421 files changed, 51402 insertions, 16713 deletions
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 08dc07c..487ce1d 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -15,6 +15,7 @@
 #ifndef TARGET_ARM_H
 #define TARGET_ARM_H
 
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
 
@@ -24,7 +25,8 @@ class ARMBaseTargetMachine;
 class FunctionPass;
 class MachineCodeEmitter;
 class JITCodeEmitter;
-class raw_ostream;
+class ObjectCodeEmitter;
+class formatted_raw_ostream;
 
 // Enums corresponding to ARM condition codes
 namespace ARMCC {
@@ -50,7 +52,7 @@ namespace ARMCC {
 
   inline static CondCodes getOppositeCondition(CondCodes CC){
     switch (CC) {
-    default: assert(0 && "Unknown condition code");
+    default: llvm_unreachable("Unknown condition code");
     case EQ: return NE;
     case NE: return EQ;
     case HS: return LO;
@@ -71,7 +73,7 @@ namespace ARMCC {
 
 inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
   switch (CC) {
-  default: assert(0 && "Unknown condition code");
+  default: llvm_unreachable("Unknown condition code");
   case ARMCC::EQ:  return "eq";
   case ARMCC::NE:  return "ne";
   case ARMCC::HS:  return "hs";
@@ -90,20 +92,23 @@ inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
   }
 }
 
-FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM);
-FunctionPass *createARMCodePrinterPass(raw_ostream &O,
-                                       ARMBaseTargetMachine &TM,
-                                       bool Verbose);
-FunctionPass *createARMCodeEmitterPass(ARMBaseTargetMachine &TM,
-                                       MachineCodeEmitter &MCE);
+FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
 
 FunctionPass *createARMCodeEmitterPass(ARMBaseTargetMachine &TM,
                                        MachineCodeEmitter &MCE);
 FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
                                           JITCodeEmitter &JCE);
+FunctionPass *createARMObjectCodeEmitterPass(ARMBaseTargetMachine &TM,
+                                             ObjectCodeEmitter &OCE);
 
 FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMConstantIslandPass();
+FunctionPass *createNEONPreAllocPass();
+FunctionPass *createThumb2ITBlockPass();
+FunctionPass *createThumb2SizeReductionPass();
+
+extern Target TheARMTarget, TheThumbTarget;
 
 } // end namespace llvm;
 
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 9001e50..8851fbb 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -89,27 +89,20 @@ def : ProcNoItin<"xscale",          [ArchV5TE]>;
 def : ProcNoItin<"iwmmxt",          [ArchV5TE]>;
 
 // V6 Processors.
-def : Processor<"arm1136j-s",       V6Itineraries,
-                [ArchV6]>;
-def : Processor<"arm1136jf-s",      V6Itineraries,
-                [ArchV6, FeatureVFP2]>;
-def : Processor<"arm1176jz-s",      V6Itineraries,
-                [ArchV6]>;
-def : Processor<"arm1176jzf-s",     V6Itineraries,
-                [ArchV6, FeatureVFP2]>;
-def : Processor<"mpcorenovfp",      V6Itineraries,
-                [ArchV6]>;
-def : Processor<"mpcore",           V6Itineraries,
-                [ArchV6, FeatureVFP2]>;
+def : ProcNoItin<"arm1136j-s",      [ArchV6]>;
+def : ProcNoItin<"arm1136jf-s",     [ArchV6, FeatureVFP2]>;
+def : ProcNoItin<"arm1176jz-s",     [ArchV6]>;
+def : ProcNoItin<"arm1176jzf-s",    [ArchV6, FeatureVFP2]>;
+def : ProcNoItin<"mpcorenovfp",     [ArchV6]>;
+def : ProcNoItin<"mpcore",          [ArchV6, FeatureVFP2]>;
 
 // V6T2 Processors.
-def : Processor<"arm1156t2-s",     V6Itineraries,
-                [ArchV6T2, FeatureThumb2]>;
-def : Processor<"arm1156t2f-s",    V6Itineraries,
-                [ArchV6T2, FeatureThumb2, FeatureVFP2]>;
+def : ProcNoItin<"arm1156t2-s",     [ArchV6T2, FeatureThumb2]>;
+def : ProcNoItin<"arm1156t2f-s",    [ArchV6T2, FeatureThumb2, FeatureVFP2]>;
 
 // V7 Processors.
-def : ProcNoItin<"cortex-a8",       [ArchV7A, FeatureThumb2, FeatureNEON]>;
+def : Processor<"cortex-a8",        CortexA8Itineraries,
+                [ArchV7A, FeatureThumb2, FeatureNEON]>;
 def : ProcNoItin<"cortex-a9",       [ArchV7A, FeatureThumb2, FeatureNEON]>;
 
 //===----------------------------------------------------------------------===//
@@ -131,13 +124,13 @@ def ARMInstrInfo : InstrInfo {
   let TSFlagsFields = ["AddrModeBits",
                        "SizeFlag",
                        "IndexModeBits",
-                       "isUnaryDataProc",
-                       "Form"];
+                       "Form",
+                       "isUnaryDataProc"];
   let TSFlagsShifts = [0,
                        4,
                        7,
                        9,
-                       10];
+                       15];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h
index 15c9ec1..1839153 100644
--- a/lib/Target/ARM/ARMAddressingModes.h
+++ b/lib/Target/ARM/ARMAddressingModes.h
@@ -15,11 +15,12 @@
 #define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
 
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 
 namespace llvm {
-  
+
 /// ARM_AM - ARM Addressing Mode Stuff
 namespace ARM_AM {
   enum ShiftOpc {
@@ -30,14 +31,14 @@ namespace ARM_AM {
     ror,
     rrx
   };
-  
+
   enum AddrOpc {
     add = '+', sub = '-'
   };
-  
+
   static inline const char *getShiftOpcStr(ShiftOpc Op) {
     switch (Op) {
-    default: assert(0 && "Unknown shift opc!");
+    default: llvm_unreachable("Unknown shift opc!");
     case ARM_AM::asr: return "asr";
     case ARM_AM::lsl: return "lsl";
     case ARM_AM::lsr: return "lsr";
@@ -45,7 +46,7 @@ namespace ARM_AM {
     case ARM_AM::rrx: return "rrx";
     }
   }
-  
+
   static inline ShiftOpc getShiftOpcForNode(SDValue N) {
     switch (N.getOpcode()) {
     default:          return ARM_AM::no_shift;
@@ -70,7 +71,7 @@ namespace ARM_AM {
 
   static inline const char *getAMSubModeStr(AMSubMode Mode) {
     switch (Mode) {
-    default: assert(0 && "Unknown addressing sub-mode!");
+    default: llvm_unreachable("Unknown addressing sub-mode!");
     case ARM_AM::ia: return "ia";
     case ARM_AM::ib: return "ib";
     case ARM_AM::da: return "da";
@@ -80,7 +81,7 @@ namespace ARM_AM {
 
   static inline const char *getAMSubModeAltStr(AMSubMode Mode, bool isLD) {
     switch (Mode) {
-    default: assert(0 && "Unknown addressing sub-mode!");
+    default: llvm_unreachable("Unknown addressing sub-mode!");
     case ARM_AM::ia: return isLD ? "fd" : "ea";
     case ARM_AM::ib: return isLD ? "ed" : "fa";
     case ARM_AM::da: return isLD ? "fa" : "ed";
@@ -94,14 +95,14 @@ namespace ARM_AM {
     assert(Amt < 32 && "Invalid rotate amount");
     return (Val >> Amt) | (Val << ((32-Amt)&31));
   }
-  
+
   /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
   ///
   static inline unsigned rotl32(unsigned Val, unsigned Amt) {
     assert(Amt < 32 && "Invalid rotate amount");
     return (Val << Amt) | (Val >> ((32-Amt)&31));
   }
-  
+
   //===--------------------------------------------------------------------===//
   // Addressing Mode #1: shift_operand with registers
   //===--------------------------------------------------------------------===//
@@ -136,7 +137,7 @@ namespace ARM_AM {
   static inline unsigned getSOImmValRot(unsigned Imm) {
     return (Imm >> 8) * 2;
   }
-  
+
   /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
   /// computing the rotate amount to use.  If this immediate value cannot be
   /// handled with a single shifter-op, determine a good rotate amount that will
@@ -145,14 +146,14 @@ namespace ARM_AM {
     // 8-bit (or less) immediates are trivially shifter_operands with a rotate
     // of zero.
     if ((Imm & ~255U) == 0) return 0;
-    
+
     // Use CTZ to compute the rotate amount.
     unsigned TZ = CountTrailingZeros_32(Imm);
-    
+
     // Rotate amount must be even.  Something like 0x200 must be rotated 8 bits,
     // not 9.
     unsigned RotAmt = TZ & ~1;
-    
+
     // If we can handle this spread, return it.
     if ((rotr32(Imm, RotAmt) & ~255U) == 0)
       return (32-RotAmt)&31;  // HW rotates right, not left.
@@ -165,16 +166,16 @@ namespace ARM_AM {
         // Restart the search for a high-order bit after the initial seconds of
         // ones.
         unsigned TZ2 = CountTrailingZeros_32(Imm & ~((1 << TrailingOnes)-1));
-      
+
         // Rotate amount must be even.
         unsigned RotAmt2 = TZ2 & ~1;
-        
+
         // If this fits, use it.
         if (RotAmt2 != 32 && (rotr32(Imm, RotAmt2) & ~255U) == 0)
           return (32-RotAmt2)&31;  // HW rotates right, not left.
       }
     }
-    
+
     // Otherwise, we have no way to cover this span of bits with a single
     // shifter_op immediate.  Return a chunk of bits that will be useful to
     // handle.
@@ -188,17 +189,17 @@ namespace ARM_AM {
     // 8-bit (or less) immediates are trivially shifter_operands with a rotate
     // of zero.
     if ((Arg & ~255U) == 0) return Arg;
-    
+
     unsigned RotAmt = getSOImmValRotate(Arg);
 
     // If this cannot be handled with a single shifter_op, bail out.
     if (rotr32(~255U, RotAmt) & Arg)
       return -1;
-      
+
     // Encode this correctly.
     return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
   }
-  
+
   /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
   /// or'ing together two SOImmVal's.
   static inline bool isSOImmTwoPartVal(unsigned V) {
@@ -206,12 +207,12 @@ namespace ARM_AM {
     V = rotr32(~255U, getSOImmValRotate(V)) & V;
     if (V == 0)
       return false;
-    
+
     // If this can be handled with two shifter_op's, accept.
     V = rotr32(~255U, getSOImmValRotate(V)) & V;
     return V == 0;
   }
-  
+
   /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
   /// return the first chunk of it.
   static inline unsigned getSOImmTwoPartFirst(unsigned V) {
@@ -221,14 +222,14 @@ namespace ARM_AM {
   /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
   /// return the second chunk of it.
   static inline unsigned getSOImmTwoPartSecond(unsigned V) {
-    // Mask out the first hunk.  
+    // Mask out the first hunk.
     V = rotr32(~255U, getSOImmValRotate(V)) & V;
-    
+
     // Take what's left.
     assert(V == (rotr32(255U, getSOImmValRotate(V)) & V));
     return V;
   }
-  
+
   /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
   /// by a left shift. Returns the shift amount to use.
   static inline unsigned getThumbImmValShift(unsigned Imm) {
@@ -243,7 +244,7 @@ namespace ARM_AM {
   /// isThumbImmShiftedVal - Return true if the specified value can be obtained
   /// by left shifting a 8-bit immediate.
   static inline bool isThumbImmShiftedVal(unsigned V) {
-    // If this can be handled with 
+    // If this can be handled with
     V = (~255U << getThumbImmValShift(V)) & V;
     return V == 0;
   }
@@ -259,10 +260,10 @@ namespace ARM_AM {
     return CountTrailingZeros_32(Imm);
   }
 
-  /// isThumbImm16ShiftedVal - Return true if the specified value can be 
+  /// isThumbImm16ShiftedVal - Return true if the specified value can be
   /// obtained by left shifting a 16-bit immediate.
   static inline bool isThumbImm16ShiftedVal(unsigned V) {
-    // If this can be handled with 
+    // If this can be handled with
     V = (~65535U << getThumbImm16ValShift(V)) & V;
     return V == 0;
   }
@@ -273,28 +274,6 @@ namespace ARM_AM {
     return V >> getThumbImmValShift(V);
   }
 
-  /// getT2SOImmValDecode - Given a 12-bit encoded Thumb-2 modified immediate,
-  /// return the corresponding 32-bit immediate value.
-  /// See ARM Reference Manual A6.3.2.
-  static inline unsigned getT2SOImmValDecode(unsigned Imm) {
-    unsigned Base = Imm & 0xff;
-    switch ((Imm >> 8) & 0xf) {
-    case 0:
-      return Base;
-    case 1:
-      return Base | (Base << 16);
-    case 2:
-      return (Base << 8) | (Base << 24);
-    case 3:
-      return Base | (Base << 8) | (Base << 16) | (Base << 24);
-    default:
-      break;
-    }
-    
-    // shifted immediate
-    unsigned RotAmount = ((Imm >> 7) & 0x1f) - 8;
-    return (Base | 0x80) << (24 - RotAmount);
-  }
 
   /// getT2SOImmValSplat - Return the 12-bit encoded representation
   /// if the specified value can be obtained by splatting the low 8 bits
@@ -305,12 +284,12 @@ namespace ARM_AM {
   ///     abcdefgh abcdefgh abcdefgh abcdefgh    control = 3
   /// Return -1 if none of the above apply.
   /// See ARM Reference Manual A6.3.2.
-  static inline int getT2SOImmValSplat(unsigned V) {
+  static inline int getT2SOImmValSplatVal(unsigned V) {
     unsigned u, Vs, Imm;
     // control = 0
-    if ((V & 0xffffff00) == 0) 
+    if ((V & 0xffffff00) == 0)
       return V;
-    
+
     // If the value is zeroes in the first byte, just shift those off
     Vs = ((V & 0xff) == 0) ? V >> 8 : V;
     // Any passing value only has 8 bits of payload, splatted across the word
@@ -329,11 +308,11 @@ namespace ARM_AM {
     return -1;
   }
 
-  /// getT2SOImmValRotate - Return the 12-bit encoded representation if the
+  /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the
   /// specified value is a rotated 8-bit value. Return -1 if no rotation
   /// encoding is possible.
   /// See ARM Reference Manual A6.3.2.
-  static inline int getT2SOImmValRotate (unsigned V) {
+  static inline int getT2SOImmValRotateVal(unsigned V) {
     unsigned RotAmt = CountLeadingZeros_32(V);
     if (RotAmt >= 24)
       return -1;
@@ -346,23 +325,23 @@ namespace ARM_AM {
   }
 
   /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit
-  /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit 
+  /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
   /// encoding for it.  If not, return -1.
   /// See ARM Reference Manual A6.3.2.
   static inline int getT2SOImmVal(unsigned Arg) {
     // If 'Arg' is an 8-bit splat, then get the encoded value.
-    int Splat = getT2SOImmValSplat(Arg);
+    int Splat = getT2SOImmValSplatVal(Arg);
     if (Splat != -1)
       return Splat;
-    
+
     // If 'Arg' can be handled with a single shifter_op return the value.
-    int Rot = getT2SOImmValRotate(Arg);
+    int Rot = getT2SOImmValRotateVal(Arg);
     if (Rot != -1)
       return Rot;
 
     return -1;
   }
-  
+
 
   //===--------------------------------------------------------------------===//
   // Addressing Mode #2
@@ -380,7 +359,7 @@ namespace ARM_AM {
   // If this addressing mode is a frame index (before prolog/epilog insertion
   // and code rewriting), this operand will have the form:  FI#, reg0, <offs>
   // with no shift amount for the frame offset.
-  // 
+  //
   static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO) {
     assert(Imm12 < (1 << 12) && "Imm too large!");
     bool isSub = Opc == sub;
@@ -395,8 +374,8 @@ namespace ARM_AM {
   static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
     return (ShiftOpc)(AM2Opc >> 13);
   }
-  
-  
+
+
   //===--------------------------------------------------------------------===//
   // Addressing Mode #3
   //===--------------------------------------------------------------------===//
@@ -409,7 +388,7 @@ namespace ARM_AM {
   // The first operand is always a Reg.  The second operand is a reg if in
   // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
   // in bit 8, the immediate in bits 0-7.
-  
+
   /// getAM3Opc - This function encodes the addrmode3 opc field.
   static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset) {
     bool isSub = Opc == sub;
@@ -421,7 +400,7 @@ namespace ARM_AM {
   static inline AddrOpc getAM3Op(unsigned AM3Opc) {
     return ((AM3Opc >> 8) & 1) ? sub : add;
   }
-  
+
   //===--------------------------------------------------------------------===//
   // Addressing Mode #4
   //===--------------------------------------------------------------------===//
@@ -469,7 +448,7 @@ namespace ARM_AM {
   //
   //    IA - Increment after
   //    DB - Decrement before
-  
+
   /// getAM5Opc - This function encodes the addrmode5 opc field.
   static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
     bool isSub = Opc == sub;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
new file mode 100644
index 0000000..ecdf5a0
--- /dev/null
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -0,0 +1,1060 @@
+//===- ARMBaseInstrInfo.cpp - ARM Instruction Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMBaseInstrInfo.h"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMGenInstrInfo.inc"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+static cl::opt<bool>
+EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
+               cl::desc("Enable ARM 2-addr to 3-addr conv"));
+
+ARMBaseInstrInfo::ARMBaseInstrInfo()
+  : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)) {
+}
+
+MachineInstr *
+ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                        MachineBasicBlock::iterator &MBBI,
+                                        LiveVariables *LV) const {
+  // FIXME: Thumb2 support.
+
+  if (!EnableARM3Addr)
+    return NULL;
+
+  MachineInstr *MI = MBBI;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  unsigned TSFlags = MI->getDesc().TSFlags;
+  bool isPre = false;
+  switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
+  default: return NULL;
+  case ARMII::IndexModePre:
+    isPre = true;
+    break;
+  case ARMII::IndexModePost:
+    break;
+  }
+
+  // Try splitting an indexed load/store to an un-indexed one plus an add/sub
+  // operation.
+  unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
+  if (MemOpc == 0)
+    return NULL;
+
+  MachineInstr *UpdateMI = NULL;
+  MachineInstr *MemMI = NULL;
+  unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned NumOps = TID.getNumOperands();
+  bool isLoad = !TID.mayStore();
+  const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0);
+  const MachineOperand &Base = MI->getOperand(2);
+  const MachineOperand &Offset = MI->getOperand(NumOps-3);
+  unsigned WBReg = WB.getReg();
+  unsigned BaseReg = Base.getReg();
+  unsigned OffReg = Offset.getReg();
+  unsigned OffImm = MI->getOperand(NumOps-2).getImm();
+  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm();
+  switch (AddrMode) {
+  default:
+    assert(false && "Unknown indexed op!");
+    return NULL;
+  case ARMII::AddrMode2: {
+    bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM2Offset(OffImm);
+    if (OffReg == 0) {
+      if (ARM_AM::getSOImmVal(Amt) == -1)
+        // Can't encode it in a so_imm operand. This transformation will
+        // add more than 1 instruction. Abandon!
+        return NULL;
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(Amt)
+        .addImm(Pred).addReg(0).addReg(0);
+    } else if (Amt != 0) {
+      ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
+      unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg)
+        .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc)
+        .addImm(Pred).addReg(0).addReg(0);
+    } else
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg)
+        .addImm(Pred).addReg(0).addReg(0);
+    break;
+  }
+  case ARMII::AddrMode3 : {
+    bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM3Offset(OffImm);
+    if (OffReg == 0)
+      // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(Amt)
+        .addImm(Pred).addReg(0).addReg(0);
+    else
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg)
+        .addImm(Pred).addReg(0).addReg(0);
+    break;
+  }
+  }
+
+  std::vector<MachineInstr*> NewMIs;
+  if (isPre) {
+    if (isLoad)
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+    else
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+    NewMIs.push_back(MemMI);
+    NewMIs.push_back(UpdateMI);
+  } else {
+    if (isLoad)
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+    else
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+    if (WB.isDead())
+      UpdateMI->getOperand(0).setIsDead();
+    NewMIs.push_back(UpdateMI);
+    NewMIs.push_back(MemMI);
+  }
+
+  // Transfer LiveVariables states, kill / dead info.
+  if (LV) {
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.getReg() &&
+          TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        unsigned Reg = MO.getReg();
+
+        LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+        if (MO.isDef()) {
+          MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
+          if (MO.isDead())
+            LV->addVirtualRegisterDead(Reg, NewMI);
+        }
+        if (MO.isUse() && MO.isKill()) {
+          for (unsigned j = 0; j < 2; ++j) {
+            // Look at the two new MI's in reverse order.
+            MachineInstr *NewMI = NewMIs[j];
+            if (!NewMI->readsRegister(Reg))
+              continue;
+            LV->addVirtualRegisterKilled(Reg, NewMI);
+            if (VI.removeKill(MI))
+              VI.Kills.push_back(NewMI);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  MFI->insert(MBBI, NewMIs[1]);
+  MFI->insert(MBBI, NewMIs[0]);
+  return NewMIs[0];
+}
+
+// Branch analysis.
+bool
+ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                MachineBasicBlock *&FBB,
+                                SmallVectorImpl<MachineOperand> &Cond,
+                                bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (isUncondBranchOpcode(LastOpc)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(LastInst->getOperand(1));
+      Cond.push_back(LastInst->getOperand(2));
+      return false;
+    }
+    return true;  // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with a B and a Bcc, handle it.
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB =  SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(1));
+    Cond.push_back(SecondLastInst->getOperand(2));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // ...likewise if it ends with a branch table followed by an unconditional
+  // branch. The branch folder can create these, and we must get rid of them for
+  // correctness of Thumb constant islands.
+  if (isJumpTableBranchOpcode(SecondLastOpc) &&
+      isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+
+unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (!isUncondBranchOpcode(I->getOpcode()) &&
+      !isCondBranchOpcode(I->getOpcode()))
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (!isCondBranchOpcode(I->getOpcode()))
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                               MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+
+  ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>();
+  int BOpc   = !AFI->isThumbFunction()
+    ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB);
+  int BccOpc = !AFI->isThumbFunction()
+    ? ARM::Bcc : (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc);
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "ARM branch conditions have two components!");
+
+  if (FBB == 0) {
+    if (Cond.empty()) // Unconditional branch?
+      BuildMI(&MBB, dl, get(BOpc)).addMBB(TBB);
+    else
+      BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
+        .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+    return 1;
+  }
+
+  // Two-way conditional branch.
+  BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
+    .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+  BuildMI(&MBB, dl, get(BOpc)).addMBB(FBB);
+  return 2;
+}
+
+bool ARMBaseInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
+  Cond[0].setImm(ARMCC::getOppositeCondition(CC));
+  return false;
+}
+
+bool ARMBaseInstrInfo::
+PredicateInstruction(MachineInstr *MI,
+                     const SmallVectorImpl<MachineOperand> &Pred) const {
+  unsigned Opc = MI->getOpcode();
+  if (isUncondBranchOpcode(Opc)) {
+    MI->setDesc(get(getMatchingCondBranchOpcode(Opc)));
+    MI->addOperand(MachineOperand::CreateImm(Pred[0].getImm()));
+    MI->addOperand(MachineOperand::CreateReg(Pred[1].getReg(), false));
+    return true;
+  }
+
+  int PIdx = MI->findFirstPredOperandIdx();
+  if (PIdx != -1) {
+    MachineOperand &PMO = MI->getOperand(PIdx);
+    PMO.setImm(Pred[0].getImm());
+    MI->getOperand(PIdx+1).setReg(Pred[1].getReg());
+    return true;
+  }
+  return false;
+}
+
+bool ARMBaseInstrInfo::
+SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                  const SmallVectorImpl<MachineOperand> &Pred2) const {
+  if (Pred1.size() > 2 || Pred2.size() > 2)
+    return false;
+
+  ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm();
+  ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm();
+  if (CC1 == CC2)
+    return true;
+
+  switch (CC1) {
+  default:
+    return false;
+  case ARMCC::AL:
+    return true;
+  case ARMCC::HS:
+    return CC2 == ARMCC::HI;
+  case ARMCC::LS:
+    return CC2 == ARMCC::LO || CC2 == ARMCC::EQ;
+  case ARMCC::GE:
+    return CC2 == ARMCC::GT;
+  case ARMCC::LE:
+    return CC2 == ARMCC::LT;
+  }
+}
+
+bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                    std::vector<MachineOperand> &Pred) const {
+  // FIXME: This confuses implicit_def with optional CPSR def.
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.getImplicitDefs() && !TID.hasOptionalDef())
+    return false;
+
+  bool Found = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.getReg() == ARM::CPSR) {
+      Pred.push_back(MO);
+      Found = true;
+    }
+  }
+
+  return Found;
+}
+
+
+/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI) DISABLE_INLINE;
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI) {
+  return JT[JTI].MBBs.size();
+}
+
+/// GetInstSize - Return the size of the specified MachineInstr.
+///
+unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const MachineBasicBlock &MBB = *MI->getParent();
+  const MachineFunction *MF = MBB.getParent();
+  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+  // Basic size info comes from the TSFlags field.
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned TSFlags = TID.TSFlags;
+
+  unsigned Opc = MI->getOpcode();
+  switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) {
+  default: {
+    // If this machine instr is an inline asm, measure it.
+    if (MI->getOpcode() == ARM::INLINEASM)
+      return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
+    if (MI->isLabel())
+      return 0;
+    switch (Opc) {
+    default:
+      llvm_unreachable("Unknown or unset size field for instr!");
+    case TargetInstrInfo::IMPLICIT_DEF:
+    case TargetInstrInfo::KILL:
+    case TargetInstrInfo::DBG_LABEL:
+    case TargetInstrInfo::EH_LABEL:
+      return 0;
+    }
+    break;
+  }
+  case ARMII::Size8Bytes: return 8;          // ARM instruction x 2.
+  case ARMII::Size4Bytes: return 4;          // ARM / Thumb2 instruction.
+  case ARMII::Size2Bytes: return 2;          // Thumb1 instruction.
+  case ARMII::SizeSpecial: {
+    switch (Opc) {
+    case ARM::CONSTPOOL_ENTRY:
+      // If this machine instr is a constant pool entry, its size is recorded as
+      // operand #2.
+      return MI->getOperand(2).getImm();
+    case ARM::Int_eh_sjlj_setjmp:
+      return 24;
+    case ARM::t2Int_eh_sjlj_setjmp:
+      return 20;
+    case ARM::BR_JTr:
+    case ARM::BR_JTm:
+    case ARM::BR_JTadd:
+    case ARM::tBR_JTr:
+    case ARM::t2BR_JT:
+    case ARM::t2TBB:
+    case ARM::t2TBH: {
+      // These are jumptable branches, i.e. a branch followed by an inlined
+      // jumptable. The size is 4 + 4 * number of entries. For TBB, each
+      // entry is one byte; TBH two byte each.
+      unsigned EntrySize = (Opc == ARM::t2TBB)
+        ? 1 : ((Opc == ARM::t2TBH) ? 2 : 4);
+      unsigned NumOps = TID.getNumOperands();
+      MachineOperand JTOP =
+        MI->getOperand(NumOps - (TID.isPredicable() ? 3 : 2));
+      unsigned JTI = JTOP.getIndex();
+      const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+      const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+      assert(JTI < JT.size());
+      // Thumb instructions are 2 byte aligned, but JT entries are 4 byte
+      // 4 aligned. The assembler / linker may add 2 byte padding just before
+      // the JT entries.  The size does not include this padding; the
+      // constant islands pass does separate bookkeeping for it.
+      // FIXME: If we know the size of the function is less than (1 << 16) *2
+      // bytes, we can use 16-bit entries instead. Then there won't be an
+      // alignment issue.
+      unsigned InstSize = (Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT) ? 2 : 4;
+      unsigned NumEntries = getNumJTEntries(JT, JTI);
+      if (Opc == ARM::t2TBB && (NumEntries & 1))
+        // Make sure the instruction that follows TBB is 2-byte aligned.
+        // FIXME: Constant island pass should insert an "ALIGN" instruction
+        // instead.
+        ++NumEntries;
+      return NumEntries * EntrySize + InstSize;
+    }
+    default:
+      // Otherwise, pseudo-instruction sizes are zero.
+      return 0;
+    }
+  }
+  }
+  return 0; // Not reached
+}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+///
+bool
+ARMBaseInstrInfo::isMoveInstr(const MachineInstr &MI,
+                              unsigned &SrcReg, unsigned &DstReg,
+                              unsigned& SrcSubIdx, unsigned& DstSubIdx) const {
+  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
+
+  switch (MI.getOpcode()) {
+  default: break;
+  case ARM::FCPYS:
+  case ARM::FCPYD:
+  case ARM::VMOVD:
+  case  ARM::VMOVQ: {
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  }
+  case ARM::MOVr:
+  case ARM::tMOVr:
+  case ARM::tMOVgpr2tgpr:
+  case ARM::tMOVtgpr2gpr:
+  case ARM::tMOVgpr2gpr:
+  case ARM::t2MOVr: {
+    assert(MI.getDesc().getNumOperands() >= 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "Invalid ARM MOV instruction");
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  }
+  }
+
+  return false;
+}
+
+unsigned
+ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::LDR:
+  case ARM::t2LDRs:  // FIXME: don't use t2LDRs to access frame.
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImm() &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::t2LDRi12:
+  case ARM::tRestore:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::FLDD:
+  case ARM::FLDS:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+unsigned
+ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                     int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::STR:
+  case ARM::t2STRs: // FIXME: don't use t2STRs to access frame.
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImm() &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::t2STRi12:
+  case ARM::tSpill:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::FSTD:
+  case ARM::FSTS:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+bool
+ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I,
+                               unsigned DestReg, unsigned SrcReg,
+                               const TargetRegisterClass *DestRC,
+                               const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC != SrcRC) {
+    // Allow DPR / DPR_VFP2 / DPR_8 cross-class copies
+    // Allow QPR / QPR_VFP2 cross-class copies
+    if (DestRC == ARM::DPRRegisterClass) {
+      if (SrcRC == ARM::DPR_VFP2RegisterClass ||
+          SrcRC == ARM::DPR_8RegisterClass) {
+      } else
+        return false;
+    } else if (DestRC == ARM::DPR_VFP2RegisterClass) {
+      if (SrcRC == ARM::DPRRegisterClass ||
+          SrcRC == ARM::DPR_8RegisterClass) {
+      } else
+        return false;
+    } else if (DestRC == ARM::DPR_8RegisterClass) {
+      if (SrcRC == ARM::DPRRegisterClass ||
+          SrcRC == ARM::DPR_VFP2RegisterClass) {
+      } else
+        return false;
+    } else if ((DestRC == ARM::QPRRegisterClass &&
+                SrcRC == ARM::QPR_VFP2RegisterClass) ||
+               (DestRC == ARM::QPR_VFP2RegisterClass &&
+                SrcRC == ARM::QPRRegisterClass)) {
+    } else
+      return false;
+  }
+
+  if (DestRC == ARM::GPRRegisterClass) {
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr),
+                                        DestReg).addReg(SrcReg)));
+  } else if (DestRC == ARM::SPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYS), DestReg)
+                   .addReg(SrcReg));
+  } else if ((DestRC == ARM::DPRRegisterClass) ||
+             (DestRC == ARM::DPR_VFP2RegisterClass) ||
+             (DestRC == ARM::DPR_8RegisterClass)) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg)
+                   .addReg(SrcReg));
+  } else if (DestRC == ARM::QPRRegisterClass ||
+             DestRC == ARM::QPR_VFP2RegisterClass) {
+    BuildMI(MBB, I, DL, get(ARM::VMOVQ), DestReg).addReg(SrcReg);
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+void ARMBaseInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                            MachineMemOperand::MOStore, 0,
+                            MFI.getObjectSize(FI),
+                            MFI.getObjectAlignment(FI));
+
+  if (RC == ARM::GPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO));
+  } else if (RC == ARM::DPRRegisterClass ||
+             RC == ARM::DPR_VFP2RegisterClass ||
+             RC == ARM::DPR_8RegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTD))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  } else if (RC == ARM::SPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTS))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  } else {
+    assert((RC == ARM::QPRRegisterClass ||
+            RC == ARM::QPR_VFP2RegisterClass) && "Unknown regclass!");
+    // FIXME: Neon instructions should support predicates
+    BuildMI(MBB, I, DL, get(ARM::VSTRQ)).addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  }
+}
+
+void ARMBaseInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                            MachineMemOperand::MOLoad, 0,
+                            MFI.getObjectSize(FI),
+                            MFI.getObjectAlignment(FI));
+
+  if (RC == ARM::GPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg)
+                   .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO));
+  } else if (RC == ARM::DPRRegisterClass ||
+             RC == ARM::DPR_VFP2RegisterClass ||
+             RC == ARM::DPR_8RegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDD), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  } else if (RC == ARM::SPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDS), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  } else {
+    assert((RC == ARM::QPRRegisterClass ||
+            RC == ARM::QPR_VFP2RegisterClass) && "Unknown regclass!");
+    // FIXME: Neon instructions should support predicates
+    BuildMI(MBB, I, DL, get(ARM::VLDRQ), DestReg).addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  }
+}
+
+MachineInstr *ARMBaseInstrInfo::
+foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                      const SmallVectorImpl<unsigned> &Ops, int FI) const {
+  if (Ops.size() != 1) return NULL;
+
+  unsigned OpNum = Ops[0];
+  unsigned Opc = MI->getOpcode();
+  MachineInstr *NewMI = NULL;
+  if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) {
+    // If it is updating CPSR, then it cannot be folded.
+    if (MI->getOperand(4).getReg() == ARM::CPSR && !MI->getOperand(4).isDead())
+      return NULL;
+    unsigned Pred = MI->getOperand(2).getImm();
+    unsigned PredReg = MI->getOperand(3).getReg();
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      if (Opc == ARM::MOVr)
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::STR))
+          .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+          .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+      else // ARM::t2MOVr
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12))
+          .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+          .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      if (Opc == ARM::MOVr)
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::LDR))
+          .addReg(DstReg,
+                  RegState::Define |
+                  getDeadRegState(isDead) |
+                  getUndefRegState(isUndef))
+          .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+      else // ARM::t2MOVr
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12))
+          .addReg(DstReg,
+                  RegState::Define |
+                  getDeadRegState(isDead) |
+                  getUndefRegState(isUndef))
+          .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+    }
+  } else if (Opc == ARM::tMOVgpr2gpr ||
+             Opc == ARM::tMOVtgpr2gpr ||
+             Opc == ARM::tMOVgpr2tgpr) {
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12))
+        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+        .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12))
+        .addReg(DstReg,
+                RegState::Define |
+                getDeadRegState(isDead) |
+                getUndefRegState(isUndef))
+        .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0);
+    }
+  } else if (Opc == ARM::FCPYS) {
+    unsigned Pred = MI->getOperand(2).getImm();
+    unsigned PredReg = MI->getOperand(3).getReg();
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTS))
+        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+        .addFrameIndex(FI)
+        .addImm(0).addImm(Pred).addReg(PredReg);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDS))
+        .addReg(DstReg,
+                RegState::Define |
+                getDeadRegState(isDead) |
+                getUndefRegState(isUndef))
+        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+    }
+  }
+  else if (Opc == ARM::FCPYD) {
+    unsigned Pred = MI->getOperand(2).getImm();
+    unsigned PredReg = MI->getOperand(3).getReg();
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTD))
+        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDD))
+        .addReg(DstReg,
+                RegState::Define |
+                getDeadRegState(isDead) |
+                getUndefRegState(isUndef))
+        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+    }
+  }
+
+  return NewMI;
+}
+
+MachineInstr*
+ARMBaseInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                        MachineInstr* MI,
+                                        const SmallVectorImpl<unsigned> &Ops,
+                                        MachineInstr* LoadMI) const {
+  // FIXME
+  return 0;
+}
+
+bool
+ARMBaseInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                   const SmallVectorImpl<unsigned> &Ops) const {
+  if (Ops.size() != 1) return false;
+
+  unsigned Opc = MI->getOpcode();
+  if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) {
+    // If it is updating CPSR, then it cannot be folded.
+    return MI->getOperand(4).getReg() != ARM::CPSR ||
+      MI->getOperand(4).isDead();
+  } else if (Opc == ARM::tMOVgpr2gpr ||
+             Opc == ARM::tMOVtgpr2gpr ||
+             Opc == ARM::tMOVgpr2tgpr) {
+    return true;
+  } else if (Opc == ARM::FCPYS || Opc == ARM::FCPYD) {
+    return true;
+  } else if (Opc == ARM::VMOVD || Opc == ARM::VMOVQ) {
+    return false; // FIXME
+  }
+
+  return false;
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes
+llvm::getInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
+  int PIdx = MI->findFirstPredOperandIdx();
+  if (PIdx == -1) {
+    PredReg = 0;
+    return ARMCC::AL;
+  }
+
+  PredReg = MI->getOperand(PIdx+1).getReg();
+  return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm();
+}
+
+
+int llvm::getMatchingCondBranchOpcode(int Opc) {
+  if (Opc == ARM::B)
+    return ARM::Bcc;
+  else if (Opc == ARM::tB)
+    return ARM::tBcc;
+  else if (Opc == ARM::t2B)
+      return ARM::t2Bcc;
+
+  llvm_unreachable("Unknown unconditional branch opcode!");
+  return 0;
+}
+
+
+void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                               unsigned DestReg, unsigned BaseReg, int NumBytes,
+                               ARMCC::CondCodes Pred, unsigned PredReg,
+                               const ARMBaseInstrInfo &TII) {
+  bool isSub = NumBytes < 0;
+  if (isSub) NumBytes = -NumBytes;
+
+  while (NumBytes) {
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes);
+    unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt);
+    assert(ThisVal && "Didn't extract field correctly");
+
+    // We will handle these bits from offset, clear them.
+    NumBytes &= ~ThisVal;
+
+    assert(ARM_AM::getSOImmVal(ThisVal) != -1 && "Bit extraction didn't work?");
+
+    // Build the new ADD / SUB.
+    unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri;
+    BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+      .addReg(BaseReg, RegState::Kill).addImm(ThisVal)
+      .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+    BaseReg = DestReg;
+  }
+}
+
+bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                unsigned FrameReg, int &Offset,
+                                const ARMBaseInstrInfo &TII) {
+  unsigned Opcode = MI.getOpcode();
+  const TargetInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  bool isSub = false;
+
+  // Memory operands in inline assembly always use AddrMode2.
+  if (Opcode == ARM::INLINEASM)
+    AddrMode = ARMII::AddrMode2;
+
+  if (Opcode == ARM::ADDri) {
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
+    if (Offset == 0) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::MOVr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.RemoveOperand(FrameRegIdx+1);
+      Offset = 0;
+      return true;
+    } else if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+      MI.setDesc(TII.get(ARM::SUBri));
+    }
+
+    // Common case: small offset, fits into instruction.
+    if (ARM_AM::getSOImmVal(Offset) != -1) {
+      // Replace the FrameIndex with sp / fp
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, pull as much of the immedidate into this ADDri/SUBri
+    // as possible.
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset);
+    unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt);
+
+    // We will handle these bits from offset, clear them.
+    Offset &= ~ThisImmVal;
+
+    // Get the properly encoded SOImmVal field.
+    assert(ARM_AM::getSOImmVal(ThisImmVal) != -1 &&
+           "Bit extraction didn't work?");
+    MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+ } else {
+    unsigned ImmIdx = 0;
+    int InstrOffs = 0;
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    switch (AddrMode) {
+    case ARMII::AddrMode2: {
+      ImmIdx = FrameRegIdx+2;
+      InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 12;
+      break;
+    }
+    case ARMII::AddrMode3: {
+      ImmIdx = FrameRegIdx+2;
+      InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      break;
+    }
+    case ARMII::AddrMode4:
+      // Can't fold any offset even if it's zero.
+      return false;
+    case ARMII::AddrMode5: {
+      ImmIdx = FrameRegIdx+1;
+      InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      Scale = 4;
+      break;
+    }
+    default:
+      llvm_unreachable("Unsupported addressing mode!");
+      break;
+    }
+
+    Offset += InstrOffs * Scale;
+    assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+    if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+    }
+
+    // Attempt to fold address comp. if opcode has offset bits
+    if (NumBits > 0) {
+      // Common case: small offset, fits into instruction.
+      MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+      int ImmedOffset = Offset / Scale;
+      unsigned Mask = (1 << NumBits) - 1;
+      if ((unsigned)Offset <= Mask * Scale) {
+        // Replace the FrameIndex with sp
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        if (isSub)
+          ImmedOffset |= 1 << NumBits;
+        ImmOp.ChangeToImmediate(ImmedOffset);
+        Offset = 0;
+        return true;
+      }
+
+      // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+      ImmedOffset = ImmedOffset & Mask;
+      if (isSub)
+        ImmedOffset |= 1 << NumBits;
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset &= ~(Mask*Scale);
+    }
+  }
+
+  Offset = (isSub) ? -Offset : Offset;
+  return Offset == 0;
+}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
new file mode 100644
index 0000000..a13155b
--- /dev/null
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -0,0 +1,333 @@
+//===- ARMBaseInstrInfo.h - ARM Base Instruction Information -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEINSTRUCTIONINFO_H
+#define ARMBASEINSTRUCTIONINFO_H
+
+#include "ARM.h"
+#include "ARMRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+/// ARMII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace ARMII {
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction Flags.
+
+    //===------------------------------------------------------------------===//
+    // This four-bit field describes the addressing mode used.
+
+    AddrModeMask  = 0xf,
+    AddrModeNone    = 0,
+    AddrMode1       = 1,
+    AddrMode2       = 2,
+    AddrMode3       = 3,
+    AddrMode4       = 4,
+    AddrMode5       = 5,
+    AddrMode6       = 6,
+    AddrModeT1_1    = 7,
+    AddrModeT1_2    = 8,
+    AddrModeT1_4    = 9,
+    AddrModeT1_s    = 10, // i8 * 4 for pc and sp relative data
+    AddrModeT2_i12  = 11,
+    AddrModeT2_i8   = 12,
+    AddrModeT2_so   = 13,
+    AddrModeT2_pc   = 14, // +/- i12 for pc relative data
+    AddrModeT2_i8s4 = 15, // i8 * 4
+
+    // Size* - Flags to keep track of the size of an instruction.
+    SizeShift     = 4,
+    SizeMask      = 7 << SizeShift,
+    SizeSpecial   = 1,   // 0 byte pseudo or special case.
+    Size8Bytes    = 2,
+    Size4Bytes    = 3,
+    Size2Bytes    = 4,
+
+    // IndexMode - Unindex, pre-indexed, or post-indexed. Only valid for load
+    // and store ops
+    IndexModeShift = 7,
+    IndexModeMask  = 3 << IndexModeShift,
+    IndexModePre   = 1,
+    IndexModePost  = 2,
+
+    //===------------------------------------------------------------------===//
+    // Instruction encoding formats.
+    //
+    FormShift     = 9,
+    FormMask      = 0x3f << FormShift,
+
+    // Pseudo instructions
+    Pseudo        = 0  << FormShift,
+
+    // Multiply instructions
+    MulFrm        = 1  << FormShift,
+
+    // Branch instructions
+    BrFrm         = 2  << FormShift,
+    BrMiscFrm     = 3  << FormShift,
+
+    // Data Processing instructions
+    DPFrm         = 4  << FormShift,
+    DPSoRegFrm    = 5  << FormShift,
+
+    // Load and Store
+    LdFrm         = 6  << FormShift,
+    StFrm         = 7  << FormShift,
+    LdMiscFrm     = 8  << FormShift,
+    StMiscFrm     = 9  << FormShift,
+    LdStMulFrm    = 10 << FormShift,
+
+    // Miscellaneous arithmetic instructions
+    ArithMiscFrm  = 11 << FormShift,
+
+    // Extend instructions
+    ExtFrm        = 12 << FormShift,
+
+    // VFP formats
+    VFPUnaryFrm   = 13 << FormShift,
+    VFPBinaryFrm  = 14 << FormShift,
+    VFPConv1Frm   = 15 << FormShift,
+    VFPConv2Frm   = 16 << FormShift,
+    VFPConv3Frm   = 17 << FormShift,
+    VFPConv4Frm   = 18 << FormShift,
+    VFPConv5Frm   = 19 << FormShift,
+    VFPLdStFrm    = 20 << FormShift,
+    VFPLdStMulFrm = 21 << FormShift,
+    VFPMiscFrm    = 22 << FormShift,
+
+    // Thumb format
+    ThumbFrm      = 23 << FormShift,
+
+    // NEON format
+    NEONFrm       = 24 << FormShift,
+    NEONGetLnFrm  = 25 << FormShift,
+    NEONSetLnFrm  = 26 << FormShift,
+    NEONDupFrm    = 27 << FormShift,
+
+    //===------------------------------------------------------------------===//
+    // Misc flags.
+
+    // UnaryDP - Indicates this is a unary data processing instruction, i.e.
+    // it doesn't have a Rn operand.
+    UnaryDP       = 1 << 15,
+
+    // Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+    // a 16-bit Thumb instruction if certain conditions are met.
+    Xform16Bit    = 1 << 16,
+
+    //===------------------------------------------------------------------===//
+    // Field shifts - such shifts are used to set field while generating
+    // machine instructions.
+    M_BitShift     = 5,
+    ShiftImmShift  = 5,
+    ShiftShift     = 7,
+    N_BitShift     = 7,
+    ImmHiShift     = 8,
+    SoRotImmShift  = 8,
+    RegRsShift     = 8,
+    ExtRotImmShift = 10,
+    RegRdLoShift   = 12,
+    RegRdShift     = 12,
+    RegRdHiShift   = 16,
+    RegRnShift     = 16,
+    S_BitShift     = 20,
+    W_BitShift     = 21,
+    AM3_I_BitShift = 22,
+    D_BitShift     = 22,
+    U_BitShift     = 23,
+    P_BitShift     = 24,
+    I_BitShift     = 25,
+    CondShift      = 28
+  };
+}
+
+class ARMBaseInstrInfo : public TargetInstrInfoImpl {
+protected:
+  // Can be only subclassed.
+  explicit ARMBaseInstrInfo();
+public:
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  virtual unsigned getUnindexedOpcode(unsigned Opc) const =0;
+
+  // Return true if the block does not fall through.
+  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const =0;
+
+  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables *LV) const;
+
+  virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0;
+
+  // Branch analysis.
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  // Predication support.
+  bool isPredicated(const MachineInstr *MI) const {
+    int PIdx = MI->findFirstPredOperandIdx();
+    return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL;
+  }
+
+  ARMCC::CondCodes getPredicate(const MachineInstr *MI) const {
+    int PIdx = MI->findFirstPredOperandIdx();
+    return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm()
+                      : ARMCC::AL;
+  }
+
+  virtual
+  bool PredicateInstruction(MachineInstr *MI,
+                            const SmallVectorImpl<MachineOperand> &Pred) const;
+
+  virtual
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+
+  virtual bool DefinesPredicate(MachineInstr *MI,
+                                std::vector<MachineOperand> &Pred) const;
+
+  /// GetInstSize - Returns the size of the specified MachineInstr.
+  ///
+  virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const;
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual bool canFoldMemoryOperand(const MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                              const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                              const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const;
+
+};
+
+static inline
+const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
+  return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
+  return MIB.addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB,
+                                          bool isDead = false) {
+  return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead));
+}
+
+static inline
+const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) {
+  return MIB.addReg(0);
+}
+
+static inline
+bool isUncondBranchOpcode(int Opc) {
+  return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B;
+}
+
+static inline
+bool isCondBranchOpcode(int Opc) {
+  return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc;
+}
+
+static inline
+bool isJumpTableBranchOpcode(int Opc) {
+  return Opc == ARM::BR_JTr || Opc == ARM::BR_JTm || Opc == ARM::BR_JTadd ||
+    Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT;
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
+
+int getMatchingCondBranchOpcode(int Opc);
+
+/// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of
+/// instructions to materializea destreg = basereg + immediate in ARM / Thumb2
+/// code.
+void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                             unsigned DestReg, unsigned BaseReg, int NumBytes,
+                             ARMCC::CondCodes Pred, unsigned PredReg,
+                             const ARMBaseInstrInfo &TII);
+
+void emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                            unsigned DestReg, unsigned BaseReg, int NumBytes,
+                            ARMCC::CondCodes Pred, unsigned PredReg,
+                            const ARMBaseInstrInfo &TII);
+
+
+/// rewriteARMFrameIndex / rewriteT2FrameIndex -
+/// Rewrite MI to access 'Offset' bytes from the FP. Return false if the
+/// offset could not be handled directly in MI, and return the left-over
+/// portion by reference.
+bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                          unsigned FrameReg, int &Offset,
+                          const ARMBaseInstrInfo &TII);
+
+bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                         unsigned FrameReg, int &Offset,
+                         const ARMBaseInstrInfo &TII);
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
new file mode 100644
index 0000000..42ef183
--- /dev/null
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -0,0 +1,1360 @@
+//===- ARMBaseRegisterInfo.cpp - ARM Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum,
+                                                   bool *isSPVFP) {
+  if (isSPVFP)
+    *isSPVFP = false;
+
+  using namespace ARM;
+  switch (RegEnum) {
+  default:
+    llvm_unreachable("Unknown ARM register!");
+  case R0:  case D0:  case Q0:  return 0;
+  case R1:  case D1:  case Q1:  return 1;
+  case R2:  case D2:  case Q2:  return 2;
+  case R3:  case D3:  case Q3:  return 3;
+  case R4:  case D4:  case Q4:  return 4;
+  case R5:  case D5:  case Q5:  return 5;
+  case R6:  case D6:  case Q6:  return 6;
+  case R7:  case D7:  case Q7:  return 7;
+  case R8:  case D8:  case Q8:  return 8;
+  case R9:  case D9:  case Q9:  return 9;
+  case R10: case D10: case Q10: return 10;
+  case R11: case D11: case Q11: return 11;
+  case R12: case D12: case Q12: return 12;
+  case SP:  case D13: case Q13: return 13;
+  case LR:  case D14: case Q14: return 14;
+  case PC:  case D15: case Q15: return 15;
+
+  case D16: return 16;
+  case D17: return 17;
+  case D18: return 18;
+  case D19: return 19;
+  case D20: return 20;
+  case D21: return 21;
+  case D22: return 22;
+  case D23: return 23;
+  case D24: return 24;
+  case D25: return 25;
+  case D26: return 27;
+  case D27: return 27;
+  case D28: return 28;
+  case D29: return 29;
+  case D30: return 30;
+  case D31: return 31;
+
+  case S0: case S1: case S2: case S3:
+  case S4: case S5: case S6: case S7:
+  case S8: case S9: case S10: case S11:
+  case S12: case S13: case S14: case S15:
+  case S16: case S17: case S18: case S19:
+  case S20: case S21: case S22: case S23:
+  case S24: case S25: case S26: case S27:
+  case S28: case S29: case S30: case S31: {
+    if (isSPVFP)
+      *isSPVFP = true;
+    switch (RegEnum) {
+    default: return 0; // Avoid compile time warning.
+    case S0: return 0;
+    case S1: return 1;
+    case S2: return 2;
+    case S3: return 3;
+    case S4: return 4;
+    case S5: return 5;
+    case S6: return 6;
+    case S7: return 7;
+    case S8: return 8;
+    case S9: return 9;
+    case S10: return 10;
+    case S11: return 11;
+    case S12: return 12;
+    case S13: return 13;
+    case S14: return 14;
+    case S15: return 15;
+    case S16: return 16;
+    case S17: return 17;
+    case S18: return 18;
+    case S19: return 19;
+    case S20: return 20;
+    case S21: return 21;
+    case S22: return 22;
+    case S23: return 23;
+    case S24: return 24;
+    case S25: return 25;
+    case S26: return 26;
+    case S27: return 27;
+    case S28: return 28;
+    case S29: return 29;
+    case S30: return 30;
+    case S31: return 31;
+    }
+  }
+  }
+}
+
+ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
+                                         const ARMSubtarget &sti)
+  : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+    TII(tii), STI(sti),
+    FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11) {
+}
+
+const unsigned*
+ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const unsigned CalleeSavedRegs[] = {
+    ARM::LR, ARM::R11, ARM::R10, ARM::R9, ARM::R8,
+    ARM::R7, ARM::R6,  ARM::R5,  ARM::R4,
+
+    ARM::D15, ARM::D14, ARM::D13, ARM::D12,
+    ARM::D11, ARM::D10, ARM::D9,  ARM::D8,
+    0
+  };
+
+  static const unsigned DarwinCalleeSavedRegs[] = {
+    // Darwin ABI deviates from ARM standard ABI. R9 is not a callee-saved
+    // register.
+    ARM::LR,  ARM::R7,  ARM::R6, ARM::R5, ARM::R4,
+    ARM::R11, ARM::R10, ARM::R8,
+
+    ARM::D15, ARM::D14, ARM::D13, ARM::D12,
+    ARM::D11, ARM::D10, ARM::D9,  ARM::D8,
+    0
+  };
+  return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const *
+ARMBaseRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+
+  static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = {
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass,
+    &ARM::tGPRRegClass,&ARM::tGPRRegClass,&ARM::tGPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+
+  static const TargetRegisterClass * const DarwinCalleeSavedRegClasses[] = {
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+
+  static const TargetRegisterClass * const DarwinThumbCalleeSavedRegClasses[] ={
+    &ARM::GPRRegClass,  &ARM::tGPRRegClass, &ARM::tGPRRegClass,
+    &ARM::tGPRRegClass, &ARM::tGPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass,  &ARM::GPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+
+  if (STI.isThumb1Only()) {
+    return STI.isTargetDarwin()
+      ? DarwinThumbCalleeSavedRegClasses : ThumbCalleeSavedRegClasses;
+  }
+  return STI.isTargetDarwin()
+    ? DarwinCalleeSavedRegClasses : CalleeSavedRegClasses;
+}
+
+BitVector ARMBaseRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  // FIXME: avoid re-calculating this everytime.
+  BitVector Reserved(getNumRegs());
+  Reserved.set(ARM::SP);
+  Reserved.set(ARM::PC);
+  if (STI.isTargetDarwin() || hasFP(MF))
+    Reserved.set(FramePtr);
+  // Some targets reserve R9.
+  if (STI.isR9Reserved())
+    Reserved.set(ARM::R9);
+  return Reserved;
+}
+
+bool ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF,
+                                        unsigned Reg) const {
+  switch (Reg) {
+  default: break;
+  case ARM::SP:
+  case ARM::PC:
+    return true;
+  case ARM::R7:
+  case ARM::R11:
+    if (FramePtr == Reg && (STI.isTargetDarwin() || hasFP(MF)))
+      return true;
+    break;
+  case ARM::R9:
+    return STI.isR9Reserved();
+  }
+
+  return false;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const {
+  return ARM::GPRRegisterClass;
+}
+
+/// getAllocationOrder - Returns the register allocation order for a specified
+/// register class in the form of a pair of TargetRegisterClass iterators.
+std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
+ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC,
+                                        unsigned HintType, unsigned HintReg,
+                                        const MachineFunction &MF) const {
+  // Alternative register allocation orders when favoring even / odd registers
+  // of register pairs.
+
+  // No FP, R9 is available.
+  static const unsigned GPREven1[] = {
+    ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8, ARM::R10,
+    ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7,
+    ARM::R9, ARM::R11
+  };
+  static const unsigned GPROdd1[] = {
+    ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R9, ARM::R11,
+    ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6,
+    ARM::R8, ARM::R10
+  };
+
+  // FP is R7, R9 is available.
+  static const unsigned GPREven2[] = {
+    ARM::R0, ARM::R2, ARM::R4,          ARM::R8, ARM::R10,
+    ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6,
+    ARM::R9, ARM::R11
+  };
+  static const unsigned GPROdd2[] = {
+    ARM::R1, ARM::R3, ARM::R5,          ARM::R9, ARM::R11,
+    ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6,
+    ARM::R8, ARM::R10
+  };
+
+  // FP is R11, R9 is available.
+  static const unsigned GPREven3[] = {
+    ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8,
+    ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7,
+    ARM::R9
+  };
+  static const unsigned GPROdd3[] = {
+    ARM::R1, ARM::R3, ARM::R5, ARM::R6, ARM::R9,
+    ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R7,
+    ARM::R8
+  };
+
+  // No FP, R9 is not available.
+  static const unsigned GPREven4[] = {
+    ARM::R0, ARM::R2, ARM::R4, ARM::R6,          ARM::R10,
+    ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8,
+    ARM::R11
+  };
+  static const unsigned GPROdd4[] = {
+    ARM::R1, ARM::R3, ARM::R5, ARM::R7,          ARM::R11,
+    ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8,
+    ARM::R10
+  };
+
+  // FP is R7, R9 is not available.
+  static const unsigned GPREven5[] = {
+    ARM::R0, ARM::R2, ARM::R4,                   ARM::R10,
+    ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6, ARM::R8,
+    ARM::R11
+  };
+  static const unsigned GPROdd5[] = {
+    ARM::R1, ARM::R3, ARM::R5,                   ARM::R11,
+    ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8,
+    ARM::R10
+  };
+
+  // FP is R11, R9 is not available.
+  static const unsigned GPREven6[] = {
+    ARM::R0, ARM::R2, ARM::R4, ARM::R6,
+    ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8
+  };
+  static const unsigned GPROdd6[] = {
+    ARM::R1, ARM::R3, ARM::R5, ARM::R7,
+    ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8
+  };
+
+
+  if (HintType == ARMRI::RegPairEven) {
+    if (isPhysicalRegister(HintReg) && getRegisterPairEven(HintReg, MF) == 0)
+      // It's no longer possible to fulfill this hint. Return the default
+      // allocation order.
+      return std::make_pair(RC->allocation_order_begin(MF),
+                            RC->allocation_order_end(MF));
+
+    if (!STI.isTargetDarwin() && !hasFP(MF)) {
+      if (!STI.isR9Reserved())
+        return std::make_pair(GPREven1,
+                              GPREven1 + (sizeof(GPREven1)/sizeof(unsigned)));
+      else
+        return std::make_pair(GPREven4,
+                              GPREven4 + (sizeof(GPREven4)/sizeof(unsigned)));
+    } else if (FramePtr == ARM::R7) {
+      if (!STI.isR9Reserved())
+        return std::make_pair(GPREven2,
+                              GPREven2 + (sizeof(GPREven2)/sizeof(unsigned)));
+      else
+        return std::make_pair(GPREven5,
+                              GPREven5 + (sizeof(GPREven5)/sizeof(unsigned)));
+    } else { // FramePtr == ARM::R11
+      if (!STI.isR9Reserved())
+        return std::make_pair(GPREven3,
+                              GPREven3 + (sizeof(GPREven3)/sizeof(unsigned)));
+      else
+        return std::make_pair(GPREven6,
+                              GPREven6 + (sizeof(GPREven6)/sizeof(unsigned)));
+    }
+  } else if (HintType == ARMRI::RegPairOdd) {
+    if (isPhysicalRegister(HintReg) && getRegisterPairOdd(HintReg, MF) == 0)
+      // It's no longer possible to fulfill this hint. Return the default
+      // allocation order.
+      return std::make_pair(RC->allocation_order_begin(MF),
+                            RC->allocation_order_end(MF));
+
+    if (!STI.isTargetDarwin() && !hasFP(MF)) {
+      if (!STI.isR9Reserved())
+        return std::make_pair(GPROdd1,
+                              GPROdd1 + (sizeof(GPROdd1)/sizeof(unsigned)));
+      else
+        return std::make_pair(GPROdd4,
+                              GPROdd4 + (sizeof(GPROdd4)/sizeof(unsigned)));
+    } else if (FramePtr == ARM::R7) {
+      if (!STI.isR9Reserved())
+        return std::make_pair(GPROdd2,
+                              GPROdd2 + (sizeof(GPROdd2)/sizeof(unsigned)));
+      else
+        return std::make_pair(GPROdd5,
+                              GPROdd5 + (sizeof(GPROdd5)/sizeof(unsigned)));
+    } else { // FramePtr == ARM::R11
+      if (!STI.isR9Reserved())
+        return std::make_pair(GPROdd3,
+                              GPROdd3 + (sizeof(GPROdd3)/sizeof(unsigned)));
+      else
+        return std::make_pair(GPROdd6,
+                              GPROdd6 + (sizeof(GPROdd6)/sizeof(unsigned)));
+    }
+  }
+  return std::make_pair(RC->allocation_order_begin(MF),
+                        RC->allocation_order_end(MF));
+}
+
+/// ResolveRegAllocHint - Resolves the specified register allocation hint
+/// to a physical register. Returns the physical register if it is successful.
+unsigned
+ARMBaseRegisterInfo::ResolveRegAllocHint(unsigned Type, unsigned Reg,
+                                         const MachineFunction &MF) const {
+  if (Reg == 0 || !isPhysicalRegister(Reg))
+    return 0;
+  if (Type == 0)
+    return Reg;
+  else if (Type == (unsigned)ARMRI::RegPairOdd)
+    // Odd register.
+    return getRegisterPairOdd(Reg, MF);
+  else if (Type == (unsigned)ARMRI::RegPairEven)
+    // Even register.
+    return getRegisterPairEven(Reg, MF);
+  return 0;
+}
+
+void
+ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
+                                        MachineFunction &MF) const {
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
+  if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
+       Hint.first == (unsigned)ARMRI::RegPairEven) &&
+      Hint.second && TargetRegisterInfo::isVirtualRegister(Hint.second)) {
+    // If 'Reg' is one of the even / odd register pair and it's now changed
+    // (e.g. coalesced) into a different register. The other register of the
+    // pair allocation hint must be updated to reflect the relationship
+    // change.
+    unsigned OtherReg = Hint.second;
+    Hint = MRI->getRegAllocationHint(OtherReg);
+    if (Hint.second == Reg)
+      // Make sure the pair has not already divorced.
+      MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
+  }
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+///
+bool ARMBaseRegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return (NoFramePointerElim ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+bool ARMBaseRegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  if (NoFramePointerElim && MFI->hasCalls())
+    return true;
+  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+/// estimateStackSize - Estimate and return the size of the frame.
+static unsigned estimateStackSize(MachineFunction &MF, MachineFrameInfo *MFI) {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  int Offset = 0;
+  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -FFI->getObjectOffset(i);
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    Offset += FFI->getObjectSize(i);
+    unsigned Align = FFI->getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset+Align-1)/Align*Align;
+  }
+  return (unsigned)Offset;
+}
+
+/// estimateRSStackSizeLimit - Look at each instruction that references stack
+/// frames and return the stack size limit beyond which some of these
+/// instructions will require scratch register during their expansion later.
+unsigned
+ARMBaseRegisterInfo::estimateRSStackSizeLimit(MachineFunction &MF) const {
+  unsigned Limit = (1 << 12) - 1;
+  for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+        if (!I->getOperand(i).isFI()) continue;
+
+        const TargetInstrDesc &Desc = TII.get(I->getOpcode());
+        unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+        if (AddrMode == ARMII::AddrMode3 ||
+            AddrMode == ARMII::AddrModeT2_i8)
+          return (1 << 8) - 1;
+
+        if (AddrMode == ARMII::AddrMode5 ||
+            AddrMode == ARMII::AddrModeT2_i8s4)
+          Limit = std::min(Limit, ((1U << 8) - 1) * 4);
+
+        if (AddrMode == ARMII::AddrModeT2_i12 && hasFP(MF))
+          // When the stack offset is negative, we will end up using
+          // the i8 instructions instead.
+          return (1 << 8) - 1;
+        break; // At most one FI per instruction
+      }
+    }
+  }
+
+  return Limit;
+}
+
+void
+ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                          RegScavenger *RS) const {
+  // This tells PEI to spill the FP as if it is any other callee-save register
+  // to take advantage the eliminateFrameIndex machinery. This also ensures it
+  // is spilled in the order specified by getCalleeSavedRegs() to make it easier
+  // to combine multiple loads / stores.
+  bool CanEliminateFrame = true;
+  bool CS1Spilled = false;
+  bool LRSpilled = false;
+  unsigned NumGPRSpills = 0;
+  SmallVector<unsigned, 4> UnspilledCS1GPRs;
+  SmallVector<unsigned, 4> UnspilledCS2GPRs;
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Don't spill FP if the frame can be eliminated. This is determined
+  // by scanning the callee-save registers to see if any is used.
+  const unsigned *CSRegs = getCalleeSavedRegs();
+  const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses();
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    bool Spilled = false;
+    if (MF.getRegInfo().isPhysRegUsed(Reg)) {
+      AFI->setCSRegisterIsSpilled(Reg);
+      Spilled = true;
+      CanEliminateFrame = false;
+    } else {
+      // Check alias registers too.
+      for (const unsigned *Aliases = getAliasSet(Reg); *Aliases; ++Aliases) {
+        if (MF.getRegInfo().isPhysRegUsed(*Aliases)) {
+          Spilled = true;
+          CanEliminateFrame = false;
+        }
+      }
+    }
+
+    if (CSRegClasses[i] == ARM::GPRRegisterClass ||
+        CSRegClasses[i] == ARM::tGPRRegisterClass) {
+      if (Spilled) {
+        NumGPRSpills++;
+
+        if (!STI.isTargetDarwin()) {
+          if (Reg == ARM::LR)
+            LRSpilled = true;
+          CS1Spilled = true;
+          continue;
+        }
+
+        // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
+        switch (Reg) {
+        case ARM::LR:
+          LRSpilled = true;
+          // Fallthrough
+        case ARM::R4:
+        case ARM::R5:
+        case ARM::R6:
+        case ARM::R7:
+          CS1Spilled = true;
+          break;
+        default:
+          break;
+        }
+      } else {
+        if (!STI.isTargetDarwin()) {
+          UnspilledCS1GPRs.push_back(Reg);
+          continue;
+        }
+
+        switch (Reg) {
+        case ARM::R4:
+        case ARM::R5:
+        case ARM::R6:
+        case ARM::R7:
+        case ARM::LR:
+          UnspilledCS1GPRs.push_back(Reg);
+          break;
+        default:
+          UnspilledCS2GPRs.push_back(Reg);
+          break;
+        }
+      }
+    }
+  }
+
+  bool ForceLRSpill = false;
+  if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
+    unsigned FnSize = TII.GetFunctionSizeInBytes(MF);
+    // Force LR to be spilled if the Thumb function size is > 2048. This enables
+    // use of BL to implement far jump. If it turns out that it's not needed
+    // then the branch fix up path will undo it.
+    if (FnSize >= (1 << 11)) {
+      CanEliminateFrame = false;
+      ForceLRSpill = true;
+    }
+  }
+
+  bool ExtraCSSpill = false;
+  if (!CanEliminateFrame || cannotEliminateFrame(MF)) {
+    AFI->setHasStackFrame(true);
+
+    // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
+    // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
+    if (!LRSpilled && CS1Spilled) {
+      MF.getRegInfo().setPhysRegUsed(ARM::LR);
+      AFI->setCSRegisterIsSpilled(ARM::LR);
+      NumGPRSpills++;
+      UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
+                                    UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
+      ForceLRSpill = false;
+      ExtraCSSpill = true;
+    }
+
+    // Darwin ABI requires FP to point to the stack slot that contains the
+    // previous FP.
+    if (STI.isTargetDarwin() || hasFP(MF)) {
+      MF.getRegInfo().setPhysRegUsed(FramePtr);
+      NumGPRSpills++;
+    }
+
+    // If stack and double are 8-byte aligned and we are spilling an odd number
+    // of GPRs. Spill one extra callee save GPR so we won't have to pad between
+    // the integer and double callee save areas.
+    unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+    if (TargetAlign == 8 && (NumGPRSpills & 1)) {
+      if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
+        for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
+          unsigned Reg = UnspilledCS1GPRs[i];
+          // Don't spill high register if the function is thumb1
+          if (!AFI->isThumb1OnlyFunction() ||
+              isARMLowRegister(Reg) || Reg == ARM::LR) {
+            MF.getRegInfo().setPhysRegUsed(Reg);
+            AFI->setCSRegisterIsSpilled(Reg);
+            if (!isReservedReg(MF, Reg))
+              ExtraCSSpill = true;
+            break;
+          }
+        }
+      } else if (!UnspilledCS2GPRs.empty() &&
+                 !AFI->isThumb1OnlyFunction()) {
+        unsigned Reg = UnspilledCS2GPRs.front();
+        MF.getRegInfo().setPhysRegUsed(Reg);
+        AFI->setCSRegisterIsSpilled(Reg);
+        if (!isReservedReg(MF, Reg))
+          ExtraCSSpill = true;
+      }
+    }
+
+    // Estimate if we might need to scavenge a register at some point in order
+    // to materialize a stack offset. If so, either spill one additional
+    // callee-saved register or reserve a special spill slot to facilitate
+    // register scavenging. Thumb1 needs a spill slot for stack pointer
+    // adjustments also, even when the frame itself is small.
+    if (RS && !ExtraCSSpill) {
+      MachineFrameInfo  *MFI = MF.getFrameInfo();
+      // If any of the stack slot references may be out of range of an
+      // immediate offset, make sure a register (or a spill slot) is
+      // available for the register scavenger. Note that if we're indexing
+      // off the frame pointer, the effective stack size is 4 bytes larger
+      // since the FP points to the stack slot of the previous FP.
+      if (estimateStackSize(MF, MFI) + (hasFP(MF) ? 4 : 0)
+          >= estimateRSStackSizeLimit(MF)) {
+        // If any non-reserved CS register isn't spilled, just spill one or two
+        // extra. That should take care of it!
+        unsigned NumExtras = TargetAlign / 4;
+        SmallVector<unsigned, 2> Extras;
+        while (NumExtras && !UnspilledCS1GPRs.empty()) {
+          unsigned Reg = UnspilledCS1GPRs.back();
+          UnspilledCS1GPRs.pop_back();
+          if (!isReservedReg(MF, Reg)) {
+            Extras.push_back(Reg);
+            NumExtras--;
+          }
+        }
+        // For non-Thumb1 functions, also check for hi-reg CS registers
+        if (!AFI->isThumb1OnlyFunction()) {
+          while (NumExtras && !UnspilledCS2GPRs.empty()) {
+            unsigned Reg = UnspilledCS2GPRs.back();
+            UnspilledCS2GPRs.pop_back();
+            if (!isReservedReg(MF, Reg)) {
+              Extras.push_back(Reg);
+              NumExtras--;
+            }
+          }
+        }
+        if (Extras.size() && NumExtras == 0) {
+          for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
+            MF.getRegInfo().setPhysRegUsed(Extras[i]);
+            AFI->setCSRegisterIsSpilled(Extras[i]);
+          }
+        } else if (!AFI->isThumb1OnlyFunction()) {
+          // note: Thumb1 functions spill to R12, not the stack.
+          // Reserve a slot closest to SP or frame pointer.
+          const TargetRegisterClass *RC = ARM::GPRRegisterClass;
+          RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                           RC->getAlignment()));
+        }
+      }
+    }
+  }
+
+  if (ForceLRSpill) {
+    MF.getRegInfo().setPhysRegUsed(ARM::LR);
+    AFI->setCSRegisterIsSpilled(ARM::LR);
+    AFI->setLRIsSpilledForFarJump(true);
+  }
+}
+
+unsigned ARMBaseRegisterInfo::getRARegister() const {
+  return ARM::LR;
+}
+
+unsigned ARMBaseRegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  if (STI.isTargetDarwin() || hasFP(MF))
+    return FramePtr;
+  return ARM::SP;
+}
+
+unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+  return 0;
+}
+
+unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+  return 0;
+}
+
+int ARMBaseRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+}
+
+unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
+                                               const MachineFunction &MF) const {
+  switch (Reg) {
+  default: break;
+  // Return 0 if either register of the pair is a special register.
+  // So no R12, etc.
+  case ARM::R1:
+    return ARM::R0;
+  case ARM::R3:
+    // FIXME!
+    return STI.isThumb1Only() ? 0 : ARM::R2;
+  case ARM::R5:
+    return ARM::R4;
+  case ARM::R7:
+    return isReservedReg(MF, ARM::R7)  ? 0 : ARM::R6;
+  case ARM::R9:
+    return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R8;
+  case ARM::R11:
+    return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10;
+
+  case ARM::S1:
+    return ARM::S0;
+  case ARM::S3:
+    return ARM::S2;
+  case ARM::S5:
+    return ARM::S4;
+  case ARM::S7:
+    return ARM::S6;
+  case ARM::S9:
+    return ARM::S8;
+  case ARM::S11:
+    return ARM::S10;
+  case ARM::S13:
+    return ARM::S12;
+  case ARM::S15:
+    return ARM::S14;
+  case ARM::S17:
+    return ARM::S16;
+  case ARM::S19:
+    return ARM::S18;
+  case ARM::S21:
+    return ARM::S20;
+  case ARM::S23:
+    return ARM::S22;
+  case ARM::S25:
+    return ARM::S24;
+  case ARM::S27:
+    return ARM::S26;
+  case ARM::S29:
+    return ARM::S28;
+  case ARM::S31:
+    return ARM::S30;
+
+  case ARM::D1:
+    return ARM::D0;
+  case ARM::D3:
+    return ARM::D2;
+  case ARM::D5:
+    return ARM::D4;
+  case ARM::D7:
+    return ARM::D6;
+  case ARM::D9:
+    return ARM::D8;
+  case ARM::D11:
+    return ARM::D10;
+  case ARM::D13:
+    return ARM::D12;
+  case ARM::D15:
+    return ARM::D14;
+  case ARM::D17:
+    return ARM::D16;
+  case ARM::D19:
+    return ARM::D18;
+  case ARM::D21:
+    return ARM::D20;
+  case ARM::D23:
+    return ARM::D22;
+  case ARM::D25:
+    return ARM::D24;
+  case ARM::D27:
+    return ARM::D26;
+  case ARM::D29:
+    return ARM::D28;
+  case ARM::D31:
+    return ARM::D30;
+  }
+
+  return 0;
+}
+
+unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg,
+                                             const MachineFunction &MF) const {
+  switch (Reg) {
+  default: break;
+  // Return 0 if either register of the pair is a special register.
+  // So no R12, etc.
+  case ARM::R0:
+    return ARM::R1;
+  case ARM::R2:
+    // FIXME!
+    return STI.isThumb1Only() ? 0 : ARM::R3;
+  case ARM::R4:
+    return ARM::R5;
+  case ARM::R6:
+    return isReservedReg(MF, ARM::R7)  ? 0 : ARM::R7;
+  case ARM::R8:
+    return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R9;
+  case ARM::R10:
+    return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11;
+
+  case ARM::S0:
+    return ARM::S1;
+  case ARM::S2:
+    return ARM::S3;
+  case ARM::S4:
+    return ARM::S5;
+  case ARM::S6:
+    return ARM::S7;
+  case ARM::S8:
+    return ARM::S9;
+  case ARM::S10:
+    return ARM::S11;
+  case ARM::S12:
+    return ARM::S13;
+  case ARM::S14:
+    return ARM::S15;
+  case ARM::S16:
+    return ARM::S17;
+  case ARM::S18:
+    return ARM::S19;
+  case ARM::S20:
+    return ARM::S21;
+  case ARM::S22:
+    return ARM::S23;
+  case ARM::S24:
+    return ARM::S25;
+  case ARM::S26:
+    return ARM::S27;
+  case ARM::S28:
+    return ARM::S29;
+  case ARM::S30:
+    return ARM::S31;
+
+  case ARM::D0:
+    return ARM::D1;
+  case ARM::D2:
+    return ARM::D3;
+  case ARM::D4:
+    return ARM::D5;
+  case ARM::D6:
+    return ARM::D7;
+  case ARM::D8:
+    return ARM::D9;
+  case ARM::D10:
+    return ARM::D11;
+  case ARM::D12:
+    return ARM::D13;
+  case ARM::D14:
+    return ARM::D15;
+  case ARM::D16:
+    return ARM::D17;
+  case ARM::D18:
+    return ARM::D19;
+  case ARM::D20:
+    return ARM::D21;
+  case ARM::D22:
+    return ARM::D23;
+  case ARM::D24:
+    return ARM::D25;
+  case ARM::D26:
+    return ARM::D27;
+  case ARM::D28:
+    return ARM::D29;
+  case ARM::D30:
+    return ARM::D31;
+  }
+
+  return 0;
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void ARMBaseRegisterInfo::
+emitLoadConstPool(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator &MBBI,
+                  DebugLoc dl,
+                  unsigned DestReg, unsigned SubIdx, int Val,
+                  ARMCC::CondCodes Pred,
+                  unsigned PredReg) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  Constant *C =
+        ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp))
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx)
+    .addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+}
+
+bool ARMBaseRegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+// not required, we reserve argument space for call sites in the function
+// immediately on entry to the current function. This eliminates the need for
+// add/sub sp brackets around call sites. Returns true if the call frame is
+// included as part of the stack frame.
+bool ARMBaseRegisterInfo::
+hasReservedCallFrame(MachineFunction &MF) const {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  unsigned CFSize = FFI->getMaxCallFrameSize();
+  // It's not always a good idea to include the call frame as part of the
+  // stack frame. ARM (especially Thumb) has small immediate offset to
+  // address the stack frame. So a large call frame can cause poor codegen
+  // and may even makes it impossible to scavenge a register.
+  if (CFSize >= ((1 << 12) - 1) / 2)  // Half of imm12
+    return false;
+
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+static void
+emitSPUpdate(bool isARM,
+             MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+             DebugLoc dl, const ARMBaseInstrInfo &TII,
+             int NumBytes,
+             ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
+  if (isARM)
+    emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+                            Pred, PredReg, TII);
+  else
+    emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+                           Pred, PredReg, TII);
+}
+
+
+void ARMBaseRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr *Old = I;
+    DebugLoc dl = Old->getDebugLoc();
+    unsigned Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      assert(!AFI->isThumb1OnlyFunction() &&
+             "This eliminateCallFramePseudoInstr does not suppor Thumb1!");
+      bool isARM = !AFI->isThumbFunction();
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old->getOpcode();
+      ARMCC::CondCodes Pred = (ARMCC::CondCodes)Old->getOperand(1).getImm();
+      // FIXME: Thumb2 version of ADJCALLSTACKUP and ADJCALLSTACKDOWN?
+      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+        // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
+        unsigned PredReg = Old->getOperand(2).getReg();
+        emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, Pred, PredReg);
+      } else {
+        // Note: PredReg is operand 3 for ADJCALLSTACKUP.
+        unsigned PredReg = Old->getOperand(3).getReg();
+        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+        emitSPUpdate(isARM, MBB, I, dl, TII, Amount, Pred, PredReg);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
+/// findScratchRegister - Find a 'free' ARM register. If register scavenger
+/// is not being used, R12 is available. Otherwise, try for a call-clobbered
+/// register first and then a spilled callee-saved register if that fails.
+static
+unsigned findScratchRegister(RegScavenger *RS, const TargetRegisterClass *RC,
+                             ARMFunctionInfo *AFI) {
+  unsigned Reg = RS ? RS->FindUnusedReg(RC) : (unsigned) ARM::R12;
+  assert(!AFI->isThumb1OnlyFunction());
+  return Reg;
+}
+
+unsigned
+ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                         int SPAdj, int *Value,
+                                         RegScavenger *RS) const {
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This eliminateFrameIndex does not support Thumb1!");
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  unsigned FrameReg = ARM::SP;
+  int FrameIndex = MI.getOperand(i).getIndex();
+  int Offset = MFI->getObjectOffset(FrameIndex) + MFI->getStackSize() + SPAdj;
+
+  if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
+    Offset -= AFI->getGPRCalleeSavedArea1Offset();
+  else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
+    Offset -= AFI->getGPRCalleeSavedArea2Offset();
+  else if (AFI->isDPRCalleeSavedAreaFrame(FrameIndex))
+    Offset -= AFI->getDPRCalleeSavedAreaOffset();
+  else if (hasFP(MF) && AFI->hasStackFrame()) {
+    assert(SPAdj == 0 && "Unexpected stack offset!");
+    // Use frame pointer to reference fixed objects unless this is a
+    // frameless function,
+    FrameReg = getFrameRegister(MF);
+    Offset -= AFI->getFramePtrSpillOffset();
+  }
+
+  // modify MI as necessary to handle as much of 'Offset' as possible
+  bool Done = false;
+  if (!AFI->isThumbFunction())
+    Done = rewriteARMFrameIndex(MI, i, FrameReg, Offset, TII);
+  else {
+    assert(AFI->isThumb2Function());
+    Done = rewriteT2FrameIndex(MI, i, FrameReg, Offset, TII);
+  }
+  if (Done)
+    return 0;
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above, handle the rest, providing a register that is
+  // SP+LargeImm.
+  assert((Offset ||
+          (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4) &&
+         "This code isn't needed if offset already handled!");
+
+  // Insert a set of r12 with the full address: r12 = sp + offset
+  // If the offset we have is too large to fit into the instruction, we need
+  // to form it with a series of ADDri's.  Do this by taking 8-bit chunks
+  // out of 'Offset'.
+  unsigned ScratchReg = findScratchRegister(RS, ARM::GPRRegisterClass, AFI);
+  if (ScratchReg == 0)
+    // No register is "free". Scavenge a register.
+    ScratchReg = RS->scavengeRegister(ARM::GPRRegisterClass, II, SPAdj);
+  int PIdx = MI.findFirstPredOperandIdx();
+  ARMCC::CondCodes Pred = (PIdx == -1)
+    ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
+  unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg();
+  if (Offset == 0)
+    // Must be addrmode4.
+    MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false);
+  else {
+    if (!AFI->isThumbFunction())
+      emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+                              Offset, Pred, PredReg, TII);
+    else {
+      assert(AFI->isThumb2Function());
+      emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+                             Offset, Pred, PredReg, TII);
+    }
+    MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true);
+  }
+  return 0;
+}
+
+/// Move iterator pass the next bunch of callee save load / store ops for
+/// the particular spill area (1: integer area 1, 2: integer area 2,
+/// 3: fp area, 0: don't care).
+static void movePastCSLoadStoreOps(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator &MBBI,
+                                   int Opc1, int Opc2, unsigned Area,
+                                   const ARMSubtarget &STI) {
+  while (MBBI != MBB.end() &&
+         ((MBBI->getOpcode() == Opc1) || (MBBI->getOpcode() == Opc2)) &&
+         MBBI->getOperand(1).isFI()) {
+    if (Area != 0) {
+      bool Done = false;
+      unsigned Category = 0;
+      switch (MBBI->getOperand(0).getReg()) {
+      case ARM::R4:  case ARM::R5:  case ARM::R6: case ARM::R7:
+      case ARM::LR:
+        Category = 1;
+        break;
+      case ARM::R8:  case ARM::R9:  case ARM::R10: case ARM::R11:
+        Category = STI.isTargetDarwin() ? 2 : 1;
+        break;
+      case ARM::D8:  case ARM::D9:  case ARM::D10: case ARM::D11:
+      case ARM::D12: case ARM::D13: case ARM::D14: case ARM::D15:
+        Category = 3;
+        break;
+      default:
+        Done = true;
+        break;
+      }
+      if (Done || Category != Area)
+        break;
+    }
+
+    ++MBBI;
+  }
+}
+
+void ARMBaseRegisterInfo::
+emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo  *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This emitPrologue does not suppor Thumb1!");
+  bool isARM = !AFI->isThumbFunction();
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  unsigned NumBytes = MFI->getStackSize();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+
+  // Determine the sizes of each callee-save spill areas and record which frame
+  // belongs to which callee-save spill areas.
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  int FramePtrSpillFI = 0;
+
+  // Allocate the vararg register save area. This is not counted in NumBytes.
+  if (VARegSaveSize)
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes);
+    return;
+  }
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    int FI = CSI[i].getFrameIdx();
+    switch (Reg) {
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      AFI->addGPRCalleeSavedArea1Frame(FI);
+      GPRCS1Size += 4;
+      break;
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      if (STI.isTargetDarwin()) {
+        AFI->addGPRCalleeSavedArea2Frame(FI);
+        GPRCS2Size += 4;
+      } else {
+        AFI->addGPRCalleeSavedArea1Frame(FI);
+        GPRCS1Size += 4;
+      }
+      break;
+    default:
+      AFI->addDPRCalleeSavedAreaFrame(FI);
+      DPRCSSize += 8;
+    }
+  }
+
+  // Build the new SUBri to adjust SP for integer callee-save spill area 1.
+  emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS1Size);
+  movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 1, STI);
+
+  // Set FP to point to the stack slot that contains the previous FP.
+  // For Darwin, FP is R7, which has now been stored in spill area 1.
+  // Otherwise, if this is not Darwin, all the callee-saved registers go
+  // into spill area 1, including the FP in R11.  In either case, it is
+  // now safe to emit this assignment.
+  if (STI.isTargetDarwin() || hasFP(MF)) {
+    unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri;
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr)
+      .addFrameIndex(FramePtrSpillFI).addImm(0);
+    AddDefaultCC(AddDefaultPred(MIB));
+  }
+
+  // Build the new SUBri to adjust SP for integer callee-save spill area 2.
+  emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS2Size);
+
+  // Build the new SUBri to adjust SP for FP callee-save spill area.
+  movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 2, STI);
+  emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRCSSize);
+
+  // Determine starting offsets of spill areas.
+  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+  AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes);
+  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+
+  NumBytes = DPRCSOffset;
+  if (NumBytes) {
+    // Insert it after all the callee-save spills.
+    movePastCSLoadStoreOps(MBB, MBBI, ARM::FSTD, 0, 3, STI);
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes);
+  }
+
+  if (STI.isTargetELF() && hasFP(MF)) {
+    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
+                             AFI->getFramePtrSpillOffset());
+  }
+
+  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI,
+                        const ARMBaseInstrInfo &TII,
+                        const unsigned *CSRegs) {
+  return ((MI->getOpcode() == (int)ARM::FLDD ||
+           MI->getOpcode() == (int)ARM::LDR ||
+           MI->getOpcode() == (int)ARM::t2LDRi12) &&
+          MI->getOperand(1).isFI() &&
+          isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs));
+}
+
+void ARMBaseRegisterInfo::
+emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  assert(MBBI->getDesc().isReturn() &&
+         "Can only insert epilog into returning blocks");
+  DebugLoc dl = MBBI->getDebugLoc();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This emitEpilogue does not suppor Thumb1!");
+  bool isARM = !AFI->isThumbFunction();
+
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  int NumBytes = (int)MFI->getStackSize();
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+  } else {
+    // Unwind MBBI to point to first LDR / FLDD.
+    const unsigned *CSRegs = getCalleeSavedRegs();
+    if (MBBI != MBB.begin()) {
+      do
+        --MBBI;
+      while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
+      if (!isCSRestore(MBBI, TII, CSRegs))
+        ++MBBI;
+    }
+
+    // Move SP to start of FP callee save spill area.
+    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+                 AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedAreaSize());
+
+    // Darwin ABI requires FP to point to the stack slot that contains the
+    // previous FP.
+    bool HasFP = hasFP(MF);
+    if ((STI.isTargetDarwin() && NumBytes) || HasFP) {
+      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+      // Reset SP based on frame pointer only if the stack frame extends beyond
+      // frame pointer stack slot or target is ELF and the function has FP.
+      if (HasFP ||
+          AFI->getGPRCalleeSavedArea2Size() ||
+          AFI->getDPRCalleeSavedAreaSize()  ||
+          AFI->getDPRCalleeSavedAreaOffset()) {
+        if (NumBytes) {
+          if (isARM)
+            emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+                                    ARMCC::AL, 0, TII);
+          else
+            emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+                                    ARMCC::AL, 0, TII);
+        } else {
+          // Thumb2 or ARM.
+          if (isARM)
+            BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
+              .addReg(FramePtr)
+              .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+          else
+            BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), ARM::SP)
+              .addReg(FramePtr);
+        }
+      }
+    } else if (NumBytes)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+
+    // Move SP to start of integer callee save spill area 2.
+    movePastCSLoadStoreOps(MBB, MBBI, ARM::FLDD, 0, 3, STI);
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedAreaSize());
+
+    // Move SP to start of integer callee save spill area 1.
+    movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 2, STI);
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea2Size());
+
+    // Move SP to SP upon entry to the function.
+    movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 1, STI);
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea1Size());
+  }
+
+  if (VARegSaveSize)
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize);
+}
+
+#include "ARMGenRegisterInfo.inc"
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
new file mode 100644
index 0000000..da703fb
--- /dev/null
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -0,0 +1,148 @@
+//===- ARMBaseRegisterInfo.h - ARM Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEREGISTERINFO_H
+#define ARMBASEREGISTERINFO_H
+
+#include "ARM.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "ARMGenRegisterInfo.h.inc"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseInstrInfo;
+  class Type;
+
+/// Register allocation hints.
+namespace ARMRI {
+  enum {
+    RegPairOdd  = 1,
+    RegPairEven = 2
+  };
+}
+
+/// isARMLowRegister - Returns true if the register is low register r0-r7.
+///
+static inline bool isARMLowRegister(unsigned Reg) {
+  using namespace ARM;
+  switch (Reg) {
+  case R0:  case R1:  case R2:  case R3:
+  case R4:  case R5:  case R6:  case R7:
+    return true;
+  default:
+    return false;
+  }
+}
+
+struct ARMBaseRegisterInfo : public ARMGenRegisterInfo {
+protected:
+  const ARMBaseInstrInfo &TII;
+  const ARMSubtarget &STI;
+
+  /// FramePtr - ARM physical register used as frame ptr.
+  unsigned FramePtr;
+
+  // Can be only subclassed.
+  explicit ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
+                               const ARMSubtarget &STI);
+
+  // Return the opcode that implements 'Op', or 0 if no opcode
+  unsigned getOpcode(int Op) const;
+
+public:
+  /// getRegisterNumbering - Given the enum value for some register, e.g.
+  /// ARM::LR, return the number that it corresponds to (e.g. 14). It
+  /// also returns true in isSPVFP if the register is a single precision
+  /// VFP register.
+  static unsigned getRegisterNumbering(unsigned RegEnum, bool *isSPVFP = 0);
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const*
+  getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+
+  std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
+  getAllocationOrder(const TargetRegisterClass *RC,
+                     unsigned HintType, unsigned HintReg,
+                     const MachineFunction &MF) const;
+
+  unsigned ResolveRegAllocHint(unsigned Type, unsigned Reg,
+                               const MachineFunction &MF) const;
+
+  void UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
+                          MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  bool cannotEliminateFrame(const MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+
+  bool isLowRegister(unsigned Reg) const;
+
+
+  /// emitLoadConstPool - Emits a load from constpool to materialize the
+  /// specified immediate.
+  virtual void emitLoadConstPool(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI,
+                                 DebugLoc dl,
+                                 unsigned DestReg, unsigned SubIdx,
+                                 int Val,
+                                 ARMCC::CondCodes Pred = ARMCC::AL,
+                                 unsigned PredReg = 0) const;
+
+  /// Code Generation virtual methods...
+  virtual bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+
+  virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  virtual bool hasReservedCallFrame(MachineFunction &MF) const;
+
+  virtual void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                             MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator I) const;
+
+  virtual unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, int *Value = NULL,
+                                       RegScavenger *RS = NULL) const;
+
+  virtual void emitPrologue(MachineFunction &MF) const;
+  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+private:
+  unsigned estimateRSStackSizeLimit(MachineFunction &MF) const;
+
+  unsigned getRegisterPairEven(unsigned Reg, const MachineFunction &MF) const;
+
+  unsigned getRegisterPairOdd(unsigned Reg, const MachineFunction &MF) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 8a4c741..7161639 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -111,6 +111,7 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
                                  S9, S10, S11, S12, S13, S14, S15]>>,
@@ -122,6 +123,7 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
                                  S9, S10, S11, S12, S13, S14, S15]>>,
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index f295761..6f1c624 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -26,14 +26,18 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/MachineCodeEmitter.h"
 #include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/ObjectCodeEmitter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #ifndef NDEBUG
 #include <iomanip>
 #endif
@@ -57,12 +61,18 @@ namespace {
     ARMJITInfo                *JTI;
     const ARMInstrInfo        *II;
     const TargetData          *TD;
+    const ARMSubtarget        *Subtarget;
     TargetMachine             &TM;
     CodeEmitter               &MCE;
     const std::vector<MachineConstantPoolEntry> *MCPEs;
     const std::vector<MachineJumpTableEntry> *MJTEs;
     bool IsPIC;
 
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineModuleInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
   public:
     static char ID;
     explicit Emitter(TargetMachine &tm, CodeEmitter &mce)
@@ -160,7 +170,7 @@ namespace {
     /// Routines that handle operands which add machine relocations which are
     /// fixed up by the relocation stage.
     void emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
-                           bool NeedStub, intptr_t ACPV = 0);
+                           bool NeedStub,  bool Indirect, intptr_t ACPV = 0);
     void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
     void emitConstPoolAddress(unsigned CPI, unsigned Reloc);
     void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc);
@@ -174,36 +184,39 @@ namespace {
 /// createARMCodeEmitterPass - Return a pass that emits the collected ARM code
 /// to the specified MCE object.
 
-namespace llvm {
-
-FunctionPass *createARMCodeEmitterPass(ARMBaseTargetMachine &TM,
-                                       MachineCodeEmitter &MCE) {
+FunctionPass *llvm::createARMCodeEmitterPass(ARMBaseTargetMachine &TM,
+                                             MachineCodeEmitter &MCE) {
   return new Emitter<MachineCodeEmitter>(TM, MCE);
 }
-FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
-                                          JITCodeEmitter &JCE) {
+FunctionPass *llvm::createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
+                                                JITCodeEmitter &JCE) {
   return new Emitter<JITCodeEmitter>(TM, JCE);
 }
-
-} // end namespace llvm
+FunctionPass *llvm::createARMObjectCodeEmitterPass(ARMBaseTargetMachine &TM,
+                                                   ObjectCodeEmitter &OCE) {
+  return new Emitter<ObjectCodeEmitter>(TM, OCE);
+}
 
 template<class CodeEmitter>
 bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
   assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
           MF.getTarget().getRelocationModel() != Reloc::Static) &&
          "JIT relocation model must be set to static or default!");
+  JTI = ((ARMTargetMachine&)MF.getTarget()).getJITInfo();
   II = ((ARMTargetMachine&)MF.getTarget()).getInstrInfo();
   TD = ((ARMTargetMachine&)MF.getTarget()).getTargetData();
-  JTI = ((ARMTargetMachine&)MF.getTarget()).getJITInfo();
+  Subtarget = &TM.getSubtarget<ARMSubtarget>();
   MCPEs = &MF.getConstantPool()->getConstants();
   MJTEs = &MF.getJumpTableInfo()->getJumpTables();
   IsPIC = TM.getRelocationModel() == Reloc::PIC_;
   JTI->Initialize(MF, IsPIC);
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo>());
 
   do {
-    DOUT << "JITTing function '" << MF.getFunction()->getName() << "'\n";
+    DEBUG(errs() << "JITTing function '"
+          << MF.getFunction()->getName() << "'\n");
     MCE.startFunction(MF);
-    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); 
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
          MBB != E; ++MBB) {
       MCE.StartMachineBasicBlock(MBB);
       for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
@@ -220,7 +233,7 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
 template<class CodeEmitter>
 unsigned Emitter<CodeEmitter>::getShiftOp(unsigned Imm) const {
   switch (ARM_AM::getAM2ShiftOpc(Imm)) {
-  default: assert(0 && "Unknown shift opc!");
+  default: llvm_unreachable("Unknown shift opc!");
   case ARM_AM::asr: return 2;
   case ARM_AM::lsl: return 0;
   case ARM_AM::lsr: return 1;
@@ -240,7 +253,7 @@ unsigned Emitter<CodeEmitter>::getMachineOpValue(const MachineInstr &MI,
   else if (MO.isImm())
     return static_cast<unsigned>(MO.getImm());
   else if (MO.isGlobal())
-    emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true);
+    emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true, false);
   else if (MO.isSymbol())
     emitExternalSymbolAddress(MO.getSymbolName(), ARM::reloc_arm_branch);
   else if (MO.isCPI()) {
@@ -254,8 +267,10 @@ unsigned Emitter<CodeEmitter>::getMachineOpValue(const MachineInstr &MI,
   else if (MO.isMBB())
     emitMachineBasicBlock(MO.getMBB(), ARM::reloc_arm_branch);
   else {
-    cerr << "ERROR: Unknown type of MachineOperand: " << MO << "\n";
-    abort();
+#ifndef NDEBUG
+    errs() << MO;
+#endif
+    llvm_unreachable(0);
   }
   return 0;
 }
@@ -264,9 +279,14 @@ unsigned Emitter<CodeEmitter>::getMachineOpValue(const MachineInstr &MI,
 ///
 template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
-                                             bool NeedStub, intptr_t ACPV) {
-  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
-                                             GV, ACPV, NeedStub));
+                                             bool NeedStub, bool Indirect,
+                                             intptr_t ACPV) {
+  MachineRelocation MR = Indirect
+    ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
+                                           GV, ACPV, NeedStub)
+    : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                               GV, ACPV, NeedStub);
+  MCE.addRelocation(MR);
 }
 
 /// emitExternalSymbolAddress - Arrange for the address of an external symbol to
@@ -294,7 +314,7 @@ void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI,
 /// be emitted to the current location in the function, and allow it to be PC
 /// relative.
 template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTIndex, 
+void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTIndex,
                                                 unsigned Reloc) {
   MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
                                                     Reloc, JTIndex, 0, true));
@@ -310,32 +330,28 @@ void Emitter<CodeEmitter>::emitMachineBasicBlock(MachineBasicBlock *BB,
 
 template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitWordLE(unsigned Binary) {
-#ifndef NDEBUG
-  DOUT << "  0x" << std::hex << std::setw(8) << std::setfill('0')
-       << Binary << std::dec << "\n";
-#endif
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Binary) << "\n");
   MCE.emitWordLE(Binary);
 }
 
 template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitDWordLE(uint64_t Binary) {
-#ifndef NDEBUG
-  DOUT << "  0x" << std::hex << std::setw(8) << std::setfill('0')
-       << (unsigned)Binary << std::dec << "\n";
-  DOUT << "  0x" << std::hex << std::setw(8) << std::setfill('0')
-       << (unsigned)(Binary >> 32) << std::dec << "\n";
-#endif
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Binary) << "\n");
   MCE.emitDWordLE(Binary);
 }
 
 template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI) {
-  DOUT << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI;
+  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI);
+
+  MCE.processDebugLoc(MI.getDebugLoc(), true);
 
   NumEmitted++;  // Keep track of the # of mi's emitted
   switch (MI.getDesc().TSFlags & ARMII::FormMask) {
   default: {
-    assert(0 && "Unhandled instruction encoding format!");
+    llvm_unreachable("Unhandled instruction encoding format!");
     break;
   }
   case ARMII::Pseudo:
@@ -393,6 +409,7 @@ void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI) {
     emitMiscInstruction(MI);
     break;
   }
+  MCE.processDebugLoc(MI.getDebugLoc(), false);
 }
 
 template<class CodeEmitter>
@@ -400,7 +417,7 @@ void Emitter<CodeEmitter>::emitConstPoolInstruction(const MachineInstr &MI) {
   unsigned CPI = MI.getOperand(0).getImm();       // CP instruction index.
   unsigned CPIndex = MI.getOperand(1).getIndex(); // Actual cp entry index.
   const MachineConstantPoolEntry &MCPE = (*MCPEs)[CPIndex];
-  
+
   // Remember the CONSTPOOL_ENTRY address for later relocation.
   JTI->addConstantPoolEntryAddr(CPI, MCE.getCurrentPCValue());
 
@@ -410,55 +427,49 @@ void Emitter<CodeEmitter>::emitConstPoolInstruction(const MachineInstr &MI) {
     ARMConstantPoolValue *ACPV =
       static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
 
-    DOUT << "  ** ARM constant pool #" << CPI << " @ "
-         << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n';
+    DEBUG(errs() << "  ** ARM constant pool #" << CPI << " @ "
+          << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n');
 
     GlobalValue *GV = ACPV->getGV();
     if (GV) {
-      assert(!ACPV->isStub() && "Don't know how to deal this yet!");
-      if (ACPV->isNonLazyPointer())
-        MCE.addRelocation(MachineRelocation::getIndirectSymbol(
-                  MCE.getCurrentPCOffset(), ARM::reloc_arm_machine_cp_entry, GV,
-                  (intptr_t)ACPV, false));
-      else 
-        emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry,
-                          ACPV->isStub() || isa<Function>(GV), (intptr_t)ACPV);
+      Reloc::Model RelocM = TM.getRelocationModel();
+      emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry,
+                        isa<Function>(GV),
+                        Subtarget->GVIsIndirectSymbol(GV, RelocM),
+                        (intptr_t)ACPV);
      } else  {
-      assert(!ACPV->isNonLazyPointer() && "Don't know how to deal this yet!");
       emitExternalSymbolAddress(ACPV->getSymbol(), ARM::reloc_arm_absolute);
     }
     emitWordLE(0);
   } else {
     Constant *CV = MCPE.Val.ConstVal;
 
-#ifndef NDEBUG
-    DOUT << "  ** Constant pool #" << CPI << " @ "
-         << (void*)MCE.getCurrentPCValue() << " ";
-    if (const Function *F = dyn_cast<Function>(CV))
-      DOUT << F->getName();
-    else
-      DOUT << *CV;
-    DOUT << '\n';
-#endif
+    DEBUG({
+        errs() << "  ** Constant pool #" << CPI << " @ "
+               << (void*)MCE.getCurrentPCValue() << " ";
+        if (const Function *F = dyn_cast<Function>(CV))
+          errs() << F->getName();
+        else
+          errs() << *CV;
+        errs() << '\n';
+      });
 
     if (GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
-      emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV));
+      emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV), false);
       emitWordLE(0);
     } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
       uint32_t Val = *(uint32_t*)CI->getValue().getRawData();
       emitWordLE(Val);
     } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
-      if (CFP->getType() == Type::FloatTy)
+      if (CFP->getType()->isFloatTy())
         emitWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
-      else if (CFP->getType() == Type::DoubleTy)
+      else if (CFP->getType()->isDoubleTy())
         emitDWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
       else {
-        assert(0 && "Unable to handle this constantpool entry!");
-        abort();
+        llvm_unreachable("Unable to handle this constantpool entry!");
       }
     } else {
-      assert(0 && "Unable to handle this constantpool entry!");
-      abort();
+      llvm_unreachable("Unable to handle this constantpool entry!");
     }
   }
 }
@@ -467,7 +478,8 @@ template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitMOVi2piecesInstruction(const MachineInstr &MI) {
   const MachineOperand &MO0 = MI.getOperand(0);
   const MachineOperand &MO1 = MI.getOperand(1);
-  assert(MO1.isImm() && "Not a valid so_imm value!");
+  assert(MO1.isImm() && ARM_AM::getSOImmVal(MO1.isImm()) != -1 &&
+                                            "Not a valid so_imm value!");
   unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO1.getImm());
   unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO1.getImm());
 
@@ -483,7 +495,7 @@ void Emitter<CodeEmitter>::emitMOVi2piecesInstruction(const MachineInstr &MI) {
   // Encode so_imm.
   // Set bit I(25) to identify this is the immediate form of <shifter_op>
   Binary |= 1 << ARMII::I_BitShift;
-  Binary |= getMachineSoImmOpValue(ARM_AM::getSOImmVal(V1));
+  Binary |= getMachineSoImmOpValue(V1);
   emitWordLE(Binary);
 
   // Now the 'orr' instruction.
@@ -501,14 +513,14 @@ void Emitter<CodeEmitter>::emitMOVi2piecesInstruction(const MachineInstr &MI) {
   // Encode so_imm.
   // Set bit I(25) to identify this is the immediate form of <shifter_op>
   Binary |= 1 << ARMII::I_BitShift;
-  Binary |= getMachineSoImmOpValue(ARM_AM::getSOImmVal(V2));
+  Binary |= getMachineSoImmOpValue(V2);
   emitWordLE(Binary);
 }
 
 template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitLEApcrelJTInstruction(const MachineInstr &MI) {
   // It's basically add r, pc, (LJTI - $+8)
-  
+
   const TargetInstrDesc &TID = MI.getDesc();
 
   // Emit the 'add' instruction.
@@ -527,7 +539,6 @@ void Emitter<CodeEmitter>::emitLEApcrelJTInstruction(const MachineInstr &MI) {
   Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::PC) << ARMII::RegRnShift;
 
   // Encode the displacement.
-  // Set bit I(25) to identify this is the immediate form of <shifter_op>.
   Binary |= 1 << ARMII::I_BitShift;
   emitJumpTableAddress(MI.getOperand(1).getIndex(), ARM::reloc_arm_jt_base);
 
@@ -576,8 +587,8 @@ void Emitter<CodeEmitter>::emitPseudoMoveInstruction(const MachineInstr &MI) {
 
 template<class CodeEmitter>
 void Emitter<CodeEmitter>::addPCLabel(unsigned LabelID) {
-  DOUT << "  ** LPC" << LabelID << " @ "
-       << (void*)MCE.getCurrentPCValue() << '\n';
+  DEBUG(errs() << "  ** LPC" << LabelID << " @ "
+        << (void*)MCE.getCurrentPCValue() << '\n');
   JTI->addPCLabelAddr(LabelID, MCE.getCurrentPCValue());
 }
 
@@ -586,13 +597,13 @@ void Emitter<CodeEmitter>::emitPseudoInstruction(const MachineInstr &MI) {
   unsigned Opcode = MI.getDesc().Opcode;
   switch (Opcode) {
   default:
-    abort(); // FIXME:
+    llvm_unreachable("ARMCodeEmitter::emitPseudoInstruction");
+  // FIXME: Add support for MOVimm32.
   case TargetInstrInfo::INLINEASM: {
     // We allow inline assembler nodes with empty bodies - they can
     // implicitly define registers, which is ok for JIT.
     if (MI.getOperand(0).getSymbolName()[0]) {
-      assert(0 && "JIT does not support inline asm!\n");
-      abort();
+      llvm_report_error("JIT does not support inline asm!");
     }
     break;
   }
@@ -601,7 +612,7 @@ void Emitter<CodeEmitter>::emitPseudoInstruction(const MachineInstr &MI) {
     MCE.emitLabel(MI.getOperand(0).getImm());
     break;
   case TargetInstrInfo::IMPLICIT_DEF:
-  case TargetInstrInfo::DECLARE:
+  case TargetInstrInfo::KILL:
   case ARM::DWARF_LOC:
     // Do nothing.
     break;
@@ -674,7 +685,7 @@ unsigned Emitter<CodeEmitter>::getMachineSoRegOpValue(
     // ROR - 0111
     // RRX - 0110 and bit[11:8] clear.
     switch (SOpc) {
-    default: assert(0 && "Unknown shift opc!");
+    default: llvm_unreachable("Unknown shift opc!");
     case ARM_AM::lsl: SBits = 0x1; break;
     case ARM_AM::lsr: SBits = 0x3; break;
     case ARM_AM::asr: SBits = 0x5; break;
@@ -688,7 +699,7 @@ unsigned Emitter<CodeEmitter>::getMachineSoRegOpValue(
     // ASR - 100
     // ROR - 110
     switch (SOpc) {
-    default: assert(0 && "Unknown shift opc!");
+    default: llvm_unreachable("Unknown shift opc!");
     case ARM_AM::lsl: SBits = 0x0; break;
     case ARM_AM::lsr: SBits = 0x2; break;
     case ARM_AM::asr: SBits = 0x4; break;
@@ -713,12 +724,15 @@ unsigned Emitter<CodeEmitter>::getMachineSoRegOpValue(
 
 template<class CodeEmitter>
 unsigned Emitter<CodeEmitter>::getMachineSoImmOpValue(unsigned SoImm) {
+  int SoImmVal = ARM_AM::getSOImmVal(SoImm);
+  assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
   // Encode rotate_imm.
-  unsigned Binary = (ARM_AM::getSOImmValRot(SoImm) >> 1)
+  unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
     << ARMII::SoRotImmShift;
 
   // Encode immed_8.
-  Binary |= ARM_AM::getSOImmValImm(SoImm);
+  Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
   return Binary;
 }
 
@@ -740,6 +754,10 @@ void Emitter<CodeEmitter>::emitDataProcessingInstruction(
                                                    unsigned ImplicitRn) {
   const TargetInstrDesc &TID = MI.getDesc();
 
+  if (TID.Opcode == ARM::BFC) {
+    llvm_report_error("ARMv6t2 JIT is not yet supported.");
+  }
+
   // Part of binary is determined by TableGn.
   unsigned Binary = getBinaryCodeForInstr(MI);
 
@@ -791,9 +809,7 @@ void Emitter<CodeEmitter>::emitDataProcessingInstruction(
   }
 
   // Encode so_imm.
-  // Set bit I(25) to identify this is the immediate form of <shifter_op>.
-  Binary |= 1 << ARMII::I_BitShift;
-  Binary |= getMachineSoImmOpValue(MO.getImm());
+  Binary |= getMachineSoImmOpValue((unsigned)MO.getImm());
 
   emitWordLE(Binary);
 }
@@ -952,8 +968,8 @@ static unsigned getAddrModeUPBits(unsigned Mode) {
   // DA - Decrement after  - bit U = 0 and bit P = 0
   // DB - Decrement before - bit U = 0 and bit P = 1
   switch (Mode) {
-  default: assert(0 && "Unknown addressing sub-mode!");
-  case ARM_AM::da:                      break;
+  default: llvm_unreachable("Unknown addressing sub-mode!");
+  case ARM_AM::da:                                     break;
   case ARM_AM::db: Binary |= 0x1 << ARMII::P_BitShift; break;
   case ARM_AM::ia: Binary |= 0x1 << ARMII::U_BitShift; break;
   case ARM_AM::ib: Binary |= 0x3 << ARMII::U_BitShift; break;
@@ -983,7 +999,7 @@ void Emitter<CodeEmitter>::emitLoadStoreMultipleInstruction(
     Binary |= 0x1 << ARMII::W_BitShift;
 
   // Set registers
-  for (unsigned i = 4, e = MI.getNumOperands(); i != e; ++i) {
+  for (unsigned i = 5, e = MI.getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || MO.isImplicit())
       break;
@@ -1107,7 +1123,7 @@ void Emitter<CodeEmitter>::emitMiscArithInstruction(const MachineInstr &MI) {
   unsigned ShiftAmt = MI.getOperand(OpIdx).getImm();
   assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!");
   Binary |= ShiftAmt << ARMII::ShiftShift;
-  
+
   emitWordLE(Binary);
 }
 
@@ -1115,8 +1131,9 @@ template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitBranchInstruction(const MachineInstr &MI) {
   const TargetInstrDesc &TID = MI.getDesc();
 
-  if (TID.Opcode == ARM::TPsoft)
-    abort(); // FIXME
+  if (TID.Opcode == ARM::TPsoft) {
+    llvm_unreachable("ARM::TPsoft FIXME"); // FIXME
+  }
 
   // Part of binary is determined by TableGn.
   unsigned Binary = getBinaryCodeForInstr(MI);
@@ -1135,7 +1152,8 @@ void Emitter<CodeEmitter>::emitInlineJumpTable(unsigned JTIndex) {
   // Remember the base address of the inline jump table.
   uintptr_t JTBase = MCE.getCurrentPCValue();
   JTI->addJumpTableBaseAddr(JTIndex, JTBase);
-  DOUT << "  ** Jump Table #" << JTIndex << " @ " << (void*)JTBase << '\n';
+  DEBUG(errs() << "  ** Jump Table #" << JTIndex << " @ " << (void*)JTBase
+               << '\n');
 
   // Now emit the jump table entries.
   const std::vector<MachineBasicBlock*> &MBBs = (*MJTEs)[JTIndex].MBBs;
@@ -1155,17 +1173,17 @@ void Emitter<CodeEmitter>::emitMiscBranchInstruction(const MachineInstr &MI) {
   const TargetInstrDesc &TID = MI.getDesc();
 
   // Handle jump tables.
-  if (TID.Opcode == ARM::BR_JTr || TID.Opcode == ARM::BR_JTadd ||
-      TID.Opcode == ARM::t2BR_JTr || TID.Opcode == ARM::t2BR_JTadd) {
+  if (TID.Opcode == ARM::BR_JTr || TID.Opcode == ARM::BR_JTadd) {
     // First emit a ldr pc, [] instruction.
     emitDataProcessingInstruction(MI, ARM::PC);
 
     // Then emit the inline jump table.
-    unsigned JTIndex = (TID.Opcode == ARM::BR_JTr || TID.Opcode == ARM::t2BR_JTr)
+    unsigned JTIndex =
+      (TID.Opcode == ARM::BR_JTr)
       ? MI.getOperand(1).getIndex() : MI.getOperand(2).getIndex();
     emitInlineJumpTable(JTIndex);
     return;
-  } else if (TID.Opcode == ARM::BR_JTm || TID.Opcode == ARM::t2BR_JTm) {
+  } else if (TID.Opcode == ARM::BR_JTm) {
     // First emit a ldr pc, [] instruction.
     emitLoadStoreInstruction(MI, ARM::PC);
 
@@ -1183,7 +1201,7 @@ void Emitter<CodeEmitter>::emitMiscBranchInstruction(const MachineInstr &MI) {
   if (TID.Opcode == ARM::BX_RET)
     // The return register is LR.
     Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::LR);
-  else 
+  else
     // otherwise, set the return register
     Binary |= getMachineOpValue(MI, 0);
 
@@ -1194,7 +1212,7 @@ static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) {
   unsigned RegD = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
   bool isSPVFP = false;
-  RegD = ARMRegisterInfo::getRegisterNumbering(RegD, isSPVFP);
+  RegD = ARMRegisterInfo::getRegisterNumbering(RegD, &isSPVFP);
   if (!isSPVFP)
     Binary |=   RegD               << ARMII::RegRdShift;
   else {
@@ -1208,7 +1226,7 @@ static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) {
   unsigned RegN = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
   bool isSPVFP = false;
-  RegN = ARMRegisterInfo::getRegisterNumbering(RegN, isSPVFP);
+  RegN = ARMRegisterInfo::getRegisterNumbering(RegN, &isSPVFP);
   if (!isSPVFP)
     Binary |=   RegN               << ARMII::RegRnShift;
   else {
@@ -1222,7 +1240,7 @@ static unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) {
   unsigned RegM = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
   bool isSPVFP = false;
-  RegM = ARMRegisterInfo::getRegisterNumbering(RegM, isSPVFP);
+  RegM = ARMRegisterInfo::getRegisterNumbering(RegM, &isSPVFP);
   if (!isSPVFP)
     Binary |=   RegM;
   else {
@@ -1268,7 +1286,7 @@ void Emitter<CodeEmitter>::emitVFPArithInstruction(const MachineInstr &MI) {
 
   // Encode Dm / Sm.
   Binary |= encodeVFPRm(MI, OpIdx);
-  
+
   emitWordLE(Binary);
 }
 
@@ -1386,11 +1404,11 @@ void Emitter<CodeEmitter>::emitVFPLoadStoreMultipleInstruction(
     Binary |= 0x1 << ARMII::W_BitShift;
 
   // First register is encoded in Dd.
-  Binary |= encodeVFPRd(MI, 4);
+  Binary |= encodeVFPRd(MI, 5);
 
   // Number of registers are encoded in offset field.
   unsigned NumRegs = 1;
-  for (unsigned i = 5, e = MI.getNumOperands(); i != e; ++i) {
+  for (unsigned i = 6, e = MI.getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || MO.isImplicit())
       break;
@@ -1413,4 +1431,3 @@ void Emitter<CodeEmitter>::emitMiscInstruction(const MachineInstr &MI) {
 }
 
 #include "ARMGenCodeEmitter.inc"
-
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 9fedaa4..309e3ba 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -15,24 +15,31 @@
 
 #define DEBUG_TYPE "arm-cp-islands"
 #include "ARM.h"
+#include "ARMAddressingModes.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMInstrInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
-STATISTIC(NumCPEs,     "Number of constpool entries");
-STATISTIC(NumSplit,    "Number of uncond branches inserted");
-STATISTIC(NumCBrFixed, "Number of cond branches fixed");
-STATISTIC(NumUBrFixed, "Number of uncond branches fixed");
+STATISTIC(NumCPEs,       "Number of constpool entries");
+STATISTIC(NumSplit,      "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
+STATISTIC(NumUBrFixed,   "Number of uncond branches fixed");
+STATISTIC(NumTBs,        "Number of table branches generated");
+STATISTIC(NumT2CPShrunk, "Number of Thumb2 constantpool instructions shrunk");
+STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk");
 
 namespace {
   /// ARMConstantIslands - Due to limited PC-relative displacements, ARM
@@ -63,6 +70,8 @@ namespace {
     /// to a return, unreachable, or unconditional branch).
     std::vector<MachineBasicBlock*> WaterList;
 
+    typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
+
     /// CPUser - One user of a constant pool, keeping the machine instruction
     /// pointer, the constant pool being referenced, and the max displacement
     /// allowed from the instruction to the CP.
@@ -70,8 +79,11 @@ namespace {
       MachineInstr *MI;
       MachineInstr *CPEMI;
       unsigned MaxDisp;
-      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp)
-        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp) {}
+      bool NegOk;
+      bool IsSoImm;
+      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
+             bool neg, bool soimm)
+        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp), NegOk(neg), IsSoImm(soimm) {}
     };
 
     /// CPUsers - Keep track of all of the machine instructions that use various
@@ -117,29 +129,34 @@ namespace {
     ///
     SmallVector<MachineInstr*, 4> PushPopMIs;
 
+    /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions.
+    SmallVector<MachineInstr*, 4> T2JumpTables;
+
     /// HasFarJump - True if any far jump instruction has been emitted during
     /// the branch fix up pass.
     bool HasFarJump;
 
     const TargetInstrInfo *TII;
+    const ARMSubtarget *STI;
     ARMFunctionInfo *AFI;
     bool isThumb;
+    bool isThumb1;
     bool isThumb2;
   public:
     static char ID;
     ARMConstantIslands() : MachineFunctionPass(&ID) {}
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    virtual bool runOnMachineFunction(MachineFunction &MF);
 
     virtual const char *getPassName() const {
       return "ARM constant island placement and branch shortening pass";
     }
 
   private:
-    void DoInitialPlacement(MachineFunction &Fn,
+    void DoInitialPlacement(MachineFunction &MF,
                             std::vector<MachineInstr*> &CPEMIs);
     CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
-    void InitialFunctionScan(MachineFunction &Fn,
+    void InitialFunctionScan(MachineFunction &MF,
                              const std::vector<MachineInstr*> &CPEMIs);
     MachineBasicBlock *SplitBlockBeforeInstr(MachineInstr *MI);
     void UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB);
@@ -147,58 +164,62 @@ namespace {
     bool DecrementOldEntry(unsigned CPI, MachineInstr* CPEMI);
     int LookForExistingCPEntry(CPUser& U, unsigned UserOffset);
     bool LookForWater(CPUser&U, unsigned UserOffset,
-                      MachineBasicBlock** NewMBB);
-    MachineBasicBlock* AcceptWater(MachineBasicBlock *WaterBB,
-                        std::vector<MachineBasicBlock*>::iterator IP);
+                      MachineBasicBlock *&NewMBB);
+    MachineBasicBlock *AcceptWater(water_iterator IP);
     void CreateNewWater(unsigned CPUserIndex, unsigned UserOffset,
-                      MachineBasicBlock** NewMBB);
-    bool HandleConstantPoolUser(MachineFunction &Fn, unsigned CPUserIndex);
+                        MachineBasicBlock *&NewMBB);
+    bool HandleConstantPoolUser(MachineFunction &MF, unsigned CPUserIndex);
     void RemoveDeadCPEMI(MachineInstr *CPEMI);
     bool RemoveUnusedCPEntries();
     bool CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
-                      MachineInstr *CPEMI, unsigned Disp,
-                      bool DoDump);
+                      MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+                      bool DoDump = false);
     bool WaterIsInRange(unsigned UserOffset, MachineBasicBlock *Water,
                         CPUser &U);
     bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset,
-                        unsigned Disp, bool NegativeOK);
+                         unsigned Disp, bool NegativeOK, bool IsSoImm = false);
     bool BBIsInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
-    bool FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br);
-    bool FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br);
-    bool FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br);
+    bool FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br);
+    bool FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br);
+    bool FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br);
     bool UndoLRSpillRestore();
+    bool OptimizeThumb2Instructions(MachineFunction &MF);
+    bool OptimizeThumb2Branches(MachineFunction &MF);
+    bool OptimizeThumb2JumpTables(MachineFunction &MF);
 
     unsigned GetOffsetOf(MachineInstr *MI) const;
     void dumpBBs();
-    void verify(MachineFunction &Fn);
+    void verify(MachineFunction &MF);
   };
   char ARMConstantIslands::ID = 0;
 }
 
 /// verify - check BBOffsets, BBSizes, alignment of islands
-void ARMConstantIslands::verify(MachineFunction &Fn) {
+void ARMConstantIslands::verify(MachineFunction &MF) {
   assert(BBOffsets.size() == BBSizes.size());
   for (unsigned i = 1, e = BBOffsets.size(); i != e; ++i)
     assert(BBOffsets[i-1]+BBSizes[i-1] == BBOffsets[i]);
-  if (isThumb) {
-    for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end();
-         MBBI != E; ++MBBI) {
-      MachineBasicBlock *MBB = MBBI;
-      if (!MBB->empty() &&
-          MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY)
-        assert((BBOffsets[MBB->getNumber()]%4 == 0 &&
-                BBSizes[MBB->getNumber()]%4 == 0) ||
-               (BBOffsets[MBB->getNumber()]%4 != 0 &&
-                BBSizes[MBB->getNumber()]%4 != 0));
+  if (!isThumb)
+    return;
+#ifndef NDEBUG
+  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    if (!MBB->empty() &&
+        MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) {
+      unsigned MBBId = MBB->getNumber();
+      assert((BBOffsets[MBBId]%4 == 0 && BBSizes[MBBId]%4 == 0) ||
+             (BBOffsets[MBBId]%4 != 0 && BBSizes[MBBId]%4 != 0));
     }
   }
+#endif
 }
 
 /// print block size and offset information - debugging
 void ARMConstantIslands::dumpBBs() {
   for (unsigned J = 0, E = BBOffsets.size(); J !=E; ++J) {
-    DOUT << "block " << J << " offset " << BBOffsets[J] <<
-                            " size " << BBSizes[J] << "\n";
+    DEBUG(errs() << "block " << J << " offset " << BBOffsets[J]
+                 << " size " << BBSizes[J] << "\n");
   }
 }
 
@@ -208,31 +229,36 @@ FunctionPass *llvm::createARMConstantIslandPass() {
   return new ARMConstantIslands();
 }
 
-bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) {
-  MachineConstantPool &MCP = *Fn.getConstantPool();
+bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
+  MachineConstantPool &MCP = *MF.getConstantPool();
+
+  TII = MF.getTarget().getInstrInfo();
+  AFI = MF.getInfo<ARMFunctionInfo>();
+  STI = &MF.getTarget().getSubtarget<ARMSubtarget>();
 
-  TII = Fn.getTarget().getInstrInfo();
-  AFI = Fn.getInfo<ARMFunctionInfo>();
   isThumb = AFI->isThumbFunction();
+  isThumb1 = AFI->isThumb1OnlyFunction();
   isThumb2 = AFI->isThumb2Function();
 
   HasFarJump = false;
 
   // Renumber all of the machine basic blocks in the function, guaranteeing that
   // the numbers agree with the position of the block in the function.
-  Fn.RenumberBlocks();
+  MF.RenumberBlocks();
+
+  // Thumb1 functions containing constant pools get 4-byte alignment.
+  // This is so we can keep exact track of where the alignment padding goes.
 
-  /// Thumb functions containing constant pools get 2-byte alignment.
-  /// This is so we can keep exact track of where the alignment padding goes.
-  /// Set default.
-  AFI->setAlign(isThumb ? 1U : 2U);
+  // Set default. Thumb1 function is 2-byte aligned, ARM and Thumb2 are 4-byte
+  // aligned.
+  AFI->setAlign(isThumb1 ? 1U : 2U);
 
   // Perform the initial placement of the constant pool entries.  To start with,
   // we put them all at the end of the function.
   std::vector<MachineInstr*> CPEMIs;
   if (!MCP.isEmpty()) {
-    DoInitialPlacement(Fn, CPEMIs);
-    if (isThumb)
+    DoInitialPlacement(MF, CPEMIs);
+    if (isThumb1)
       AFI->setAlign(2U);
   }
 
@@ -242,7 +268,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) {
   // Do the initial scan of the function, building up information about the
   // sizes of each block, the location of all the water, and finding all of the
   // constant pool users.
-  InitialFunctionScan(Fn, CPEMIs);
+  InitialFunctionScan(MF, CPEMIs);
   CPEMIs.clear();
 
   /// Remove dead constant pool entries.
@@ -251,25 +277,37 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) {
   // Iteratively place constant pool entries and fix up branches until there
   // is no change.
   bool MadeChange = false;
+  unsigned NoCPIters = 0, NoBRIters = 0;
   while (true) {
-    bool Change = false;
+    bool CPChange = false;
     for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
-      Change |= HandleConstantPoolUser(Fn, i);
+      CPChange |= HandleConstantPoolUser(MF, i);
+    if (CPChange && ++NoCPIters > 30)
+      llvm_unreachable("Constant Island pass failed to converge!");
     DEBUG(dumpBBs());
+
+    bool BRChange = false;
     for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
-      Change |= FixUpImmediateBr(Fn, ImmBranches[i]);
+      BRChange |= FixUpImmediateBr(MF, ImmBranches[i]);
+    if (BRChange && ++NoBRIters > 30)
+      llvm_unreachable("Branch Fix Up pass failed to converge!");
     DEBUG(dumpBBs());
-    if (!Change)
+
+    if (!CPChange && !BRChange)
       break;
     MadeChange = true;
   }
 
+  // Shrink 32-bit Thumb2 branch, load, and store instructions.
+  if (isThumb2)
+    MadeChange |= OptimizeThumb2Instructions(MF);
+
   // After a while, this might be made debug-only, but it is not expensive.
-  verify(Fn);
+  verify(MF);
 
   // If LR has been forced spilled and no far jumps (i.e. BL) has been issued.
   // Undo the spill / restore of LR if possible.
-  if (!HasFarJump && AFI->isLRSpilledForFarJump() && isThumb)
+  if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
     MadeChange |= UndoLRSpillRestore();
 
   BBSizes.clear();
@@ -279,24 +317,25 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) {
   CPEntries.clear();
   ImmBranches.clear();
   PushPopMIs.clear();
+  T2JumpTables.clear();
 
   return MadeChange;
 }
 
 /// DoInitialPlacement - Perform the initial placement of the constant pool
 /// entries.  To start with, we put them all at the end of the function.
-void ARMConstantIslands::DoInitialPlacement(MachineFunction &Fn,
+void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF,
                                         std::vector<MachineInstr*> &CPEMIs) {
   // Create the basic block to hold the CPE's.
-  MachineBasicBlock *BB = Fn.CreateMachineBasicBlock();
-  Fn.push_back(BB);
+  MachineBasicBlock *BB = MF.CreateMachineBasicBlock();
+  MF.push_back(BB);
 
   // Add all of the constants from the constant pool to the end block, use an
   // identity mapping of CPI's to CPE's.
   const std::vector<MachineConstantPoolEntry> &CPs =
-    Fn.getConstantPool()->getConstants();
+    MF.getConstantPool()->getConstants();
 
-  const TargetData &TD = *Fn.getTarget().getTargetData();
+  const TargetData &TD = *MF.getTarget().getTargetData();
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
     unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
     // Verify that all constant pool entries are a multiple of 4 bytes.  If not,
@@ -313,7 +352,8 @@ void ARMConstantIslands::DoInitialPlacement(MachineFunction &Fn,
     CPEs.push_back(CPEntry(CPEMI, i));
     CPEntries.push_back(CPEs);
     NumCPEs++;
-    DOUT << "Moved CPI#" << i << " to end of function as #" << i << "\n";
+    DEBUG(errs() << "Moved CPI#" << i << " to end of function as #" << i
+                 << "\n");
   }
 }
 
@@ -352,10 +392,10 @@ ARMConstantIslands::CPEntry
 /// InitialFunctionScan - Do the initial scan of the function, building up
 /// information about the sizes of each block, the location of all the water,
 /// and finding all of the constant pool users.
-void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn,
+void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF,
                                  const std::vector<MachineInstr*> &CPEMIs) {
   unsigned Offset = 0;
-  for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end();
+  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
        MBBI != E; ++MBBI) {
     MachineBasicBlock &MBB = *MBBI;
 
@@ -377,18 +417,19 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn,
         unsigned Scale = 1;
         int UOpc = Opc;
         switch (Opc) {
+        default:
+          continue;  // Ignore other JT branches
         case ARM::tBR_JTr:
-        case ARM::t2BR_JTr:
-        case ARM::t2BR_JTm:
-        case ARM::t2BR_JTadd:
-          // A Thumb table jump may involve padding; for the offsets to
+          // A Thumb1 table jump may involve padding; for the offsets to
           // be right, functions containing these must be 4-byte aligned.
           AFI->setAlign(2U);
           if ((Offset+MBBSize)%4 != 0)
+            // FIXME: Add a pseudo ALIGN instruction instead.
             MBBSize += 2;           // padding
           continue;   // Does not get an entry in ImmBranches
-        default:
-          continue;  // Ignore other JT branches
+        case ARM::t2BR_JT:
+          T2JumpTables.push_back(I);
+          continue;   // Does not get an entry in ImmBranches
         case ARM::Bcc:
           isCond = true;
           UOpc = ARM::B;
@@ -427,6 +468,9 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn,
       if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET)
         PushPopMIs.push_back(I);
 
+      if (Opc == ARM::CONSTPOOL_ENTRY)
+        continue;
+
       // Scan the instructions for constant pool operands.
       for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
         if (I->getOperand(op).isCPI()) {
@@ -436,50 +480,52 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn,
           // Basic size info comes from the TSFlags field.
           unsigned Bits = 0;
           unsigned Scale = 1;
-          unsigned TSFlags = I->getDesc().TSFlags;
-          switch (TSFlags & ARMII::AddrModeMask) {
+          bool NegOk = false;
+          bool IsSoImm = false;
+
+          switch (Opc) {
           default:
-            // Constant pool entries can reach anything.
-            if (I->getOpcode() == ARM::CONSTPOOL_ENTRY)
-              continue;
-            if (I->getOpcode() == ARM::tLEApcrel) {
-              Bits = 8;  // Taking the address of a CP entry.
-              break;
-            }
-            assert(0 && "Unknown addressing mode for CP reference!");
-          case ARMII::AddrMode1: // AM1: 8 bits << 2
-            Bits = 8;
-            Scale = 4;  // Taking the address of a CP entry.
-            break;
-          case ARMII::AddrMode2:
-            Bits = 12;  // +-offset_12
-            break;
-          case ARMII::AddrMode3:
-            Bits = 8;   // +-offset_8
+            llvm_unreachable("Unknown addressing mode for CP reference!");
             break;
-            // addrmode4 has no immediate offset.
-          case ARMII::AddrMode5:
+
+          // Taking the address of a CP entry.
+          case ARM::LEApcrel:
+            // This takes a SoImm, which is 8 bit immediate rotated. We'll
+            // pretend the maximum offset is 255 * 4. Since each instruction
+            // 4 byte wide, this is always correct. We'llc heck for other
+            // displacements that fits in a SoImm as well.
             Bits = 8;
-            Scale = 4;  // +-(offset_8*4)
+            Scale = 4;
+            NegOk = true;
+            IsSoImm = true;
             break;
-            // addrmode6 has no immediate offset.
-          case ARMII::AddrModeT1_1:
-            Bits = 5;  // +offset_5
+          case ARM::t2LEApcrel:
+            Bits = 12;
+            NegOk = true;
             break;
-          case ARMII::AddrModeT1_2:
-            Bits = 5;
-            Scale = 2;  // +(offset_5*2)
+          case ARM::tLEApcrel:
+            Bits = 8;
+            Scale = 4;
             break;
-          case ARMII::AddrModeT1_4:
-            Bits = 5;
-            Scale = 4;  // +(offset_5*4)
+
+          case ARM::LDR:
+          case ARM::LDRcp:
+          case ARM::t2LDRpci:
+            Bits = 12;  // +-offset_12
+            NegOk = true;
             break;
-          case ARMII::AddrModeT1_s:
+
+          case ARM::tLDRpci:
+          case ARM::tLDRcp:
             Bits = 8;
             Scale = 4;  // +(offset_8*4)
             break;
-          case ARMII::AddrModeT2_pc:
-            Bits = 12;  // +-offset_12
+
+          case ARM::FLDD:
+          case ARM::FLDS:
+            Bits = 8;
+            Scale = 4;  // +-(offset_8*4)
+            NegOk = true;
             break;
           }
 
@@ -487,7 +533,7 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn,
           unsigned CPI = I->getOperand(op).getIndex();
           MachineInstr *CPEMI = CPEMIs[CPI];
           unsigned MaxOffs = ((1 << Bits)-1) * Scale;
-          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs));
+          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk, IsSoImm));
 
           // Increment corresponding CPEntry reference count.
           CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
@@ -563,7 +609,7 @@ void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
 
   // Next, update WaterList.  Specifically, we need to add NewMBB as having
   // available water after it.
-  std::vector<MachineBasicBlock*>::iterator IP =
+  water_iterator IP =
     std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
                      CompareMBBNumbers);
   WaterList.insert(IP, NewBB);
@@ -590,8 +636,8 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
   // Note the new unconditional branch is not being recorded.
   // There doesn't seem to be meaningful DebugInfo available; this doesn't
   // correspond to anything in the source.
-  BuildMI(OrigBB, DebugLoc::getUnknownLoc(),
-          TII->get(isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B)).addMBB(NewBB);
+  unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B;
+  BuildMI(OrigBB, DebugLoc::getUnknownLoc(), TII->get(Opc)).addMBB(NewBB);
   NumSplit++;
 
   // Update the CFG.  All succs of OrigBB are now succs of NewBB.
@@ -625,7 +671,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
   // available water after it (but not if it's already there, which happens
   // when splitting before a conditional branch that is followed by an
   // unconditional branch - in that case we want to insert NewBB).
-  std::vector<MachineBasicBlock*>::iterator IP =
+  water_iterator IP =
     std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
                      CompareMBBNumbers);
   MachineBasicBlock* WaterBB = *IP;
@@ -648,7 +694,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
 
   // We removed instructions from UserMBB, subtract that off from its size.
   // Add 2 or 4 to the block to count the unconditional branch we added to it.
-  unsigned delta = isThumb ? 2 : 4;
+  int delta = isThumb1 ? 2 : 4;
   BBSizes[OrigBBI] -= NewBBSize - delta;
 
   // ...and adjust BBOffsets for NewBB accordingly.
@@ -664,24 +710,39 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
 /// reference) is within MaxDisp of TrialOffset (a proposed location of a
 /// constant pool entry).
 bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset,
-                      unsigned TrialOffset, unsigned MaxDisp, bool NegativeOK) {
+                                         unsigned TrialOffset, unsigned MaxDisp,
+                                         bool NegativeOK, bool IsSoImm) {
   // On Thumb offsets==2 mod 4 are rounded down by the hardware for
   // purposes of the displacement computation; compensate for that here.
   // Effectively, the valid range of displacements is 2 bytes smaller for such
   // references.
-  if (isThumb && UserOffset%4 !=0)
+  unsigned TotalAdj = 0;
+  if (isThumb && UserOffset%4 !=0) {
     UserOffset -= 2;
+    TotalAdj = 2;
+  }
   // CPEs will be rounded up to a multiple of 4.
-  if (isThumb && TrialOffset%4 != 0)
+  if (isThumb && TrialOffset%4 != 0) {
     TrialOffset += 2;
+    TotalAdj += 2;
+  }
+
+  // In Thumb2 mode, later branch adjustments can shift instructions up and
+  // cause alignment change. In the worst case scenario this can cause the
+  // user's effective address to be subtracted by 2 and the CPE's address to
+  // be plus 2.
+  if (isThumb2 && TotalAdj != 4)
+    MaxDisp -= (4 - TotalAdj);
 
   if (UserOffset <= TrialOffset) {
     // User before the Trial.
-    if (TrialOffset-UserOffset <= MaxDisp)
+    if (TrialOffset - UserOffset <= MaxDisp)
       return true;
+    // FIXME: Make use full range of soimm values.
   } else if (NegativeOK) {
-    if (UserOffset-TrialOffset <= MaxDisp)
+    if (UserOffset - TrialOffset <= MaxDisp)
       return true;
+    // FIXME: Make use full range of soimm values.
   }
   return false;
 }
@@ -690,39 +751,36 @@ bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset,
 /// Water (a basic block) will be in range for the specific MI.
 
 bool ARMConstantIslands::WaterIsInRange(unsigned UserOffset,
-                         MachineBasicBlock* Water, CPUser &U)
-{
+                                        MachineBasicBlock* Water, CPUser &U) {
   unsigned MaxDisp = U.MaxDisp;
-  MachineFunction::iterator I = next(MachineFunction::iterator(Water));
   unsigned CPEOffset = BBOffsets[Water->getNumber()] +
                        BBSizes[Water->getNumber()];
 
   // If the CPE is to be inserted before the instruction, that will raise
-  // the offset of the instruction.  (Currently applies only to ARM, so
-  // no alignment compensation attempted here.)
+  // the offset of the instruction.
   if (CPEOffset < UserOffset)
     UserOffset += U.CPEMI->getOperand(2).getImm();
 
-  return OffsetIsInRange (UserOffset, CPEOffset, MaxDisp, !isThumb);
+  return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, U.NegOk, U.IsSoImm);
 }
 
 /// CPEIsInRange - Returns true if the distance between specific MI and
 /// specific ConstPool entry instruction can fit in MI's displacement field.
 bool ARMConstantIslands::CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
-                                      MachineInstr *CPEMI,
-                                      unsigned MaxDisp, bool DoDump) {
+                                      MachineInstr *CPEMI, unsigned MaxDisp,
+                                      bool NegOk, bool DoDump) {
   unsigned CPEOffset  = GetOffsetOf(CPEMI);
   assert(CPEOffset%4 == 0 && "Misaligned CPE");
 
   if (DoDump) {
-    DOUT << "User of CPE#" << CPEMI->getOperand(0).getImm()
-         << " max delta=" << MaxDisp
-         << " insn address=" << UserOffset
-         << " CPE address=" << CPEOffset
-         << " offset=" << int(CPEOffset-UserOffset) << "\t" << *MI;
+    DEBUG(errs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+                 << " max delta=" << MaxDisp
+                 << " insn address=" << UserOffset
+                 << " CPE address=" << CPEOffset
+                 << " offset=" << int(CPEOffset-UserOffset) << "\t" << *MI);
   }
 
-  return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, !isThumb);
+  return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
 }
 
 #ifndef NDEBUG
@@ -745,52 +803,48 @@ static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
 void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB,
                                               int delta) {
   MachineFunction::iterator MBBI = BB; MBBI = next(MBBI);
-  for(unsigned i=BB->getNumber()+1; i<BB->getParent()->getNumBlockIDs(); i++) {
+  for(unsigned i = BB->getNumber()+1, e = BB->getParent()->getNumBlockIDs();
+      i < e; ++i) {
     BBOffsets[i] += delta;
     // If some existing blocks have padding, adjust the padding as needed, a
     // bit tricky.  delta can be negative so don't use % on that.
-    if (isThumb) {
-      MachineBasicBlock *MBB = MBBI;
-      if (!MBB->empty()) {
-        // Constant pool entries require padding.
-        if (MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) {
-          unsigned oldOffset = BBOffsets[i] - delta;
-          if (oldOffset%4==0 && BBOffsets[i]%4!=0) {
-            // add new padding
-            BBSizes[i] += 2;
-            delta += 2;
-          } else if (oldOffset%4!=0 && BBOffsets[i]%4==0) {
-            // remove existing padding
-            BBSizes[i] -=2;
-            delta -= 2;
-          }
+    if (!isThumb)
+      continue;
+    MachineBasicBlock *MBB = MBBI;
+    if (!MBB->empty()) {
+      // Constant pool entries require padding.
+      if (MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) {
+        unsigned OldOffset = BBOffsets[i] - delta;
+        if ((OldOffset%4) == 0 && (BBOffsets[i]%4) != 0) {
+          // add new padding
+          BBSizes[i] += 2;
+          delta += 2;
+        } else if ((OldOffset%4) != 0 && (BBOffsets[i]%4) == 0) {
+          // remove existing padding
+          BBSizes[i] -= 2;
+          delta -= 2;
         }
-        // Thumb jump tables require padding.  They should be at the end;
-        // following unconditional branches are removed by AnalyzeBranch.
-        MachineInstr *ThumbJTMI = NULL;
-        if ((prior(MBB->end())->getOpcode() == ARM::tBR_JTr)
-            || (prior(MBB->end())->getOpcode() == ARM::t2BR_JTr)
-            || (prior(MBB->end())->getOpcode() == ARM::t2BR_JTm)
-            || (prior(MBB->end())->getOpcode() == ARM::t2BR_JTadd))
-          ThumbJTMI = prior(MBB->end());
-        if (ThumbJTMI) {
-          unsigned newMIOffset = GetOffsetOf(ThumbJTMI);
-          unsigned oldMIOffset = newMIOffset - delta;
-          if (oldMIOffset%4 == 0 && newMIOffset%4 != 0) {
-            // remove existing padding
-            BBSizes[i] -= 2;
-            delta -= 2;
-          } else if (oldMIOffset%4 != 0 && newMIOffset%4 == 0) {
-            // add new padding
-            BBSizes[i] += 2;
-            delta += 2;
-          }
+      }
+      // Thumb1 jump tables require padding.  They should be at the end;
+      // following unconditional branches are removed by AnalyzeBranch.
+      MachineInstr *ThumbJTMI = prior(MBB->end());
+      if (ThumbJTMI->getOpcode() == ARM::tBR_JTr) {
+        unsigned NewMIOffset = GetOffsetOf(ThumbJTMI);
+        unsigned OldMIOffset = NewMIOffset - delta;
+        if ((OldMIOffset%4) == 0 && (NewMIOffset%4) != 0) {
+          // remove existing padding
+          BBSizes[i] -= 2;
+          delta -= 2;
+        } else if ((OldMIOffset%4) != 0 && (NewMIOffset%4) == 0) {
+          // add new padding
+          BBSizes[i] += 2;
+          delta += 2;
         }
-        if (delta==0)
-          return;
       }
-      MBBI = next(MBBI);
+      if (delta==0)
+        return;
     }
+    MBBI = next(MBBI);
   }
 }
 
@@ -824,8 +878,8 @@ int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset)
   MachineInstr *CPEMI  = U.CPEMI;
 
   // Check to see if the CPE is already in-range.
-  if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.MaxDisp, true)) {
-    DOUT << "In range\n";
+  if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.MaxDisp, U.NegOk, true)) {
+    DEBUG(errs() << "In range\n");
     return 1;
   }
 
@@ -839,8 +893,9 @@ int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset)
     // Removing CPEs can leave empty entries, skip
     if (CPEs[i].CPEMI == NULL)
       continue;
-    if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.MaxDisp, false)) {
-      DOUT << "Replacing CPE#" << CPI << " with CPE#" << CPEs[i].CPI << "\n";
+    if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.MaxDisp, U.NegOk)) {
+      DEBUG(errs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
       // Point the CPUser node to the replacement
       U.CPEMI = CPEs[i].CPEMI;
       // Change the CPI in the instruction operand to refer to the clone.
@@ -870,15 +925,15 @@ static inline unsigned getUnconditionalBrDisp(int Opc) {
   default:
     break;
   }
-  
+
   return ((1<<23)-1)*4;
 }
 
 /// AcceptWater - Small amount of common code factored out of the following.
-
-MachineBasicBlock* ARMConstantIslands::AcceptWater(MachineBasicBlock *WaterBB,
-                          std::vector<MachineBasicBlock*>::iterator IP) {
-  DOUT << "found water in range\n";
+///
+MachineBasicBlock *ARMConstantIslands::AcceptWater(water_iterator IP) {
+  DEBUG(errs() << "found water in range\n");
+  MachineBasicBlock *WaterBB = *IP;
   // Remove the original WaterList entry; we want subsequent
   // insertions in this vicinity to go after the one we're
   // about to insert.  This considerably reduces the number
@@ -890,41 +945,44 @@ MachineBasicBlock* ARMConstantIslands::AcceptWater(MachineBasicBlock *WaterBB,
 
 /// LookForWater - look for an existing entry in the WaterList in which
 /// we can place the CPE referenced from U so it's within range of U's MI.
-/// Returns true if found, false if not.  If it returns true, *NewMBB
-/// is set to the WaterList entry.
-/// For ARM, we prefer the water that's farthest away.  For Thumb, prefer
-/// water that will not introduce padding to water that will; within each
-/// group, prefer the water that's farthest away.
-
+/// Returns true if found, false if not.  If it returns true, NewMBB
+/// is set to the WaterList entry.  For Thumb, prefer water that will not
+/// introduce padding to water that will.  To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
 bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
-                                      MachineBasicBlock** NewMBB) {
-  std::vector<MachineBasicBlock*>::iterator IPThatWouldPad;
-  MachineBasicBlock* WaterBBThatWouldPad = NULL;
-  if (!WaterList.empty()) {
-    for (std::vector<MachineBasicBlock*>::iterator IP = prior(WaterList.end()),
-        B = WaterList.begin();; --IP) {
-      MachineBasicBlock* WaterBB = *IP;
-      if (WaterIsInRange(UserOffset, WaterBB, U)) {
-        if (isThumb &&
-            (BBOffsets[WaterBB->getNumber()] +
-             BBSizes[WaterBB->getNumber()])%4 != 0) {
-          // This is valid Water, but would introduce padding.  Remember
-          // it in case we don't find any Water that doesn't do this.
-          if (!WaterBBThatWouldPad) {
-            WaterBBThatWouldPad = WaterBB;
-            IPThatWouldPad = IP;
-          }
-        } else {
-          *NewMBB = AcceptWater(WaterBB, IP);
-          return true;
+                                      MachineBasicBlock *&NewMBB) {
+  if (WaterList.empty())
+    return false;
+
+  bool FoundWaterThatWouldPad = false;
+  water_iterator IPThatWouldPad;
+  for (water_iterator IP = prior(WaterList.end()),
+         B = WaterList.begin();; --IP) {
+    MachineBasicBlock* WaterBB = *IP;
+    // Check if water is in range and at a lower address than the current one.
+    if (WaterIsInRange(UserOffset, WaterBB, U) &&
+        WaterBB->getNumber() < U.CPEMI->getParent()->getNumber()) {
+      unsigned WBBId = WaterBB->getNumber();
+      if (isThumb &&
+          (BBOffsets[WBBId] + BBSizes[WBBId])%4 != 0) {
+        // This is valid Water, but would introduce padding.  Remember
+        // it in case we don't find any Water that doesn't do this.
+        if (!FoundWaterThatWouldPad) {
+          FoundWaterThatWouldPad = true;
+          IPThatWouldPad = IP;
         }
+      } else {
+        NewMBB = AcceptWater(IP);
+        return true;
+      }
     }
-      if (IP == B)
-        break;
-    }
+    if (IP == B)
+      break;
   }
-  if (isThumb && WaterBBThatWouldPad) {
-    *NewMBB = AcceptWater(WaterBBThatWouldPad, IPThatWouldPad);
+  if (FoundWaterThatWouldPad) {
+    NewMBB = AcceptWater(IPThatWouldPad);
     return true;
   }
   return false;
@@ -934,12 +992,12 @@ bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
 /// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
 /// block is used if in range, and the conditional branch munged so control
 /// flow is correct.  Otherwise the block is split to create a hole with an
-/// unconditional branch around it.  In either case *NewMBB is set to a
+/// unconditional branch around it.  In either case NewMBB is set to a
 /// block following which the new island can be inserted (the WaterList
 /// is not adjusted).
-
 void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
-                        unsigned UserOffset, MachineBasicBlock** NewMBB) {
+                                        unsigned UserOffset,
+                                        MachineBasicBlock *&NewMBB) {
   CPUser &U = CPUsers[CPUserIndex];
   MachineInstr *UserMI = U.MI;
   MachineInstr *CPEMI  = U.CPEMI;
@@ -950,18 +1008,18 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
 
   // If the use is at the end of the block, or the end of the block
   // is within range, make new water there.  (The addition below is
-  // for the unconditional branch we will be adding:  4 bytes on ARM,
-  // 2 on Thumb.  Possible Thumb alignment padding is allowed for
+  // for the unconditional branch we will be adding:  4 bytes on ARM + Thumb2,
+  // 2 on Thumb1.  Possible Thumb1 alignment padding is allowed for
   // inside OffsetIsInRange.
   // If the block ends in an unconditional branch already, it is water,
   // and is known to be out of range, so we'll always be adding a branch.)
   if (&UserMBB->back() == UserMI ||
-      OffsetIsInRange(UserOffset, OffsetOfNextBlock + (isThumb ? 2: 4),
-           U.MaxDisp, !isThumb)) {
-    DOUT << "Split at end of block\n";
+      OffsetIsInRange(UserOffset, OffsetOfNextBlock + (isThumb1 ? 2: 4),
+                      U.MaxDisp, U.NegOk, U.IsSoImm)) {
+    DEBUG(errs() << "Split at end of block\n");
     if (&UserMBB->back() == UserMI)
       assert(BBHasFallthrough(UserMBB) && "Expected a fallthrough BB!");
-    *NewMBB = next(MachineFunction::iterator(UserMBB));
+    NewMBB = next(MachineFunction::iterator(UserMBB));
     // Add an unconditional branch from UserMBB to fallthrough block.
     // Record it for branch lengthening; this new branch will not get out of
     // range, but if the preceding conditional branch is out of range, the
@@ -969,16 +1027,16 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
     // range, so the machinery has to know about it.
     int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B;
     BuildMI(UserMBB, DebugLoc::getUnknownLoc(),
-            TII->get(UncondBr)).addMBB(*NewMBB);
+            TII->get(UncondBr)).addMBB(NewMBB);
     unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
     ImmBranches.push_back(ImmBranch(&UserMBB->back(),
                           MaxDisp, false, UncondBr));
-    int delta = isThumb ? 2 : 4;
+    int delta = isThumb1 ? 2 : 4;
     BBSizes[UserMBB->getNumber()] += delta;
     AdjustBBOffsetsAfter(UserMBB, delta);
   } else {
     // What a big block.  Find a place within the block to split it.
-    // This is a little tricky on Thumb since instructions are 2 bytes
+    // This is a little tricky on Thumb1 since instructions are 2 bytes
     // and constant pool entries are 4 bytes: if instruction I references
     // island CPE, and instruction I+1 references CPE', it will
     // not work well to put CPE as far forward as possible, since then
@@ -991,7 +1049,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
     // if not, we back up the insertion point.
 
     // The 4 in the following is for the unconditional branch we'll be
-    // inserting (allows for long branch on Thumb).  Alignment of the
+    // inserting (allows for long branch on Thumb1).  Alignment of the
     // island is handled inside OffsetIsInRange.
     unsigned BaseInsertOffset = UserOffset + U.MaxDisp -4;
     // This could point off the end of the block if we've already got
@@ -1000,7 +1058,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
     // conditional and a maximally long unconditional).
     if (BaseInsertOffset >= BBOffsets[UserMBB->getNumber()+1])
       BaseInsertOffset = BBOffsets[UserMBB->getNumber()+1] -
-                              (isThumb ? 6 : 8);
+                              (isThumb1 ? 6 : 8);
     unsigned EndInsertOffset = BaseInsertOffset +
            CPEMI->getOperand(2).getImm();
     MachineBasicBlock::iterator MI = UserMI;
@@ -1011,10 +1069,11 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
          Offset += TII->GetInstSizeInBytes(MI),
             MI = next(MI)) {
       if (CPUIndex < CPUsers.size() && CPUsers[CPUIndex].MI == MI) {
+        CPUser &U = CPUsers[CPUIndex];
         if (!OffsetIsInRange(Offset, EndInsertOffset,
-              CPUsers[CPUIndex].MaxDisp, !isThumb)) {
-          BaseInsertOffset -= (isThumb ? 2 : 4);
-          EndInsertOffset -= (isThumb ? 2 : 4);
+                             U.MaxDisp, U.NegOk, U.IsSoImm)) {
+          BaseInsertOffset -= (isThumb1 ? 2 : 4);
+          EndInsertOffset  -= (isThumb1 ? 2 : 4);
         }
         // This is overly conservative, as we don't account for CPEMIs
         // being reused within the block, but it doesn't matter much.
@@ -1022,8 +1081,8 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
         CPUIndex++;
       }
     }
-    DOUT << "Split in middle of big block\n";
-    *NewMBB = SplitBlockBeforeInstr(prior(MI));
+    DEBUG(errs() << "Split in middle of big block\n");
+    NewMBB = SplitBlockBeforeInstr(prior(MI));
   }
 }
 
@@ -1031,7 +1090,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
 /// is out-of-range.  If so, pick up the constant pool value and move it some
 /// place in-range.  Return true if we changed any addresses (thus must run
 /// another pass of branch lengthening), false otherwise.
-bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn,
+bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
                                                 unsigned CPUserIndex) {
   CPUser &U = CPUsers[CPUserIndex];
   MachineInstr *UserMI = U.MI;
@@ -1040,14 +1099,9 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn,
   unsigned Size = CPEMI->getOperand(2).getImm();
   MachineBasicBlock *NewMBB;
   // Compute this only once, it's expensive.  The 4 or 8 is the value the
-  // hardware keeps in the PC (2 insns ahead of the reference).
+  // hardware keeps in the PC.
   unsigned UserOffset = GetOffsetOf(UserMI) + (isThumb ? 4 : 8);
 
-  // Special case: tLEApcrel are two instructions MI's. The actual user is the
-  // second instruction.
-  if (UserMI->getOpcode() == ARM::tLEApcrel)
-    UserOffset += 2;
-
   // See if the current entry is within range, or there is a clone of it
   // in range.
   int result = LookForExistingCPEntry(U, UserOffset);
@@ -1058,19 +1112,16 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn,
   // We will be generating a new clone.  Get a UID for it.
   unsigned ID = AFI->createConstPoolEntryUId();
 
-  // Look for water where we can place this CPE.  We look for the farthest one
-  // away that will work.  Forward references only for now (although later
-  // we might find some that are backwards).
-
-  if (!LookForWater(U, UserOffset, &NewMBB)) {
+  // Look for water where we can place this CPE.
+  if (!LookForWater(U, UserOffset, NewMBB)) {
     // No water found.
-    DOUT << "No water found\n";
-    CreateNewWater(CPUserIndex, UserOffset, &NewMBB);
+    DEBUG(errs() << "No water found\n");
+    CreateNewWater(CPUserIndex, UserOffset, NewMBB);
   }
 
   // Okay, we know we can put an island before NewMBB now, do it!
-  MachineBasicBlock *NewIsland = Fn.CreateMachineBasicBlock();
-  Fn.insert(NewMBB, NewIsland);
+  MachineBasicBlock *NewIsland = MF.CreateMachineBasicBlock();
+  MF.insert(NewMBB, NewIsland);
 
   // Update internal data structures to account for the newly inserted MBB.
   UpdateForInsertedWaterBlock(NewIsland);
@@ -1101,7 +1152,8 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn,
       break;
     }
 
-  DOUT << "  Moved CPE to #" << ID << " CPI=" << CPI << "\t" << *UserMI;
+  DEBUG(errs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+           << '\t' << *UserMI);
 
   return true;
 }
@@ -1115,7 +1167,7 @@ void ARMConstantIslands::RemoveDeadCPEMI(MachineInstr *CPEMI) {
   BBSizes[CPEBB->getNumber()] -= Size;
   // All succeeding offsets have the current size value added in, fix this.
   if (CPEBB->empty()) {
-    // In thumb mode, the size of island may be  padded by two to compensate for
+    // In thumb1 mode, the size of island may be padded by two to compensate for
     // the alignment requirement.  Then it will now be 2 when the block is
     // empty, so fix this.
     // All succeeding offsets have the current size value added in, fix this.
@@ -1157,11 +1209,11 @@ bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
   unsigned BrOffset   = GetOffsetOf(MI) + PCAdj;
   unsigned DestOffset = BBOffsets[DestBB->getNumber()];
 
-  DOUT << "Branch of destination BB#" << DestBB->getNumber()
-       << " from BB#" << MI->getParent()->getNumber()
-       << " max delta=" << MaxDisp
-       << " from " << GetOffsetOf(MI) << " to " << DestOffset
-       << " offset " << int(DestOffset-BrOffset) << "\t" << *MI;
+  DEBUG(errs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxDisp
+               << " from " << GetOffsetOf(MI) << " to " << DestOffset
+               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
 
   if (BrOffset <= DestOffset) {
     // Branch before the Dest.
@@ -1176,7 +1228,7 @@ bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
 
 /// FixUpImmediateBr - Fix up an immediate branch whose destination is too far
 /// away to fit in its displacement field.
-bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br) {
+bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br) {
   MachineInstr *MI = Br.MI;
   MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
 
@@ -1185,8 +1237,8 @@ bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br) {
     return false;
 
   if (!Br.isCond)
-    return FixUpUnconditionalBr(Fn, Br);
-  return FixUpConditionalBr(Fn, Br);
+    return FixUpUnconditionalBr(MF, Br);
+  return FixUpConditionalBr(MF, Br);
 }
 
 /// FixUpUnconditionalBr - Fix up an unconditional branch whose destination is
@@ -1194,10 +1246,11 @@ bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br) {
 /// spilled in the epilogue, then we can use BL to implement a far jump.
 /// Otherwise, add an intermediate branch instruction to a branch.
 bool
-ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br) {
+ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br) {
   MachineInstr *MI = Br.MI;
   MachineBasicBlock *MBB = MI->getParent();
-  assert(isThumb && !isThumb2 && "Expected a Thumb-1 function!");
+  if (!isThumb1)
+    llvm_unreachable("FixUpUnconditionalBr is Thumb1 only!");
 
   // Use BL to implement far jump.
   Br.MaxDisp = (1 << 21) * 2;
@@ -1207,7 +1260,7 @@ ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br) {
   HasFarJump = true;
   NumUBrFixed++;
 
-  DOUT << "  Changed B to long jump " << *MI;
+  DEBUG(errs() << "  Changed B to long jump " << *MI);
 
   return true;
 }
@@ -1216,7 +1269,7 @@ ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br) {
 /// far away to fit in its displacement field. It is converted to an inverse
 /// conditional branch + an unconditional branch to the destination.
 bool
-ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) {
+ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) {
   MachineInstr *MI = Br.MI;
   MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
 
@@ -1251,7 +1304,8 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) {
       // b   L1
       MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
       if (BBIsInRange(MI, NewDest, Br.MaxDisp)) {
-        DOUT << "  Invert Bcc condition and swap its destination with " << *BMI;
+        DEBUG(errs() << "  Invert Bcc condition and swap its destination with "
+                     << *BMI);
         BMI->getOperand(0).setMBB(DestBB);
         MI->getOperand(0).setMBB(NewDest);
         MI->getOperand(1).setImm(CC);
@@ -1273,9 +1327,9 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) {
   }
   MachineBasicBlock *NextBB = next(MachineFunction::iterator(MBB));
 
-  DOUT << "  Insert B to BB#" << DestBB->getNumber()
-       << " also invert condition and change dest. to BB#"
-       << NextBB->getNumber() << "\n";
+  DEBUG(errs() << "  Insert B to BB#" << DestBB->getNumber()
+               << " also invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
 
   // Insert a new conditional branch and a new unconditional branch.
   // Also update the ImmBranch as well as adding a new entry for the new branch.
@@ -1300,14 +1354,17 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) {
 }
 
 /// UndoLRSpillRestore - Remove Thumb push / pop instructions that only spills
-/// LR / restores LR to pc.
+/// LR / restores LR to pc. FIXME: This is done here because it's only possible
+/// to do this if tBfar is not used.
 bool ARMConstantIslands::UndoLRSpillRestore() {
   bool MadeChange = false;
   for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) {
     MachineInstr *MI = PushPopMIs[i];
+    // First two operands are predicates, the third is a zero since there
+    // is no writeback.
     if (MI->getOpcode() == ARM::tPOP_RET &&
-        MI->getOperand(0).getReg() == ARM::PC &&
-        MI->getNumExplicitOperands() == 1) {
+        MI->getOperand(3).getReg() == ARM::PC &&
+        MI->getNumExplicitOperands() == 4) {
       BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET));
       MI->eraseFromParent();
       MadeChange = true;
@@ -1315,3 +1372,201 @@ bool ARMConstantIslands::UndoLRSpillRestore() {
   }
   return MadeChange;
 }
+
+bool ARMConstantIslands::OptimizeThumb2Instructions(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  // Shrink ADR and LDR from constantpool.
+  for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
+    CPUser &U = CPUsers[i];
+    unsigned Opcode = U.MI->getOpcode();
+    unsigned NewOpc = 0;
+    unsigned Scale = 1;
+    unsigned Bits = 0;
+    switch (Opcode) {
+    default: break;
+    case ARM::t2LEApcrel:
+      if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+        NewOpc = ARM::tLEApcrel;
+        Bits = 8;
+        Scale = 4;
+      }
+      break;
+    case ARM::t2LDRpci:
+      if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+        NewOpc = ARM::tLDRpci;
+        Bits = 8;
+        Scale = 4;
+      }
+      break;
+    }
+
+    if (!NewOpc)
+      continue;
+
+    unsigned UserOffset = GetOffsetOf(U.MI) + 4;
+    unsigned MaxOffs = ((1 << Bits) - 1) * Scale;
+    // FIXME: Check if offset is multiple of scale if scale is not 4.
+    if (CPEIsInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) {
+      U.MI->setDesc(TII->get(NewOpc));
+      MachineBasicBlock *MBB = U.MI->getParent();
+      BBSizes[MBB->getNumber()] -= 2;
+      AdjustBBOffsetsAfter(MBB, -2);
+      ++NumT2CPShrunk;
+      MadeChange = true;
+    }
+  }
+
+  MadeChange |= OptimizeThumb2Branches(MF);
+  MadeChange |= OptimizeThumb2JumpTables(MF);
+  return MadeChange;
+}
+
+bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) {
+    ImmBranch &Br = ImmBranches[i];
+    unsigned Opcode = Br.MI->getOpcode();
+    unsigned NewOpc = 0;
+    unsigned Scale = 1;
+    unsigned Bits = 0;
+    switch (Opcode) {
+    default: break;
+    case ARM::t2B:
+      NewOpc = ARM::tB;
+      Bits = 11;
+      Scale = 2;
+      break;
+    case ARM::t2Bcc:
+      NewOpc = ARM::tBcc;
+      Bits = 8;
+      Scale = 2;      
+      break;
+    }
+    if (!NewOpc)
+      continue;
+
+    unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+    MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+    if (BBIsInRange(Br.MI, DestBB, MaxOffs)) {
+      Br.MI->setDesc(TII->get(NewOpc));
+      MachineBasicBlock *MBB = Br.MI->getParent();
+      BBSizes[MBB->getNumber()] -= 2;
+      AdjustBBOffsetsAfter(MBB, -2);
+      ++NumT2BrShrunk;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+
+/// OptimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
+/// jumptables when it's possible.
+bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  // FIXME: After the tables are shrunk, can we get rid some of the
+  // constantpool tables?
+  const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
+    MachineInstr *MI = T2JumpTables[i];
+    const TargetInstrDesc &TID = MI->getDesc();
+    unsigned NumOps = TID.getNumOperands();
+    unsigned JTOpIdx = NumOps - (TID.isPredicable() ? 3 : 2);
+    MachineOperand JTOP = MI->getOperand(JTOpIdx);
+    unsigned JTI = JTOP.getIndex();
+    assert(JTI < JT.size());
+
+    bool ByteOk = true;
+    bool HalfWordOk = true;
+    unsigned JTOffset = GetOffsetOf(MI) + 4;
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+    for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
+      MachineBasicBlock *MBB = JTBBs[j];
+      unsigned DstOffset = BBOffsets[MBB->getNumber()];
+      // Negative offset is not ok. FIXME: We should change BB layout to make
+      // sure all the branches are forward.
+      if (ByteOk && (DstOffset - JTOffset) > ((1<<8)-1)*2)
+        ByteOk = false;
+      unsigned TBHLimit = ((1<<16)-1)*2;
+      if (HalfWordOk && (DstOffset - JTOffset) > TBHLimit)
+        HalfWordOk = false;
+      if (!ByteOk && !HalfWordOk)
+        break;
+    }
+
+    if (ByteOk || HalfWordOk) {
+      MachineBasicBlock *MBB = MI->getParent();
+      unsigned BaseReg = MI->getOperand(0).getReg();
+      bool BaseRegKill = MI->getOperand(0).isKill();
+      if (!BaseRegKill)
+        continue;
+      unsigned IdxReg = MI->getOperand(1).getReg();
+      bool IdxRegKill = MI->getOperand(1).isKill();
+      MachineBasicBlock::iterator PrevI = MI;
+      if (PrevI == MBB->begin())
+        continue;
+
+      MachineInstr *AddrMI = --PrevI;
+      bool OptOk = true;
+      // Examine the instruction that calculate the jumptable entry address.
+      // If it's not the one just before the t2BR_JT, we won't delete it, then
+      // it's not worth doing the optimization.
+      for (unsigned k = 0, eee = AddrMI->getNumOperands(); k != eee; ++k) {
+        const MachineOperand &MO = AddrMI->getOperand(k);
+        if (!MO.isReg() || !MO.getReg())
+          continue;
+        if (MO.isDef() && MO.getReg() != BaseReg) {
+          OptOk = false;
+          break;
+        }
+        if (MO.isUse() && !MO.isKill() && MO.getReg() != IdxReg) {
+          OptOk = false;
+          break;
+        }
+      }
+      if (!OptOk)
+        continue;
+
+      // The previous instruction should be a tLEApcrel or t2LEApcrelJT, we want
+      // to delete it as well.
+      MachineInstr *LeaMI = --PrevI;
+      if ((LeaMI->getOpcode() != ARM::tLEApcrelJT &&
+           LeaMI->getOpcode() != ARM::t2LEApcrelJT) ||
+          LeaMI->getOperand(0).getReg() != BaseReg)
+        OptOk = false;
+
+      if (!OptOk)
+        continue;
+
+      unsigned Opc = ByteOk ? ARM::t2TBB : ARM::t2TBH;
+      MachineInstr *NewJTMI = BuildMI(MBB, MI->getDebugLoc(), TII->get(Opc))
+        .addReg(IdxReg, getKillRegState(IdxRegKill))
+        .addJumpTableIndex(JTI, JTOP.getTargetFlags())
+        .addImm(MI->getOperand(JTOpIdx+1).getImm());
+      // FIXME: Insert an "ALIGN" instruction to ensure the next instruction
+      // is 2-byte aligned. For now, asm printer will fix it up.
+      unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI);
+      unsigned OrigSize = TII->GetInstSizeInBytes(AddrMI);
+      OrigSize += TII->GetInstSizeInBytes(LeaMI);
+      OrigSize += TII->GetInstSizeInBytes(MI);
+
+      AddrMI->eraseFromParent();
+      LeaMI->eraseFromParent();
+      MI->eraseFromParent();
+
+      int delta = OrigSize - NewSize;
+      BBSizes[MBB->getNumber()] -= delta;
+      AdjustBBOffsetsAfter(MBB, -delta);
+
+      ++NumTBs;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index a75ed3b..7170089 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -15,33 +15,31 @@
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Type.h"
-#include "llvm/Support/Streams.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
 using namespace llvm;
 
 ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, unsigned id,
-                                           ARMCP::ARMCPKind k,
+                                           ARMCP::ARMCPKind K,
                                            unsigned char PCAdj,
                                            const char *Modif,
                                            bool AddCA)
   : MachineConstantPoolValue((const Type*)gv->getType()),
-    GV(gv), S(NULL), LabelId(id), Kind(k), PCAdjust(PCAdj),
+    GV(gv), S(NULL), LabelId(id), Kind(K), PCAdjust(PCAdj),
     Modifier(Modif), AddCurrentAddress(AddCA) {}
 
-ARMConstantPoolValue::ARMConstantPoolValue(const char *s, unsigned id,
-                                           ARMCP::ARMCPKind k,
+ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C,
+                                           const char *s, unsigned id,
                                            unsigned char PCAdj,
                                            const char *Modif,
                                            bool AddCA)
-  : MachineConstantPoolValue((const Type*)Type::Int32Ty),
-    GV(NULL), S(s), LabelId(id), Kind(k), PCAdjust(PCAdj),
+  : MachineConstantPoolValue((const Type*)Type::getInt32Ty(C)),
+    GV(NULL), S(strdup(s)), LabelId(id), Kind(ARMCP::CPValue), PCAdjust(PCAdj),
     Modifier(Modif), AddCurrentAddress(AddCA) {}
 
-ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv,
-                                           ARMCP::ARMCPKind k,
-                                           const char *Modif)
-  : MachineConstantPoolValue((const Type*)Type::Int32Ty),
-    GV(gv), S(NULL), LabelId(0), Kind(k), PCAdjust(0),
+ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, const char *Modif)
+  : MachineConstantPoolValue((const Type*)Type::getInt32Ty(gv->getContext())),
+    GV(gv), S(NULL), LabelId(0), Kind(ARMCP::CPValue), PCAdjust(0),
     Modifier(Modif) {}
 
 int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
@@ -56,7 +54,6 @@ int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
       if (CPV->GV == GV &&
           CPV->S == S &&
           CPV->LabelId == LabelId &&
-          CPV->Kind == Kind &&
           CPV->PCAdjust == PCAdjust)
         return i;
     }
@@ -65,31 +62,28 @@ int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
   return -1;
 }
 
+ARMConstantPoolValue::~ARMConstantPoolValue() {
+  free((void*)S);
+}
+
 void
 ARMConstantPoolValue::AddSelectionDAGCSEId(FoldingSetNodeID &ID) {
   ID.AddPointer(GV);
   ID.AddPointer(S);
   ID.AddInteger(LabelId);
-  ID.AddInteger((unsigned)Kind);
   ID.AddInteger(PCAdjust);
 }
 
 void ARMConstantPoolValue::dump() const {
-  cerr << "  " << *this;
+  errs() << "  " << *this;
 }
 
-void ARMConstantPoolValue::print(std::ostream &O) const {
-  raw_os_ostream RawOS(O);
-  print(RawOS);
-}
 
 void ARMConstantPoolValue::print(raw_ostream &O) const {
   if (GV)
     O << GV->getName();
   else
     O << S;
-  if (isNonLazyPointer()) O << "$non_lazy_ptr";
-  else if (isStub()) O << "$stub";
   if (Modifier) O << "(" << Modifier << ")";
   if (PCAdjust != 0) {
     O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust;
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index d2b9066..00c4808 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -15,17 +15,16 @@
 #define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
 
 #include "llvm/CodeGen/MachineConstantPool.h"
-#include <iosfwd>
 
 namespace llvm {
 
 class GlobalValue;
+class LLVMContext;
 
 namespace ARMCP {
   enum ARMCPKind {
     CPValue,
-    CPNonLazyPtr,
-    CPStub
+    CPLSDA
   };
 }
 
@@ -36,7 +35,7 @@ class ARMConstantPoolValue : public MachineConstantPoolValue {
   GlobalValue *GV;         // GlobalValue being loaded.
   const char *S;           // ExtSymbol being loaded.
   unsigned LabelId;        // Label id of the load.
-  ARMCP::ARMCPKind Kind;   // non_lazy_ptr or stub?
+  ARMCP::ARMCPKind Kind;   // Value or LSDA?
   unsigned char PCAdjust;  // Extra adjustment if constantpool is pc relative.
                            // 8 for ARM, 4 for Thumb.
   const char *Modifier;    // GV modifier i.e. (&GV(modifier)-(LPIC+8))
@@ -47,12 +46,12 @@ public:
                        ARMCP::ARMCPKind Kind = ARMCP::CPValue,
                        unsigned char PCAdj = 0, const char *Modifier = NULL,
                        bool AddCurrentAddress = false);
-  ARMConstantPoolValue(const char *s, unsigned id,
-                       ARMCP::ARMCPKind Kind = ARMCP::CPValue,
+  ARMConstantPoolValue(LLVMContext &C, const char *s, unsigned id,
                        unsigned char PCAdj = 0, const char *Modifier = NULL,
                        bool AddCurrentAddress = false);
-  ARMConstantPoolValue(GlobalValue *GV, ARMCP::ARMCPKind Kind,
-                       const char *Modifier);
+  ARMConstantPoolValue(GlobalValue *GV, const char *Modifier);
+  ARMConstantPoolValue();
+  ~ARMConstantPoolValue();
 
 
   GlobalValue *getGV() const { return GV; }
@@ -61,27 +60,27 @@ public:
   bool hasModifier() const { return Modifier != NULL; }
   bool mustAddCurrentAddress() const { return AddCurrentAddress; }
   unsigned getLabelId() const { return LabelId; }
-  bool isNonLazyPointer() const { return Kind == ARMCP::CPNonLazyPtr; }
-  bool isStub() const { return Kind == ARMCP::CPStub; }
   unsigned char getPCAdjustment() const { return PCAdjust; }
+  bool isLSDA() { return Kind == ARMCP::CPLSDA; }
+
+  virtual unsigned getRelocationInfo() const {
+    // FIXME: This is conservatively claiming that these entries require a
+    // relocation, we may be able to do better than this.
+    return 2;
+  }
+
 
   virtual int getExistingMachineCPValue(MachineConstantPool *CP,
                                         unsigned Alignment);
 
   virtual void AddSelectionDAGCSEId(FoldingSetNodeID &ID);
 
-  void print(std::ostream *O) const { if (O) print(*O); }
-  void print(std::ostream &O) const;
   void print(raw_ostream *O) const { if (O) print(*O); }
   void print(raw_ostream &O) const;
   void dump() const;
 };
 
-  inline std::ostream &operator<<(std::ostream &O, const ARMConstantPoolValue &V) {
-  V.print(O);
-  return O;
-}
-  
+
 inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) {
   V.print(O);
   return O;
diff --git a/lib/Target/ARM/ARMFrameInfo.h b/lib/Target/ARM/ARMFrameInfo.h
index 405b8f2..d5dae24 100644
--- a/lib/Target/ARM/ARMFrameInfo.h
+++ b/lib/Target/ARM/ARMFrameInfo.h
@@ -15,15 +15,15 @@
 #define ARM_FRAMEINFO_H
 
 #include "ARM.h"
-#include "llvm/Target/TargetFrameInfo.h"
 #include "ARMSubtarget.h"
+#include "llvm/Target/TargetFrameInfo.h"
 
 namespace llvm {
 
 class ARMFrameInfo : public TargetFrameInfo {
 public:
   explicit ARMFrameInfo(const ARMSubtarget &ST)
-    : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0) {
+    : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0, 4) {
   }
 };
 
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 6485fc1..bebf4e8 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -21,6 +21,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -30,10 +31,10 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-using namespace llvm;
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 
-static const unsigned arm_dsubreg_0 = 5;
-static const unsigned arm_dsubreg_1 = 6;
+using namespace llvm;
 
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
@@ -48,8 +49,9 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
   const ARMSubtarget *Subtarget;
 
 public:
-  explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm)
-    : SelectionDAGISel(tm), TM(tm),
+  explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm,
+                           CodeGenOpt::Level OptLevel)
+    : SelectionDAGISel(tm, OptLevel), TM(tm),
     Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
   }
 
@@ -57,7 +59,8 @@ public:
     return "ARM Instruction Selection";
   }
 
- /// getI32Imm - Return a target constant with the specified value, of type i32.
+  /// getI32Imm - Return a target constant of type i32 with the specified
+  /// value.
   inline SDValue getI32Imm(unsigned Imm) {
     return CurDAG->getTargetConstant(Imm, MVT::i32);
   }
@@ -74,6 +77,8 @@ public:
                        SDValue &Offset, SDValue &Opc);
   bool SelectAddrMode3Offset(SDValue Op, SDValue N,
                              SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode4(SDValue Op, SDValue N, SDValue &Addr,
+                       SDValue &Mode);
   bool SelectAddrMode5(SDValue Op, SDValue N, SDValue &Base,
                        SDValue &Offset);
   bool SelectAddrMode6(SDValue Op, SDValue N, SDValue &Addr, SDValue &Update,
@@ -118,15 +123,63 @@ private:
   SDNode *SelectARMIndexedLoad(SDValue Op);
   SDNode *SelectT2IndexedLoad(SDValue Op);
 
+  /// SelectDYN_ALLOC - Select dynamic alloc for Thumb.
+  SDNode *SelectDYN_ALLOC(SDValue Op);
+
+  /// SelectVLD - Select NEON load intrinsics.  NumVecs should
+  /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// loads of D registers and even subregs and odd subregs of Q registers.
+  /// For NumVecs == 2, QOpcodes1 is not used.
+  SDNode *SelectVLD(SDValue Op, unsigned NumVecs, unsigned *DOpcodes,
+                    unsigned *QOpcodes0, unsigned *QOpcodes1);
+
+  /// SelectVLDSTLane - Select NEON load/store lane intrinsics.  NumVecs should
+  /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// load/store of D registers and even subregs and odd subregs of Q registers.
+  SDNode *SelectVLDSTLane(SDValue Op, bool IsLoad, unsigned NumVecs,
+                          unsigned *DOpcodes, unsigned *QOpcodes0,
+                          unsigned *QOpcodes1);
+
+  /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM.
+  SDNode *SelectV6T2BitfieldExtractOp(SDValue Op, unsigned Opc);
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
   virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                             char ConstraintCode,
                                             std::vector<SDValue> &OutOps);
+
+  /// PairDRegs - Insert a pair of double registers into an implicit def to
+  /// form a quad register.
+  SDNode *PairDRegs(EVT VT, SDValue V0, SDValue V1);
 };
 }
 
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDValue N, unsigned &Imm) {
+  return isInt32Immediate(N.getNode(), Imm);
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+  return N->getOpcode() == Opc &&
+         isInt32Immediate(N->getOperand(1).getNode(), Imm);
+}
+
+
 void ARMDAGToDAGISel::InstructionSelect() {
   DEBUG(BB->dump());
 
@@ -144,7 +197,7 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue Op,
   // Don't match base register only case. That is matched to a separate
   // lower complexity pattern with explicit register operand.
   if (ShOpcVal == ARM_AM::no_shift) return false;
-  
+
   BaseReg = N.getOperand(0);
   unsigned ShImmVal = 0;
   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
@@ -198,7 +251,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N,
                                     MVT::i32);
     return true;
   }
-  
+
   // Match simple R +/- imm12 operands.
   if (N.getOpcode() == ISD::ADD)
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
@@ -223,15 +276,15 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N,
         return true;
       }
     }
-  
+
   // Otherwise this is R +/- [possibly shifted] R
   ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::ADD ? ARM_AM::add:ARM_AM::sub;
   ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1));
   unsigned ShAmt = 0;
-  
+
   Base   = N.getOperand(0);
   Offset = N.getOperand(1);
-  
+
   if (ShOpcVal != ARM_AM::no_shift) {
     // Check to see if the RHS of the shift is a constant, if not, we can't fold
     // it.
@@ -243,7 +296,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N,
       ShOpcVal = ARM_AM::no_shift;
     }
   }
-  
+
   // Try matching (R shl C) + (R).
   if (N.getOpcode() == ISD::ADD && ShOpcVal == ARM_AM::no_shift) {
     ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0));
@@ -260,7 +313,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N,
       }
     }
   }
-  
+
   Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
                                   MVT::i32);
   return true;
@@ -315,7 +368,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue Op, SDValue N,
     Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0),MVT::i32);
     return true;
   }
-  
+
   if (N.getOpcode() != ISD::ADD) {
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
@@ -326,7 +379,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue Op, SDValue N,
     Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32);
     return true;
   }
-  
+
   // If the RHS is +/- imm8, fold into addr mode.
   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
     int RHSC = (int)RHS->getZExtValue();
@@ -348,7 +401,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue Op, SDValue N,
       return true;
     }
   }
-  
+
   Base = N.getOperand(0);
   Offset = N.getOperand(1);
   Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), MVT::i32);
@@ -377,6 +430,12 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDValue Op, SDValue N,
   return true;
 }
 
+bool ARMDAGToDAGISel::SelectAddrMode4(SDValue Op, SDValue N,
+                                      SDValue &Addr, SDValue &Mode) {
+  Addr = N;
+  Mode = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
 
 bool ARMDAGToDAGISel::SelectAddrMode5(SDValue Op, SDValue N,
                                       SDValue &Base, SDValue &Offset) {
@@ -392,7 +451,7 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue Op, SDValue N,
                                        MVT::i32);
     return true;
   }
-  
+
   // If the RHS is +/- imm8, fold into addr mode.
   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
     int RHSC = (int)RHS->getZExtValue();
@@ -417,7 +476,7 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue Op, SDValue N,
       }
     }
   }
-  
+
   Base = N;
   Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
                                      MVT::i32);
@@ -428,14 +487,14 @@ bool ARMDAGToDAGISel::SelectAddrMode6(SDValue Op, SDValue N,
                                       SDValue &Addr, SDValue &Update,
                                       SDValue &Opc) {
   Addr = N;
-  // The optional writeback is handled in ARMLoadStoreOpt.
+  // Default to no writeback.
   Update = CurDAG->getRegister(0, MVT::i32);
   Opc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(false), MVT::i32);
   return true;
 }
 
 bool ARMDAGToDAGISel::SelectAddrModePC(SDValue Op, SDValue N,
-                                        SDValue &Offset, SDValue &Label) {
+                                       SDValue &Offset, SDValue &Label) {
   if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {
     Offset = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
@@ -451,13 +510,11 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue Op, SDValue N,
   // FIXME dl should come from the parent load or store, not the address
   DebugLoc dl = Op.getDebugLoc();
   if (N.getOpcode() != ISD::ADD) {
-    Base = N;
-    // We must materialize a zero in a reg! Returning a constant here
-    // wouldn't work without additional code to position the node within
-    // ISel's topological ordering in a place where ISel will process it
-    // normally.  Instead, just explicitly issue a tMOVri8 node!
-    Offset = SDValue(CurDAG->getTargetNode(ARM::tMOVi8, dl, MVT::i32,
-                                    CurDAG->getTargetConstant(0, MVT::i32)), 0);
+    ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N);
+    if (!NC || NC->getZExtValue() != 0)
+      return false;
+
+    Base = Offset = N;
     return true;
   }
 
@@ -567,7 +624,7 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue Op, SDValue N,
       }
     }
   }
-  
+
   return false;
 }
 
@@ -594,41 +651,70 @@ bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue Op, SDValue N,
 bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue Op, SDValue N,
                                             SDValue &Base, SDValue &OffImm) {
   // Match simple R + imm12 operands.
-  if (N.getOpcode() != ISD::ADD)
-    return false;
+
+  // Base only.
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) {
+    if (N.getOpcode() == ISD::FrameIndex) {
+      // Match frame index...
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+      return true;
+    } else if (N.getOpcode() == ARMISD::Wrapper) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::TargetConstantPool)
+        return false;  // We want to select t2LDRpci instead.
+    } else
+      Base = N;
+    OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
 
   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    if (SelectT2AddrModeImm8(Op, N, Base, OffImm))
+      // Let t2LDRi8 handle (R - imm8).
+      return false;
+
     int RHSC = (int)RHS->getZExtValue();
-    if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits.
+    if (N.getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned)
       Base   = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
     }
   }
 
-  return false;
+  // Base only.
+  Base = N;
+  OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
 }
 
 bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue Op, SDValue N,
                                            SDValue &Base, SDValue &OffImm) {
-  if (N.getOpcode() == ISD::ADD) {
+  // Match simple R - imm8 operands.
+  if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::SUB) {
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-      int RHSC = (int)RHS->getZExtValue();
-      if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
-        Base   = N.getOperand(0);
+      int RHSC = (int)RHS->getSExtValue();
+      if (N.getOpcode() == ISD::SUB)
+        RHSC = -RHSC;
+
+      if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative)
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        }
         OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
         return true;
       }
     }
-  } else if (N.getOpcode() == ISD::SUB) {
-    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-      int RHSC = (int)RHS->getZExtValue();
-      if (RHSC >= 0 && RHSC < 0x100) { // 8 bits.
-        Base   = N.getOperand(0);
-        OffImm = CurDAG->getTargetConstant(-RHSC, MVT::i32);
-        return true;
-      }
-    }
   }
 
   return false;
@@ -643,7 +729,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDValue Op, SDValue N,
   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N)) {
     int RHSC = (int)RHS->getZExtValue();
     if (RHSC >= 0 && RHSC < 0x100) { // 8 bits.
-      OffImm = (AM == ISD::PRE_INC)
+      OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
         ? CurDAG->getTargetConstant(RHSC, MVT::i32)
         : CurDAG->getTargetConstant(-RHSC, MVT::i32);
       return true;
@@ -658,7 +744,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDValue Op, SDValue N,
   if (N.getOpcode() == ISD::ADD) {
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       int RHSC = (int)RHS->getZExtValue();
-      if (((RHSC & 0x3) == 0) && (RHSC < 0 && RHSC > -0x400)) { // 8 bits.
+      if (((RHSC & 0x3) == 0) &&
+          ((RHSC >= 0 && RHSC < 0x400) || (RHSC < 0 && RHSC > -0x400))) { // 8 bits.
         Base   = N.getOperand(0);
         OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
         return true;
@@ -681,20 +768,17 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDValue Op, SDValue N,
 bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue Op, SDValue N,
                                             SDValue &Base,
                                             SDValue &OffReg, SDValue &ShImm) {
-  // Base only.
-  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) {
-    Base = N;
-    if (N.getOpcode() == ISD::FrameIndex) {
-      int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
-    } else if (N.getOpcode() == ARMISD::Wrapper) {
-      Base = N.getOperand(0);
-      if (Base.getOpcode() == ISD::TargetConstantPool)
-        return false;  // We want to select t2LDRpci instead.
-    }
-    OffReg = CurDAG->getRegister(0, MVT::i32);
-    ShImm  = CurDAG->getTargetConstant(0, MVT::i32);
-    return true;
+  // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12.
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+
+  // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if (RHSC >= 0 && RHSC < 0x1000) // 12 bits (unsigned)
+      return false;
+    else if (RHSC < 0 && RHSC >= -255) // 8 bits
+      return false;
   }
 
   // Look for (R + R) or (R + (R << [1,2,3])).
@@ -708,8 +792,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue Op, SDValue N,
     ShOpcVal = ARM_AM::getShiftOpcForNode(Base);
     if (ShOpcVal == ARM_AM::lsl)
       std::swap(Base, OffReg);
-  }  
-  
+  }
+
   if (ShOpcVal == ARM_AM::lsl) {
     // Check to see if the RHS of the shift is a constant, if not, we can't fold
     // it.
@@ -723,11 +807,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue Op, SDValue N,
     } else {
       ShOpcVal = ARM_AM::no_shift;
     }
-  } else if (SelectT2AddrModeImm12(Op, N, Base, ShImm) ||
-             SelectT2AddrModeImm8 (Op, N, Base, ShImm))
-    // Don't match if it's possible to match to one of the r +/- imm cases.
-    return false;
-  
+  }
+
   ShImm = CurDAG->getTargetConstant(ShAmt, MVT::i32);
 
   return true;
@@ -746,7 +827,7 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDValue Op) {
   if (AM == ISD::UNINDEXED)
     return NULL;
 
-  MVT LoadedVT = LD->getMemoryVT();
+  EVT LoadedVT = LD->getMemoryVT();
   SDValue Offset, AMOpc;
   bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
   unsigned Opcode = 0;
@@ -780,8 +861,8 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDValue Op) {
     SDValue Base = LD->getBasePtr();
     SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG),
                      CurDAG->getRegister(0, MVT::i32), Chain };
-    return CurDAG->getTargetNode(Opcode, Op.getDebugLoc(), MVT::i32, MVT::i32,
-                                 MVT::Other, Ops, 6);
+    return CurDAG->getMachineNode(Opcode, Op.getDebugLoc(), MVT::i32, MVT::i32,
+                                  MVT::Other, Ops, 6);
   }
 
   return NULL;
@@ -793,14 +874,14 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDValue Op) {
   if (AM == ISD::UNINDEXED)
     return NULL;
 
-  MVT LoadedVT = LD->getMemoryVT();
+  EVT LoadedVT = LD->getMemoryVT();
   bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
   SDValue Offset;
   bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
   unsigned Opcode = 0;
   bool Match = false;
   if (SelectT2AddrModeImm8Offset(Op, LD->getOffset(), Offset)) {
-    switch (LoadedVT.getSimpleVT()) {
+    switch (LoadedVT.getSimpleVT().SimpleTy) {
     case MVT::i32:
       Opcode = isPre ? ARM::t2LDR_PRE : ARM::t2LDR_POST;
       break;
@@ -828,13 +909,300 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDValue Op) {
     SDValue Base = LD->getBasePtr();
     SDValue Ops[]= { Base, Offset, getAL(CurDAG),
                      CurDAG->getRegister(0, MVT::i32), Chain };
-    return CurDAG->getTargetNode(Opcode, Op.getDebugLoc(), MVT::i32, MVT::i32,
-                                 MVT::Other, Ops, 5);
+    return CurDAG->getMachineNode(Opcode, Op.getDebugLoc(), MVT::i32, MVT::i32,
+                                  MVT::Other, Ops, 5);
+  }
+
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectDYN_ALLOC(SDValue Op) {
+  SDNode *N = Op.getNode();
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = Op.getValueType();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  SDValue Align = Op.getOperand(2);
+  SDValue SP = CurDAG->getRegister(ARM::SP, MVT::i32);
+  int32_t AlignVal = cast<ConstantSDNode>(Align)->getSExtValue();
+  if (AlignVal < 0)
+    // We need to align the stack. Use Thumb1 tAND which is the only thumb
+    // instruction that can read and write SP. This matches to a pseudo
+    // instruction that has a chain to ensure the result is written back to
+    // the stack pointer.
+    SP = SDValue(CurDAG->getMachineNode(ARM::tANDsp, dl, VT, SP, Align), 0);
+
+  bool isC = isa<ConstantSDNode>(Size);
+  uint32_t C = isC ? cast<ConstantSDNode>(Size)->getZExtValue() : ~0UL;
+  // Handle the most common case for both Thumb1 and Thumb2:
+  // tSUBspi - immediate is between 0 ... 508 inclusive.
+  if (C <= 508 && ((C & 3) == 0))
+    // FIXME: tSUBspi encode scale 4 implicitly.
+    return CurDAG->SelectNodeTo(N, ARM::tSUBspi_, VT, MVT::Other, SP,
+                                CurDAG->getTargetConstant(C/4, MVT::i32),
+                                Chain);
+
+  if (Subtarget->isThumb1Only()) {
+    // Use tADDspr since Thumb1 does not have a sub r, sp, r. ARMISelLowering
+    // should have negated the size operand already. FIXME: We can't insert
+    // new target independent node at this stage so we are forced to negate
+    // it earlier. Is there a better solution?
+    return CurDAG->SelectNodeTo(N, ARM::tADDspr_, VT, MVT::Other, SP, Size,
+                                Chain);
+  } else if (Subtarget->isThumb2()) {
+    if (isC && Predicate_t2_so_imm(Size.getNode())) {
+      // t2SUBrSPi
+      SDValue Ops[] = { SP, CurDAG->getTargetConstant(C, MVT::i32), Chain };
+      return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPi_, VT, MVT::Other, Ops, 3);
+    } else if (isC && Predicate_imm0_4095(Size.getNode())) {
+      // t2SUBrSPi12
+      SDValue Ops[] = { SP, CurDAG->getTargetConstant(C, MVT::i32), Chain };
+      return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPi12_, VT, MVT::Other, Ops, 3);
+    } else {
+      // t2SUBrSPs
+      SDValue Ops[] = { SP, Size,
+                        getI32Imm(ARM_AM::getSORegOpc(ARM_AM::lsl,0)), Chain };
+      return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPs_, VT, MVT::Other, Ops, 4);
+    }
+  }
+
+  // FIXME: Add ADD / SUB sp instructions for ARM.
+  return 0;
+}
+
+/// PairDRegs - Insert a pair of double registers into an implicit def to
+/// form a quad register.
+SDNode *ARMDAGToDAGISel::PairDRegs(EVT VT, SDValue V0, SDValue V1) {
+  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue Undef =
+    SDValue(CurDAG->getMachineNode(TargetInstrInfo::IMPLICIT_DEF, dl, VT), 0);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::DSUBREG_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::DSUBREG_1, MVT::i32);
+  SDNode *Pair = CurDAG->getMachineNode(TargetInstrInfo::INSERT_SUBREG, dl,
+                                        VT, Undef, V0, SubReg0);
+  return CurDAG->getMachineNode(TargetInstrInfo::INSERT_SUBREG, dl,
+                                VT, SDValue(Pair, 0), V1, SubReg1);
+}
+
+/// GetNEONSubregVT - Given a type for a 128-bit NEON vector, return the type
+/// for a 64-bit subregister of the vector.
+static EVT GetNEONSubregVT(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled NEON type");
+  case MVT::v16i8: return MVT::v8i8;
+  case MVT::v8i16: return MVT::v4i16;
+  case MVT::v4f32: return MVT::v2f32;
+  case MVT::v4i32: return MVT::v2i32;
+  case MVT::v2i64: return MVT::v1i64;
+  }
+}
+
+SDNode *ARMDAGToDAGISel::SelectVLD(SDValue Op, unsigned NumVecs,
+                                   unsigned *DOpcodes, unsigned *QOpcodes0,
+                                   unsigned *QOpcodes1) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+  SDNode *N = Op.getNode();
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue MemAddr, MemUpdate, MemOpc;
+  if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc))
+    return NULL;
+
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  bool is64BitVector = VT.is64BitVector();
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+  case MVT::v1i64: OpcodeIndex = 3; break;
+    // Quad-register operations:
+  case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v8i16: OpcodeIndex = 1; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 2; break;
+  }
+
+  if (is64BitVector) {
+    unsigned Opc = DOpcodes[OpcodeIndex];
+    const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, Chain };
+    std::vector<EVT> ResTys(NumVecs, VT);
+    ResTys.push_back(MVT::Other);
+    return CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 4);
+  }
+
+  EVT RegVT = GetNEONSubregVT(VT);
+  if (NumVecs == 2) {
+    // Quad registers are directly supported for VLD2,
+    // loading 2 pairs of D regs.
+    unsigned Opc = QOpcodes0[OpcodeIndex];
+    const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, Chain };
+    std::vector<EVT> ResTys(4, VT);
+    ResTys.push_back(MVT::Other);
+    SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 4);
+    Chain = SDValue(VLd, 4);
+
+    // Combine the even and odd subregs to produce the result.
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+      SDNode *Q = PairDRegs(VT, SDValue(VLd, 2*Vec), SDValue(VLd, 2*Vec+1));
+      ReplaceUses(SDValue(N, Vec), SDValue(Q, 0));
+    }
+  } else {
+    // Otherwise, quad registers are loaded with two separate instructions,
+    // where one loads the even registers and the other loads the odd registers.
+
+    // Enable writeback to the address register.
+    MemOpc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(true), MVT::i32);
+
+    std::vector<EVT> ResTys(NumVecs, RegVT);
+    ResTys.push_back(MemAddr.getValueType());
+    ResTys.push_back(MVT::Other);
+
+    // Load the even subreg.
+    unsigned Opc = QOpcodes0[OpcodeIndex];
+    const SDValue OpsA[] = { MemAddr, MemUpdate, MemOpc, Chain };
+    SDNode *VLdA = CurDAG->getMachineNode(Opc, dl, ResTys, OpsA, 4);
+    Chain = SDValue(VLdA, NumVecs+1);
+
+    // Load the odd subreg.
+    Opc = QOpcodes1[OpcodeIndex];
+    const SDValue OpsB[] = { SDValue(VLdA, NumVecs), MemUpdate, MemOpc, Chain };
+    SDNode *VLdB = CurDAG->getMachineNode(Opc, dl, ResTys, OpsB, 4);
+    Chain = SDValue(VLdB, NumVecs+1);
+
+    // Combine the even and odd subregs to produce the result.
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+      SDNode *Q = PairDRegs(VT, SDValue(VLdA, Vec), SDValue(VLdB, Vec));
+      ReplaceUses(SDValue(N, Vec), SDValue(Q, 0));
+    }
+  }
+  ReplaceUses(SDValue(N, NumVecs), Chain);
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDValue Op, bool IsLoad,
+                                         unsigned NumVecs, unsigned *DOpcodes,
+                                         unsigned *QOpcodes0,
+                                         unsigned *QOpcodes1) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+  SDNode *N = Op.getNode();
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue MemAddr, MemUpdate, MemOpc;
+  if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc))
+    return NULL;
+
+  SDValue Chain = N->getOperand(0);
+  unsigned Lane =
+    cast<ConstantSDNode>(N->getOperand(NumVecs+3))->getZExtValue();
+  EVT VT = IsLoad ? N->getValueType(0) : N->getOperand(3).getValueType();
+  bool is64BitVector = VT.is64BitVector();
+
+  // Quad registers are handled by load/store of subregs. Find the subreg info.
+  unsigned NumElts = 0;
+  int SubregIdx = 0;
+  EVT RegVT = VT;
+  if (!is64BitVector) {
+    RegVT = GetNEONSubregVT(VT);
+    NumElts = RegVT.getVectorNumElements();
+    SubregIdx = (Lane < NumElts) ? ARM::DSUBREG_0 : ARM::DSUBREG_1;
+  }
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld/vst lane type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+    // Quad-register operations:
+  case MVT::v8i16: OpcodeIndex = 0; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 1; break;
+  }
+
+  SmallVector<SDValue, 9> Ops;
+  Ops.push_back(MemAddr);
+  Ops.push_back(MemUpdate);
+  Ops.push_back(MemOpc);
+
+  unsigned Opc = 0;
+  if (is64BitVector) {
+    Opc = DOpcodes[OpcodeIndex];
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+      Ops.push_back(N->getOperand(Vec+3));
+  } else {
+    // Check if this is loading the even or odd subreg of a Q register.
+    if (Lane < NumElts) {
+      Opc = QOpcodes0[OpcodeIndex];
+    } else {
+      Lane -= NumElts;
+      Opc = QOpcodes1[OpcodeIndex];
+    }
+    // Extract the subregs of the input vector.
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+      Ops.push_back(CurDAG->getTargetExtractSubreg(SubregIdx, dl, RegVT,
+                                                   N->getOperand(Vec+3)));
+  }
+  Ops.push_back(getI32Imm(Lane));
+  Ops.push_back(Chain);
+
+  if (!IsLoad)
+    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+5);
+
+  std::vector<EVT> ResTys(NumVecs, RegVT);
+  ResTys.push_back(MVT::Other);
+  SDNode *VLdLn =
+    CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), NumVecs+5);
+  // For a 64-bit vector load to D registers, nothing more needs to be done.
+  if (is64BitVector)
+    return VLdLn;
+
+  // For 128-bit vectors, take the 64-bit results of the load and insert them
+  // as subregs into the result.
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+    SDValue QuadVec = CurDAG->getTargetInsertSubreg(SubregIdx, dl, VT,
+                                                    N->getOperand(Vec+3),
+                                                    SDValue(VLdLn, Vec));
+    ReplaceUses(SDValue(N, Vec), QuadVec);
   }
 
+  Chain = SDValue(VLdLn, NumVecs);
+  ReplaceUses(SDValue(N, NumVecs), Chain);
   return NULL;
 }
 
+SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDValue Op,
+                                                     unsigned Opc) {
+  if (!Subtarget->hasV6T2Ops())
+    return NULL;
+
+  unsigned Shl_imm = 0;
+  if (isOpcWithIntImmediate(Op.getOperand(0).getNode(), ISD::SHL, Shl_imm)){
+    assert(Shl_imm > 0 && Shl_imm < 32 && "bad amount in shift node!");
+    unsigned Srl_imm = 0;
+    if (isInt32Immediate(Op.getOperand(1), Srl_imm)) {
+      assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+      unsigned Width = 32 - Srl_imm;
+      int LSB = Srl_imm - Shl_imm;
+      if ((LSB + Width) > 32)
+        return NULL;
+      SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { Op.getOperand(0).getOperand(0),
+                        CurDAG->getTargetConstant(LSB, MVT::i32),
+                        CurDAG->getTargetConstant(Width, MVT::i32),
+                        getAL(CurDAG), Reg0 };
+      return CurDAG->SelectNodeTo(Op.getNode(), Opc, MVT::i32, Ops, 5);
+    }
+  }
+  return NULL;
+}
 
 SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
   SDNode *N = Op.getNode();
@@ -848,44 +1216,50 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
   case ISD::Constant: {
     unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
     bool UseCP = true;
-    if (Subtarget->isThumb()) {
-      if (Subtarget->hasThumb2())
-        // Thumb2 has the MOVT instruction, so all immediates can
-        // be done with MOV + MOVT, at worst.
-        UseCP = 0;
-      else
+    if (Subtarget->hasThumb2())
+      // Thumb2-aware targets have the MOVT instruction, so all immediates can
+      // be done with MOV + MOVT, at worst.
+      UseCP = 0;
+    else {
+      if (Subtarget->isThumb()) {
         UseCP = (Val > 255 &&                          // MOV
                  ~Val > 255 &&                         // MOV + MVN
                  !ARM_AM::isThumbImmShiftedVal(Val));  // MOV + LSL
-    } else
-      UseCP = (ARM_AM::getSOImmVal(Val) == -1 &&     // MOV
-               ARM_AM::getSOImmVal(~Val) == -1 &&    // MVN
-               !ARM_AM::isSOImmTwoPartVal(Val));     // two instrs.
+      } else
+        UseCP = (ARM_AM::getSOImmVal(Val) == -1 &&     // MOV
+                 ARM_AM::getSOImmVal(~Val) == -1 &&    // MVN
+                 !ARM_AM::isSOImmTwoPartVal(Val));     // two instrs.
+    }
+
     if (UseCP) {
       SDValue CPIdx =
-        CurDAG->getTargetConstantPool(ConstantInt::get(Type::Int32Ty, Val),
+        CurDAG->getTargetConstantPool(ConstantInt::get(
+                                  Type::getInt32Ty(*CurDAG->getContext()), Val),
                                       TLI.getPointerTy());
 
       SDNode *ResNode;
-      if (Subtarget->isThumb())
-        ResNode = CurDAG->getTargetNode(ARM::tLDRcp, dl, MVT::i32, MVT::Other,
-                                        CPIdx, CurDAG->getEntryNode());
-      else {
+      if (Subtarget->isThumb1Only()) {
+        SDValue Pred = CurDAG->getTargetConstant(0xEULL, MVT::i32);
+        SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+        SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() };
+        ResNode = CurDAG->getMachineNode(ARM::tLDRcp, dl, MVT::i32, MVT::Other,
+                                         Ops, 4);
+      } else {
         SDValue Ops[] = {
-          CPIdx, 
+          CPIdx,
           CurDAG->getRegister(0, MVT::i32),
           CurDAG->getTargetConstant(0, MVT::i32),
           getAL(CurDAG),
           CurDAG->getRegister(0, MVT::i32),
           CurDAG->getEntryNode()
         };
-        ResNode=CurDAG->getTargetNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
-                                      Ops, 6);
+        ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
+                                       Ops, 6);
       }
       ReplaceUses(Op, SDValue(ResNode, 0));
       return NULL;
     }
-      
+
     // Other cases are autogenerated.
     break;
   }
@@ -893,80 +1267,106 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
     // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
-    if (Subtarget->isThumb()) {
+    if (Subtarget->isThumb1Only()) {
       return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, TFI,
                                   CurDAG->getTargetConstant(0, MVT::i32));
     } else {
+      unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
+                      ARM::t2ADDri : ARM::ADDri);
       SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
-                          getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
-                          CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->SelectNodeTo(N, ARM::ADDri, MVT::i32, Ops, 5);
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
     }
   }
-  case ISD::ADD: {
-    if (!Subtarget->isThumb())
-      break;
-    // Select add sp, c to tADDhirr.
-    SDValue N0 = Op.getOperand(0);
-    SDValue N1 = Op.getOperand(1);
-    RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(Op.getOperand(0));
-    RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(Op.getOperand(1));
-    if (LHSR && LHSR->getReg() == ARM::SP) {
-      std::swap(N0, N1);
-      std::swap(LHSR, RHSR);
-    }
-    if (RHSR && RHSR->getReg() == ARM::SP) {
-      SDValue Val = SDValue(CurDAG->getTargetNode(ARM::tMOVlor2hir, dl,
-                                  Op.getValueType(), N0, N0), 0);
-      return CurDAG->SelectNodeTo(N, ARM::tADDhirr, Op.getValueType(), Val, N1);
-    }
+  case ARMISD::DYN_ALLOC:
+    return SelectDYN_ALLOC(Op);
+  case ISD::SRL:
+    if (SDNode *I = SelectV6T2BitfieldExtractOp(Op,
+                      Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX))
+      return I;
+    break;
+  case ISD::SRA:
+    if (SDNode *I = SelectV6T2BitfieldExtractOp(Op,
+                      Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX))
+      return I;
     break;
-  }
   case ISD::MUL:
-    if (Subtarget->isThumb())
+    if (Subtarget->isThumb1Only())
       break;
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       unsigned RHSV = C->getZExtValue();
       if (!RHSV) break;
       if (isPowerOf2_32(RHSV-1)) {  // 2^n+1?
+        unsigned ShImm = Log2_32(RHSV-1);
+        if (ShImm >= 32)
+          break;
         SDValue V = Op.getOperand(0);
-        unsigned ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, Log2_32(RHSV-1));
-        SDValue Ops[] = { V, V, CurDAG->getRegister(0, MVT::i32),
-                            CurDAG->getTargetConstant(ShImm, MVT::i32),
-                            getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
-                            CurDAG->getRegister(0, MVT::i32) };
-        return CurDAG->SelectNodeTo(N, ARM::ADDrs, MVT::i32, Ops, 7);
+        ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+        SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32);
+        SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+        if (Subtarget->isThumb()) {
+          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops, 6);
+        } else {
+          SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::ADDrs, MVT::i32, Ops, 7);
+        }
       }
       if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
+        unsigned ShImm = Log2_32(RHSV+1);
+        if (ShImm >= 32)
+          break;
         SDValue V = Op.getOperand(0);
-        unsigned ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, Log2_32(RHSV+1));
-        SDValue Ops[] = { V, V, CurDAG->getRegister(0, MVT::i32),
-                            CurDAG->getTargetConstant(ShImm, MVT::i32),
-                            getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
-                            CurDAG->getRegister(0, MVT::i32) };
-        return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7);
+        ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+        SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32);
+        SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+        if (Subtarget->isThumb()) {
+          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 5);
+        } else {
+          SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7);
+        }
       }
     }
     break;
   case ARMISD::FMRRD:
-    return CurDAG->getTargetNode(ARM::FMRRD, dl, MVT::i32, MVT::i32,
-                                 Op.getOperand(0), getAL(CurDAG),
-                                 CurDAG->getRegister(0, MVT::i32));
+    return CurDAG->getMachineNode(ARM::FMRRD, dl, MVT::i32, MVT::i32,
+                                  Op.getOperand(0), getAL(CurDAG),
+                                  CurDAG->getRegister(0, MVT::i32));
   case ISD::UMUL_LOHI: {
-    SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+    if (Subtarget->isThumb1Only())
+      break;
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops,4);
+    } else {
+      SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-    return CurDAG->getTargetNode(ARM::UMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+      return CurDAG->getMachineNode(ARM::UMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+    }
   }
   case ISD::SMUL_LOHI: {
-    SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+    if (Subtarget->isThumb1Only())
+      break;
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops,4);
+    } else {
+      SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-    return CurDAG->getTargetNode(ARM::SMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+      return CurDAG->getMachineNode(ARM::SMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+    }
   }
   case ISD::LOAD: {
     SDNode *ResNode = 0;
-    if (Subtarget->isThumb2())
+    if (Subtarget->isThumb() && Subtarget->hasThumb2())
       ResNode = SelectT2IndexedLoad(Op);
     else
       ResNode = SelectARMIndexedLoad(Op);
@@ -988,7 +1388,7 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
     // Emits: (t2Bcc:void (bb:Other):$dst, (imm:i32):$cc)
     // Pattern complexity = 6  cost = 1  size = 0
 
-    unsigned Opc = Subtarget->isThumb() ? 
+    unsigned Opc = Subtarget->isThumb() ?
       ((Subtarget->hasThumb2()) ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc;
     SDValue Chain = Op.getOperand(0);
     SDValue N1 = Op.getOperand(1);
@@ -1003,8 +1403,8 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
                                cast<ConstantSDNode>(N2)->getZExtValue()),
                                MVT::i32);
     SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag };
-    SDNode *ResNode = CurDAG->getTargetNode(Opc, dl, MVT::Other, 
-                                            MVT::Flag, Ops, 5);
+    SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
+                                             MVT::Flag, Ops, 5);
     Chain = SDValue(ResNode, 0);
     if (Op.getNode()->getNumValues() == 2) {
       InFlag = SDValue(ResNode, 1);
@@ -1014,8 +1414,7 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
     return NULL;
   }
   case ARMISD::CMOV: {
-    bool isThumb = Subtarget->isThumb();
-    MVT VT = Op.getValueType();
+    EVT VT = Op.getValueType();
     SDValue N0 = Op.getOperand(0);
     SDValue N1 = Op.getOperand(1);
     SDValue N2 = Op.getOperand(2);
@@ -1024,39 +1423,79 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
     assert(N2.getOpcode() == ISD::Constant);
     assert(N3.getOpcode() == ISD::Register);
 
-    // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
-    // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
-    // Pattern complexity = 18  cost = 1  size = 0
-    SDValue CPTmp0;
-    SDValue CPTmp1;
-    SDValue CPTmp2;
-    if (!isThumb && VT == MVT::i32 &&
-        SelectShifterOperandReg(Op, N1, CPTmp0, CPTmp1, CPTmp2)) {
-      SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
-                               cast<ConstantSDNode>(N2)->getZExtValue()),
-                               MVT::i32);
-      SDValue Ops[] = { N0, CPTmp0, CPTmp1, CPTmp2, Tmp2, N3, InFlag };
-      return CurDAG->SelectNodeTo(Op.getNode(), ARM::MOVCCs, MVT::i32, Ops, 7);
-    }
+    if (!Subtarget->isThumb1Only() && VT == MVT::i32) {
+      // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
+      // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
+      // Pattern complexity = 18  cost = 1  size = 0
+      SDValue CPTmp0;
+      SDValue CPTmp1;
+      SDValue CPTmp2;
+      if (Subtarget->isThumb()) {
+        if (SelectT2ShifterOperandReg(Op, N1, CPTmp0, CPTmp1)) {
+          unsigned SOVal = cast<ConstantSDNode>(CPTmp1)->getZExtValue();
+          unsigned SOShOp = ARM_AM::getSORegShOp(SOVal);
+          unsigned Opc = 0;
+          switch (SOShOp) {
+          case ARM_AM::lsl: Opc = ARM::t2MOVCClsl; break;
+          case ARM_AM::lsr: Opc = ARM::t2MOVCClsr; break;
+          case ARM_AM::asr: Opc = ARM::t2MOVCCasr; break;
+          case ARM_AM::ror: Opc = ARM::t2MOVCCror; break;
+          default:
+            llvm_unreachable("Unknown so_reg opcode!");
+            break;
+          }
+          SDValue SOShImm =
+            CurDAG->getTargetConstant(ARM_AM::getSORegOffset(SOVal), MVT::i32);
+          SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                                   cast<ConstantSDNode>(N2)->getZExtValue()),
+                                   MVT::i32);
+          SDValue Ops[] = { N0, CPTmp0, SOShImm, Tmp2, N3, InFlag };
+          return CurDAG->SelectNodeTo(Op.getNode(), Opc, MVT::i32,Ops, 6);
+        }
+      } else {
+        if (SelectShifterOperandReg(Op, N1, CPTmp0, CPTmp1, CPTmp2)) {
+          SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                                   cast<ConstantSDNode>(N2)->getZExtValue()),
+                                   MVT::i32);
+          SDValue Ops[] = { N0, CPTmp0, CPTmp1, CPTmp2, Tmp2, N3, InFlag };
+          return CurDAG->SelectNodeTo(Op.getNode(),
+                                      ARM::MOVCCs, MVT::i32, Ops, 7);
+        }
+      }
 
-    // Pattern: (ARMcmov:i32 GPR:i32:$false,
-    //             (imm:i32)<<P:Predicate_so_imm>><<X:so_imm_XFORM>>:$true,
-    //             (imm:i32):$cc)
-    // Emits: (MOVCCi:i32 GPR:i32:$false,
-    //           (so_imm_XFORM:i32 (imm:i32):$true), (imm:i32):$cc)
-    // Pattern complexity = 10  cost = 1  size = 0
-    if (VT == MVT::i32 &&
-        N3.getOpcode() == ISD::Constant &&
-        Predicate_so_imm(N3.getNode())) {
-      SDValue Tmp1 = CurDAG->getTargetConstant(((unsigned)
-                               cast<ConstantSDNode>(N1)->getZExtValue()),
-                               MVT::i32);
-      Tmp1 = Transform_so_imm_XFORM(Tmp1.getNode());
-      SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
-                               cast<ConstantSDNode>(N2)->getZExtValue()),
-                               MVT::i32);
-      SDValue Ops[] = { N0, Tmp1, Tmp2, N3, InFlag };
-      return CurDAG->SelectNodeTo(Op.getNode(), ARM::MOVCCi, MVT::i32, Ops, 5);
+      // Pattern: (ARMcmov:i32 GPR:i32:$false,
+      //             (imm:i32)<<P:Predicate_so_imm>>:$true,
+      //             (imm:i32):$cc)
+      // Emits: (MOVCCi:i32 GPR:i32:$false,
+      //           (so_imm:i32 (imm:i32):$true), (imm:i32):$cc)
+      // Pattern complexity = 10  cost = 1  size = 0
+      if (N3.getOpcode() == ISD::Constant) {
+        if (Subtarget->isThumb()) {
+          if (Predicate_t2_so_imm(N3.getNode())) {
+            SDValue Tmp1 = CurDAG->getTargetConstant(((unsigned)
+                                     cast<ConstantSDNode>(N1)->getZExtValue()),
+                                     MVT::i32);
+            SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                                     cast<ConstantSDNode>(N2)->getZExtValue()),
+                                     MVT::i32);
+            SDValue Ops[] = { N0, Tmp1, Tmp2, N3, InFlag };
+            return CurDAG->SelectNodeTo(Op.getNode(),
+                                        ARM::t2MOVCCi, MVT::i32, Ops, 5);
+          }
+        } else {
+          if (Predicate_so_imm(N3.getNode())) {
+            SDValue Tmp1 = CurDAG->getTargetConstant(((unsigned)
+                                     cast<ConstantSDNode>(N1)->getZExtValue()),
+                                     MVT::i32);
+            SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                                     cast<ConstantSDNode>(N2)->getZExtValue()),
+                                     MVT::i32);
+            SDValue Ops[] = { N0, Tmp1, Tmp2, N3, InFlag };
+            return CurDAG->SelectNodeTo(Op.getNode(),
+                                        ARM::MOVCCi, MVT::i32, Ops, 5);
+          }
+        }
+      }
     }
 
     // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
@@ -1073,23 +1512,25 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
                                MVT::i32);
     SDValue Ops[] = { N0, N1, Tmp2, N3, InFlag };
     unsigned Opc = 0;
-    switch (VT.getSimpleVT()) {
+    switch (VT.getSimpleVT().SimpleTy) {
     default: assert(false && "Illegal conditional move type!");
       break;
     case MVT::i32:
-      Opc = isThumb ? ARM::tMOVCCr : ARM::MOVCCr;
+      Opc = Subtarget->isThumb()
+        ? (Subtarget->hasThumb2() ? ARM::t2MOVCCr : ARM::tMOVCCr_pseudo)
+        : ARM::MOVCCr;
       break;
     case MVT::f32:
       Opc = ARM::FCPYScc;
       break;
     case MVT::f64:
       Opc = ARM::FCPYDcc;
-      break; 
+      break;
     }
     return CurDAG->SelectNodeTo(Op.getNode(), Opc, VT, Ops, 5);
   }
   case ARMISD::CNEG: {
-    MVT VT = Op.getValueType();
+    EVT VT = Op.getValueType();
     SDValue N0 = Op.getOperand(0);
     SDValue N1 = Op.getOperand(1);
     SDValue N2 = Op.getOperand(2);
@@ -1103,7 +1544,7 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
                                MVT::i32);
     SDValue Ops[] = { N0, N1, Tmp2, N3, InFlag };
     unsigned Opc = 0;
-    switch (VT.getSimpleVT()) {
+    switch (VT.getSimpleVT().SimpleTy) {
     default: assert(false && "Illegal conditional move type!");
       break;
     case MVT::f32:
@@ -1116,104 +1557,308 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
     return CurDAG->SelectNodeTo(Op.getNode(), Opc, VT, Ops, 5);
   }
 
-  case ISD::DECLARE: {
-    SDValue Chain = Op.getOperand(0);
-    SDValue N1 = Op.getOperand(1);
-    SDValue N2 = Op.getOperand(2);
-    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N1);
-    // FIXME: handle VLAs.
-    if (!FINode) {
-      ReplaceUses(Op.getValue(0), Chain);
-      return NULL;
+  case ARMISD::VZIP: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return NULL;
+    case MVT::v8i8:  Opc = ARM::VZIPd8; break;
+    case MVT::v4i16: Opc = ARM::VZIPd16; break;
+    case MVT::v2f32:
+    case MVT::v2i32: Opc = ARM::VZIPd32; break;
+    case MVT::v16i8: Opc = ARM::VZIPq8; break;
+    case MVT::v8i16: Opc = ARM::VZIPq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VZIPq32; break;
     }
-    if (N2.getOpcode() == ARMISD::PIC_ADD && isa<LoadSDNode>(N2.getOperand(0)))
-      N2 = N2.getOperand(0);
-    LoadSDNode *Ld = dyn_cast<LoadSDNode>(N2);
-    if (!Ld) {
-      ReplaceUses(Op.getValue(0), Chain);
-      return NULL;
+    return CurDAG->getMachineNode(Opc, dl, VT, VT,
+                                  N->getOperand(0), N->getOperand(1));
+  }
+  case ARMISD::VUZP: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return NULL;
+    case MVT::v8i8:  Opc = ARM::VUZPd8; break;
+    case MVT::v4i16: Opc = ARM::VUZPd16; break;
+    case MVT::v2f32:
+    case MVT::v2i32: Opc = ARM::VUZPd32; break;
+    case MVT::v16i8: Opc = ARM::VUZPq8; break;
+    case MVT::v8i16: Opc = ARM::VUZPq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VUZPq32; break;
     }
-    SDValue BasePtr = Ld->getBasePtr();
-    assert(BasePtr.getOpcode() == ARMISD::Wrapper &&
-           isa<ConstantPoolSDNode>(BasePtr.getOperand(0)) &&
-           "llvm.dbg.variable should be a constantpool node");
-    ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(BasePtr.getOperand(0));
-    GlobalValue *GV = 0;
-    if (CP->isMachineConstantPoolEntry()) {
-      ARMConstantPoolValue *ACPV = (ARMConstantPoolValue*)CP->getMachineCPVal();
-      GV = ACPV->getGV();
-    } else
-      GV = dyn_cast<GlobalValue>(CP->getConstVal());
-    if (!GV) {
-      ReplaceUses(Op.getValue(0), Chain);
-      return NULL;
+    return CurDAG->getMachineNode(Opc, dl, VT, VT,
+                                  N->getOperand(0), N->getOperand(1));
+  }
+  case ARMISD::VTRN: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return NULL;
+    case MVT::v8i8:  Opc = ARM::VTRNd8; break;
+    case MVT::v4i16: Opc = ARM::VTRNd16; break;
+    case MVT::v2f32:
+    case MVT::v2i32: Opc = ARM::VTRNd32; break;
+    case MVT::v16i8: Opc = ARM::VTRNq8; break;
+    case MVT::v8i16: Opc = ARM::VTRNq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VTRNq32; break;
     }
-    
-    SDValue Tmp1 = CurDAG->getTargetFrameIndex(FINode->getIndex(),
-                                               TLI.getPointerTy());
-    SDValue Tmp2 = CurDAG->getTargetGlobalAddress(GV, TLI.getPointerTy());
-    SDValue Ops[] = { Tmp1, Tmp2, Chain };
-    return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl,
-                                 MVT::Other, Ops, 3);
+    return CurDAG->getMachineNode(Opc, dl, VT, VT,
+                                  N->getOperand(0), N->getOperand(1));
   }
 
-  case ISD::CONCAT_VECTORS: {
-    MVT VT = Op.getValueType();
-    assert(VT.is128BitVector() && Op.getNumOperands() == 2 &&
-           "unexpected CONCAT_VECTORS");
-    SDValue N0 = Op.getOperand(0);
-    SDValue N1 = Op.getOperand(1);
-    SDNode *Result =
-      CurDAG->getTargetNode(TargetInstrInfo::IMPLICIT_DEF, dl, VT);
-    if (N0.getOpcode() != ISD::UNDEF)
-      Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
-                                     SDValue(Result, 0), N0,
-                                     CurDAG->getTargetConstant(arm_dsubreg_0,
-                                                               MVT::i32));
-    if (N1.getOpcode() != ISD::UNDEF)
-      Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
-                                     SDValue(Result, 0), N1,
-                                     CurDAG->getTargetConstant(arm_dsubreg_1,
-                                                               MVT::i32));
-    return Result;
-  }
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    EVT VT = N->getValueType(0);
+    unsigned Opc = 0;
+
+    switch (IntNo) {
+    default:
+      break;
+
+    case Intrinsic::arm_neon_vld2: {
+      unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
+                              ARM::VLD2d32, ARM::VLD2d64 };
+      unsigned QOpcodes[] = { ARM::VLD2q8, ARM::VLD2q16, ARM::VLD2q32 };
+      return SelectVLD(Op, 2, DOpcodes, QOpcodes, 0);
+    }
 
-  case ISD::VECTOR_SHUFFLE: {
-    MVT VT = Op.getValueType();
-
-    // Match 128-bit splat to VDUPLANEQ.  (This could be done with a Pat in
-    // ARMInstrNEON.td but it is awkward because the shuffle mask needs to be
-    // transformed first into a lane number and then to both a subregister
-    // index and an adjusted lane number.)  If the source operand is a
-    // SCALAR_TO_VECTOR, leave it so it will be matched later as a VDUP.
-    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-    if (VT.is128BitVector() && SVOp->isSplat() &&
-        Op.getOperand(0).getOpcode() != ISD::SCALAR_TO_VECTOR &&
-        Op.getOperand(1).getOpcode() == ISD::UNDEF) {
-      unsigned LaneVal = SVOp->getSplatIndex();
-
-      MVT HalfVT;
-      unsigned Opc = 0;
-      switch (VT.getVectorElementType().getSimpleVT()) {
-      default: assert(false && "unhandled VDUP splat type");
-      case MVT::i8:  Opc = ARM::VDUPLN8q;  HalfVT = MVT::v8i8; break;
-      case MVT::i16: Opc = ARM::VDUPLN16q; HalfVT = MVT::v4i16; break;
-      case MVT::i32: Opc = ARM::VDUPLN32q; HalfVT = MVT::v2i32; break;
-      case MVT::f32: Opc = ARM::VDUPLNfq;  HalfVT = MVT::v2f32; break;
+    case Intrinsic::arm_neon_vld3: {
+      unsigned DOpcodes[] = { ARM::VLD3d8, ARM::VLD3d16,
+                              ARM::VLD3d32, ARM::VLD3d64 };
+      unsigned QOpcodes0[] = { ARM::VLD3q8a, ARM::VLD3q16a, ARM::VLD3q32a };
+      unsigned QOpcodes1[] = { ARM::VLD3q8b, ARM::VLD3q16b, ARM::VLD3q32b };
+      return SelectVLD(Op, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vld4: {
+      unsigned DOpcodes[] = { ARM::VLD4d8, ARM::VLD4d16,
+                              ARM::VLD4d32, ARM::VLD4d64 };
+      unsigned QOpcodes0[] = { ARM::VLD4q8a, ARM::VLD4q16a, ARM::VLD4q32a };
+      unsigned QOpcodes1[] = { ARM::VLD4q8b, ARM::VLD4q16b, ARM::VLD4q32b };
+      return SelectVLD(Op, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vld2lane: {
+      unsigned DOpcodes[] = { ARM::VLD2LNd8, ARM::VLD2LNd16, ARM::VLD2LNd32 };
+      unsigned QOpcodes0[] = { ARM::VLD2LNq16a, ARM::VLD2LNq32a };
+      unsigned QOpcodes1[] = { ARM::VLD2LNq16b, ARM::VLD2LNq32b };
+      return SelectVLDSTLane(Op, true, 2, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vld3lane: {
+      unsigned DOpcodes[] = { ARM::VLD3LNd8, ARM::VLD3LNd16, ARM::VLD3LNd32 };
+      unsigned QOpcodes0[] = { ARM::VLD3LNq16a, ARM::VLD3LNq32a };
+      unsigned QOpcodes1[] = { ARM::VLD3LNq16b, ARM::VLD3LNq32b };
+      return SelectVLDSTLane(Op, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vld4lane: {
+      unsigned DOpcodes[] = { ARM::VLD4LNd8, ARM::VLD4LNd16, ARM::VLD4LNd32 };
+      unsigned QOpcodes0[] = { ARM::VLD4LNq16a, ARM::VLD4LNq32a };
+      unsigned QOpcodes1[] = { ARM::VLD4LNq16b, ARM::VLD4LNq32b };
+      return SelectVLDSTLane(Op, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vst2: {
+      SDValue MemAddr, MemUpdate, MemOpc;
+      if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc))
+        return NULL;
+      SDValue Chain = N->getOperand(0);
+      VT = N->getOperand(3).getValueType();
+      if (VT.is64BitVector()) {
+        switch (VT.getSimpleVT().SimpleTy) {
+        default: llvm_unreachable("unhandled vst2 type");
+        case MVT::v8i8:  Opc = ARM::VST2d8; break;
+        case MVT::v4i16: Opc = ARM::VST2d16; break;
+        case MVT::v2f32:
+        case MVT::v2i32: Opc = ARM::VST2d32; break;
+        case MVT::v1i64: Opc = ARM::VST2d64; break;
+        }
+        const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc,
+                                N->getOperand(3), N->getOperand(4), Chain };
+        return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 6);
+      }
+      // Quad registers are stored as pairs of double registers.
+      EVT RegVT;
+      switch (VT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("unhandled vst2 type");
+      case MVT::v16i8: Opc = ARM::VST2q8; RegVT = MVT::v8i8; break;
+      case MVT::v8i16: Opc = ARM::VST2q16; RegVT = MVT::v4i16; break;
+      case MVT::v4f32: Opc = ARM::VST2q32; RegVT = MVT::v2f32; break;
+      case MVT::v4i32: Opc = ARM::VST2q32; RegVT = MVT::v2i32; break;
+      }
+      SDValue D0 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+                                                  N->getOperand(3));
+      SDValue D1 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+                                                  N->getOperand(3));
+      SDValue D2 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+                                                  N->getOperand(4));
+      SDValue D3 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+                                                  N->getOperand(4));
+      const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc,
+                              D0, D1, D2, D3, Chain };
+      return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 8);
+    }
+
+    case Intrinsic::arm_neon_vst3: {
+      SDValue MemAddr, MemUpdate, MemOpc;
+      if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc))
+        return NULL;
+      SDValue Chain = N->getOperand(0);
+      VT = N->getOperand(3).getValueType();
+      if (VT.is64BitVector()) {
+        switch (VT.getSimpleVT().SimpleTy) {
+        default: llvm_unreachable("unhandled vst3 type");
+        case MVT::v8i8:  Opc = ARM::VST3d8; break;
+        case MVT::v4i16: Opc = ARM::VST3d16; break;
+        case MVT::v2f32:
+        case MVT::v2i32: Opc = ARM::VST3d32; break;
+        case MVT::v1i64: Opc = ARM::VST3d64; break;
+        }
+        const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc,
+                                N->getOperand(3), N->getOperand(4),
+                                N->getOperand(5), Chain };
+        return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7);
       }
+      // Quad registers are stored with two separate instructions, where one
+      // stores the even registers and the other stores the odd registers.
+      EVT RegVT;
+      unsigned Opc2 = 0;
+      switch (VT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("unhandled vst3 type");
+      case MVT::v16i8:
+        Opc = ARM::VST3q8a;  Opc2 = ARM::VST3q8b;  RegVT = MVT::v8i8; break;
+      case MVT::v8i16:
+        Opc = ARM::VST3q16a; Opc2 = ARM::VST3q16b; RegVT = MVT::v4i16; break;
+      case MVT::v4f32:
+        Opc = ARM::VST3q32a; Opc2 = ARM::VST3q32b; RegVT = MVT::v2f32; break;
+      case MVT::v4i32:
+        Opc = ARM::VST3q32a; Opc2 = ARM::VST3q32b; RegVT = MVT::v2i32; break;
+      }
+      // Enable writeback to the address register.
+      MemOpc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(true), MVT::i32);
+
+      SDValue D0 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+                                                  N->getOperand(3));
+      SDValue D2 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+                                                  N->getOperand(4));
+      SDValue D4 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+                                                  N->getOperand(5));
+      const SDValue OpsA[] = { MemAddr, MemUpdate, MemOpc, D0, D2, D4, Chain };
+      SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
+                                            MVT::Other, OpsA, 7);
+      Chain = SDValue(VStA, 1);
+
+      SDValue D1 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+                                                  N->getOperand(3));
+      SDValue D3 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+                                                  N->getOperand(4));
+      SDValue D5 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+                                                  N->getOperand(5));
+      MemAddr = SDValue(VStA, 0);
+      const SDValue OpsB[] = { MemAddr, MemUpdate, MemOpc, D1, D3, D5, Chain };
+      SDNode *VStB = CurDAG->getMachineNode(Opc2, dl, MemAddr.getValueType(),
+                                            MVT::Other, OpsB, 7);
+      Chain = SDValue(VStB, 1);
+      ReplaceUses(SDValue(N, 0), Chain);
+      return NULL;
+    }
 
-      // The source operand needs to be changed to a subreg of the original
-      // 128-bit operand, and the lane number needs to be adjusted accordingly.
-      unsigned NumElts = VT.getVectorNumElements() / 2;
-      unsigned SRVal = (LaneVal < NumElts ? arm_dsubreg_0 : arm_dsubreg_1);
-      SDValue SR = CurDAG->getTargetConstant(SRVal, MVT::i32);
-      SDValue NewLane = CurDAG->getTargetConstant(LaneVal % NumElts, MVT::i32);
-      SDNode *SubReg = CurDAG->getTargetNode(TargetInstrInfo::EXTRACT_SUBREG,
-                                             dl, HalfVT, N->getOperand(0), SR);
-      return CurDAG->SelectNodeTo(N, Opc, VT, SDValue(SubReg, 0), NewLane);
+    case Intrinsic::arm_neon_vst4: {
+      SDValue MemAddr, MemUpdate, MemOpc;
+      if (!SelectAddrMode6(Op, N->getOperand(2), MemAddr, MemUpdate, MemOpc))
+        return NULL;
+      SDValue Chain = N->getOperand(0);
+      VT = N->getOperand(3).getValueType();
+      if (VT.is64BitVector()) {
+        switch (VT.getSimpleVT().SimpleTy) {
+        default: llvm_unreachable("unhandled vst4 type");
+        case MVT::v8i8:  Opc = ARM::VST4d8; break;
+        case MVT::v4i16: Opc = ARM::VST4d16; break;
+        case MVT::v2f32:
+        case MVT::v2i32: Opc = ARM::VST4d32; break;
+        case MVT::v1i64: Opc = ARM::VST4d64; break;
+        }
+        const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc,
+                                N->getOperand(3), N->getOperand(4),
+                                N->getOperand(5), N->getOperand(6), Chain };
+        return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 8);
+      }
+      // Quad registers are stored with two separate instructions, where one
+      // stores the even registers and the other stores the odd registers.
+      EVT RegVT;
+      unsigned Opc2 = 0;
+      switch (VT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("unhandled vst4 type");
+      case MVT::v16i8:
+        Opc = ARM::VST4q8a;  Opc2 = ARM::VST4q8b;  RegVT = MVT::v8i8; break;
+      case MVT::v8i16:
+        Opc = ARM::VST4q16a; Opc2 = ARM::VST4q16b; RegVT = MVT::v4i16; break;
+      case MVT::v4f32:
+        Opc = ARM::VST4q32a; Opc2 = ARM::VST4q32b; RegVT = MVT::v2f32; break;
+      case MVT::v4i32:
+        Opc = ARM::VST4q32a; Opc2 = ARM::VST4q32b; RegVT = MVT::v2i32; break;
+      }
+      // Enable writeback to the address register.
+      MemOpc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(true), MVT::i32);
+
+      SDValue D0 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+                                                  N->getOperand(3));
+      SDValue D2 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+                                                  N->getOperand(4));
+      SDValue D4 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+                                                  N->getOperand(5));
+      SDValue D6 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+                                                  N->getOperand(6));
+      const SDValue OpsA[] = { MemAddr, MemUpdate, MemOpc,
+                               D0, D2, D4, D6, Chain };
+      SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
+                                            MVT::Other, OpsA, 8);
+      Chain = SDValue(VStA, 1);
+
+      SDValue D1 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+                                                  N->getOperand(3));
+      SDValue D3 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+                                                  N->getOperand(4));
+      SDValue D5 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+                                                  N->getOperand(5));
+      SDValue D7 = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+                                                  N->getOperand(6));
+      MemAddr = SDValue(VStA, 0);
+      const SDValue OpsB[] = { MemAddr, MemUpdate, MemOpc,
+                               D1, D3, D5, D7, Chain };
+      SDNode *VStB = CurDAG->getMachineNode(Opc2, dl, MemAddr.getValueType(),
+                                            MVT::Other, OpsB, 8);
+      Chain = SDValue(VStB, 1);
+      ReplaceUses(SDValue(N, 0), Chain);
+      return NULL;
     }
 
-    break;
+    case Intrinsic::arm_neon_vst2lane: {
+      unsigned DOpcodes[] = { ARM::VST2LNd8, ARM::VST2LNd16, ARM::VST2LNd32 };
+      unsigned QOpcodes0[] = { ARM::VST2LNq16a, ARM::VST2LNq32a };
+      unsigned QOpcodes1[] = { ARM::VST2LNq16b, ARM::VST2LNq32b };
+      return SelectVLDSTLane(Op, false, 2, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vst3lane: {
+      unsigned DOpcodes[] = { ARM::VST3LNd8, ARM::VST3LNd16, ARM::VST3LNd32 };
+      unsigned QOpcodes0[] = { ARM::VST3LNq16a, ARM::VST3LNq32a };
+      unsigned QOpcodes1[] = { ARM::VST3LNq16b, ARM::VST3LNq32b };
+      return SelectVLDSTLane(Op, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vst4lane: {
+      unsigned DOpcodes[] = { ARM::VST4LNd8, ARM::VST4LNd16, ARM::VST4LNd32 };
+      unsigned QOpcodes0[] = { ARM::VST4LNq16a, ARM::VST4LNq32a };
+      unsigned QOpcodes1[] = { ARM::VST4LNq16b, ARM::VST4LNq32b };
+      return SelectVLDSTLane(Op, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+    }
   }
   }
 
@@ -1224,20 +1869,17 @@ bool ARMDAGToDAGISel::
 SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
                              std::vector<SDValue> &OutOps) {
   assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
-
-  SDValue Base, Offset, Opc;
-  if (!SelectAddrMode2(Op, Op, Base, Offset, Opc))
-    return true;
-  
-  OutOps.push_back(Base);
-  OutOps.push_back(Offset);
-  OutOps.push_back(Opc);
+  // Require the address to be in a register.  That is safe for all ARM
+  // variants and it is hard to do anything much smarter without knowing
+  // how the operand is used.
+  OutOps.push_back(Op);
   return false;
 }
 
 /// createARMISelDag - This pass converts a legalized DAG into a
 /// ARM-specific DAG, ready for instruction scheduling.
 ///
-FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM) {
-  return new ARMDAGToDAGISel(TM);
+FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel) {
+  return new ARMDAGToDAGISel(TM, OptLevel);
 }
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 41c9ecc..426cecb 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -17,9 +17,11 @@
 #include "ARMConstantPoolValue.h"
 #include "ARMISelLowering.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMPerfectShuffle.h"
 #include "ARMRegisterInfo.h"
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
+#include "ARMTargetObjectFile.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
@@ -36,74 +38,101 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include <sstream>
 using namespace llvm;
 
-static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                                    CCValAssign::LocInfo &LocInfo,
                                    ISD::ArgFlagsTy &ArgFlags,
                                    CCState &State);
-static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                                     CCValAssign::LocInfo &LocInfo,
                                     ISD::ArgFlagsTy &ArgFlags,
                                     CCState &State);
-static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                                       CCValAssign::LocInfo &LocInfo,
                                       ISD::ArgFlagsTy &ArgFlags,
                                       CCState &State);
-static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                                        CCValAssign::LocInfo &LocInfo,
                                        ISD::ArgFlagsTy &ArgFlags,
                                        CCState &State);
 
-void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
-                                       MVT PromotedBitwiseVT) {
+void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
+                                       EVT PromotedBitwiseVT) {
   if (VT != PromotedLdStVT) {
-    setOperationAction(ISD::LOAD, VT, Promote);
-    AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType (ISD::LOAD, VT.getSimpleVT(),
+                       PromotedLdStVT.getSimpleVT());
 
-    setOperationAction(ISD::STORE, VT, Promote);
-    AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType (ISD::STORE, VT.getSimpleVT(),
+                       PromotedLdStVT.getSimpleVT());
   }
 
-  MVT ElemTy = VT.getVectorElementType();
+  EVT ElemTy = VT.getVectorElementType();
   if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
-    setOperationAction(ISD::VSETCC, VT, Custom);
+    setOperationAction(ISD::VSETCC, VT.getSimpleVT(), Custom);
   if (ElemTy == MVT::i8 || ElemTy == MVT::i16)
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
-  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
-  setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
-  setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  if (ElemTy != MVT::i32) {
+    setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Expand);
+  }
+  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand);
   if (VT.isInteger()) {
-    setOperationAction(ISD::SHL, VT, Custom);
-    setOperationAction(ISD::SRA, VT, Custom);
-    setOperationAction(ISD::SRL, VT, Custom);
+    setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
+    setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
+    setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
   }
 
   // Promote all bit-wise operations.
   if (VT.isInteger() && VT != PromotedBitwiseVT) {
-    setOperationAction(ISD::AND, VT, Promote);
-    AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
-    setOperationAction(ISD::OR,  VT, Promote);
-    AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
-    setOperationAction(ISD::XOR, VT, Promote);
-    AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
-  }
-}
-
-void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
+    setOperationAction(ISD::AND, VT.getSimpleVT(), Promote);
+    AddPromotedToType (ISD::AND, VT.getSimpleVT(),
+                       PromotedBitwiseVT.getSimpleVT());
+    setOperationAction(ISD::OR,  VT.getSimpleVT(), Promote);
+    AddPromotedToType (ISD::OR,  VT.getSimpleVT(),
+                       PromotedBitwiseVT.getSimpleVT());
+    setOperationAction(ISD::XOR, VT.getSimpleVT(), Promote);
+    AddPromotedToType (ISD::XOR, VT.getSimpleVT(),
+                       PromotedBitwiseVT.getSimpleVT());
+  }
+
+  // Neon does not support vector divide/remainder operations.
+  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::FDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+}
+
+void ARMTargetLowering::addDRTypeForNEON(EVT VT) {
   addRegisterClass(VT, ARM::DPRRegisterClass);
   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
 }
 
-void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
+void ARMTargetLowering::addQRTypeForNEON(EVT VT) {
   addRegisterClass(VT, ARM::QPRRegisterClass);
   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 }
 
+static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
+  if (TM.getSubtarget<ARMSubtarget>().isTargetDarwin())
+    return new TargetLoweringObjectFileMachO();
+  return new ARMElfTargetObjectFile();
+}
+
 ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
-    : TargetLowering(TM), ARMPCLabelIndex(0) {
+    : TargetLowering(TM, createTLOF(TM)), ARMPCLabelIndex(0) {
   Subtarget = &TM.getSubtarget<ARMSubtarget>();
 
   if (Subtarget->isTargetDarwin()) {
@@ -188,11 +217,20 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setLibcallName(RTLIB::SRL_I128, 0);
   setLibcallName(RTLIB::SRA_I128, 0);
 
-  if (Subtarget->isThumb())
+  // Libcalls should use the AAPCS base standard ABI, even if hard float
+  // is in effect, as per the ARM RTABI specification, section 4.1.2.
+  if (Subtarget->isAAPCS_ABI()) {
+    for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
+      setLibcallCallingConv(static_cast<RTLIB::Libcall>(i),
+                            CallingConv::ARM_AAPCS);
+    }
+  }
+
+  if (Subtarget->isThumb1Only())
     addRegisterClass(MVT::i32, ARM::tGPRRegisterClass);
   else
     addRegisterClass(MVT::i32, ARM::GPRRegisterClass);
-  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) {
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
     addRegisterClass(MVT::f32, ARM::SPRRegisterClass);
     addRegisterClass(MVT::f64, ARM::DPRRegisterClass);
 
@@ -213,6 +251,39 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     addQRTypeForNEON(MVT::v4i32);
     addQRTypeForNEON(MVT::v2i64);
 
+    // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
+    // neither Neon nor VFP support any arithmetic operations on it.
+    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
+    setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
+    setOperationAction(ISD::FREM, MVT::v2f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
+    setOperationAction(ISD::VSETCC, MVT::v2f64, Expand);
+    setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
+    setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
+    setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
+    setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
+    setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
+
+    // Neon does not support some operations on v1i64 and v2i64 types.
+    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
+    setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
+
     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
     setTargetDAGCombine(ISD::SHL);
     setTargetDAGCombine(ISD::SRL);
@@ -246,7 +317,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
 
   // i64 operation support.
-  if (Subtarget->isThumb()) {
+  if (Subtarget->isThumb1Only()) {
     setOperationAction(ISD::MUL,     MVT::i64, Expand);
     setOperationAction(ISD::MULHU,   MVT::i32, Expand);
     setOperationAction(ISD::MULHS,   MVT::i32, Expand);
@@ -287,7 +358,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
   setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
 
-  setOperationAction(ISD::RET,           MVT::Other, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
   setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
@@ -300,7 +370,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::VAEND,              MVT::Other, Expand);
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Expand);
+  setOperationAction(ISD::EHSELECTION,        MVT::i32,   Expand);
+  // FIXME: Shouldn't need this, since no register is used, but the legalizer
+  // doesn't yet know how to not do that for SjLj.
+  setExceptionSelectorRegister(ARM::R0);
+  if (Subtarget->isThumb())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
   setOperationAction(ISD::MEMBARRIER,         MVT::Other, Expand);
 
   if (!Subtarget->hasV6Ops() && !Subtarget->isThumb2()) {
@@ -309,7 +386,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
-  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb())
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only())
     // Turn f64->i64 into FMRRD, i64 -> f64 to FMDRR iff target supports vfp2.
     setOperationAction(ISD::BIT_CONVERT, MVT::i64, Custom);
 
@@ -339,7 +416,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
   setOperationAction(ISD::FREM,      MVT::f64, Expand);
   setOperationAction(ISD::FREM,      MVT::f32, Expand);
-  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) {
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
   }
@@ -347,7 +424,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::FPOW,      MVT::f32, Expand);
 
   // int <-> fp are custom expanded into bit_convert + ARMISD ops.
-  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) {
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
@@ -361,26 +438,19 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 
   setStackPointerRegisterToSaveRestore(ARM::SP);
   setSchedulingPreference(SchedulingForRegPressure);
-  setIfCvtBlockSizeLimit(Subtarget->isThumb() ? 0 : 10);
-  setIfCvtDupBlockSizeLimit(Subtarget->isThumb() ? 0 : 2);
-
-  if (!Subtarget->isThumb()) {
-    // Use branch latency information to determine if-conversion limits.
-    // FIXME: If-converter should use instruction latency of the branch being
-    // eliminated to compute the threshold. For ARMv6, the branch "latency"
-    // varies depending on whether it's dynamically or statically predicted
-    // and on whether the destination is in the prefetch buffer.
-    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-    const InstrItineraryData &InstrItins = Subtarget->getInstrItineraryData();
-    unsigned Latency= InstrItins.getLatency(TII->get(ARM::Bcc).getSchedClass());
-    if (Latency > 1) {
-      setIfCvtBlockSizeLimit(Latency-1);
-      if (Latency > 2)
-        setIfCvtDupBlockSizeLimit(Latency-2);
-    } else {
-      setIfCvtBlockSizeLimit(10);
-      setIfCvtDupBlockSizeLimit(2);
-    }
+
+  // FIXME: If-converter should use instruction latency to determine
+  // profitability rather than relying on fixed limits.
+  if (Subtarget->getCPUString() == "generic") {
+    // Generic (and overly aggressive) if-conversion limits.
+    setIfCvtBlockSizeLimit(10);
+    setIfCvtDupBlockSizeLimit(2);
+  } else if (Subtarget->hasV6Ops()) {
+    setIfCvtBlockSizeLimit(2);
+    setIfCvtDupBlockSizeLimit(1);
+  } else {
+    setIfCvtBlockSizeLimit(3);
+    setIfCvtDupBlockSizeLimit(2);
   }
 
   maxStoresPerMemcpy = 1;   //// temporary - rewrite interface to use type
@@ -401,6 +471,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::tCALL:         return "ARMISD::tCALL";
   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
+  case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
   case ARMISD::CMP:           return "ARMISD::CMP";
@@ -425,6 +496,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
 
+  case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
+
   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
   case ARMISD::VCGE:          return "ARMISD::VCGE";
   case ARMISD::VCGEU:         return "ARMISD::VCGEU";
@@ -453,13 +526,21 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
   case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
   case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
-  case ARMISD::VDUPLANEQ:     return "ARMISD::VDUPLANEQ";
+  case ARMISD::VDUP:          return "ARMISD::VDUP";
+  case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
+  case ARMISD::VEXT:          return "ARMISD::VEXT";
+  case ARMISD::VREV64:        return "ARMISD::VREV64";
+  case ARMISD::VREV32:        return "ARMISD::VREV32";
+  case ARMISD::VREV16:        return "ARMISD::VREV16";
+  case ARMISD::VZIP:          return "ARMISD::VZIP";
+  case ARMISD::VUZP:          return "ARMISD::VUZP";
+  case ARMISD::VTRN:          return "ARMISD::VTRN";
   }
 }
 
 /// getFunctionAlignment - Return the Log2 alignment of this function.
 unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const {
-  return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 1 : 2;
+  return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 0 : 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -469,7 +550,7 @@ unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const {
 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
   switch (CC) {
-  default: assert(0 && "Unknown condition code!");
+  default: llvm_unreachable("Unknown condition code!");
   case ISD::SETNE:  return ARMCC::NE;
   case ISD::SETEQ:  return ARMCC::EQ;
   case ISD::SETGT:  return ARMCC::GT;
@@ -483,15 +564,12 @@ static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
   }
 }
 
-/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. It
-/// returns true if the operands should be inverted to form the proper
-/// comparison.
-static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
+static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
                         ARMCC::CondCodes &CondCode2) {
-  bool Invert = false;
   CondCode2 = ARMCC::AL;
   switch (CC) {
-  default: assert(0 && "Unknown FP condition!");
+  default: llvm_unreachable("Unknown FP condition!");
   case ISD::SETEQ:
   case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
   case ISD::SETGT:
@@ -499,7 +577,7 @@ static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
   case ISD::SETGE:
   case ISD::SETOGE: CondCode = ARMCC::GE; break;
   case ISD::SETOLT: CondCode = ARMCC::MI; break;
-  case ISD::SETOLE: CondCode = ARMCC::GT; Invert = true; break;
+  case ISD::SETOLE: CondCode = ARMCC::LS; break;
   case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
   case ISD::SETO:   CondCode = ARMCC::VC; break;
   case ISD::SETUO:  CondCode = ARMCC::VS; break;
@@ -513,24 +591,16 @@ static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
   case ISD::SETNE:
   case ISD::SETUNE: CondCode = ARMCC::NE; break;
   }
-  return Invert;
 }
 
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
-//
-//  The lower operations present on calling convention works on this order:
-//      LowerCALL (virt regs --> phys regs, virt regs --> stack)
-//      LowerFORMAL_ARGUMENTS (phys --> virt regs, stack --> virt regs)
-//      LowerRET (virt regs --> phys regs)
-//      LowerCALL (phys regs --> virt regs)
-//
 //===----------------------------------------------------------------------===//
 
 #include "ARMGenCallingConv.inc"
 
 // APCS f64 is in register pairs, possibly split to stack
-static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool f64AssignAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                           CCValAssign::LocInfo &LocInfo,
                           CCState &State, bool CanFail) {
   static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
@@ -560,7 +630,7 @@ static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   return true;
 }
 
-static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                                    CCValAssign::LocInfo &LocInfo,
                                    ISD::ArgFlagsTy &ArgFlags,
                                    CCState &State) {
@@ -573,7 +643,7 @@ static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
 }
 
 // AAPCS f64 is in aligned register pairs
-static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool f64AssignAAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                            CCValAssign::LocInfo &LocInfo,
                            CCState &State, bool CanFail) {
   static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
@@ -603,7 +673,7 @@ static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   return true;
 }
 
-static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                                     CCValAssign::LocInfo &LocInfo,
                                     ISD::ArgFlagsTy &ArgFlags,
                                     CCState &State) {
@@ -615,7 +685,7 @@ static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   return true;  // we handled it
 }
 
-static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool f64RetAssign(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                          CCValAssign::LocInfo &LocInfo, CCState &State) {
   static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
   static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
@@ -635,7 +705,7 @@ static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   return true;
 }
 
-static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                                       CCValAssign::LocInfo &LocInfo,
                                       ISD::ArgFlagsTy &ArgFlags,
                                       CCState &State) {
@@ -646,7 +716,7 @@ static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   return true;  // we handled it
 }
 
-static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                                        CCValAssign::LocInfo &LocInfo,
                                        ISD::ArgFlagsTy &ArgFlags,
                                        CCState &State) {
@@ -656,49 +726,48 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
 
 /// CCAssignFnForNode - Selects the correct CCAssignFn for a the
 /// given CallingConvention value.
-CCAssignFn *ARMTargetLowering::CCAssignFnForNode(unsigned CC,
-                                                 bool Return) const {
+CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
+                                                 bool Return,
+                                                 bool isVarArg) const {
   switch (CC) {
   default:
-   assert(0 && "Unsupported calling convention");
+    llvm_unreachable("Unsupported calling convention");
   case CallingConv::C:
   case CallingConv::Fast:
-   // Use target triple & subtarget features to do actual dispatch.
-   if (Subtarget->isAAPCS_ABI()) {
-     if (Subtarget->hasVFP2() &&
-         FloatABIType == FloatABI::Hard)
-       return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
-     else
-       return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
-   } else
-     return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+    // Use target triple & subtarget features to do actual dispatch.
+    if (Subtarget->isAAPCS_ABI()) {
+      if (Subtarget->hasVFP2() &&
+          FloatABIType == FloatABI::Hard && !isVarArg)
+        return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+      else
+        return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+    } else
+        return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
   case CallingConv::ARM_AAPCS_VFP:
-   return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+    return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
   case CallingConv::ARM_AAPCS:
-   return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+    return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
   case CallingConv::ARM_APCS:
-   return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+    return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
   }
 }
 
-/// LowerCallResult - Lower the result values of an ISD::CALL into the
-/// appropriate copies out of appropriate physical registers.  This assumes that
-/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
-/// being lowered.  The returns a SDNode with the same number of values as the
-/// ISD::CALL.
-SDNode *ARMTargetLowering::
-LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
-                unsigned CallingConv, SelectionDAG &DAG) {
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue
+ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals) {
 
-  DebugLoc dl = TheCall->getDebugLoc();
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  bool isVarArg = TheCall->isVarArg();
-  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
-  CCInfo.AnalyzeCallResult(TheCall,
-                           CCAssignFnForNode(CallingConv, /* Return*/ true));
-
-  SmallVector<SDValue, 8> ResultVals;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins,
+                           CCAssignFnForNode(CallConv, /* Return*/ true,
+                                             isVarArg));
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -743,20 +812,17 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
     }
 
     switch (VA.getLocInfo()) {
-    default: assert(0 && "Unknown loc info!");
+    default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::BCvt:
       Val = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), Val);
       break;
     }
 
-    ResultVals.push_back(Val);
+    InVals.push_back(Val);
   }
 
-  // Merge everything together with a MERGE_VALUES node.
-  ResultVals.push_back(Chain);
-  return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
-                     &ResultVals[0], ResultVals.size()).getNode();
+  return Chain;
 }
 
 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
@@ -776,11 +842,11 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
 
 /// LowerMemOpCallTo - Store the argument to the stack.
 SDValue
-ARMTargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
-                                    const SDValue &StackPtr,
-                                    const CCValAssign &VA, SDValue Chain,
-                                    SDValue Arg, ISD::ArgFlagsTy Flags) {
-  DebugLoc dl = TheCall->getDebugLoc();
+ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
+                                    SDValue StackPtr, SDValue Arg,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    ISD::ArgFlagsTy Flags) {
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
@@ -791,14 +857,13 @@ ARMTargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
                       PseudoSourceValue::getStack(), LocMemOffset);
 }
 
-void ARMTargetLowering::PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG,
+void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
                                          SDValue Chain, SDValue &Arg,
                                          RegsToPassVector &RegsToPass,
                                          CCValAssign &VA, CCValAssign &NextVA,
                                          SDValue &StackPtr,
                                          SmallVector<SDValue, 8> &MemOpChains,
                                          ISD::ArgFlagsTy Flags) {
-  DebugLoc dl = TheCall->getDebugLoc();
 
   SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl,
                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
@@ -811,27 +876,31 @@ void ARMTargetLowering::PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG,
     if (StackPtr.getNode() == 0)
       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
 
-    MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, NextVA,
-                                           Chain, fmrrd.getValue(1), Flags));
+    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1),
+                                           dl, DAG, NextVA,
+                                           Flags));
   }
 }
 
-/// LowerCALL - Lowering a ISD::CALL node into a callseq_start <-
+/// LowerCall - Lowering a call into a callseq_start <-
 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
 /// nodes.
-SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
-  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
-  MVT RetVT           = TheCall->getRetValType(0);
-  SDValue Chain       = TheCall->getChain();
-  unsigned CC         = TheCall->getCallingConv();
-  bool isVarArg       = TheCall->isVarArg();
-  SDValue Callee      = TheCall->getCallee();
-  DebugLoc dl         = TheCall->getDebugLoc();
+SDValue
+ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool isTailCall,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) {
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
-  CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC, /* Return*/ false));
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(Outs,
+                             CCAssignFnForNode(CallConv, /* Return*/ false,
+                                               isVarArg));
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -851,12 +920,12 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
        i != e;
        ++i, ++realArgIdx) {
     CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = TheCall->getArg(realArgIdx);
-    ISD::ArgFlagsTy Flags = TheCall->getArgFlags(realArgIdx);
+    SDValue Arg = Outs[realArgIdx].Val;
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
-    default: assert(0 && "Unknown loc info!");
+    default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::SExt:
       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
@@ -872,7 +941,7 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
       break;
     }
 
-    // f64 and v2f64 are passed in i32 pairs and must be split into pieces
+    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
     if (VA.needsCustom()) {
       if (VA.getLocVT() == MVT::v2f64) {
         SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
@@ -880,23 +949,23 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
         SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
                                   DAG.getConstant(1, MVT::i32));
 
-        PassF64ArgInRegs(TheCall, DAG, Chain, Op0, RegsToPass,
+        PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
                          VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
 
         VA = ArgLocs[++i]; // skip ahead to next loc
         if (VA.isRegLoc()) {
-          PassF64ArgInRegs(TheCall, DAG, Chain, Op1, RegsToPass,
+          PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
                            VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
         } else {
           assert(VA.isMemLoc());
           if (StackPtr.getNode() == 0)
             StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
 
-          MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
-                                                 Chain, Op1, Flags));
+          MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
+                                                 dl, DAG, VA, Flags));
         }
       } else {
-        PassF64ArgInRegs(TheCall, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+        PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
                          StackPtr, MemOpChains, Flags);
       }
     } else if (VA.isRegLoc()) {
@@ -906,8 +975,8 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
       if (StackPtr.getNode() == 0)
         StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
 
-      MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
-                                             Chain, Arg, Flags));
+      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+                                             dl, DAG, VA, Flags));
     }
   }
 
@@ -933,17 +1002,17 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     GlobalValue *GV = G->getGlobal();
     isDirect = true;
-    bool isExt = (GV->isDeclaration() || GV->hasWeakLinkage() ||
-                  GV->hasLinkOnceLinkage());
+    bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
     bool isStub = (isExt && Subtarget->isTargetDarwin()) &&
                    getTargetMachine().getRelocationModel() != Reloc::Static;
     isARMFunc = !Subtarget->isThumb() || isStub;
     // ARM call to a local ARM function is predicable.
     isLocalARMFunc = !Subtarget->isThumb() && !isExt;
     // tBX takes a register source operand.
-    if (isARMFunc && Subtarget->isThumb() && !Subtarget->hasV5TOps()) {
-      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex,
-                                                           ARMCP::CPStub, 4);
+    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV,
+                                                           ARMPCLabelIndex,
+                                                           ARMCP::CPValue, 4);
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(getPointerTy(), dl,
@@ -960,9 +1029,9 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
     isARMFunc = !Subtarget->isThumb() || isStub;
     // tBX takes a register source operand.
     const char *Sym = S->getSymbol();
-    if (isARMFunc && Subtarget->isThumb() && !Subtarget->hasV5TOps()) {
-      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(Sym, ARMPCLabelIndex,
-                                                           ARMCP::CPStub, 4);
+    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
+                                                       Sym, ARMPCLabelIndex, 4);
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(getPointerTy(), dl,
@@ -977,7 +1046,7 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
   // FIXME: handle tail calls differently.
   unsigned CallOpc;
   if (Subtarget->isThumb()) {
-    if (!Subtarget->hasV5TOps() && (!isDirect || isARMFunc))
+    if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
     else
       CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
@@ -986,7 +1055,7 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
       ? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL)
       : ARMISD::CALL_NOLINK;
   }
-  if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb()) {
+  if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb1Only()) {
     // implicit def LR - LR mustn't be allocated as GRP:$dst of CALL_NOLINK
     Chain = DAG.getCopyToReg(Chain, dl, ARM::LR, DAG.getUNDEF(MVT::i32),InFlag);
     InFlag = Chain.getValue(1);
@@ -1011,30 +1080,31 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                              DAG.getIntPtrConstant(0, true), InFlag);
-  if (RetVT != MVT::Other)
+  if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
-  return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
-                                 Op.getResNo());
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins,
+                         dl, DAG, InVals);
 }
 
-SDValue ARMTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
-  // The chain is always operand #0
-  SDValue Chain = Op.getOperand(0);
-  DebugLoc dl = Op.getDebugLoc();
+SDValue
+ARMTargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               DebugLoc dl, SelectionDAG &DAG) {
 
   // CCValAssign - represent the assignment of the return value to a location.
   SmallVector<CCValAssign, 16> RVLocs;
-  unsigned CC   = DAG.getMachineFunction().getFunction()->getCallingConv();
-  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
 
   // CCState - Info about the registers and stack slots.
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
+                 *DAG.getContext());
 
-  // Analyze return values of ISD::RET.
-  CCInfo.AnalyzeReturn(Op.getNode(), CCAssignFnForNode(CC, /* Return */ true));
+  // Analyze outgoing return values.
+  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
+                                               isVarArg));
 
   // If this is the first return lowered for this function, add
   // the regs to the liveout set for the function.
@@ -1053,12 +1123,10 @@ SDValue ARMTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
-    // ISD::RET => ret chain, (regnum1,val1), ...
-    // So i*2+1 index only the regnums
-    SDValue Arg = Op.getOperand(realRVLocIdx*2+1);
+    SDValue Arg = Outs[realRVLocIdx].Val;
 
     switch (VA.getLocInfo()) {
-    default: assert(0 && "Unknown loc info!");
+    default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::BCvt:
       Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg);
@@ -1112,13 +1180,13 @@ SDValue ARMTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
 }
 
 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
-// their target countpart wrapped in the ARMISD::Wrapper node. Suppose N is
+// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
 // one of the above mentioned nodes. It has to be wrapped because otherwise
 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
 // be used to form addressing mode. These wrapped nodes will be selected
 // into MOVi.
 static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
-  MVT PtrVT = Op.getValueType();
+  EVT PtrVT = Op.getValueType();
   // FIXME there is no actual debug info here
   DebugLoc dl = Op.getDebugLoc();
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
@@ -1137,11 +1205,11 @@ SDValue
 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                                  SelectionDAG &DAG) {
   DebugLoc dl = GA->getDebugLoc();
-  MVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy();
   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
   ARMConstantPoolValue *CPV =
-    new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue,
-                             PCAdj, "tlsgd", true);
+    new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
+                             ARMCP::CPValue, PCAdj, "tlsgd", true);
   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
   Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, NULL, 0);
@@ -1154,12 +1222,13 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   ArgListTy Args;
   ArgListEntry Entry;
   Entry.Node = Argument;
-  Entry.Ty = (const Type *) Type::Int32Ty;
+  Entry.Ty = (const Type *) Type::getInt32Ty(*DAG.getContext());
   Args.push_back(Entry);
   // FIXME: is there useful debug info available here?
   std::pair<SDValue, SDValue> CallResult =
-    LowerCallTo(Chain, (const Type *) Type::Int32Ty, false, false, false, false,
-                0, CallingConv::C, false,
+    LowerCallTo(Chain, (const Type *) Type::getInt32Ty(*DAG.getContext()),
+                false, false, false, false,
+                0, CallingConv::C, false, /*isReturnValueUsed=*/true,
                 DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
   return CallResult.first;
 }
@@ -1173,16 +1242,16 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
   DebugLoc dl = GA->getDebugLoc();
   SDValue Offset;
   SDValue Chain = DAG.getEntryNode();
-  MVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy();
   // Get the Thread Pointer
   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
 
-  if (GV->isDeclaration()){
+  if (GV->isDeclaration()) {
     // initial exec model
     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
     ARMConstantPoolValue *CPV =
-      new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue,
-                               PCAdj, "gottpoff", true);
+      new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
+                               ARMCP::CPValue, PCAdj, "gottpoff", true);
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, NULL, 0);
@@ -1194,8 +1263,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
     Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, NULL, 0);
   } else {
     // local exec model
-    ARMConstantPoolValue *CPV =
-      new ARMConstantPoolValue(GV, ARMCP::CPValue, "tpoff");
+    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, "tpoff");
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, NULL, 0);
@@ -1222,59 +1290,47 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
 
 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
                                                  SelectionDAG &DAG) {
-  MVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy();
   DebugLoc dl = Op.getDebugLoc();
   GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
   if (RelocM == Reloc::PIC_) {
     bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
     ARMConstantPoolValue *CPV =
-      new ARMConstantPoolValue(GV, ARMCP::CPValue, UseGOTOFF ? "GOTOFF":"GOT");
+      new ARMConstantPoolValue(GV, UseGOTOFF ? "GOTOFF" : "GOT");
     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
-                                 CPAddr, NULL, 0);
+                                 CPAddr,
+                                 PseudoSourceValue::getConstantPool(), 0);
     SDValue Chain = Result.getValue(1);
     SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
     if (!UseGOTOFF)
-      Result = DAG.getLoad(PtrVT, dl, Chain, Result, NULL, 0);
+      Result = DAG.getLoad(PtrVT, dl, Chain, Result,
+                           PseudoSourceValue::getGOT(), 0);
     return Result;
   } else {
     SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, NULL, 0);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                       PseudoSourceValue::getConstantPool(), 0);
   }
 }
 
-/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol
-/// even in non-static mode.
-static bool GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) {
-  // If symbol visibility is hidden, the extra load is not needed if
-  // the symbol is definitely defined in the current translation unit.
-  bool isDecl = GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode();
-  if (GV->hasHiddenVisibility() && (!isDecl && !GV->hasCommonLinkage()))
-    return false;
-  return RelocM != Reloc::Static && (isDecl || GV->isWeakForLinker());
-}
-
 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
                                                     SelectionDAG &DAG) {
-  MVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy();
   DebugLoc dl = Op.getDebugLoc();
   GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
-  bool IsIndirect = GVIsIndirectSymbol(GV, RelocM);
   SDValue CPAddr;
   if (RelocM == Reloc::Static)
     CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
   else {
-    unsigned PCAdj = (RelocM != Reloc::PIC_)
-      ? 0 : (Subtarget->isThumb() ? 4 : 8);
-    ARMCP::ARMCPKind Kind = IsIndirect ? ARMCP::CPNonLazyPtr
-      : ARMCP::CPValue;
-    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex,
-                                                         Kind, PCAdj);
+    unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
+    ARMConstantPoolValue *CPV =
+      new ARMConstantPoolValue(GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj);
     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   }
   CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
@@ -1286,7 +1342,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
     Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
   }
-  if (IsIndirect)
+
+  if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
     Result = DAG.getLoad(PtrVT, dl, Chain, Result, NULL, 0);
 
   return Result;
@@ -1296,32 +1353,55 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
                                                     SelectionDAG &DAG){
   assert(Subtarget->isTargetELF() &&
          "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
-  MVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy();
   DebugLoc dl = Op.getDebugLoc();
   unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
-  ARMConstantPoolValue *CPV = new ARMConstantPoolValue("_GLOBAL_OFFSET_TABLE_",
-                                                       ARMPCLabelIndex,
-                                                       ARMCP::CPValue, PCAdj);
+  ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
+                                                       "_GLOBAL_OFFSET_TABLE_",
+                                                       ARMPCLabelIndex, PCAdj);
   SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, NULL, 0);
+  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                               PseudoSourceValue::getConstantPool(), 0);
   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
   return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
 }
 
 SDValue
 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   DebugLoc dl = Op.getDebugLoc();
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
-  case Intrinsic::arm_thread_pointer:
-      return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+  case Intrinsic::arm_thread_pointer: {
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+  }
+  case Intrinsic::eh_sjlj_lsda: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    EVT PtrVT = getPointerTy();
+    DebugLoc dl = Op.getDebugLoc();
+    Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+    SDValue CPAddr;
+    unsigned PCAdj = (RelocM != Reloc::PIC_)
+      ? 0 : (Subtarget->isThumb() ? 4 : 8);
+    ARMConstantPoolValue *CPV =
+      new ARMConstantPoolValue(MF.getFunction(), ARMPCLabelIndex,
+                               ARMCP::CPLSDA, PCAdj);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    SDValue Result =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, NULL, 0);
+    SDValue Chain = Result.getValue(1);
+
+    if (RelocM == Reloc::PIC_) {
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+      Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+    }
+    return Result;
+  }
   case Intrinsic::eh_sjlj_setjmp:
-      SDValue Res = DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32,
-                         Op.getOperand(1));
-      return Res;
+    return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(1));
   }
 }
 
@@ -1330,13 +1410,60 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
   DebugLoc dl = Op.getDebugLoc();
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
 }
 
 SDValue
+ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  EVT VT = Node->getValueType(0);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  SDValue Align = Op.getOperand(2);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
+
+  unsigned AlignVal = cast<ConstantSDNode>(Align)->getZExtValue();
+  unsigned StackAlign = getTargetMachine().getFrameInfo()->getStackAlignment();
+  if (AlignVal > StackAlign)
+    // Do this now since selection pass cannot introduce new target
+    // independent node.
+    Align = DAG.getConstant(-(uint64_t)AlignVal, VT);
+
+  // In Thumb1 mode, there isn't a "sub r, sp, r" instruction, we will end up
+  // using a "add r, sp, r" instead. Negate the size now so we don't have to
+  // do even more horrible hack later.
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  if (AFI->isThumb1OnlyFunction()) {
+    bool Negate = true;
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Size);
+    if (C) {
+      uint32_t Val = C->getZExtValue();
+      if (Val <= 508 && ((Val & 3) == 0))
+        Negate = false;
+    }
+    if (Negate)
+      Size = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, VT), Size);
+  }
+
+  SDVTList VTList = DAG.getVTList(VT, MVT::Other);
+  SDValue Ops1[] = { Chain, Size, Align };
+  SDValue Res = DAG.getNode(ARMISD::DYN_ALLOC, dl, VTList, Ops1, 3);
+  Chain = Res.getValue(1);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
+                             DAG.getIntPtrConstant(0, true), SDValue());
+  SDValue Ops2[] = { Res, Chain };
+  return DAG.getMergeValues(Ops2, 2, dl);
+}
+
+SDValue
 ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                         SDValue &Root, SelectionDAG &DAG,
                                         DebugLoc dl) {
@@ -1344,7 +1471,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   TargetRegisterClass *RC;
-  if (AFI->isThumbFunction())
+  if (AFI->isThumb1OnlyFunction())
     RC = ARM::tGPRRegisterClass;
   else
     RC = ARM::GPRRegisterClass;
@@ -1371,21 +1498,25 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
 }
 
 SDValue
-ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
+ARMTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals) {
+
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
-  SDValue Root = Op.getOperand(0);
-  DebugLoc dl = Op.getDebugLoc();
-  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
-  unsigned CC = MF.getFunction()->getCallingConv();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
-  CCInfo.AnalyzeFormalArguments(Op.getNode(),
-                                CCAssignFnForNode(CC, /* Return*/ false));
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins,
+                                CCAssignFnForNode(CallConv, /* Return*/ false,
+                                                  isVarArg));
 
   SmallVector<SDValue, 16> ArgValues;
 
@@ -1394,7 +1525,7 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
 
     // Arguments stored in registers.
     if (VA.isRegLoc()) {
-      MVT RegVT = VA.getLocVT();
+      EVT RegVT = VA.getLocVT();
 
       SDValue ArgValue;
       if (VA.needsCustom()) {
@@ -1404,43 +1535,43 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
 
         if (VA.getLocVT() == MVT::v2f64) {
           SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
-                                                   Root, DAG, dl);
+                                                   Chain, DAG, dl);
           VA = ArgLocs[++i]; // skip ahead to next loc
           SDValue ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
-                                                   Root, DAG, dl);
+                                                   Chain, DAG, dl);
           ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
                                  ArgValue, ArgValue1, DAG.getIntPtrConstant(0));
           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
                                  ArgValue, ArgValue2, DAG.getIntPtrConstant(1));
         } else
-          ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Root, DAG, dl);
+          ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
 
       } else {
         TargetRegisterClass *RC;
-        if (FloatABIType == FloatABI::Hard && RegVT == MVT::f32)
+
+        if (RegVT == MVT::f32)
           RC = ARM::SPRRegisterClass;
-        else if (FloatABIType == FloatABI::Hard && RegVT == MVT::f64)
+        else if (RegVT == MVT::f64)
           RC = ARM::DPRRegisterClass;
-        else if (AFI->isThumbFunction())
-          RC = ARM::tGPRRegisterClass;
+        else if (RegVT == MVT::v2f64)
+          RC = ARM::QPRRegisterClass;
+        else if (RegVT == MVT::i32)
+          RC = (AFI->isThumb1OnlyFunction() ?
+                ARM::tGPRRegisterClass : ARM::GPRRegisterClass);
         else
-          RC = ARM::GPRRegisterClass;
-
-        assert((RegVT == MVT::i32 || RegVT == MVT::f32 ||
-                (FloatABIType == FloatABI::Hard && RegVT == MVT::f64)) &&
-               "RegVT not supported by FORMAL_ARGUMENTS Lowering");
+          llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
         // Transform the arguments in physical registers into virtual ones.
         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
-        ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT);
+        ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
       }
 
       // If this is an 8 or 16-bit value, it is really passed promoted
       // to 32 bits.  Insert an assert[sz]ext to capture this, then
       // truncate to the right size.
       switch (VA.getLocInfo()) {
-      default: assert(0 && "Unknown loc info!");
+      default: llvm_unreachable("Unknown loc info!");
       case CCValAssign::Full: break;
       case CCValAssign::BCvt:
         ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
@@ -1457,7 +1588,7 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
         break;
       }
 
-      ArgValues.push_back(ArgValue);
+      InVals.push_back(ArgValue);
 
     } else { // VA.isRegLoc()
 
@@ -1470,7 +1601,7 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
 
       // Create load nodes to retrieve arguments from the stack.
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      ArgValues.push_back(DAG.getLoad(VA.getValVT(), dl, Root, FIN, NULL, 0));
+      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, NULL, 0));
     }
   }
 
@@ -1500,31 +1631,27 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
       SmallVector<SDValue, 4> MemOps;
       for (; NumGPRs < 4; ++NumGPRs) {
         TargetRegisterClass *RC;
-        if (AFI->isThumbFunction())
+        if (AFI->isThumb1OnlyFunction())
           RC = ARM::tGPRRegisterClass;
         else
           RC = ARM::GPRRegisterClass;
 
         unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC);
-        SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i32);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
         SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
         MemOps.push_back(Store);
         FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
                           DAG.getConstant(4, getPointerTy()));
       }
       if (!MemOps.empty())
-        Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                           &MemOps[0], MemOps.size());
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                            &MemOps[0], MemOps.size());
     } else
       // This will point to the next argument passed via stack.
       VarArgsFrameIndex = MFI->CreateFixedObject(4, ArgOffset);
   }
 
-  ArgValues.push_back(Root);
-
-  // Return the new list of results.
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
-                     &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
+  return Chain;
 }
 
 /// isFloatingPointZero - Return true if this is +0.0.
@@ -1543,46 +1670,46 @@ static bool isFloatingPointZero(SDValue Op) {
   return false;
 }
 
-static bool isLegalCmpImmediate(unsigned C, bool isThumb) {
-  return ( isThumb && (C & ~255U) == 0) ||
-         (!isThumb && ARM_AM::getSOImmVal(C) != -1);
+static bool isLegalCmpImmediate(unsigned C, bool isThumb1Only) {
+  return ( isThumb1Only && (C & ~255U) == 0) ||
+         (!isThumb1Only && ARM_AM::getSOImmVal(C) != -1);
 }
 
 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
 /// the given operands.
 static SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                         SDValue &ARMCC, SelectionDAG &DAG, bool isThumb,
+                         SDValue &ARMCC, SelectionDAG &DAG, bool isThumb1Only,
                          DebugLoc dl) {
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     unsigned C = RHSC->getZExtValue();
-    if (!isLegalCmpImmediate(C, isThumb)) {
+    if (!isLegalCmpImmediate(C, isThumb1Only)) {
       // Constant does not fit, try adjusting it by one?
       switch (CC) {
       default: break;
       case ISD::SETLT:
       case ISD::SETGE:
-        if (isLegalCmpImmediate(C-1, isThumb)) {
+        if (isLegalCmpImmediate(C-1, isThumb1Only)) {
           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
           RHS = DAG.getConstant(C-1, MVT::i32);
         }
         break;
       case ISD::SETULT:
       case ISD::SETUGE:
-        if (C > 0 && isLegalCmpImmediate(C-1, isThumb)) {
+        if (C > 0 && isLegalCmpImmediate(C-1, isThumb1Only)) {
           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
           RHS = DAG.getConstant(C-1, MVT::i32);
         }
         break;
       case ISD::SETLE:
       case ISD::SETGT:
-        if (isLegalCmpImmediate(C+1, isThumb)) {
+        if (isLegalCmpImmediate(C+1, isThumb1Only)) {
           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
           RHS = DAG.getConstant(C+1, MVT::i32);
         }
         break;
       case ISD::SETULE:
       case ISD::SETUGT:
-        if (C < 0xffffffff && isLegalCmpImmediate(C+1, isThumb)) {
+        if (C < 0xffffffff && isLegalCmpImmediate(C+1, isThumb1Only)) {
           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
           RHS = DAG.getConstant(C+1, MVT::i32);
         }
@@ -1620,7 +1747,7 @@ static SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
 
 static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) {
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
@@ -1631,13 +1758,12 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
   if (LHS.getValueType() == MVT::i32) {
     SDValue ARMCC;
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
-    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb(), dl);
+    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb1Only(), dl);
     return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC, CCR,Cmp);
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
-  if (FPCCToARMCC(CC, CondCode, CondCode2))
-    std::swap(TrueVal, FalseVal);
+  FPCCToARMCC(CC, CondCode, CondCode2);
 
   SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
@@ -1666,16 +1792,14 @@ static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG,
   if (LHS.getValueType() == MVT::i32) {
     SDValue ARMCC;
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
-    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb(), dl);
+    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb1Only(), dl);
     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
                        Chain, Dest, ARMCC, CCR,Cmp);
   }
 
   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
   ARMCC::CondCodes CondCode, CondCode2;
-  if (FPCCToARMCC(CC, CondCode, CondCode2))
-    // Swap the LHS/RHS of the comparison if needed.
-    std::swap(LHS, RHS);
+  FPCCToARMCC(CC, CondCode, CondCode2);
 
   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
   SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32);
@@ -1697,21 +1821,32 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) {
   SDValue Index = Op.getOperand(2);
   DebugLoc dl = Op.getDebugLoc();
 
-  MVT PTy = getPointerTy();
+  EVT PTy = getPointerTy();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
   ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
-  SDValue UId =  DAG.getConstant(AFI->createJumpTableUId(), PTy);
+  SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy);
   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId);
   Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy));
   SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
-  bool isPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
-  Addr = DAG.getLoad(isPIC ? (MVT)MVT::i32 : PTy, dl,
-                     Chain, Addr, NULL, 0);
-  Chain = Addr.getValue(1);
-  if (isPIC)
+  if (Subtarget->isThumb2()) {
+    // Thumb2 uses a two-level jump. That is, it jumps into the jump table
+    // which does another jump to the destination. This also makes it easier
+    // to translate it to TBB / TBH later.
+    // FIXME: This might not work if the function is extremely large.
+    return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
+                       Addr, Op.getOperand(2), JTI, UId);
+  }
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+    Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, NULL, 0);
+    Chain = Addr.getValue(1);
     Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
-  return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+  } else {
+    Addr = DAG.getLoad(PTy, dl, Chain, Addr, NULL, 0);
+    Chain = Addr.getValue(1);
+    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+  }
 }
 
 static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
@@ -1723,7 +1858,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
   unsigned Opc =
     Op.getOpcode() == ISD::SINT_TO_FP ? ARMISD::SITOF : ARMISD::UITOF;
@@ -1737,8 +1872,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue Tmp0 = Op.getOperand(0);
   SDValue Tmp1 = Op.getOperand(1);
   DebugLoc dl = Op.getDebugLoc();
-  MVT VT = Op.getValueType();
-  MVT SrcVT = Tmp1.getValueType();
+  EVT VT = Op.getValueType();
+  EVT SrcVT = Tmp1.getValueType();
   SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0);
   SDValue Cmp = getVFPCmp(Tmp1, DAG.getConstantFP(0.0, SrcVT), DAG, dl);
   SDValue ARMCC = DAG.getConstant(ARMCC::LT, MVT::i32);
@@ -1749,7 +1884,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin())
@@ -1784,7 +1919,7 @@ ARMTargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
   unsigned BytesLeft = SizeVal & 3;
   unsigned NumMemOps = SizeVal >> 2;
   unsigned EmittedNumMemOps = 0;
-  MVT VT = MVT::i32;
+  EVT VT = MVT::i32;
   unsigned VTSize = 4;
   unsigned i = 0;
   const unsigned MAX_LOADS_IN_LDM = 6;
@@ -1890,45 +2025,55 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) {
 
 /// getZeroVector - Returns a vector of specified type with all zero elements.
 ///
-static SDValue getZeroVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
   // Zero vectors are used to represent vector negation and in those cases
   // will be implemented with the NEON VNEG instruction.  However, VNEG does
   // not support i64 elements, so sometimes the zero vectors will need to be
   // explicitly constructed.  For those cases, and potentially other uses in
-  // the future, always build zero vectors as <4 x i32> or <2 x i32> bitcasted
+  // the future, always build zero vectors as <16 x i8> or <8 x i8> bitcasted
   // to their dest type.  This ensures they get CSE'd.
   SDValue Vec;
-  SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
-  if (VT.getSizeInBits() == 64)
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
-  else
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+  SDValue Cst = DAG.getTargetConstant(0, MVT::i8);
+  SmallVector<SDValue, 8> Ops;
+  MVT TVT;
+
+  if (VT.getSizeInBits() == 64) {
+    Ops.assign(8, Cst); TVT = MVT::v8i8;
+  } else {
+    Ops.assign(16, Cst); TVT = MVT::v16i8;
+  }
+  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size());
 
   return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
 }
 
 /// getOnesVector - Returns a vector of specified type with all bits set.
 ///
-static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
-  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
-  // type.  This ensures they get CSE'd.
+  // Always build ones vectors as <16 x i32> or <8 x i32> bitcasted to their
+  // dest type. This ensures they get CSE'd.
   SDValue Vec;
-  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
-  if (VT.getSizeInBits() == 64)
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
-  else
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+  SDValue Cst = DAG.getTargetConstant(0xFF, MVT::i8);
+  SmallVector<SDValue, 8> Ops;
+  MVT TVT;
+
+  if (VT.getSizeInBits() == 64) {
+    Ops.assign(8, Cst); TVT = MVT::v8i8;
+  } else {
+    Ops.assign(16, Cst); TVT = MVT::v16i8;
+  }
+  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size());
 
   return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
 }
 
 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
                           const ARMSubtarget *ST) {
-  MVT VT = N->getValueType(0);
+  EVT VT = N->getValueType(0);
   DebugLoc dl = N->getDebugLoc();
 
   // Lower vector shifts on NEON to use VSHL.
@@ -1947,7 +2092,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
     // NEON uses the same intrinsics for both left and right shifts.  For
     // right shifts, the shift amounts are negative, so negate the vector of
     // shift amounts.
-    MVT ShiftVT = N->getOperand(1).getValueType();
+    EVT ShiftVT = N->getOperand(1).getValueType();
     SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
                                        getZeroVector(ShiftVT, DAG, dl),
                                        N->getOperand(1));
@@ -1959,8 +2104,11 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
                        N->getOperand(0), NegatedCount);
   }
 
-  assert(VT == MVT::i64 &&
-         (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+  // We can get here for a node like i32 = ISD::SHL i32, i64
+  if (VT != MVT::i64)
+    return SDValue();
+
+  assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
          "Unknown shift to lower!");
 
   // We only lower SRA, SRL of 1 here, all others use generic lowering.
@@ -1969,7 +2117,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // If we are in thumb mode, we don't have RRX.
-  if (ST->isThumb()) return SDValue();
+  if (ST->isThumb1Only()) return SDValue();
 
   // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
@@ -1998,13 +2146,13 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   DebugLoc dl = Op.getDebugLoc();
 
   if (Op.getOperand(1).getValueType().isFloatingPoint()) {
     switch (SetCCOpcode) {
-    default: assert(0 && "Illegal FP comparison"); break;
+    default: llvm_unreachable("Illegal FP comparison"); break;
     case ISD::SETUNE:
     case ISD::SETNE:  Invert = true; // Fallthrough
     case ISD::SETOEQ:
@@ -2043,7 +2191,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   } else {
     // Integer comparisons.
     switch (SetCCOpcode) {
-    default: assert(0 && "Illegal integer comparison"); break;
+    default: llvm_unreachable("Illegal integer comparison"); break;
     case ISD::SETNE:  Invert = true;
     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
     case ISD::SETLT:  Swap = true;
@@ -2056,7 +2204,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
     case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
     }
 
-    // Detect VTST (Vector Test Bits) = vicmp ne (and (op0, op1), zero).
+    // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
     if (Opc == ARMISD::VCEQ) {
 
       SDValue AndOp;
@@ -2147,7 +2295,7 @@ static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef,
   }
 
   default:
-    assert(0 && "unexpected size for isVMOVSplat");
+    llvm_unreachable("unexpected size for isVMOVSplat");
     break;
   }
 
@@ -2174,22 +2322,123 @@ SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
                      SplatBitSize, DAG);
 }
 
-static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT,
+                       bool &ReverseVEXT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+  ReverseVEXT = false;
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, it may still be
+    // a VEXT but the source vectors must be swapped.
+    ExpectedElt += 1;
+    if (ExpectedElt == NumElts * 2) {
+      ExpectedElt = 0;
+      ReverseVEXT = true;
+    }
+
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  // Adjust the index value if the source operands will be swapped.
+  if (ReverseVEXT)
+    Imm -= NumElts;
+
+  return true;
+}
+
+/// isVREVMask - Check if a vector shuffle corresponds to a VREV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT,
+                       unsigned BlockSize) {
+  assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
+         "Only possible block sizes for VREV are: 16, 32, 64");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  unsigned BlockElts = M[0] + 1;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned) M[i] !=
+        (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT,
+                       unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((unsigned) M[i] != i + WhichResult ||
+        (unsigned) M[i+1] != i + NumElts + WhichResult)
+      return false;
+  }
+  return true;
+}
+
+static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
+                       unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if ((unsigned) M[i] != 2 * i + WhichResult)
+      return false;
+  }
+
+  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32)
+    return false;
+
+  return true;
+}
+
+static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT,
+                       unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((unsigned) M[i] != Idx ||
+        (unsigned) M[i+1] != Idx + NumElts)
+      return false;
+    Idx += 1;
+  }
+
+  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32)
+    return false;
+
+  return true;
+}
+
+static SDValue BuildSplat(SDValue Val, EVT VT, SelectionDAG &DAG, DebugLoc dl) {
   // Canonicalize all-zeros and all-ones vectors.
-  ConstantSDNode *ConstVal = dyn_cast<ConstantSDNode>(Val.getNode());
+  ConstantSDNode *ConstVal = cast<ConstantSDNode>(Val.getNode());
   if (ConstVal->isNullValue())
     return getZeroVector(VT, DAG, dl);
   if (ConstVal->isAllOnesValue())
     return getOnesVector(VT, DAG, dl);
 
-  MVT CanonicalVT;
+  EVT CanonicalVT;
   if (VT.is64BitVector()) {
     switch (Val.getValueType().getSizeInBits()) {
     case 8:  CanonicalVT = MVT::v8i8; break;
     case 16: CanonicalVT = MVT::v4i16; break;
     case 32: CanonicalVT = MVT::v2i32; break;
     case 64: CanonicalVT = MVT::v1i64; break;
-    default: assert(0 && "unexpected splat element type"); break;
+    default: llvm_unreachable("unexpected splat element type"); break;
     }
   } else {
     assert(VT.is128BitVector() && "unknown splat vector size");
@@ -2198,7 +2447,7 @@ static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) {
     case 16: CanonicalVT = MVT::v8i16; break;
     case 32: CanonicalVT = MVT::v4i32; break;
     case 64: CanonicalVT = MVT::v2i64; break;
-    default: assert(0 && "unexpected splat element type"); break;
+    default: llvm_unreachable("unexpected splat element type"); break;
     }
   }
 
@@ -2213,69 +2462,291 @@ static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) {
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.
 static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
-  assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
   DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
 
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    SDValue Val = isVMOVSplat(SplatBits.getZExtValue(),
-                              SplatUndef.getZExtValue(), SplatBitSize, DAG);
-    if (Val.getNode())
-      return BuildSplat(Val, Op.getValueType(), DAG, dl);
+    if (SplatBitSize <= 64) {
+      SDValue Val = isVMOVSplat(SplatBits.getZExtValue(),
+                                SplatUndef.getZExtValue(), SplatBitSize, DAG);
+      if (Val.getNode())
+        return BuildSplat(Val, VT, DAG, dl);
+    }
+  }
+
+  // If there are only 2 elements in a 128-bit vector, insert them into an
+  // undef vector.  This handles the common case for 128-bit vector argument
+  // passing, where the insertions should be translated to subreg accesses
+  // with no real instructions.
+  if (VT.is128BitVector() && Op.getNumOperands() == 2) {
+    SDValue Val = DAG.getUNDEF(VT);
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    if (Op0.getOpcode() != ISD::UNDEF)
+      Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op0,
+                        DAG.getIntPtrConstant(0));
+    if (Op1.getOpcode() != ISD::UNDEF)
+      Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op1,
+                        DAG.getIntPtrConstant(1));
+    return Val;
   }
 
   return SDValue();
 }
 
-static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
-  return Op;
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                      EVT VT) const {
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (M[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = M[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return true;
+  }
+
+  bool ReverseVEXT;
+  unsigned Imm, WhichResult;
+
+  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+          isVREVMask(M, VT, 64) ||
+          isVREVMask(M, VT, 32) ||
+          isVREVMask(M, VT, 16) ||
+          isVEXTMask(M, VT, ReverseVEXT, Imm) ||
+          isVTRNMask(M, VT, WhichResult) ||
+          isVUZPMask(M, VT, WhichResult) ||
+          isVZIPMask(M, VT, WhichResult));
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      DebugLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
+
+  enum {
+    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VREV,
+    OP_VDUP0,
+    OP_VDUP1,
+    OP_VDUP2,
+    OP_VDUP3,
+    OP_VEXT1,
+    OP_VEXT2,
+    OP_VEXT3,
+    OP_VUZPL, // VUZP, left result
+    OP_VUZPR, // VUZP, right result
+    OP_VZIPL, // VZIP, left result
+    OP_VZIPR, // VZIP, right result
+    OP_VTRNL, // VTRN, left result
+    OP_VTRNR  // VTRN, right result
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1*9+2)*9+3) return LHS;
+    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  EVT VT = OpLHS.getValueType();
+
+  switch (OpNum) {
+  default: llvm_unreachable("Unknown shuffle opcode!");
+  case OP_VREV:
+    return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
+  case OP_VDUP0:
+  case OP_VDUP1:
+  case OP_VDUP2:
+  case OP_VDUP3:
+    return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                       OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32));
+  case OP_VEXT1:
+  case OP_VEXT2:
+  case OP_VEXT3:
+    return DAG.getNode(ARMISD::VEXT, dl, VT,
+                       OpLHS, OpRHS,
+                       DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32));
+  case OP_VUZPL:
+  case OP_VUZPR:
+    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
+  case OP_VZIPL:
+  case OP_VZIPR:
+    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
+  case OP_VTRNL:
+  case OP_VTRNR:
+    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
+  }
 }
 
-static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
-  return Op;
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+  SmallVector<int, 8> ShuffleMask;
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  // FIXME: floating-point vectors should be canonicalized to integer vectors
+  // of the same time so that they get CSEd properly.
+  SVN->getMask(ShuffleMask);
+
+  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
+    int Lane = SVN->getSplatIndex();
+    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+    }
+    return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
+                       DAG.getConstant(Lane, MVT::i32));
+  }
+
+  bool ReverseVEXT;
+  unsigned Imm;
+  if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
+    if (ReverseVEXT)
+      std::swap(V1, V2);
+    return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
+
+  if (isVREVMask(ShuffleMask, VT, 64))
+    return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
+  if (isVREVMask(ShuffleMask, VT, 32))
+    return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
+  if (isVREVMask(ShuffleMask, VT, 16))
+    return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
+
+  // Check for Neon shuffles that modify both input vectors in place.
+  // If both results are used, i.e., if there are two shuffles with the same
+  // source operands and with masks corresponding to both results of one of
+  // these operations, DAG memoization will ensure that a single node is
+  // used for both shuffles.
+  unsigned WhichResult;
+  if (isVTRNMask(ShuffleMask, VT, WhichResult))
+    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                       V1, V2).getValue(WhichResult);
+  if (isVUZPMask(ShuffleMask, VT, WhichResult))
+    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                       V1, V2).getValue(WhichResult);
+  if (isVZIPMask(ShuffleMask, VT, WhichResult))
+    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                       V1, V2).getValue(WhichResult);
+
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (ShuffleMask[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = ShuffleMask[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  return SDValue();
 }
 
 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
-  assert((VT == MVT::i8 || VT == MVT::i16) &&
-         "unexpected type for custom-lowering vector extract");
   SDValue Vec = Op.getOperand(0);
   SDValue Lane = Op.getOperand(1);
+
+  // FIXME: This is invalid for 8 and 16-bit elements - the information about
+  // sign / zero extension is lost!
   Op = DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
   Op = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Op, DAG.getValueType(VT));
-  return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
+
+  if (VT.bitsLT(MVT::i32))
+    Op = DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
+  else if (VT.bitsGT(MVT::i32))
+    Op = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op);
+
+  return Op;
 }
 
-static SDValue LowerCONCAT_VECTORS(SDValue Op) {
-  if (Op.getValueType().is128BitVector() && Op.getNumOperands() == 2)
-    return Op;
-  return SDValue();
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  // The only time a CONCAT_VECTORS operation can have legal types is when
+  // two 64-bit vectors are concatenated to a 128-bit vector.
+  assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
+         "unexpected CONCAT_VECTORS");
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Val = DAG.getUNDEF(MVT::v2f64);
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  if (Op0.getOpcode() != ISD::UNDEF)
+    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+                      DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op0),
+                      DAG.getIntPtrConstant(0));
+  if (Op1.getOpcode() != ISD::UNDEF)
+    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+                      DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op1),
+                      DAG.getIntPtrConstant(1));
+  return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Val);
 }
 
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   switch (Op.getOpcode()) {
-  default: assert(0 && "Don't know how to custom lower this!"); abort();
+  default: llvm_unreachable("Don't know how to custom lower this!");
   case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
   case ISD::GlobalAddress:
     return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
       LowerGlobalAddressELF(Op, DAG);
   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
-  case ISD::CALL:          return LowerCALL(Op, DAG);
-  case ISD::RET:           return LowerRET(Op, DAG);
   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG, Subtarget);
   case ISD::BR_CC:         return LowerBR_CC(Op, DAG, Subtarget);
   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::VASTART:       return LowerVASTART(Op, DAG, VarArgsFrameIndex);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
-  case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG);
   case ISD::RETURNADDR:    break;
   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
   case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
@@ -2287,9 +2758,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   case ISD::VSETCC:        return LowerVSETCC(Op, DAG);
   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG);
   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
-  case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
-  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op);
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   }
   return SDValue();
 }
@@ -2301,7 +2771,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
                                            SelectionDAG &DAG) {
   switch (N->getOpcode()) {
   default:
-    assert(0 && "Don't know how to custom expand this!");
+    llvm_unreachable("Don't know how to custom expand this!");
     return;
   case ISD::BIT_CONVERT:
     Results.push_back(ExpandBIT_CONVERT(N, DAG));
@@ -2322,12 +2792,14 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
 
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                               MachineBasicBlock *BB) const {
+                                               MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   switch (MI->getOpcode()) {
-  default: assert(false && "Unexpected instr type to insert");
-  case ARM::tMOVCCr: {
+  default:
+    llvm_unreachable("Unexpected instr type to insert");
+  case ARM::tMOVCCr_pseudo: {
     // To "insert" a SELECT_CC instruction, we actually have to insert the
     // diamond control-flow pattern.  The incoming instruction knows the
     // destination vreg to set, the condition code register to branch on, the
@@ -2352,12 +2824,15 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     F->insert(It, sinkMBB);
     // Update machine-CFG edges by first adding all successors of the current
     // block to the new block which will contain the Phi node for the select.
-    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
-        e = BB->succ_end(); i != e; ++i)
-      sinkMBB->addSuccessor(*i);
+    // Also inform sdisel of the edge changes.
+    for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
+           E = BB->succ_end(); I != E; ++I) {
+      EM->insert(std::make_pair(*I, sinkMBB));
+      sinkMBB->addSuccessor(*I);
+    }
     // Next, remove all successors of the current block, and add the true
     // and fallthrough blocks as its successors.
-    while(!BB->succ_empty())
+    while (!BB->succ_empty())
       BB->removeSuccessor(BB->succ_begin());
     BB->addSuccessor(copy0MBB);
     BB->addSuccessor(sinkMBB);
@@ -2381,6 +2856,78 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
     return BB;
   }
+
+  case ARM::tANDsp:
+  case ARM::tADDspr_:
+  case ARM::tSUBspi_:
+  case ARM::t2SUBrSPi_:
+  case ARM::t2SUBrSPi12_:
+  case ARM::t2SUBrSPs_: {
+    MachineFunction *MF = BB->getParent();
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    bool DstIsDead = MI->getOperand(0).isDead();
+    bool SrcIsKill = MI->getOperand(1).isKill();
+
+    if (SrcReg != ARM::SP) {
+      // Copy the source to SP from virtual register.
+      const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(SrcReg);
+      unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
+        ? ARM::tMOVtgpr2gpr : ARM::tMOVgpr2gpr;
+      BuildMI(BB, dl, TII->get(CopyOpc), ARM::SP)
+        .addReg(SrcReg, getKillRegState(SrcIsKill));
+    }
+
+    unsigned OpOpc = 0;
+    bool NeedPred = false, NeedCC = false, NeedOp3 = false;
+    switch (MI->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected pseudo instruction!");
+    case ARM::tANDsp:
+      OpOpc = ARM::tAND;
+      NeedPred = true;
+      break;
+    case ARM::tADDspr_:
+      OpOpc = ARM::tADDspr;
+      break;
+    case ARM::tSUBspi_:
+      OpOpc = ARM::tSUBspi;
+      break;
+    case ARM::t2SUBrSPi_:
+      OpOpc = ARM::t2SUBrSPi;
+      NeedPred = true; NeedCC = true;
+      break;
+    case ARM::t2SUBrSPi12_:
+      OpOpc = ARM::t2SUBrSPi12;
+      NeedPred = true;
+      break;
+    case ARM::t2SUBrSPs_:
+      OpOpc = ARM::t2SUBrSPs;
+      NeedPred = true; NeedCC = true; NeedOp3 = true;
+      break;
+    }
+    MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(OpOpc), ARM::SP);
+    if (OpOpc == ARM::tAND)
+      AddDefaultT1CC(MIB);
+    MIB.addReg(ARM::SP);
+    MIB.addOperand(MI->getOperand(2));
+    if (NeedOp3)
+      MIB.addOperand(MI->getOperand(3));
+    if (NeedPred)
+      AddDefaultPred(MIB);
+    if (NeedCC)
+      AddDefaultCC(MIB);
+
+    // Copy the result from SP to virtual register.
+    const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(DstReg);
+    unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
+      ? ARM::tMOVgpr2tgpr : ARM::tMOVgpr2gpr;
+    BuildMI(BB, dl, TII->get(CopyOpc))
+      .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead))
+      .addReg(ARM::SP);
+    MF->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    return BB;
+  }
   }
 }
 
@@ -2393,7 +2940,7 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
                             TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  MVT VT = N->getValueType(0);
+  EVT VT = N->getValueType(0);
   unsigned Opc = N->getOpcode();
   bool isSlctCC = Slct.getOpcode() == ISD::SELECT_CC;
   SDValue LHS = isSlctCC ? Slct.getOperand(2) : Slct.getOperand(1);
@@ -2421,7 +2968,7 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
              cast<ConstantSDNode>(RHS)->isNullValue()) {
     std::swap(LHS, RHS);
     SDValue Op0 = Slct.getOperand(0);
-    MVT OpVT = isSlctCC ? Op0.getValueType() :
+    EVT OpVT = isSlctCC ? Op0.getValueType() :
                           Op0.getOperand(0).getValueType();
     bool isInt = OpVT.isInteger();
     CC = ISD::getSetCCInverse(CC, isInt);
@@ -2516,7 +3063,7 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
 /// operand of a vector shift left operation.  That value must be in the range:
 ///   0 <= Value < ElementBits for a left shift; or
 ///   0 <= Value <= ElementBits for a long left shift.
-static bool isVShiftLImm(SDValue Op, MVT VT, bool isLong, int64_t &Cnt) {
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
   assert(VT.isVector() && "vector shift count is not a vector type");
   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
   if (! getVShiftImm(Op, ElementBits, Cnt))
@@ -2530,7 +3077,7 @@ static bool isVShiftLImm(SDValue Op, MVT VT, bool isLong, int64_t &Cnt) {
 /// absolute value must be in the range:
 ///   1 <= |Value| <= ElementBits for a right shift; or
 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
-static bool isVShiftRImm(SDValue Op, MVT VT, bool isNarrow, bool isIntrinsic,
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
                          int64_t &Cnt) {
   assert(VT.isVector() && "vector shift count is not a vector type");
   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
@@ -2571,7 +3118,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
   case Intrinsic::arm_neon_vqrshiftns:
   case Intrinsic::arm_neon_vqrshiftnu:
   case Intrinsic::arm_neon_vqrshiftnsu: {
-    MVT VT = N->getOperand(1).getValueType();
+    EVT VT = N->getOperand(1).getValueType();
     int64_t Cnt;
     unsigned VShiftOpc = 0;
 
@@ -2593,8 +3140,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
     case Intrinsic::arm_neon_vshiftlu:
       if (isVShiftLImm(N->getOperand(2), VT, true, Cnt))
         break;
-      assert(0 && "invalid shift count for vshll intrinsic");
-      abort();
+      llvm_unreachable("invalid shift count for vshll intrinsic");
 
     case Intrinsic::arm_neon_vrshifts:
     case Intrinsic::arm_neon_vrshiftu:
@@ -2611,8 +3157,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
     case Intrinsic::arm_neon_vqshiftsu:
       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
         break;
-      assert(0 && "invalid shift count for vqshlu intrinsic");
-      abort();
+      llvm_unreachable("invalid shift count for vqshlu intrinsic");
 
     case Intrinsic::arm_neon_vshiftn:
     case Intrinsic::arm_neon_vrshiftn:
@@ -2625,11 +3170,10 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
       // Narrowing shifts require an immediate right shift.
       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
         break;
-      assert(0 && "invalid shift count for narrowing vector shift intrinsic");
-      abort();
+      llvm_unreachable("invalid shift count for narrowing vector shift intrinsic");
 
     default:
-      assert(0 && "unhandled vector shift");
+      llvm_unreachable("unhandled vector shift");
     }
 
     switch (IntNo) {
@@ -2678,7 +3222,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
   }
 
   case Intrinsic::arm_neon_vshiftins: {
-    MVT VT = N->getOperand(1).getValueType();
+    EVT VT = N->getOperand(1).getValueType();
     int64_t Cnt;
     unsigned VShiftOpc = 0;
 
@@ -2687,8 +3231,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
     else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
       VShiftOpc = ARMISD::VSRI;
     else {
-      assert(0 && "invalid shift count for vsli/vsri intrinsic");
-      abort();
+      llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
     }
 
     return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
@@ -2712,7 +3255,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
 /// their values after they get legalized to loads from a constant pool.
 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
                                    const ARMSubtarget *ST) {
-  MVT VT = N->getValueType(0);
+  EVT VT = N->getValueType(0);
 
   // Nothing to be done for scalar shifts.
   if (! VT.isVector())
@@ -2722,7 +3265,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
   int64_t Cnt;
 
   switch (N->getOpcode()) {
-  default: assert(0 && "unexpected shift opcode");
+  default: llvm_unreachable("unexpected shift opcode");
 
   case ISD::SHL:
     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
@@ -2755,8 +3298,8 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
   if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     SDValue Vec = N0.getOperand(0);
     SDValue Lane = N0.getOperand(1);
-    MVT VT = N->getValueType(0);
-    MVT EltVT = N0.getValueType();
+    EVT VT = N->getValueType(0);
+    EVT EltVT = N0.getValueType();
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
     if (VT == MVT::i32 &&
@@ -2765,7 +3308,7 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
 
       unsigned Opc = 0;
       switch (N->getOpcode()) {
-      default: assert(0 && "unexpected opcode");
+      default: llvm_unreachable("unexpected opcode");
       case ISD::SIGN_EXTEND:
         Opc = ARMISD::VGETLANEs;
         break;
@@ -2802,10 +3345,88 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   return SDValue();
 }
 
+bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
+  if (!Subtarget->hasV6Ops())
+    // Pre-v6 does not support unaligned mem access.
+    return false;
+  else if (!Subtarget->hasV6Ops()) {
+    // v6 may or may not support unaligned mem access.
+    if (!Subtarget->isTargetDarwin())
+      return false;
+  }
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    return true;
+  // FIXME: VLD1 etc with standard alignment is legal.
+  }
+}
+
+static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
+  if (V < 0)
+    return false;
+
+  unsigned Scale = 1;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+    // Scale == 1;
+    break;
+  case MVT::i16:
+    // Scale == 2;
+    Scale = 2;
+    break;
+  case MVT::i32:
+    // Scale == 4;
+    Scale = 4;
+    break;
+  }
+
+  if ((V & (Scale - 1)) != 0)
+    return false;
+  V /= Scale;
+  return V == (V & ((1LL << 5) - 1));
+}
+
+static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
+                                      const ARMSubtarget *Subtarget) {
+  bool isNeg = false;
+  if (V < 0) {
+    isNeg = true;
+    V = - V;
+  }
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    // + imm12 or - imm8
+    if (isNeg)
+      return V == (V & ((1LL << 8) - 1));
+    return V == (V & ((1LL << 12) - 1));
+  case MVT::f32:
+  case MVT::f64:
+    // Same as ARM mode. FIXME: NEON?
+    if (!Subtarget->hasVFP2())
+      return false;
+    if ((V & 3) != 0)
+      return false;
+    V >>= 2;
+    return V == (V & ((1LL << 8) - 1));
+  }
+}
+
 /// isLegalAddressImmediate - Return true if the integer value can be used
 /// as the offset of the target addressing mode for load / store of the
 /// given type.
-static bool isLegalAddressImmediate(int64_t V, MVT VT,
+static bool isLegalAddressImmediate(int64_t V, EVT VT,
                                     const ARMSubtarget *Subtarget) {
   if (V == 0)
     return true;
@@ -2813,36 +3434,15 @@ static bool isLegalAddressImmediate(int64_t V, MVT VT,
   if (!VT.isSimple())
     return false;
 
-  if (Subtarget->isThumb()) {
-    if (V < 0)
-      return false;
-
-    unsigned Scale = 1;
-    switch (VT.getSimpleVT()) {
-    default: return false;
-    case MVT::i1:
-    case MVT::i8:
-      // Scale == 1;
-      break;
-    case MVT::i16:
-      // Scale == 2;
-      Scale = 2;
-      break;
-    case MVT::i32:
-      // Scale == 4;
-      Scale = 4;
-      break;
-    }
-
-    if ((V & (Scale - 1)) != 0)
-      return false;
-    V /= Scale;
-    return V == (V & ((1LL << 5) - 1));
-  }
+  if (Subtarget->isThumb1Only())
+    return isLegalT1AddressImmediate(V, VT);
+  else if (Subtarget->isThumb2())
+    return isLegalT2AddressImmediate(V, VT, Subtarget);
 
+  // ARM mode.
   if (V < 0)
     V = - V;
-  switch (VT.getSimpleVT()) {
+  switch (VT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i1:
   case MVT::i8:
@@ -2854,7 +3454,7 @@ static bool isLegalAddressImmediate(int64_t V, MVT VT,
     return V == (V & ((1LL << 8) - 1));
   case MVT::f32:
   case MVT::f64:
-    if (!Subtarget->hasVFP2())
+    if (!Subtarget->hasVFP2()) // FIXME: NEON?
       return false;
     if ((V & 3) != 0)
       return false;
@@ -2863,11 +3463,44 @@ static bool isLegalAddressImmediate(int64_t V, MVT VT,
   }
 }
 
+bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
+                                                      EVT VT) const {
+  int Scale = AM.Scale;
+  if (Scale < 0)
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    if (Scale == 1)
+      return true;
+    // r + r << imm
+    Scale = Scale & ~1;
+    return Scale == 2 || Scale == 4 || Scale == 8;
+  case MVT::i64:
+    // r + r
+    if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+      return true;
+    return false;
+  case MVT::isVoid:
+    // Note, we allow "void" uses (basically, uses that aren't loads or
+    // stores), because arm allows folding a scale into many arithmetic
+    // operations.  This should be made more precise and revisited later.
+
+    // Allow r << imm, but the imm has to be a multiple of two.
+    if (Scale & 1) return false;
+    return isPowerOf2_32(Scale);
+  }
+}
+
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
                                               const Type *Ty) const {
-  MVT VT = getValueType(Ty, true);
+  EVT VT = getValueType(Ty, true);
   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
     return false;
 
@@ -2879,7 +3512,7 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
   case 0:  // no scale reg, must be "r+i" or "r", or "i".
     break;
   case 1:
-    if (Subtarget->isThumb())
+    if (Subtarget->isThumb1Only())
       return false;
     // FALL THROUGH.
   default:
@@ -2890,22 +3523,22 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
     if (!VT.isSimple())
       return false;
 
+    if (Subtarget->isThumb2())
+      return isLegalT2ScaledAddressingMode(AM, VT);
+
     int Scale = AM.Scale;
-    switch (VT.getSimpleVT()) {
+    switch (VT.getSimpleVT().SimpleTy) {
     default: return false;
     case MVT::i1:
     case MVT::i8:
     case MVT::i32:
-    case MVT::i64:
-      // This assumes i64 is legalized to a pair of i32. If not (i.e.
-      // ldrd / strd are used, then its address mode is same as i16.
-      // r + r
       if (Scale < 0) Scale = -Scale;
       if (Scale == 1)
         return true;
       // r + r << imm
       return isPowerOf2_32(Scale & ~1);
     case MVT::i16:
+    case MVT::i64:
       // r + r
       if (((unsigned)AM.HasBaseReg + Scale) <= 2)
         return true;
@@ -2917,15 +3550,15 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
       // operations.  This should be made more precise and revisited later.
 
       // Allow r << imm, but the imm has to be a multiple of two.
-      if (AM.Scale & 1) return false;
-      return isPowerOf2_32(AM.Scale);
+      if (Scale & 1) return false;
+      return isPowerOf2_32(Scale);
     }
     break;
   }
   return true;
 }
 
-static bool getARMIndexedAddressParts(SDNode *Ptr, MVT VT,
+static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
                                       bool isSEXTLoad, SDValue &Base,
                                       SDValue &Offset, bool &isInc,
                                       SelectionDAG &DAG) {
@@ -2983,7 +3616,7 @@ static bool getARMIndexedAddressParts(SDNode *Ptr, MVT VT,
   return false;
 }
 
-static bool getT2IndexedAddressParts(SDNode *Ptr, MVT VT,
+static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
                                      bool isSEXTLoad, SDValue &Base,
                                      SDValue &Offset, bool &isInc,
                                      SelectionDAG &DAG) {
@@ -3019,7 +3652,7 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   if (Subtarget->isThumb1Only())
     return false;
 
-  MVT VT;
+  EVT VT;
   SDValue Ptr;
   bool isSEXTLoad = false;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
@@ -3037,7 +3670,7 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   if (Subtarget->isThumb2())
     isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
                                        Offset, isInc, DAG);
-  else 
+  else
     isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
                                         Offset, isInc, DAG);
   if (!isLegal)
@@ -3058,7 +3691,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
   if (Subtarget->isThumb1Only())
     return false;
 
-  MVT VT;
+  EVT VT;
   SDValue Ptr;
   bool isSEXTLoad = false;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
@@ -3074,7 +3707,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
   if (Subtarget->isThumb2())
     isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
                                         isInc, DAG);
-  else 
+  else
     isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
                                         isInc, DAG);
   if (!isLegal)
@@ -3128,12 +3761,12 @@ ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
 
 std::pair<unsigned, const TargetRegisterClass*>
 ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                MVT VT) const {
+                                                EVT VT) const {
   if (Constraint.size() == 1) {
     // GCC RS6000 Constraint Letters
     switch (Constraint[0]) {
     case 'l':
-      if (Subtarget->isThumb())
+      if (Subtarget->isThumb1Only())
         return std::make_pair(0U, ARM::tGPRRegisterClass);
       else
         return std::make_pair(0U, ARM::GPRRegisterClass);
@@ -3152,7 +3785,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
 
 std::vector<unsigned> ARMTargetLowering::
 getRegClassForInlineAsmConstraint(const std::string &Constraint,
-                                  MVT VT) const {
+                                  EVT VT) const {
   if (Constraint.size() != 1)
     return std::vector<unsigned>();
 
@@ -3214,10 +3847,16 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
     switch (Constraint) {
       case 'I':
-        if (Subtarget->isThumb()) {
-          // This must be a constant between 0 and 255, for ADD immediates.
+        if (Subtarget->isThumb1Only()) {
+          // This must be a constant between 0 and 255, for ADD
+          // immediates.
           if (CVal >= 0 && CVal <= 255)
             break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant that can be used as an immediate value in a
+          // data-processing instruction.
+          if (ARM_AM::getT2SOImmVal(CVal) != -1)
+            break;
         } else {
           // A constant that can be used as an immediate value in a
           // data-processing instruction.
@@ -3227,7 +3866,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         return;
 
       case 'J':
-        if (Subtarget->isThumb()) {
+        if (Subtarget->isThumb()) {  // FIXME thumb2
           // This must be a constant between -255 and -1, for negated ADD
           // immediates. This can be used in GCC with an "n" modifier that
           // prints the negated value, for use with SUB instructions. It is
@@ -3244,13 +3883,21 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         return;
 
       case 'K':
-        if (Subtarget->isThumb()) {
+        if (Subtarget->isThumb1Only()) {
           // A 32-bit value where only one byte has a nonzero value. Exclude
           // zero to match GCC. This constraint is used by GCC internally for
           // constants that can be loaded with a move/shift combination.
           // It is not useful otherwise but is implemented for compatibility.
           if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
             break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant whose bitwise inverse can be used as an immediate
+          // value in a data-processing instruction. This can be used in GCC
+          // with a "B" modifier that prints the inverted value, for use with
+          // BIC and MVN instructions. It is not useful otherwise but is
+          // implemented for compatibility.
+          if (ARM_AM::getT2SOImmVal(~CVal) != -1)
+            break;
         } else {
           // A constant whose bitwise inverse can be used as an immediate
           // value in a data-processing instruction. This can be used in GCC
@@ -3263,11 +3910,19 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         return;
 
       case 'L':
-        if (Subtarget->isThumb()) {
+        if (Subtarget->isThumb1Only()) {
           // This must be a constant between -7 and 7,
           // for 3-operand ADD/SUB immediate instructions.
           if (CVal >= -7 && CVal < 7)
             break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant whose negation can be used as an immediate value in a
+          // data-processing instruction. This can be used in GCC with an "n"
+          // modifier that prints the negated value, for use with SUB
+          // instructions. It is not useful otherwise but is implemented for
+          // compatibility.
+          if (ARM_AM::getT2SOImmVal(-CVal) != -1)
+            break;
         } else {
           // A constant whose negation can be used as an immediate value in a
           // data-processing instruction. This can be used in GCC with an "n"
@@ -3280,7 +3935,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         return;
 
       case 'M':
-        if (Subtarget->isThumb()) {
+        if (Subtarget->isThumb()) { // FIXME thumb2
           // This must be a multiple of 4 between 0 and 1020, for
           // ADD sp + immediate.
           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
@@ -3295,7 +3950,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         return;
 
       case 'N':
-        if (Subtarget->isThumb()) {
+        if (Subtarget->isThumb()) {  // FIXME thumb2
           // This must be a constant between 0 and 31, for shift amounts.
           if (CVal >= 0 && CVal <= 31)
             break;
@@ -3303,7 +3958,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         return;
 
       case 'O':
-        if (Subtarget->isThumb()) {
+        if (Subtarget->isThumb()) {  // FIXME thumb2
           // This must be a multiple of 4 between -508 and 508, for
           // ADD/SUB sp = sp + immediate.
           if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
@@ -3322,3 +3977,9 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
                                                       Ops, DAG);
 }
+
+bool
+ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The ARM target isn't yet aware of offsets.
+  return false;
+}
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 553a86d..7d85f45 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -40,6 +40,7 @@ namespace llvm {
       tCALL,        // Thumb function call.
       BRCOND,       // Conditional branch.
       BR_JT,        // Jumptable branch.
+      BR2_JT,       // Jumptable branch (2 level - jumptable entry is a jump).
       RET_FLAG,     // Return with a flag operand.
 
       PIC_ADD,      // Add with a PC operand and a PIC label.
@@ -64,11 +65,13 @@ namespace llvm {
       FMRRD,        // double to two gprs.
       FMDRR,        // Two gprs to double.
 
-      EH_SJLJ_SETJMP,    // SjLj exception handling setjmp
-      EH_SJLJ_LONGJMP,   // SjLj exception handling longjmp
+      EH_SJLJ_SETJMP,    // SjLj exception handling setjmp.
+      EH_SJLJ_LONGJMP,   // SjLj exception handling longjmp.
 
       THREAD_POINTER,
 
+      DYN_ALLOC,    // Dynamic allocation on the stack.
+
       VCEQ,         // Vector compare equal.
       VCGE,         // Vector compare greater than or equal.
       VCGEU,        // Vector compare unsigned greater than or equal.
@@ -112,8 +115,18 @@ namespace llvm {
       VGETLANEu,    // zero-extend vector extract element
       VGETLANEs,    // sign-extend vector extract element
 
-      // Vector duplicate lane (128-bit result only; 64-bit is a shuffle)
-      VDUPLANEQ     // splat a lane from a 64-bit vector to a 128-bit vector
+      // Vector duplicate:
+      VDUP,
+      VDUPLANE,
+
+      // Vector shuffles:
+      VEXT,         // extract
+      VREV64,       // reverse elements within 64-bit doublewords
+      VREV32,       // reverse elements within 32-bit words
+      VREV16,       // reverse elements within 16-bit halfwords
+      VZIP,         // zip (interleave)
+      VUZP,         // unzip (deinterleave)
+      VTRN          // transpose
     };
   }
 
@@ -147,11 +160,18 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                  MachineBasicBlock *MBB) const;
+                                                         MachineBasicBlock *MBB,
+                       DenseMap<MachineBasicBlock*, MachineBasicBlock*>*) const;
+
+    /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+    /// unaligned memory accesses. of the specified type.
+    /// FIXME: Add getOptimalMemOpType to implement memcpy with NEON?
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT) const;
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
     virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+    bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
 
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
@@ -175,13 +195,15 @@ namespace llvm {
                                                 APInt &KnownOne,
                                                 const SelectionDAG &DAG,
                                                 unsigned Depth) const;
+
+
     ConstraintType getConstraintType(const std::string &Constraint) const;
     std::pair<unsigned, const TargetRegisterClass*>
       getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   MVT VT) const;
+                                   EVT VT) const;
     std::vector<unsigned>
     getRegClassForInlineAsmConstraint(const std::string &Constraint,
-                                      MVT VT) const;
+                                      EVT VT) const;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
@@ -200,21 +222,23 @@ namespace llvm {
     /// getFunctionAlignment - Return the Log2 alignment of this function.
     virtual unsigned getFunctionAlignment(const Function *F) const;
 
+    bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const;
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
   private:
     /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
     /// make the right decision when generating code for different targets.
     const ARMSubtarget *Subtarget;
 
-    /// ARMPCLabelIndex - Keep track the number of ARM PC labels created.
+    /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created.
     ///
     unsigned ARMPCLabelIndex;
 
-    void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
-    void addDRTypeForNEON(MVT VT);
-    void addQRTypeForNEON(MVT VT);
+    void addTypeForNEON(EVT VT, EVT PromotedLdStVT, EVT PromotedBitwiseVT);
+    void addDRTypeForNEON(EVT VT);
+    void addQRTypeForNEON(EVT VT);
 
     typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
-    void PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG,
+    void PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
                           SDValue Chain, SDValue &Arg,
                           RegsToPassVector &RegsToPass,
                           CCValAssign &VA, CCValAssign &NextVA,
@@ -224,15 +248,13 @@ namespace llvm {
     SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                  SDValue &Root, SelectionDAG &DAG, DebugLoc dl);
 
-    CCAssignFn *CCAssignFnForNode(unsigned CC, bool Return) const;
-    SDValue LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
-                             const SDValue &StackPtr, const CCValAssign &VA,
-                             SDValue Chain, SDValue Arg, ISD::ArgFlagsTy Flags);
-    SDNode *LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
-                            unsigned CallingConv, SelectionDAG &DAG);
-    SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
+    CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const;
+    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,
+                             ISD::ArgFlagsTy Flags);
+    SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG);
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
     SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG);
     SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG);
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
@@ -241,9 +263,9 @@ namespace llvm {
     SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
                                    SelectionDAG &DAG);
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG);
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG);
 
     SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
                                       SDValue Chain,
@@ -252,6 +274,33 @@ namespace llvm {
                                       bool AlwaysInline,
                                       const Value *DstSV, uint64_t DstSVOff,
                                       const Value *SrcSV, uint64_t SrcSVOff);
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
   };
 }
 
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 301a6c1..3d19f23 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -54,9 +54,16 @@ def NEONGetLnFrm  : Format<25>;
 def NEONSetLnFrm  : Format<26>;
 def NEONDupFrm    : Format<27>;
 
-// Misc flag for data processing instructions that indicates whether
+// Misc flags.
+
 // the instruction has a Rn register operand.
-class UnaryDP  { bit isUnaryDataProc = 1; }
+// UnaryDP - Indicates this is a unary data processing instruction, i.e.
+// it doesn't have a Rn operand.
+class UnaryDP    { bit isUnaryDataProc = 1; }
+
+// Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+// a 16-bit Thumb instruction if certain conditions are met.
+class Xform16Bit { bit canXformTo16Bit = 1; }
 
 //===----------------------------------------------------------------------===//
 // ARM Instruction flags.  These need to match ARMInstrInfo.h.
@@ -77,7 +84,7 @@ def AddrModeT1_1  : AddrMode<7>;
 def AddrModeT1_2  : AddrMode<8>;
 def AddrModeT1_4  : AddrMode<9>;
 def AddrModeT1_s  : AddrMode<10>;
-def AddrModeT2_i12: AddrMode<12>;
+def AddrModeT2_i12: AddrMode<11>;
 def AddrModeT2_i8 : AddrMode<12>;
 def AddrModeT2_so : AddrMode<13>;
 def AddrModeT2_pc : AddrMode<14>;
@@ -103,11 +110,33 @@ def IndexModePost : IndexMode<2>;
 
 //===----------------------------------------------------------------------===//
 
+// ARM special operands.
+//
+
+// ARM Predicate operand. Default to 14 = always (AL). Second part is CC
+// register whose default is 0 (no register).
+def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
+                                     (ops (i32 14), (i32 zero_reg))> {
+  let PrintMethod = "printPredicateOperand";
+}
+
+// Conditional code result for instructions whose 's' bit is set, e.g. subs.
+def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
+  let PrintMethod = "printSBitModifierOperand";
+}
+
+// Same as cc_out except it defaults to setting CPSR.
+def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
+  let PrintMethod = "printSBitModifierOperand";
+}
+
+//===----------------------------------------------------------------------===//
+
 // ARM Instruction templates.
 //
 
 class InstARM<AddrMode am, SizeFlagVal sz, IndexMode im,
-              Format f, string cstr>
+              Format f, string cstr, InstrItinClass itin>
   : Instruction {
   field bits<32> Inst;
 
@@ -130,12 +159,15 @@ class InstARM<AddrMode am, SizeFlagVal sz, IndexMode im,
   // Attributes specific to ARM instructions...
   //
   bit isUnaryDataProc = 0;
+  bit canXformTo16Bit = 0;
   
   let Constraints = cstr;
+  let Itinerary = itin;
 }
 
-class PseudoInst<dag oops, dag iops, string asm, list<dag> pattern>
-  : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, ""> {
+class PseudoInst<dag oops, dag iops, InstrItinClass itin, 
+                 string asm, list<dag> pattern>
+  : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, "", itin> {
   let OutOperandList = oops;
   let InOperandList = iops;
   let AsmString   = asm;
@@ -144,9 +176,10 @@ class PseudoInst<dag oops, dag iops, string asm, list<dag> pattern>
 
 // Almost all ARM instructions are predicable.
 class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
-        IndexMode im, Format f, string opc, string asm, string cstr,
+        IndexMode im, Format f, InstrItinClass itin, 
+        string opc, string asm, string cstr,
         list<dag> pattern>
-  : InstARM<am, sz, im, f, cstr> {
+  : InstARM<am, sz, im, f, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ops pred:$p));
   let AsmString   = !strconcat(opc, !strconcat("${p}", asm));
@@ -158,9 +191,10 @@ class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
 // an input operand since by default it's a zero register. It will
 // become an implicit def once it's "flipped".
 class sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
-         IndexMode im, Format f, string opc, string asm, string cstr,
+         IndexMode im, Format f, InstrItinClass itin,
+         string opc, string asm, string cstr,
          list<dag> pattern>
-  : InstARM<am, sz, im, f, cstr> {
+  : InstARM<am, sz, im, f, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ops pred:$p, cc_out:$s));
   let AsmString   = !strconcat(opc, !strconcat("${p}${s}", asm));
@@ -170,8 +204,9 @@ class sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
 
 // Special cases
 class XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
-         IndexMode im, Format f, string asm, string cstr, list<dag> pattern>
-  : InstARM<am, sz, im, f, cstr> {
+         IndexMode im, Format f, InstrItinClass itin,
+         string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = iops;
   let AsmString   = asm;
@@ -179,90 +214,93 @@ class XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   list<Predicate> Predicates = [IsARM];
 }
 
-class AI<dag oops, dag iops, Format f, string opc,
-         string asm, list<dag> pattern>
-  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern>;
-class AsI<dag oops, dag iops, Format f, string opc,
+class AI<dag oops, dag iops, Format f, InstrItinClass itin,
+         string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern>;
+class AsI<dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
+       opc, asm, "", pattern>;
+class AXI<dag oops, dag iops, Format f, InstrItinClass itin,
           string asm, list<dag> pattern>
-  : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, opc,
+  : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern>;
-class AXI<dag oops, dag iops, Format f, string asm,
-          list<dag> pattern>
-  : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, asm,
-       "", pattern>;
 
 // Ctrl flow instructions
-class ABI<bits<4> opcod, dag oops, dag iops, string opc,
-         string asm, list<dag> pattern>
-  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, opc,
-      asm, "", pattern> {
+class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, itin,
+      opc, asm, "", pattern> {
   let Inst{27-24} = opcod;
 }
-class ABXI<bits<4> opcod, dag oops, dag iops, string asm, list<dag> pattern>
-  : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, asm,
-       "", pattern> {
+class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, itin,
+       asm, "", pattern> {
   let Inst{27-24} = opcod;
 }
-class ABXIx2<dag oops, dag iops, string asm, list<dag> pattern>
-  : XI<oops, iops, AddrModeNone, Size8Bytes, IndexModeNone, BrMiscFrm, asm,
-       "", pattern>;
+class ABXIx2<dag oops, dag iops, InstrItinClass itin,
+             string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, Size8Bytes, IndexModeNone, BrMiscFrm, itin,
+       asm, "", pattern>;
 
 // BR_JT instructions
-class JTI<dag oops, dag iops, string asm, list<dag> pattern>
-  : XI<oops, iops, AddrModeNone, SizeSpecial, IndexModeNone, BrMiscFrm,
+class JTI<dag oops, dag iops, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, SizeSpecial, IndexModeNone, BrMiscFrm, itin,
        asm, "", pattern>;
 
 // addrmode1 instructions
-class AI1<bits<4> opcod, dag oops, dag iops, Format f, string opc,
-          string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern> {
+class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
   let Inst{24-21} = opcod;
   let Inst{27-26} = {0,0};
 }
-class AsI1<bits<4> opcod, dag oops, dag iops, Format f, string opc,
-           string asm, list<dag> pattern>
-  : sI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, opc,
-       asm, "", pattern> {
+class AsI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin,
+       opc, asm, "", pattern> {
   let Inst{24-21} = opcod;
   let Inst{27-26} = {0,0};
 }
-class AXI1<bits<4> opcod, dag oops, dag iops, Format f, string asm,
-           list<dag> pattern>
-  : XI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, asm,
-       "", pattern> {
+class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
   let Inst{24-21} = opcod;
   let Inst{27-26} = {0,0};
 }
-class AI1x2<dag oops, dag iops, Format f, string opc,
-            string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, opc,
-      asm, "", pattern>;
+class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin, 
+            string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern>;
 
 
 // addrmode2 loads and stores
-class AI2<dag oops, dag iops, Format f, string opc,
-          string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern> {
+class AI2<dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
   let Inst{27-26} = {0,1};
 }
 
 // loads
-class AI2ldw<dag oops, dag iops, Format f, string opc,
-          string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern> {
+class AI2ldw<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 1; // P bit
   let Inst{27-26} = {0,1};
 }
-class AXI2ldw<dag oops, dag iops, Format f, string asm,
-           list<dag> pattern>
-  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f,
+class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, 
+              string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern> {
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 0; // W bit
@@ -270,19 +308,19 @@ class AXI2ldw<dag oops, dag iops, Format f, string asm,
   let Inst{24}    = 1; // P bit
   let Inst{27-26} = {0,1};
 }
-class AI2ldb<dag oops, dag iops, Format f, string opc,
-          string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern> {
+class AI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 1; // B bit
   let Inst{24}    = 1; // P bit
   let Inst{27-26} = {0,1};
 }
-class AXI2ldb<dag oops, dag iops, Format f, string asm,
-           list<dag> pattern>
-  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f,
+class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, 
+              string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern> {
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 0; // W bit
@@ -292,19 +330,19 @@ class AXI2ldb<dag oops, dag iops, Format f, string asm,
 }
 
 // stores
-class AI2stw<dag oops, dag iops, Format f, string opc,
-          string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern> {
+class AI2stw<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 1; // P bit
   let Inst{27-26} = {0,1};
 }
-class AXI2stw<dag oops, dag iops, Format f, string asm,
-           list<dag> pattern>
-  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f,
+class AXI2stw<dag oops, dag iops, Format f, InstrItinClass itin,
+              string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern> {
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 0; // W bit
@@ -312,19 +350,19 @@ class AXI2stw<dag oops, dag iops, Format f, string asm,
   let Inst{24}    = 1; // P bit
   let Inst{27-26} = {0,1};
 }
-class AI2stb<dag oops, dag iops, Format f, string opc,
-          string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern> {
+class AI2stb<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 1; // B bit
   let Inst{24}    = 1; // P bit
   let Inst{27-26} = {0,1};
 }
-class AXI2stb<dag oops, dag iops, Format f, string asm,
-           list<dag> pattern>
-  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f,
+class AXI2stb<dag oops, dag iops, Format f, InstrItinClass itin,
+              string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern> {
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 0; // W bit
@@ -334,20 +372,20 @@ class AXI2stb<dag oops, dag iops, Format f, string asm,
 }
 
 // Pre-indexed loads
-class AI2ldwpr<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc,
-      asm, cstr, pattern> {
+class AI2ldwpr<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 1; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 1; // P bit
   let Inst{27-26} = {0,1};
 }
-class AI2ldbpr<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc,
-      asm, cstr, pattern> {
+class AI2ldbpr<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 1; // W bit
   let Inst{22}    = 1; // B bit
@@ -356,20 +394,20 @@ class AI2ldbpr<dag oops, dag iops, Format f, string opc,
 }
 
 // Pre-indexed stores
-class AI2stwpr<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc,
-      asm, cstr, pattern> {
+class AI2stwpr<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 1; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 1; // P bit
   let Inst{27-26} = {0,1};
 }
-class AI2stbpr<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, opc,
-      asm, cstr, pattern> {
+class AI2stbpr<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 1; // W bit
   let Inst{22}    = 1; // B bit
@@ -378,20 +416,20 @@ class AI2stbpr<dag oops, dag iops, Format f, string opc,
 }
 
 // Post-indexed loads
-class AI2ldwpo<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc,
-      asm, cstr,pattern> {
+class AI2ldwpo<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 0; // P bit
   let Inst{27-26} = {0,1};
 }
-class AI2ldbpo<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc,
-      asm, cstr,pattern> {
+class AI2ldbpo<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 1; // B bit
@@ -400,20 +438,20 @@ class AI2ldbpo<dag oops, dag iops, Format f, string opc,
 }
 
 // Post-indexed stores
-class AI2stwpo<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc,
-      asm, cstr,pattern> {
+class AI2stwpo<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 0; // P bit
   let Inst{27-26} = {0,1};
 }
-class AI2stbpo<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, opc,
-      asm, cstr,pattern> {
+class AI2stbpo<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 1; // B bit
@@ -422,20 +460,20 @@ class AI2stbpo<dag oops, dag iops, Format f, string opc,
 }
 
 // addrmode3 instructions
-class AI3<dag oops, dag iops, Format f, string opc,
-          string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern>;
-class AXI3<dag oops, dag iops, Format f, string asm,
-           list<dag> pattern>
-  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, asm,
-       "", pattern>;
+class AI3<dag oops, dag iops, Format f, InstrItinClass itin, 
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern>;
+class AXI3<dag oops, dag iops, Format f, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern>;
 
 // loads
-class AI3ldh<dag oops, dag iops, Format f, string opc,
-          string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern> {
+class AI3ldh<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
   let Inst{6}     = 0; // S bit
@@ -443,10 +481,11 @@ class AI3ldh<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
 }
-class AXI3ldh<dag oops, dag iops, Format f, string asm,
-           list<dag> pattern>
-  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f,
+class AXI3ldh<dag oops, dag iops, Format f, InstrItinClass itin,
+              string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
@@ -456,10 +495,10 @@ class AXI3ldh<dag oops, dag iops, Format f, string asm,
   let Inst{21}    = 0; // W bit
   let Inst{24}    = 1; // P bit
 }
-class AI3ldsh<dag oops, dag iops, Format f, string opc,
-          string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern> {
+class AI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
   let Inst{6}     = 1; // S bit
@@ -467,10 +506,11 @@ class AI3ldsh<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
 }
-class AXI3ldsh<dag oops, dag iops, Format f, string asm,
-           list<dag> pattern>
-  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f,
+class AXI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin,
+               string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
@@ -480,10 +520,10 @@ class AXI3ldsh<dag oops, dag iops, Format f, string asm,
   let Inst{21}    = 0; // W bit
   let Inst{24}    = 1; // P bit
 }
-class AI3ldsb<dag oops, dag iops, Format f, string opc,
-          string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern> {
+class AI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 0; // H bit
   let Inst{6}     = 1; // S bit
@@ -491,10 +531,11 @@ class AI3ldsb<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
 }
-class AXI3ldsb<dag oops, dag iops, Format f, string asm,
-           list<dag> pattern>
-  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f,
+class AXI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin,
+               string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 0; // H bit
@@ -504,10 +545,10 @@ class AXI3ldsb<dag oops, dag iops, Format f, string asm,
   let Inst{21}    = 0; // W bit
   let Inst{24}    = 1; // P bit
 }
-class AI3ldd<dag oops, dag iops, Format f, string opc,
-          string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern> {
+class AI3ldd<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 0; // H bit
   let Inst{6}     = 1; // S bit
@@ -515,13 +556,14 @@ class AI3ldd<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
 }
 
 // stores
-class AI3sth<dag oops, dag iops, Format f, string opc,
-          string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern> {
+class AI3sth<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
   let Inst{6}     = 0; // S bit
@@ -529,10 +571,11 @@ class AI3sth<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
 }
-class AXI3sth<dag oops, dag iops, Format f, string asm,
-           list<dag> pattern>
-  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f,
+class AXI3sth<dag oops, dag iops, Format f, InstrItinClass itin,
+              string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
@@ -542,10 +585,10 @@ class AXI3sth<dag oops, dag iops, Format f, string asm,
   let Inst{21}    = 0; // W bit
   let Inst{24}    = 1; // P bit
 }
-class AI3std<dag oops, dag iops, Format f, string opc,
-          string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, opc,
-      asm, "", pattern> {
+class AI3std<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
   let Inst{6}     = 1; // S bit
@@ -553,13 +596,14 @@ class AI3std<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 0; // W bit
   let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
 }
 
 // Pre-indexed loads
-class AI3ldhpr<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc,
-      asm, cstr, pattern> {
+class AI3ldhpr<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
   let Inst{6}     = 0; // S bit
@@ -567,11 +611,12 @@ class AI3ldhpr<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 1; // W bit
   let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
 }
-class AI3ldshpr<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc,
-      asm, cstr, pattern> {
+class AI3ldshpr<dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
   let Inst{6}     = 1; // S bit
@@ -579,11 +624,12 @@ class AI3ldshpr<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 1; // W bit
   let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
 }
-class AI3ldsbpr<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc,
-      asm, cstr, pattern> {
+class AI3ldsbpr<dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 0; // H bit
   let Inst{6}     = 1; // S bit
@@ -591,13 +637,14 @@ class AI3ldsbpr<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 1; // W bit
   let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
 }
 
 // Pre-indexed stores
-class AI3sthpr<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, opc,
-      asm, cstr, pattern> {
+class AI3sthpr<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
   let Inst{6}     = 0; // S bit
@@ -605,13 +652,14 @@ class AI3sthpr<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 1; // W bit
   let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
 }
 
 // Post-indexed loads
-class AI3ldhpo<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc,
-      asm, cstr,pattern> {
+class AI3ldhpo<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
   let Inst{6}     = 0; // S bit
@@ -619,11 +667,12 @@ class AI3ldhpo<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 1; // W bit
   let Inst{24}    = 0; // P bit
+  let Inst{27-25} = 0b000;
 }
-class AI3ldshpo<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc,
-      asm, cstr,pattern> {
+class AI3ldshpo<dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
   let Inst{6}     = 1; // S bit
@@ -631,11 +680,12 @@ class AI3ldshpo<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 1; // W bit
   let Inst{24}    = 0; // P bit
+  let Inst{27-25} = 0b000;
 }
-class AI3ldsbpo<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc,
-      asm, cstr,pattern> {
+class AI3ldsbpo<dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 0; // H bit
   let Inst{6}     = 1; // S bit
@@ -643,13 +693,14 @@ class AI3ldsbpo<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 1; // L bit
   let Inst{21}    = 1; // W bit
   let Inst{24}    = 0; // P bit
+  let Inst{27-25} = 0b000;
 }
 
 // Post-indexed stores
-class AI3sthpo<dag oops, dag iops, Format f, string opc,
-            string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, opc,
-      asm, cstr,pattern> {
+class AI3sthpo<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
   let Inst{6}     = 0; // S bit
@@ -657,57 +708,60 @@ class AI3sthpo<dag oops, dag iops, Format f, string opc,
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 1; // W bit
   let Inst{24}    = 0; // P bit
+  let Inst{27-25} = 0b000;
 }
 
 
 // addrmode4 instructions
-class AXI4ld<dag oops, dag iops, Format f, string asm, list<dag> pattern>
-  : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, asm,
-       "", pattern> {
+class AXI4ld<dag oops, dag iops, Format f, InstrItinClass itin,
+             string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
   let Inst{20}    = 1; // L bit
   let Inst{22}    = 0; // S bit
   let Inst{27-25} = 0b100;
 }
-class AXI4st<dag oops, dag iops, Format f, string asm, list<dag> pattern>
-  : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, asm,
-       "", pattern> {
+class AXI4st<dag oops, dag iops, Format f, InstrItinClass itin,
+             string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
   let Inst{20}    = 0; // L bit
   let Inst{22}    = 0; // S bit
   let Inst{27-25} = 0b100;
 }
 
 // Unsigned multiply, multiply-accumulate instructions.
-class AMul1I<bits<7> opcod, dag oops, dag iops, string opc,
-         string asm, list<dag> pattern>
-  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc,
-      asm, "", pattern> {
+class AMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
   let Inst{7-4}   = 0b1001;
   let Inst{20}    = 0; // S bit
   let Inst{27-21} = opcod;
 }
-class AsMul1I<bits<7> opcod, dag oops, dag iops, string opc,
-          string asm, list<dag> pattern>
-  : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc,
-       asm, "", pattern> {
+class AsMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
+       opc, asm, "", pattern> {
   let Inst{7-4}   = 0b1001;
   let Inst{27-21} = opcod;
 }
 
 // Most significant word multiply
-class AMul2I<bits<7> opcod, dag oops, dag iops, string opc,
-         string asm, list<dag> pattern>
-  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc,
-      asm, "", pattern> {
+class AMul2I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
   let Inst{7-4}   = 0b1001;
   let Inst{20}    = 1;
   let Inst{27-21} = opcod;
 }
 
 // SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y>
-class AMulxyI<bits<7> opcod, dag oops, dag iops, string opc,
-         string asm, list<dag> pattern>
-  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, opc,
-      asm, "", pattern> {
+class AMulxyI<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
   let Inst{4}     = 0;
   let Inst{7}     = 1;
   let Inst{20}    = 0;
@@ -715,19 +769,19 @@ class AMulxyI<bits<7> opcod, dag oops, dag iops, string opc,
 }
 
 // Extend instructions.
-class AExtI<bits<8> opcod, dag oops, dag iops, string opc,
-            string asm, list<dag> pattern>
-  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ExtFrm, opc,
-      asm, "", pattern> {
+class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ExtFrm, itin,
+      opc, asm, "", pattern> {
   let Inst{7-4}   = 0b0111;
   let Inst{27-20} = opcod;
 }
 
 // Misc Arithmetic instructions.
-class AMiscA1I<bits<8> opcod, dag oops, dag iops, string opc,
-               string asm, list<dag> pattern>
-  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ArithMiscFrm, opc,
-      asm, "", pattern> {
+class AMiscA1I<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+               string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ArithMiscFrm, itin,
+      opc, asm, "", pattern> {
   let Inst{27-20} = opcod;
 }
 
@@ -751,74 +805,120 @@ class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> {
 
 // TI - Thumb instruction.
 
-class ThumbI<dag outs, dag ins, AddrMode am, SizeFlagVal sz,
-             string asm, string cstr, list<dag> pattern>
-  : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> {
-  let OutOperandList = outs;
-  let InOperandList = ins;
+class ThumbI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+             InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
   let AsmString   = asm;
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb];
 }
 
-class TI<dag outs, dag ins, string asm, list<dag> pattern>
-  : ThumbI<outs, ins, AddrModeNone, Size2Bytes, asm, "", pattern>;
+class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>;
 
-// BL, BLX(1) are translated by assembler into two instructions
-class TIx2<dag outs, dag ins, string asm, list<dag> pattern>
-  : ThumbI<outs, ins, AddrModeNone, Size4Bytes, asm, "", pattern>;
-
-// BR_JT instructions
-class TJTI<dag outs, dag ins, string asm, list<dag> pattern>
-  : ThumbI<outs, ins, AddrModeNone, SizeSpecial, asm, "", pattern>;
+// Two-address instructions
+class TIt<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "$lhs = $dst", pattern>;
 
-// TPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
-class TPat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb];
-}
+// tBL, tBX instructions
+class TIx2<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>;
 
-class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb, HasV5T];
-}
+// BR_JT instructions
+class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
 
 // Thumb1 only
-class Thumb1I<dag outs, dag ins, AddrMode am, SizeFlagVal sz,
-             string asm, string cstr, list<dag> pattern>
-  : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> {
-  let OutOperandList = outs;
-  let InOperandList = ins;
+class Thumb1I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+              InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
   let AsmString   = asm;
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb1Only];
 }
 
-class T1I<dag outs, dag ins, string asm, list<dag> pattern>
-  : Thumb1I<outs, ins, AddrModeNone, Size2Bytes, asm, "", pattern>;
-class T1I1<dag outs, dag ins, string asm, list<dag> pattern>
-  : Thumb1I<outs, ins, AddrModeT1_1, Size2Bytes, asm, "", pattern>;
-class T1I2<dag outs, dag ins, string asm, list<dag> pattern>
-  : Thumb1I<outs, ins, AddrModeT1_2, Size2Bytes, asm, "", pattern>;
-class T1I4<dag outs, dag ins, string asm, list<dag> pattern>
-  : Thumb1I<outs, ins, AddrModeT1_4, Size2Bytes, asm, "", pattern>;
-class T1Is<dag outs, dag ins, string asm, list<dag> pattern>
-  : Thumb1I<outs, ins, AddrModeT1_s, Size2Bytes, asm, "", pattern>;
-class T1Ix2<dag outs, dag ins, string asm, list<dag> pattern>
-  : Thumb1I<outs, ins, AddrModeNone, Size4Bytes, asm, "", pattern>;
-class T1JTI<dag outs, dag ins, string asm, list<dag> pattern>
-  : Thumb1I<outs, ins, AddrModeNone, SizeSpecial, asm, "", pattern>;
+class T1I<dag oops, dag iops, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>;
+class T1Ix2<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>;
+class T1JTI<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
 
 // Two-address instructions
-class T1It<dag outs, dag ins, string asm, list<dag> pattern>
-  : Thumb1I<outs, ins, AddrModeNone, Size2Bytes, asm, "$lhs = $dst", pattern>;
+class T1It<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, 
+            asm, "$lhs = $dst", pattern>;
 
-class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
+// Thumb1 instruction that can either be predicated or set CPSR.
+class Thumb1sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
+  let OutOperandList = !con(oops, (ops s_cc_out:$s));
+  let InOperandList = !con(iops, (ops pred:$p));
+  let AsmString = !strconcat(opc, !strconcat("${s}${p}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb1Only];
+}
+
+class T1sI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1sIt<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm,
+            "$lhs = $dst", pattern>;
+
+// Thumb1 instruction that can be predicated.
+class Thumb1pI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p));
+  let AsmString = !strconcat(opc, !strconcat("${p}", asm));
+  let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb1Only];
 }
 
+class T1pI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1pIt<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm,
+            "$lhs = $dst", pattern>;
+
+class T1pI1<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeT1_1, Size2Bytes, itin, opc, asm, "", pattern>;
+class T1pI2<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeT1_2, Size2Bytes, itin, opc, asm, "", pattern>;
+class T1pI4<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeT1_4, Size2Bytes, itin, opc, asm, "", pattern>;
+class T1pIs<dag oops, dag iops, 
+            InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeT1_s, Size2Bytes, itin, opc, asm, "", pattern>;
+
 // Thumb2I - Thumb2 instruction. Almost all Thumb2 instructions are predicable.
 class Thumb2I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+              InstrItinClass itin,
               string opc, string asm, string cstr, list<dag> pattern>
-  : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> {
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ops pred:$p));
   let AsmString = !strconcat(opc, !strconcat("${p}", asm));
@@ -832,8 +932,9 @@ class Thumb2I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
 // FIXME: This uses unified syntax so {s} comes before {p}. We should make it
 // more consistent.
 class Thumb2sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+               InstrItinClass itin,
                string opc, string asm, string cstr, list<dag> pattern>
-  : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> {
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ops pred:$p, cc_out:$s));
   let AsmString   = !strconcat(opc, !strconcat("${s}${p}", asm));
@@ -843,8 +944,9 @@ class Thumb2sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
 
 // Special cases
 class Thumb2XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+               InstrItinClass itin,
                string asm, string cstr, list<dag> pattern>
-  : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr> {
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = iops;
   let AsmString   = asm;
@@ -852,31 +954,46 @@ class Thumb2XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   list<Predicate> Predicates = [IsThumb2];
 }
 
-class T2I<dag oops, dag iops, string opc, string asm, list<dag> pattern>
-  : Thumb2I<oops, iops, AddrModeNone, Size4Bytes, opc, asm, "", pattern>;
-class T2Ii12<dag oops, dag iops, string opc, string asm, list<dag> pattern>
-  : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, opc, asm, "", pattern>;
-class T2Ii8<dag oops, dag iops, string opc, string asm, list<dag> pattern>
-  : Thumb2I<oops, iops, AddrModeT2_i8, Size4Bytes, opc, asm, "", pattern>;
-class T2Iso<dag oops, dag iops, string opc, string asm, list<dag> pattern>
-  : Thumb2I<oops, iops, AddrModeT2_so, Size4Bytes, opc, asm, "", pattern>;
-class T2Ipc<dag oops, dag iops, string opc, string asm, list<dag> pattern>
-  : Thumb2I<oops, iops, AddrModeT2_pc, Size4Bytes, opc, asm, "", pattern>;
-class T2Ii8s4<dag oops, dag iops, string opc, string asm, list<dag> pattern>
-  : Thumb2I<oops, iops, AddrModeT2_i8s4, Size4Bytes, opc, asm, "", pattern>;
+class T2I<dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Ii12<dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Ii8<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Iso<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_so, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Ipc<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_pc, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Ii8s4<dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8s4, Size4Bytes, itin, opc, asm, "", pattern>;
+
+class T2sI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb2sI<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>;
+
+class T2XI<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : Thumb2XI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>;
+class T2JTI<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
 
-class T2sI<dag oops, dag iops, string opc, string asm, list<dag> pattern>
-  : Thumb2sI<oops, iops, AddrModeNone, Size4Bytes, opc, asm, "", pattern>;
+class T2Ix2<dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, Size8Bytes, itin, opc, asm, "", pattern>;
 
-class T2XI<dag oops, dag iops, string asm, list<dag> pattern>
-  : Thumb2XI<oops, iops, AddrModeNone, Size4Bytes, asm, "", pattern>;
-class T2JTI<dag oops, dag iops, string asm, list<dag> pattern>
-  : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, asm, "", pattern>;
 
 // T2Iidxldst - Thumb2 indexed load / store instructions.
 class T2Iidxldst<dag oops, dag iops, AddrMode am, IndexMode im,
+                 InstrItinClass itin,
                  string opc, string asm, string cstr, list<dag> pattern>
-  : InstARM<am, Size4Bytes, im, ThumbFrm, cstr> {
+  : InstARM<am, Size4Bytes, im, ThumbFrm, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ops pred:$p));
   let AsmString = !strconcat(opc, !strconcat("${p}", asm));
@@ -884,6 +1001,15 @@ class T2Iidxldst<dag oops, dag iops, AddrMode am, IndexMode im,
   list<Predicate> Predicates = [IsThumb2];
 }
 
+// Tv5Pat - Same as Pat<>, but requires V5T Thumb mode.
+class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb1Only, HasV5T];
+}
+
+// T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode.
+class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb1Only];
+}
 
 // T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode.
 class T2Pat<dag pattern, dag result> : Pat<pattern, result> {
@@ -896,11 +1022,41 @@ class T2Pat<dag pattern, dag result> : Pat<pattern, result> {
 // ARM VFP Instruction templates.
 //
 
+// Almost all VFP instructions are predicable.
+class VFPI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+           IndexMode im, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p));
+  let AsmString   = !strconcat(opc, !strconcat("${p}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasVFP2];
+}
+
+// Special cases
+class VFPXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+            IndexMode im, Format f, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString   = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasVFP2];
+}
+
+class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
+         opc, asm, "", pattern>;
+
 // ARM VFP addrmode5 loads and stores
 class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
-      VFPLdStFrm, opc, asm, "", pattern> {
+  : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+      VFPLdStFrm, itin, opc, asm, "", pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-24} = opcod1;
   let Inst{21-20} = opcod2;
@@ -908,9 +1064,10 @@ class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
 }
 
 class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
-      VFPLdStFrm, opc, asm, "", pattern> {
+  : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+      VFPLdStFrm, itin, opc, asm, "", pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-24} = opcod1;
   let Inst{21-20} = opcod2;
@@ -918,27 +1075,28 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
 }
 
 // Load / store multiple
-class AXSI5<dag oops, dag iops, string asm, list<dag> pattern>
-  : XI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
-       VFPLdStMulFrm, asm, "", pattern> {
+class AXDI5<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+       VFPLdStMulFrm, itin, asm, "", pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
   let Inst{11-8}  = 0b1011;
 }
 
-class AXDI5<dag oops, dag iops, string asm, list<dag> pattern>
-  : XI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
-       VFPLdStMulFrm, asm, "", pattern> {
+class AXSI5<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+       VFPLdStMulFrm, itin, asm, "", pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
   let Inst{11-8}  = 0b1010;
 }
 
-
 // Double precision, unary
 class ADuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
-           string opc, string asm, list<dag> pattern>
-  : AI<oops, iops, VFPUnaryFrm, opc, asm, pattern> {
+           InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
   let Inst{27-20} = opcod1;
   let Inst{19-16} = opcod2;
   let Inst{11-8}  = 0b1011;
@@ -946,17 +1104,17 @@ class ADuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
 }
 
 // Double precision, binary
-class ADbI<bits<8> opcod, dag oops, dag iops, string opc,
-           string asm, list<dag> pattern>
-  : AI<oops, iops, VFPBinaryFrm, opc, asm, pattern> {
+class ADbI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
   let Inst{27-20} = opcod;
   let Inst{11-8}  = 0b1011;
 }
 
 // Single precision, unary
 class ASuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
-           string opc, string asm, list<dag> pattern>
-  : AI<oops, iops, VFPUnaryFrm, opc, asm, pattern> {
+           InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
   // Bits 22 (D bit) and 5 (M bit) will be changed during instruction encoding.
   let Inst{27-20} = opcod1;
   let Inst{19-16} = opcod2;
@@ -964,48 +1122,74 @@ class ASuI<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
   let Inst{7-4}   = opcod3;
 }
 
+// Single precision unary, if no NEON
+// Same as ASuI except not available if NEON is enabled
+class ASuIn<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3, dag oops, dag iops,
+            InstrItinClass itin,    string opc, string asm, list<dag> pattern>
+  : ASuI<opcod1, opcod2, opcod2, oops, iops, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
 // Single precision, binary
-class ASbI<bits<8> opcod, dag oops, dag iops, string opc,
-           string asm, list<dag> pattern>
-  : AI<oops, iops, VFPBinaryFrm, opc, asm, pattern> {
+class ASbI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
   // Bit 22 (D bit) can be changed during instruction encoding.
   let Inst{27-20} = opcod;
   let Inst{11-8}  = 0b1010;
 }
 
+// Single precision binary, if no NEON
+// Same as ASbI except not available if NEON is enabled
+class ASbIn<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : ASbI<opcod, oops, iops, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
 // VFP conversion instructions
 class AVConv1I<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3,
-               dag oops, dag iops, string opc, string asm, list<dag> pattern>
-  : AI<oops, iops, VFPConv1Frm, opc, asm, pattern> {
+               dag oops, dag iops, InstrItinClass itin,
+               string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPConv1Frm, itin, opc, asm, pattern> {
   let Inst{27-20} = opcod1;
   let Inst{19-16} = opcod2;
   let Inst{11-8}  = opcod3;
   let Inst{6}     = 1;
 }
 
+// VFP conversion instructions, if no NEON
+class AVConv1In<bits<8> opcod1, bits<4> opcod2, bits<4> opcod3,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, oops, iops, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
 class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f,
-             string opc, string asm, list<dag> pattern>
-  : AI<oops, iops, f, opc, asm, pattern> {
+               InstrItinClass itin,
+               string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, f, itin, opc, asm, pattern> {
   let Inst{27-20} = opcod1;
   let Inst{11-8}  = opcod2;
   let Inst{4}     = 1;
 }
 
-class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
-              string asm, list<dag> pattern>
-  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, opc, asm, pattern>;
+class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>;
 
-class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
-              string asm, list<dag> pattern>
-  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, opc, asm, pattern>;
+class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, 
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>;
 
-class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
-              string asm, list<dag> pattern>
-  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, opc, asm, pattern>;
+class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, itin, opc, asm, pattern>;
 
-class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
-              string asm, list<dag> pattern>
-  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, opc, asm, pattern>;
+class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, itin, opc, asm, pattern>;
 
 //===----------------------------------------------------------------------===//
 
@@ -1013,9 +1197,9 @@ class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
 // ARM NEON Instruction templates.
 //
 
-class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, string asm,
-            string cstr, list<dag> pattern>
-  : InstARM<am, Size4Bytes, im, NEONFrm, cstr> {
+class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : InstARM<am, Size4Bytes, im, NEONFrm, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = iops;
   let AsmString = asm;
@@ -1023,20 +1207,33 @@ class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, string asm,
   list<Predicate> Predicates = [HasNEON];
 }
 
-class NI<dag oops, dag iops, string asm, list<dag> pattern>
-  : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, "", pattern> {
+class NI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, itin, asm, "", pattern> {
+}
+
+class NI4<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+  : NeonI<oops, iops, AddrMode4, IndexModeNone, itin, asm, "", pattern> {
+}
+
+class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
+            dag oops, dag iops, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : NeonI<oops, iops, AddrMode6, IndexModeNone, itin, asm, cstr, pattern> {
+  let Inst{31-24} = 0b11110100;
 }
 
-class NDataI<dag oops, dag iops, string asm, string cstr, list<dag> pattern>
-  : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, cstr, pattern> {
+class NDataI<dag oops, dag iops, InstrItinClass itin,
+             string asm, string cstr, list<dag> pattern>
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, itin, asm, cstr, pattern> {
   let Inst{31-25} = 0b1111001;
 }
 
 // NEON "one register and a modified immediate" format.
 class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
                bit op5, bit op4,
-               dag oops, dag iops, string asm, string cstr, list<dag> pattern>
-  : NDataI<oops, iops, asm, cstr, pattern> {
+               dag oops, dag iops, InstrItinClass itin,
+               string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, itin, asm, cstr, pattern> {
   let Inst{23} = op23;
   let Inst{21-19} = op21_19;
   let Inst{11-8} = op11_8;
@@ -1049,8 +1246,9 @@ class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
 // NEON 2 vector register format.
 class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
           bits<5> op11_7, bit op6, bit op4,
-          dag oops, dag iops, string asm, string cstr, list<dag> pattern>
-  : NDataI<oops, iops, asm, cstr, pattern> {
+          dag oops, dag iops, InstrItinClass itin,
+          string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, itin, asm, cstr, pattern> {
   let Inst{24-23} = op24_23;
   let Inst{21-20} = op21_20;
   let Inst{19-18} = op19_18;
@@ -1063,8 +1261,9 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
 // NEON 2 vector register with immediate.
 class N2VImm<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
              bit op6, bit op4,
-             dag oops, dag iops, string asm, string cstr, list<dag> pattern>
-  : NDataI<oops, iops, asm, cstr, pattern> {
+             dag oops, dag iops, InstrItinClass itin,
+             string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, itin, asm, cstr, pattern> {
   let Inst{24} = op24;
   let Inst{23} = op23;
   let Inst{21-16} = op21_16;
@@ -1076,8 +1275,9 @@ class N2VImm<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
 
 // NEON 3 vector register format.
 class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
-          dag oops, dag iops, string asm, string cstr, list<dag> pattern>
-  : NDataI<oops, iops, asm, cstr, pattern> {
+          dag oops, dag iops, InstrItinClass itin,
+          string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, itin, asm, cstr, pattern> {
   let Inst{24} = op24;
   let Inst{23} = op23;
   let Inst{21-20} = op21_20;
@@ -1088,9 +1288,9 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
 
 // NEON VMOVs between scalar and core registers.
 class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
-               dag oops, dag iops, Format f, string opc, string asm,
-               list<dag> pattern>
-  : AI<oops, iops, f, opc, asm, pattern> {
+               dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, list<dag> pattern>
+  : AI<oops, iops, f, itin, opc, asm, pattern> {
   let Inst{27-20} = opcod1;
   let Inst{11-8} = opcod2;
   let Inst{6-5} = opcod3;
@@ -1098,13 +1298,23 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
   list<Predicate> Predicates = [HasNEON];
 }
 class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
-                dag oops, dag iops, string opc, string asm, list<dag> pattern>
-  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, opc, asm,
-             pattern>;
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, itin,
+             opc, asm, pattern>;
 class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
-                dag oops, dag iops, string opc, string asm, list<dag> pattern>
-  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, opc, asm,
-             pattern>;
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, itin,
+             opc, asm, pattern>;
 class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
-            dag oops, dag iops, string opc, string asm, list<dag> pattern>
-  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, opc, asm, pattern>;
+            dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, itin,
+             opc, asm, pattern>;
+
+// NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON
+// for single-precision FP.
+class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [HasNEON,UseNEONForFP];
+}
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 443fdc7..4c92891 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -21,52 +21,15 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
-static cl::opt<bool>
-EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
-               cl::desc("Enable ARM 2-addr to 3-addr conv"));
-
-static inline
-const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
-  return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
-}
-
-static inline
-const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
-  return MIB.addReg(0);
-}
-
-ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI)
-  : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)) {
-}
-
 ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
-  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+  : RI(*this, STI), Subtarget(STI) {
 }
 
-void ARMInstrInfo::reMaterialize(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator I,
-                                 unsigned DestReg,
-                                 const MachineInstr *Orig) const {
-  DebugLoc dl = Orig->getDebugLoc();
-  if (Orig->getOpcode() == ARM::MOVi2pieces) {
-    RI.emitLoadConstPool(MBB, I, this, dl,
-                         DestReg,
-                         Orig->getOperand(1).getImm(),
-                         (ARMCC::CondCodes)Orig->getOperand(2).getImm(),
-                         Orig->getOperand(3).getReg());
-    return;
-  }
-
-  MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
-  MI->getOperand(0).setReg(DestReg);
-  MBB.insert(I, MI);
-}
-
-static unsigned getUnindexedOpcode(unsigned Opc) {
+unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
   switch (Opc) {
   default: break;
   case ARM::LDR_PRE:
@@ -94,820 +57,45 @@ static unsigned getUnindexedOpcode(unsigned Opc) {
   case ARM::STRB_POST:
     return ARM::STRB;
   }
-  return 0;
-}
-
-MachineInstr *
-ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
-                                        MachineBasicBlock::iterator &MBBI,
-                                        LiveVariables *LV) const {
-  if (!EnableARM3Addr)
-    return NULL;
-
-  MachineInstr *MI = MBBI;
-  MachineFunction &MF = *MI->getParent()->getParent();
-  unsigned TSFlags = MI->getDesc().TSFlags;
-  bool isPre = false;
-  switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
-  default: return NULL;
-  case ARMII::IndexModePre:
-    isPre = true;
-    break;
-  case ARMII::IndexModePost:
-    break;
-  }
-
-  // Try splitting an indexed load/store to an un-indexed one plus an add/sub
-  // operation.
-  unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
-  if (MemOpc == 0)
-    return NULL;
-
-  MachineInstr *UpdateMI = NULL;
-  MachineInstr *MemMI = NULL;
-  unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
-  const TargetInstrDesc &TID = MI->getDesc();
-  unsigned NumOps = TID.getNumOperands();
-  bool isLoad = !TID.mayStore();
-  const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0);
-  const MachineOperand &Base = MI->getOperand(2);
-  const MachineOperand &Offset = MI->getOperand(NumOps-3);
-  unsigned WBReg = WB.getReg();
-  unsigned BaseReg = Base.getReg();
-  unsigned OffReg = Offset.getReg();
-  unsigned OffImm = MI->getOperand(NumOps-2).getImm();
-  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm();
-  switch (AddrMode) {
-  default:
-    assert(false && "Unknown indexed op!");
-    return NULL;
-  case ARMII::AddrMode2: {
-    bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
-    unsigned Amt = ARM_AM::getAM2Offset(OffImm);
-    if (OffReg == 0) {
-      int SOImmVal = ARM_AM::getSOImmVal(Amt);
-      if (SOImmVal == -1)
-        // Can't encode it in a so_imm operand. This transformation will
-        // add more than 1 instruction. Abandon!
-        return NULL;
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
-                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
-        .addReg(BaseReg).addImm(SOImmVal)
-        .addImm(Pred).addReg(0).addReg(0);
-    } else if (Amt != 0) {
-      ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
-      unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
-                         get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg)
-        .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc)
-        .addImm(Pred).addReg(0).addReg(0);
-    } else
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
-                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
-        .addReg(BaseReg).addReg(OffReg)
-        .addImm(Pred).addReg(0).addReg(0);
-    break;
-  }
-  case ARMII::AddrMode3 : {
-    bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub;
-    unsigned Amt = ARM_AM::getAM3Offset(OffImm);
-    if (OffReg == 0)
-      // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
-                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
-        .addReg(BaseReg).addImm(Amt)
-        .addImm(Pred).addReg(0).addReg(0);
-    else
-      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
-                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
-        .addReg(BaseReg).addReg(OffReg)
-        .addImm(Pred).addReg(0).addReg(0);
-    break;
-  }
-  }
-
-  std::vector<MachineInstr*> NewMIs;
-  if (isPre) {
-    if (isLoad)
-      MemMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(MemOpc), MI->getOperand(0).getReg())
-        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
-    else
-      MemMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
-        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
-    NewMIs.push_back(MemMI);
-    NewMIs.push_back(UpdateMI);
-  } else {
-    if (isLoad)
-      MemMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(MemOpc), MI->getOperand(0).getReg())
-        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
-    else
-      MemMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
-        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
-    if (WB.isDead())
-      UpdateMI->getOperand(0).setIsDead();
-    NewMIs.push_back(UpdateMI);
-    NewMIs.push_back(MemMI);
-  }
-
-  // Transfer LiveVariables states, kill / dead info.
-  if (LV) {
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (MO.isReg() && MO.getReg() &&
-          TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
-        unsigned Reg = MO.getReg();
-
-        LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
-        if (MO.isDef()) {
-          MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
-          if (MO.isDead())
-            LV->addVirtualRegisterDead(Reg, NewMI);
-        }
-        if (MO.isUse() && MO.isKill()) {
-          for (unsigned j = 0; j < 2; ++j) {
-            // Look at the two new MI's in reverse order.
-            MachineInstr *NewMI = NewMIs[j];
-            if (!NewMI->readsRegister(Reg))
-              continue;
-            LV->addVirtualRegisterKilled(Reg, NewMI);
-            if (VI.removeKill(MI))
-              VI.Kills.push_back(NewMI);
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  MFI->insert(MBBI, NewMIs[1]);
-  MFI->insert(MBBI, NewMIs[0]);
-  return NewMIs[0];
-}
-
-// Branch analysis.
-bool
-ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
-                                MachineBasicBlock *&FBB,
-                                SmallVectorImpl<MachineOperand> &Cond,
-                                bool AllowModify) const {
-  // If the block has no terminators, it just falls into the block after it.
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
-    return false;
-
-  // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
-
-  // If there is only one terminator instruction, process it.
-  unsigned LastOpc = LastInst->getOpcode();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-    if (LastOpc == ARM::B || LastOpc == ARM::tB || LastOpc == ARM::t2B) {
-      TBB = LastInst->getOperand(0).getMBB();
-      return false;
-    }
-    if (LastOpc == ARM::Bcc || LastOpc == ARM::tBcc || LastOpc == ARM::t2Bcc) {
-      // Block ends with fall-through condbranch.
-      TBB = LastInst->getOperand(0).getMBB();
-      Cond.push_back(LastInst->getOperand(1));
-      Cond.push_back(LastInst->getOperand(2));
-      return false;
-    }
-    return true;  // Can't handle indirect branch.
-  }
-
-  // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
-
-  // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
-    return true;
-
-  // If the block ends with ARM::B/ARM::tB/ARM::t2B and a 
-  // ARM::Bcc/ARM::tBcc/ARM::t2Bcc, handle it.
-  unsigned SecondLastOpc = SecondLastInst->getOpcode();
-  if ((SecondLastOpc == ARM::Bcc && LastOpc == ARM::B) ||
-      (SecondLastOpc == ARM::tBcc && LastOpc == ARM::tB) ||
-      (SecondLastOpc == ARM::t2Bcc && LastOpc == ARM::t2B)) {
-    TBB =  SecondLastInst->getOperand(0).getMBB();
-    Cond.push_back(SecondLastInst->getOperand(1));
-    Cond.push_back(SecondLastInst->getOperand(2));
-    FBB = LastInst->getOperand(0).getMBB();
-    return false;
-  }
-
-  // If the block ends with two unconditional branches, handle it.  The second
-  // one is not executed, so remove it.
-  if ((SecondLastOpc == ARM::B || SecondLastOpc==ARM::tB || 
-       SecondLastOpc==ARM::t2B) &&
-      (LastOpc == ARM::B || LastOpc == ARM::tB || LastOpc == ARM::t2B)) {
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return false;
-  }
-
-  // ...likewise if it ends with a branch table followed by an unconditional
-  // branch. The branch folder can create these, and we must get rid of them for
-  // correctness of Thumb constant islands.
-  if ((SecondLastOpc == ARM::BR_JTr || SecondLastOpc==ARM::BR_JTm ||
-       SecondLastOpc == ARM::BR_JTadd || SecondLastOpc==ARM::tBR_JTr ||
-       SecondLastOpc == ARM::t2BR_JTr || SecondLastOpc==ARM::t2BR_JTm ||
-       SecondLastOpc == ARM::t2BR_JTadd) &&
-      (LastOpc == ARM::B || LastOpc == ARM::tB || LastOpc == ARM::t2B)) {
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return true;
-  }
-
-  // Otherwise, can't handle this.
-  return true;
-}
-
-
-unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
-  MachineFunction &MF = *MBB.getParent();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  int BOpc   = AFI->isThumbFunction() ? 
-    (AFI->isThumb2Function() ? ARM::t2B : ARM::tB) : ARM::B;
-  int BccOpc = AFI->isThumbFunction() ? 
-    (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc;
-
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin()) return 0;
-  --I;
-  if (I->getOpcode() != BOpc && I->getOpcode() != BccOpc)
-    return 0;
-
-  // Remove the branch.
-  I->eraseFromParent();
-
-  I = MBB.end();
-
-  if (I == MBB.begin()) return 1;
-  --I;
-  if (I->getOpcode() != BccOpc)
-    return 1;
-
-  // Remove the branch.
-  I->eraseFromParent();
-  return 2;
-}
-
-unsigned
-ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                               MachineBasicBlock *FBB,
-                             const SmallVectorImpl<MachineOperand> &Cond) const {
-  // FIXME this should probably have a DebugLoc argument
-  DebugLoc dl = DebugLoc::getUnknownLoc();
-  MachineFunction &MF = *MBB.getParent();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  int BOpc   = AFI->isThumbFunction() ? 
-    (AFI->isThumb2Function() ? ARM::t2B : ARM::tB) : ARM::B;
-  int BccOpc = AFI->isThumbFunction() ? 
-    (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc;
-
-  // Shouldn't be a fall through.
-  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
-  assert((Cond.size() == 2 || Cond.size() == 0) &&
-         "ARM branch conditions have two components!");
 
-  if (FBB == 0) {
-    if (Cond.empty()) // Unconditional branch?
-      BuildMI(&MBB, dl, get(BOpc)).addMBB(TBB);
-    else
-      BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
-        .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
-    return 1;
-  }
-
-  // Two-way conditional branch.
-  BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
-    .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
-  BuildMI(&MBB, dl, get(BOpc)).addMBB(FBB);
-  return 2;
+  return 0;
 }
 
-bool
-ARMBaseInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
+bool ARMInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
   if (MBB.empty()) return false;
 
   switch (MBB.back().getOpcode()) {
   case ARM::BX_RET:   // Return.
   case ARM::LDM_RET:
-  case ARM::tBX_RET:
-  case ARM::tBX_RET_vararg:
-  case ARM::tPOP_RET:
   case ARM::B:
-  case ARM::tB:
-  case ARM::t2B:      // Uncond branch.
-  case ARM::tBR_JTr:
-  case ARM::t2BR_JTr:
   case ARM::BR_JTr:   // Jumptable branch.
-  case ARM::t2BR_JTm:
   case ARM::BR_JTm:   // Jumptable branch through mem.
-  case ARM::t2BR_JTadd:
   case ARM::BR_JTadd: // Jumptable branch add to pc.
     return true;
-  default: return false;
-  }
-}
-
-bool ARMBaseInstrInfo::
-ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
-  ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
-  Cond[0].setImm(ARMCC::getOppositeCondition(CC));
-  return false;
-}
-
-bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const {
-  int PIdx = MI->findFirstPredOperandIdx();
-  return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL;
-}
-
-bool ARMBaseInstrInfo::
-PredicateInstruction(MachineInstr *MI,
-                     const SmallVectorImpl<MachineOperand> &Pred) const {
-  unsigned Opc = MI->getOpcode();
-  if (Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B) {
-    MI->setDesc(get((Opc == ARM::B) ? ARM::Bcc :
-                    ((Opc == ARM::tB) ? ARM::tBcc : ARM::t2Bcc)));
-    MI->addOperand(MachineOperand::CreateImm(Pred[0].getImm()));
-    MI->addOperand(MachineOperand::CreateReg(Pred[1].getReg(), false));
-    return true;
-  }
-
-  int PIdx = MI->findFirstPredOperandIdx();
-  if (PIdx != -1) {
-    MachineOperand &PMO = MI->getOperand(PIdx);
-    PMO.setImm(Pred[0].getImm());
-    MI->getOperand(PIdx+1).setReg(Pred[1].getReg());
-    return true;
-  }
-  return false;
-}
-
-bool ARMBaseInstrInfo::
-SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                  const SmallVectorImpl<MachineOperand> &Pred2) const {
-  if (Pred1.size() > 2 || Pred2.size() > 2)
-    return false;
-
-  ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm();
-  ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm();
-  if (CC1 == CC2)
-    return true;
-
-  switch (CC1) {
-  default:
-    return false;
-  case ARMCC::AL:
-    return true;
-  case ARMCC::HS:
-    return CC2 == ARMCC::HI;
-  case ARMCC::LS:
-    return CC2 == ARMCC::LO || CC2 == ARMCC::EQ;
-  case ARMCC::GE:
-    return CC2 == ARMCC::GT;
-  case ARMCC::LE:
-    return CC2 == ARMCC::LT;
-  }
-}
-
-bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI,
-                                    std::vector<MachineOperand> &Pred) const {
-  const TargetInstrDesc &TID = MI->getDesc();
-  if (!TID.getImplicitDefs() && !TID.hasOptionalDef())
-    return false;
-
-  bool Found = false;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (MO.isReg() && MO.getReg() == ARM::CPSR) {
-      Pred.push_back(MO);
-      Found = true;
-    }
-  }
-
-  return Found;
-}
-
-
-/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing
-static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
-                                unsigned JTI) DISABLE_INLINE;
-static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
-                                unsigned JTI) {
-  return JT[JTI].MBBs.size();
-}
-
-/// GetInstSize - Return the size of the specified MachineInstr.
-///
-unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  const MachineBasicBlock &MBB = *MI->getParent();
-  const MachineFunction *MF = MBB.getParent();
-  const TargetAsmInfo *TAI = MF->getTarget().getTargetAsmInfo();
-
-  // Basic size info comes from the TSFlags field.
-  const TargetInstrDesc &TID = MI->getDesc();
-  unsigned TSFlags = TID.TSFlags;
-
-  switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) {
-  default: {
-    // If this machine instr is an inline asm, measure it.
-    if (MI->getOpcode() == ARM::INLINEASM)
-      return TAI->getInlineAsmLength(MI->getOperand(0).getSymbolName());
-    if (MI->isLabel())
-      return 0;
-    switch (MI->getOpcode()) {
-    default:
-      assert(0 && "Unknown or unset size field for instr!");
-      break;
-    case TargetInstrInfo::IMPLICIT_DEF:
-    case TargetInstrInfo::DECLARE:
-    case TargetInstrInfo::DBG_LABEL:
-    case TargetInstrInfo::EH_LABEL:
-      return 0;
-    }
-    break;
-  }
-  case ARMII::Size8Bytes: return 8;          // Arm instruction x 2.
-  case ARMII::Size4Bytes: return 4;          // Arm instruction.
-  case ARMII::Size2Bytes: return 2;          // Thumb instruction.
-  case ARMII::SizeSpecial: {
-    switch (MI->getOpcode()) {
-    case ARM::CONSTPOOL_ENTRY:
-      // If this machine instr is a constant pool entry, its size is recorded as
-      // operand #2.
-      return MI->getOperand(2).getImm();
-    case ARM::Int_eh_sjlj_setjmp: return 12;
-    case ARM::BR_JTr:
-    case ARM::BR_JTm:
-    case ARM::BR_JTadd:
-    case ARM::t2BR_JTr:
-    case ARM::t2BR_JTm:
-    case ARM::t2BR_JTadd:
-    case ARM::tBR_JTr: {
-      // These are jumptable branches, i.e. a branch followed by an inlined
-      // jumptable. The size is 4 + 4 * number of entries.
-      unsigned NumOps = TID.getNumOperands();
-      MachineOperand JTOP =
-        MI->getOperand(NumOps - (TID.isPredicable() ? 3 : 2));
-      unsigned JTI = JTOP.getIndex();
-      const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
-      const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
-      assert(JTI < JT.size());
-      // Thumb instructions are 2 byte aligned, but JT entries are 4 byte
-      // 4 aligned. The assembler / linker may add 2 byte padding just before
-      // the JT entries.  The size does not include this padding; the
-      // constant islands pass does separate bookkeeping for it.
-      // FIXME: If we know the size of the function is less than (1 << 16) *2
-      // bytes, we can use 16-bit entries instead. Then there won't be an
-      // alignment issue.
-      return getNumJTEntries(JT, JTI) * 4 +
-        ((MI->getOpcode()==ARM::tBR_JTr) ? 2 : 4);
-    }
-    default:
-      // Otherwise, pseudo-instruction sizes are zero.
-      return 0;
-    }
-  }
-  }
-  return 0; // Not reached
-}
-
-/// Return true if the instruction is a register to register move and
-/// leave the source and dest operands in the passed parameters.
-///
-bool
-ARMBaseInstrInfo::isMoveInstr(const MachineInstr &MI,
-                              unsigned &SrcReg, unsigned &DstReg,
-                              unsigned& SrcSubIdx, unsigned& DstSubIdx) const {
-  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
-
-  unsigned oc = MI.getOpcode();
-  switch (oc) {
   default:
-    return false;
-  case ARM::FCPYS:
-  case ARM::FCPYD:
-  case ARM::VMOVD:
-  case ARM::VMOVQ:
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    return true;
-  case ARM::MOVr:
-    assert(MI.getDesc().getNumOperands() >= 2 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           "Invalid ARM MOV instruction");
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    return true;
-  }
-}
-
-unsigned 
-ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default: break;
-  case ARM::LDR:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isReg() &&
-        MI->getOperand(3).isImm() &&
-        MI->getOperand(2).getReg() == 0 &&
-        MI->getOperand(3).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  case ARM::FLDD:
-  case ARM::FLDS:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
-  return 0;
-}
-
-unsigned
-ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
-                                     int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default: break;
-  case ARM::STR:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isReg() &&
-        MI->getOperand(3).isImm() &&
-        MI->getOperand(2).getReg() == 0 &&
-        MI->getOperand(3).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  case ARM::FSTD:
-  case ARM::FSTS:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
     break;
   }
 
-  return 0;
-}
-
-bool
-ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I,
-                               unsigned DestReg, unsigned SrcReg,
-                               const TargetRegisterClass *DestRC,
-                               const TargetRegisterClass *SrcRC) const {
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  if (I != MBB.end()) DL = I->getDebugLoc();
-
-  if (DestRC != SrcRC) {
-    // Not yet supported!
-    return false;
-  }
-
-  if (DestRC == ARM::GPRRegisterClass)
-    AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
-                                .addReg(SrcReg)));
-  else if (DestRC == ARM::SPRRegisterClass)
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYS), DestReg)
-                   .addReg(SrcReg));
-  else if (DestRC == ARM::DPRRegisterClass)
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg)
-                   .addReg(SrcReg));
-  else if (DestRC == ARM::QPRRegisterClass)
-    BuildMI(MBB, I, DL, get(ARM::VMOVQ), DestReg).addReg(SrcReg);
-  else
-    return false;
-
-  return true;
-}
-
-void ARMBaseInstrInfo::
-storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                    unsigned SrcReg, bool isKill, int FI,
-                    const TargetRegisterClass *RC) const {
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  if (I != MBB.end()) DL = I->getDebugLoc();
-
-  if (RC == ARM::GPRRegisterClass) {
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR))
-                   .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addReg(0).addImm(0));
-  } else if (RC == ARM::DPRRegisterClass) {
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTD))
-                   .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addImm(0));
-  } else {
-    assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTS))
-                   .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addImm(0));
-  }
-}
-
-void 
-ARMBaseInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
-                                 bool isKill,
-                                 SmallVectorImpl<MachineOperand> &Addr,
-                                 const TargetRegisterClass *RC,
-                                 SmallVectorImpl<MachineInstr*> &NewMIs) const{
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  unsigned Opc = 0;
-  if (RC == ARM::GPRRegisterClass) {
-    Opc = ARM::STR;
-  } else if (RC == ARM::DPRRegisterClass) {
-    Opc = ARM::FSTD;
-  } else {
-    assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
-    Opc = ARM::FSTS;
-  }
-
-  MachineInstrBuilder MIB =
-    BuildMI(MF, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill));
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  AddDefaultPred(MIB);
-  NewMIs.push_back(MIB);
-  return;
-}
-
-void ARMBaseInstrInfo::
-loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                     unsigned DestReg, int FI,
-                     const TargetRegisterClass *RC) const {
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  if (I != MBB.end()) DL = I->getDebugLoc();
-
-  if (RC == ARM::GPRRegisterClass) {
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg)
-                   .addFrameIndex(FI).addReg(0).addImm(0));
-  } else if (RC == ARM::DPRRegisterClass) {
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDD), DestReg)
-                   .addFrameIndex(FI).addImm(0));
-  } else {
-    assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDS), DestReg)
-                   .addFrameIndex(FI).addImm(0));
-  }
-}
-
-void ARMBaseInstrInfo::
-loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                SmallVectorImpl<MachineOperand> &Addr,
-                const TargetRegisterClass *RC,
-                SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  unsigned Opc = 0;
-  if (RC == ARM::GPRRegisterClass) {
-    Opc = ARM::LDR;
-  } else if (RC == ARM::DPRRegisterClass) {
-    Opc = ARM::FLDD;
-  } else {
-    assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
-    Opc = ARM::FLDS;
-  }
-
-  MachineInstrBuilder MIB =  BuildMI(MF, DL, get(Opc), DestReg);
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  AddDefaultPred(MIB);
-  NewMIs.push_back(MIB);
-  return;
+  return false;
 }
 
-MachineInstr *ARMBaseInstrInfo::
-foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                      const SmallVectorImpl<unsigned> &Ops, int FI) const {
-  if (Ops.size() != 1) return NULL;
-
-  unsigned OpNum = Ops[0];
-  unsigned Opc = MI->getOpcode();
-  MachineInstr *NewMI = NULL;
-  switch (Opc) {
-  default: break;
-  case ARM::MOVr: {
-    if (MI->getOperand(4).getReg() == ARM::CPSR)
-      // If it is updating CPSR, then it cannot be folded.
-      break;
-    unsigned Pred = MI->getOperand(2).getImm();
-    unsigned PredReg = MI->getOperand(3).getReg();
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::STR))
-        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
-        .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::LDR))
-        .addReg(DstReg,
-                RegState::Define |
-                getDeadRegState(isDead) |
-                getUndefRegState(isUndef))
-        .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
-    }
-    break;
-  }
-  case ARM::FCPYS: {
-    unsigned Pred = MI->getOperand(2).getImm();
-    unsigned PredReg = MI->getOperand(3).getReg();
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTS))
-        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
-        .addFrameIndex(FI)
-        .addImm(0).addImm(Pred).addReg(PredReg);
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDS))
-        .addReg(DstReg,
-                RegState::Define |
-                getDeadRegState(isDead) |
-                getUndefRegState(isUndef))
-        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
-    }
-    break;
-  }
-  case ARM::FCPYD: {
-    unsigned Pred = MI->getOperand(2).getImm();
-    unsigned PredReg = MI->getOperand(3).getReg();
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FSTD))
-        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
-        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::FLDD))
-        .addReg(DstReg,
-                RegState::Define |
-                getDeadRegState(isDead) |
-                getUndefRegState(isUndef))
-        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
-    }
-    break;
-  }
+void ARMInstrInfo::
+reMaterialize(MachineBasicBlock &MBB,
+              MachineBasicBlock::iterator I,
+              unsigned DestReg, unsigned SubIdx,
+              const MachineInstr *Orig) const {
+  DebugLoc dl = Orig->getDebugLoc();
+  if (Orig->getOpcode() == ARM::MOVi2pieces) {
+    RI.emitLoadConstPool(MBB, I, dl,
+                         DestReg, SubIdx,
+                         Orig->getOperand(1).getImm(),
+                         (ARMCC::CondCodes)Orig->getOperand(2).getImm(),
+                         Orig->getOperand(3).getReg());
+    return;
   }
 
-  return NewMI;
-}
-
-MachineInstr* 
-ARMBaseInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                        MachineInstr* MI,
-                                        const SmallVectorImpl<unsigned> &Ops,
-                                        MachineInstr* LoadMI) const {
-  return 0;
+  MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+  MI->getOperand(0).setReg(DestReg);
+  MBB.insert(I, MI);
 }
 
-bool
-ARMBaseInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
-                                       const SmallVectorImpl<unsigned> &Ops) const {
-  if (Ops.size() != 1) return false;
-
-  unsigned Opc = MI->getOpcode();
-  switch (Opc) {
-  default: break;
-  case ARM::MOVr:
-    // If it is updating CPSR, then it cannot be folded.
-    return MI->getOperand(4).getReg() != ARM::CPSR;
-  case ARM::FCPYS:
-  case ARM::FCPYD:
-    return true;
-
-  case ARM::VMOVD:
-  case ARM::VMOVQ:
-    return false; // FIXME
-  }
-
-  return false;
-}
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index 8c8f788..c616949 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -15,247 +15,27 @@
 #define ARMINSTRUCTIONINFO_H
 
 #include "llvm/Target/TargetInstrInfo.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMRegisterInfo.h"
+#include "ARMSubtarget.h"
 #include "ARM.h"
 
 namespace llvm {
   class ARMSubtarget;
 
-/// ARMII - This namespace holds all of the target specific flags that
-/// instruction info tracks.
-///
-namespace ARMII {
-  enum {
-    //===------------------------------------------------------------------===//
-    // Instruction Flags.
-
-    //===------------------------------------------------------------------===//
-    // This four-bit field describes the addressing mode used.
-
-    AddrModeMask  = 0xf,
-    AddrModeNone    = 0,
-    AddrMode1       = 1,
-    AddrMode2       = 2,
-    AddrMode3       = 3,
-    AddrMode4       = 4,
-    AddrMode5       = 5,
-    AddrMode6       = 6,
-    AddrModeT1_1    = 7,
-    AddrModeT1_2    = 8,
-    AddrModeT1_4    = 9,
-    AddrModeT1_s    = 10, // i8 * 4 for pc and sp relative data
-    AddrModeT2_i12  = 11,
-    AddrModeT2_i8   = 12,
-    AddrModeT2_so   = 13,
-    AddrModeT2_pc   = 14, // +/- i12 for pc relative data
-    AddrModeT2_i8s4 = 15, // i8 * 4
-
-    // Size* - Flags to keep track of the size of an instruction.
-    SizeShift     = 4,
-    SizeMask      = 7 << SizeShift,
-    SizeSpecial   = 1,   // 0 byte pseudo or special case.
-    Size8Bytes    = 2,
-    Size4Bytes    = 3,
-    Size2Bytes    = 4,
-
-    // IndexMode - Unindex, pre-indexed, or post-indexed. Only valid for load
-    // and store ops
-    IndexModeShift = 7,
-    IndexModeMask  = 3 << IndexModeShift,
-    IndexModePre   = 1,
-    IndexModePost  = 2,
-
-    //===------------------------------------------------------------------===//
-    // Misc flags.
-
-    // UnaryDP - Indicates this is a unary data processing instruction, i.e.
-    // it doesn't have a Rn operand.
-    UnaryDP       = 1 << 9,
-
-    //===------------------------------------------------------------------===//
-    // Instruction encoding formats.
-    //
-    FormShift     = 10,
-    FormMask      = 0x1f << FormShift,
-
-    // Pseudo instructions
-    Pseudo        = 0  << FormShift,
-
-    // Multiply instructions
-    MulFrm        = 1  << FormShift,
-
-    // Branch instructions
-    BrFrm         = 2  << FormShift,
-    BrMiscFrm     = 3  << FormShift,
-
-    // Data Processing instructions
-    DPFrm         = 4  << FormShift,
-    DPSoRegFrm    = 5  << FormShift,
-
-    // Load and Store
-    LdFrm         = 6  << FormShift,
-    StFrm         = 7  << FormShift,
-    LdMiscFrm     = 8  << FormShift,
-    StMiscFrm     = 9  << FormShift,
-    LdStMulFrm    = 10 << FormShift,
-
-    // Miscellaneous arithmetic instructions
-    ArithMiscFrm  = 11 << FormShift,
-
-    // Extend instructions
-    ExtFrm        = 12 << FormShift,
-
-    // VFP formats
-    VFPUnaryFrm   = 13 << FormShift,
-    VFPBinaryFrm  = 14 << FormShift,
-    VFPConv1Frm   = 15 << FormShift,
-    VFPConv2Frm   = 16 << FormShift,
-    VFPConv3Frm   = 17 << FormShift,
-    VFPConv4Frm   = 18 << FormShift,
-    VFPConv5Frm   = 19 << FormShift,
-    VFPLdStFrm    = 20 << FormShift,
-    VFPLdStMulFrm = 21 << FormShift,
-    VFPMiscFrm    = 22 << FormShift,
-
-    // Thumb format
-    ThumbFrm      = 23 << FormShift,
-
-    // NEON format
-    NEONFrm       = 24 << FormShift,
-    NEONGetLnFrm  = 25 << FormShift,
-    NEONSetLnFrm  = 26 << FormShift,
-    NEONDupFrm    = 27 << FormShift,
-
-    //===------------------------------------------------------------------===//
-    // Field shifts - such shifts are used to set field while generating
-    // machine instructions.
-    M_BitShift     = 5,
-    ShiftImmShift  = 5,
-    ShiftShift     = 7,
-    N_BitShift     = 7,
-    ImmHiShift     = 8,
-    SoRotImmShift  = 8,
-    RegRsShift     = 8,
-    ExtRotImmShift = 10,
-    RegRdLoShift   = 12,
-    RegRdShift     = 12,
-    RegRdHiShift   = 16,
-    RegRnShift     = 16,
-    S_BitShift     = 20,
-    W_BitShift     = 21,
-    AM3_I_BitShift = 22,
-    D_BitShift     = 22,
-    U_BitShift     = 23,
-    P_BitShift     = 24,
-    I_BitShift     = 25,
-    CondShift      = 28
-  };
-}
-
-class ARMBaseInstrInfo : public TargetInstrInfoImpl {
-protected:
-  // Can be only subclassed.
-  explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
-public:
-  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
-                                              MachineBasicBlock::iterator &MBBI,
-                                              LiveVariables *LV) const;
-
-  virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0;
-
-  // Branch analysis.
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const;
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                            const SmallVectorImpl<MachineOperand> &Cond) const;
-
-  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
-  virtual
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
-
-  // Predication support.
-  virtual bool isPredicated(const MachineInstr *MI) const;
-
-  ARMCC::CondCodes getPredicate(const MachineInstr *MI) const {
-    int PIdx = MI->findFirstPredOperandIdx();
-    return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm()
-                      : ARMCC::AL;
-  }
-
-  virtual
-  bool PredicateInstruction(MachineInstr *MI,
-                            const SmallVectorImpl<MachineOperand> &Pred) const;
-
-  virtual
-  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                         const SmallVectorImpl<MachineOperand> &Pred2) const;
-
-  virtual bool DefinesPredicate(MachineInstr *MI,
-                                std::vector<MachineOperand> &Pred) const;
-
-  /// GetInstSize - Returns the size of the specified MachineInstr.
-  ///
-  virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const;
-
-  /// Return true if the instruction is a register to register move and return
-  /// the source and dest operands and their sub-register indices by reference.
-  virtual bool isMoveInstr(const MachineInstr &MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
-
-  virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator I,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC) const;
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC) const;
-
-  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                              SmallVectorImpl<MachineOperand> &Addr,
-                              const TargetRegisterClass *RC,
-                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC) const;
-
-  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                               SmallVectorImpl<MachineOperand> &Addr,
-                               const TargetRegisterClass *RC,
-                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
-  virtual bool canFoldMemoryOperand(const MachineInstr *MI,
-                                    const SmallVectorImpl<unsigned> &Ops) const;
-  
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                              const SmallVectorImpl<unsigned> &Ops,
-                                              int FrameIndex) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                              const SmallVectorImpl<unsigned> &Ops,
-                                              MachineInstr* LoadMI) const;
-};
-
 class ARMInstrInfo : public ARMBaseInstrInfo {
   ARMRegisterInfo RI;
+  const ARMSubtarget &Subtarget;
 public:
   explicit ARMInstrInfo(const ARMSubtarget &STI);
 
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const;
+
+  // Return true if the block does not fall through.
+  bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
@@ -263,7 +43,8 @@ public:
   const ARMRegisterInfo &getRegisterInfo() const { return RI; }
 
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                     unsigned DestReg, const MachineInstr *Orig) const;
+                     unsigned DestReg, unsigned SubIdx,
+                     const MachineInstr *Orig) const;
 };
 
 }
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 408f47a..8adfac3 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -34,6 +34,10 @@ def SDT_ARMBrJT    : SDTypeProfile<0, 3,
                                   [SDTCisPtrTy<0>, SDTCisVT<1, i32>,
                                    SDTCisVT<2, i32>]>;
 
+def SDT_ARMBr2JT   : SDTypeProfile<0, 4,
+                                  [SDTCisPtrTy<0>, SDTCisVT<1, i32>,
+                                   SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
 def SDT_ARMCmp     : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
 
 def SDT_ARMPICAdd  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
@@ -71,6 +75,8 @@ def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
 
 def ARMbrjt          : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT,
                               [SDNPHasChain]>;
+def ARMbr2jt         : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT,
+                              [SDNPHasChain]>;
 
 def ARMcmp           : SDNode<"ARMISD::CMP", SDT_ARMCmp,
                               [SDNPOutFlag]>;
@@ -93,10 +99,14 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", SDT_ARMEH_SJLJ_Setjmp>;
 def HasV5T    : Predicate<"Subtarget->hasV5TOps()">;
 def HasV5TE   : Predicate<"Subtarget->hasV5TEOps()">;
 def HasV6     : Predicate<"Subtarget->hasV6Ops()">;
+def HasV6T2   : Predicate<"Subtarget->hasV6T2Ops()">;
+def NoV6T2    : Predicate<"!Subtarget->hasV6T2Ops()">;
 def HasV7     : Predicate<"Subtarget->hasV7Ops()">;
 def HasVFP2   : Predicate<"Subtarget->hasVFP2()">;
 def HasVFP3   : Predicate<"Subtarget->hasVFP3()">;
 def HasNEON   : Predicate<"Subtarget->hasNEON()">;
+def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
 def IsThumb   : Predicate<"Subtarget->isThumb()">;
 def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">;
 def IsThumb2  : Predicate<"Subtarget->isThumb2()">;
@@ -117,25 +127,16 @@ class RegConstraint<string C> {
 //  ARM specific transformation functions and pattern fragments.
 //
 
-// so_imm_XFORM - Return a so_imm value packed into the format described for
-// so_imm def below.
-def so_imm_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(N->getZExtValue()),
-                                   MVT::i32);
-}]>;
-
 // so_imm_neg_XFORM - Return a so_imm value packed into the format described for
 // so_imm_neg def below.
 def so_imm_neg_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(-(int)N->getZExtValue()),
-                                   MVT::i32);
+  return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
 }]>;
 
 // so_imm_not_XFORM - Return a so_imm value packed into the format described for
 // so_imm_not def below.
 def so_imm_not_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(~(int)N->getZExtValue()),
-                                   MVT::i32);
+  return CurDAG->getTargetConstant(~(int)N->getZExtValue(), MVT::i32);
 }]>;
 
 // rot_imm predicate - True if the 32-bit immediate is equal to 8, 16, or 24.
@@ -169,6 +170,48 @@ def sext_16_node : PatLeaf<(i32 GPR:$a), [{
   return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17;
 }]>;
 
+/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
+/// e.g., 0xf000ffff
+def bf_inv_mask_imm : Operand<i32>,
+                      PatLeaf<(imm), [{ 
+  uint32_t v = (uint32_t)N->getZExtValue();
+  if (v == 0xffffffff)
+    return 0;
+  // there can be 1's on either or both "outsides", all the "inside"
+  // bits must be 0's
+  unsigned int lsb = 0, msb = 31;
+  while (v & (1 << msb)) --msb;
+  while (v & (1 << lsb)) ++lsb;
+  for (unsigned int i = lsb; i <= msb; ++i) {
+    if (v & (1 << i))
+      return 0;
+  }
+  return 1;
+}] > {
+  let PrintMethod = "printBitfieldInvMaskImmOperand";
+}
+
+/// Split a 32-bit immediate into two 16 bit parts.
+def lo16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() & 0xffff,
+                                   MVT::i32);
+}]>;
+
+def hi16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32);
+}]>;
+
+def lo16AllZero : PatLeaf<(i32 imm), [{
+  // Returns true if all low 16-bits are 0.
+  return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
+  }], hi16>;
+
+/// imm0_65535 predicate - True if the 32-bit immediate is in the range 
+/// [0.65535].
+def imm0_65535 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getZExtValue() < 65536;
+}]>;
+
 class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
 class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
 
@@ -192,6 +235,9 @@ def cpinst_operand : Operand<i32> {
 def jtblock_operand : Operand<i32> {
   let PrintMethod = "printJTBlockOperand";
 }
+def jt2block_operand : Operand<i32> {
+  let PrintMethod = "printJT2BlockOperand";
+}
 
 // Local PC labels.
 def pclabel : Operand<i32> {
@@ -212,9 +258,9 @@ def so_reg : Operand<i32>,    // reg reg imm
 // into so_imm instructions: the 8-bit immediate is the least significant bits
 // [bits 0-7], the 4-bit shift amount is the next 4 bits [bits 8-11].
 def so_imm : Operand<i32>,
-             PatLeaf<(imm),
-                     [{ return ARM_AM::getSOImmVal(N->getZExtValue()) != -1; }],
-                     so_imm_XFORM> {
+             PatLeaf<(imm), [{
+      return ARM_AM::getSOImmVal(N->getZExtValue()) != -1;
+    }]> {
   let PrintMethod = "printSOImmOperand";
 }
 
@@ -230,14 +276,18 @@ def so_imm2part : Operand<i32>,
 
 def so_imm2part_1 : SDNodeXForm<imm, [{
   unsigned V = ARM_AM::getSOImmTwoPartFirst((unsigned)N->getZExtValue());
-  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(V), MVT::i32);
+  return CurDAG->getTargetConstant(V, MVT::i32);
 }]>;
 
 def so_imm2part_2 : SDNodeXForm<imm, [{
   unsigned V = ARM_AM::getSOImmTwoPartSecond((unsigned)N->getZExtValue());
-  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(V), MVT::i32);
+  return CurDAG->getTargetConstant(V, MVT::i32);
 }]>;
 
+/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
+def imm0_31 : Operand<i32>, PatLeaf<(imm), [{
+  return (int32_t)N->getZExtValue() < 32;
+}]>;
 
 // Define ARM specific addressing modes.
 
@@ -274,7 +324,7 @@ def am3offset : Operand<i32>,
 // addrmode4 := reg, <mode|W>
 //
 def addrmode4 : Operand<i32>,
-                ComplexPattern<i32, 2, "", []> {
+                ComplexPattern<i32, 2, "SelectAddrMode4", []> {
   let PrintMethod = "printAddrMode4Operand";
   let MIOperandInfo = (ops GPR, i32imm);
 }
@@ -303,17 +353,8 @@ def addrmodepc : Operand<i32>,
   let MIOperandInfo = (ops GPR, i32imm);
 }
 
-// ARM Predicate operand. Default to 14 = always (AL). Second part is CC
-// register whose default is 0 (no register).
-def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
-                                     (ops (i32 14), (i32 zero_reg))> {
-  let PrintMethod = "printPredicateOperand";
-}
-
-// Conditional code result for instructions whose 's' bit is set, e.g. subs.
-//
-def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
-  let PrintMethod = "printSBitModifierOperand";
+def nohash_imm : Operand<i32> {
+  let PrintMethod = "printNoHashImmediate";
 }
 
 //===----------------------------------------------------------------------===//
@@ -329,34 +370,44 @@ include "ARMInstrFormats.td"
 multiclass AsI1_bin_irs<bits<4> opcod, string opc, PatFrag opnode,
                         bit Commutable = 0> {
   def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
-               opc, " $dst, $a, $b",
-               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>;
+               IIC_iALUi, opc, " $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> {
+    let Inst{25} = 1;
+  }
   def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm,
-               opc, " $dst, $a, $b",
+               IIC_iALUr, opc, " $dst, $a, $b",
                [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> {
+    let Inst{25} = 0;
     let isCommutable = Commutable;
   }
   def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
-               opc, " $dst, $a, $b",
-               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>;
+               IIC_iALUsr, opc, " $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> {
+    let Inst{25} = 0;
+  }
 }
 
 /// AI1_bin_s_irs - Similar to AsI1_bin_irs except it sets the 's' bit so the
-/// instruction modifies the CSPR register.
+/// instruction modifies the CPSR register.
 let Defs = [CPSR] in {
 multiclass AI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
                          bit Commutable = 0> {
   def ri : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
-               opc, "s $dst, $a, $b",
-               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>;
+               IIC_iALUi, opc, "s $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> {
+    let Inst{25} = 1;
+  }
   def rr : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm,
-               opc, "s $dst, $a, $b",
+               IIC_iALUr, opc, "s $dst, $a, $b",
                [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> {
     let isCommutable = Commutable;
+    let Inst{25} = 0;
   }
   def rs : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
-               opc, "s $dst, $a, $b",
-               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>;
+               IIC_iALUsr, opc, "s $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> {
+    let Inst{25} = 0;
+  }
 }
 }
 
@@ -366,17 +417,25 @@ multiclass AI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
 let Defs = [CPSR] in {
 multiclass AI1_cmp_irs<bits<4> opcod, string opc, PatFrag opnode,
                        bit Commutable = 0> {
-  def ri : AI1<opcod, (outs), (ins GPR:$a, so_imm:$b), DPFrm,
+  def ri : AI1<opcod, (outs), (ins GPR:$a, so_imm:$b), DPFrm, IIC_iCMPi,
                opc, " $a, $b",
-               [(opnode GPR:$a, so_imm:$b)]>;
-  def rr : AI1<opcod, (outs), (ins GPR:$a, GPR:$b), DPFrm,
+               [(opnode GPR:$a, so_imm:$b)]> {
+    let Inst{20} = 1;
+    let Inst{25} = 1;
+  }
+  def rr : AI1<opcod, (outs), (ins GPR:$a, GPR:$b), DPFrm, IIC_iCMPr,
                opc, " $a, $b",
                [(opnode GPR:$a, GPR:$b)]> {
+    let Inst{20} = 1;
+    let Inst{25} = 0;
     let isCommutable = Commutable;
   }
-  def rs : AI1<opcod, (outs), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
+  def rs : AI1<opcod, (outs), (ins GPR:$a, so_reg:$b), DPSoRegFrm, IIC_iCMPsr,
                opc, " $a, $b",
-               [(opnode GPR:$a, so_reg:$b)]>;
+               [(opnode GPR:$a, so_reg:$b)]> {
+    let Inst{20} = 1;
+    let Inst{25} = 0;
+  }
 }
 }
 
@@ -384,15 +443,15 @@ multiclass AI1_cmp_irs<bits<4> opcod, string opc, PatFrag opnode,
 /// register and one whose operand is a register rotated by 8/16/24.
 /// FIXME: Remove the 'r' variant. Its rot_imm is zero.
 multiclass AI_unary_rrot<bits<8> opcod, string opc, PatFrag opnode> {
-  def r     : AExtI<opcod, (outs GPR:$dst), (ins GPR:$Src),
-                 opc, " $dst, $Src",
-                 [(set GPR:$dst, (opnode GPR:$Src))]>,
+  def r     : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src),
+                 IIC_iUNAr, opc, " $dst, $src",
+                 [(set GPR:$dst, (opnode GPR:$src))]>,
               Requires<[IsARM, HasV6]> {
                 let Inst{19-16} = 0b1111;
               }
-  def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$Src, i32imm:$rot),
-                 opc, " $dst, $Src, ror $rot",
-                 [(set GPR:$dst, (opnode (rotr GPR:$Src, rot_imm:$rot)))]>,
+  def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src, i32imm:$rot),
+                 IIC_iUNAsi, opc, " $dst, $src, ror $rot",
+                 [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>,
               Requires<[IsARM, HasV6]> {
                 let Inst{19-16} = 0b1111;
               }
@@ -402,11 +461,11 @@ multiclass AI_unary_rrot<bits<8> opcod, string opc, PatFrag opnode> {
 /// register and one whose operand is a register rotated by 8/16/24.
 multiclass AI_bin_rrot<bits<8> opcod, string opc, PatFrag opnode> {
   def rr     : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS),
-                  opc, " $dst, $LHS, $RHS",
+                  IIC_iALUr, opc, " $dst, $LHS, $RHS",
                   [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>,
                   Requires<[IsARM, HasV6]>;
   def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot),
-                  opc, " $dst, $LHS, $RHS, ror $rot",
+                  IIC_iALUsi, opc, " $dst, $LHS, $RHS, ror $rot",
                   [(set GPR:$dst, (opnode GPR:$LHS,
                                           (rotr GPR:$RHS, rot_imm:$rot)))]>,
                   Requires<[IsARM, HasV6]>;
@@ -417,37 +476,45 @@ let Uses = [CPSR] in {
 multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
                              bit Commutable = 0> {
   def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
-                DPFrm, opc, " $dst, $a, $b",
+                DPFrm, IIC_iALUi, opc, " $dst, $a, $b",
                [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>,
-               Requires<[IsARM, CarryDefIsUnused]>;
+               Requires<[IsARM, CarryDefIsUnused]> {
+    let Inst{25} = 1;
+  }
   def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-                DPFrm, opc, " $dst, $a, $b",
+                DPFrm, IIC_iALUr, opc, " $dst, $a, $b",
                [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>,
                Requires<[IsARM, CarryDefIsUnused]> {
     let isCommutable = Commutable;
+    let Inst{25} = 0;
   }
   def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
-                DPSoRegFrm, opc, " $dst, $a, $b",
+                DPSoRegFrm, IIC_iALUsr, opc, " $dst, $a, $b",
                [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>,
-               Requires<[IsARM, CarryDefIsUnused]>;
+               Requires<[IsARM, CarryDefIsUnused]> {
+    let Inst{25} = 0;
+  }
   // Carry setting variants
   def Sri : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
-                DPFrm, !strconcat(opc, "s $dst, $a, $b"),
+                DPFrm, IIC_iALUi, !strconcat(opc, "s $dst, $a, $b"),
                [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>,
                Requires<[IsARM, CarryDefIsUsed]> {
-                 let Defs = [CPSR];
+    let Defs = [CPSR];
+    let Inst{25} = 1;
   }
   def Srr : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-                DPFrm, !strconcat(opc, "s $dst, $a, $b"),
+                DPFrm, IIC_iALUr, !strconcat(opc, "s $dst, $a, $b"),
                [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>,
                Requires<[IsARM, CarryDefIsUsed]> {
-                 let Defs = [CPSR];
+    let Defs = [CPSR];
+    let Inst{25} = 0;
   }
   def Srs : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
-                DPSoRegFrm, !strconcat(opc, "s $dst, $a, $b"),
+                DPSoRegFrm, IIC_iALUsr, !strconcat(opc, "s $dst, $a, $b"),
                [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>,
                Requires<[IsARM, CarryDefIsUsed]> {
-                 let Defs = [CPSR];
+    let Defs = [CPSR];
+    let Inst{25} = 0;
   }
 }
 }
@@ -467,23 +534,23 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
 let neverHasSideEffects = 1, isNotDuplicable = 1 in
 def CONSTPOOL_ENTRY :
 PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
-                    i32imm:$size),
+                    i32imm:$size), NoItinerary,
            "${instid:label} ${cpidx:cpentry}", []>;
 
 let Defs = [SP], Uses = [SP] in {
 def ADJCALLSTACKUP :
-PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p),
+PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
            "@ ADJCALLSTACKUP $amt1",
            [(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
 
 def ADJCALLSTACKDOWN : 
-PseudoInst<(outs), (ins i32imm:$amt, pred:$p),
+PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
            "@ ADJCALLSTACKDOWN $amt",
            [(ARMcallseq_start timm:$amt)]>;
 }
 
 def DWARF_LOC :
-PseudoInst<(outs), (ins i32imm:$line, i32imm:$col, i32imm:$file),
+PseudoInst<(outs), (ins i32imm:$line, i32imm:$col, i32imm:$file), NoItinerary,
            ".loc $file, $line, $col",
            [(dwarf_loc (i32 imm:$line), (i32 imm:$col), (i32 imm:$file))]>;
 
@@ -491,42 +558,42 @@ PseudoInst<(outs), (ins i32imm:$line, i32imm:$col, i32imm:$file),
 // Address computation and loads and stores in PIC mode.
 let isNotDuplicable = 1 in {
 def PICADD : AXI1<0b0100, (outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
-                  Pseudo, "$cp:\n\tadd$p $dst, pc, $a",
+                  Pseudo, IIC_iALUr, "\n$cp:\n\tadd$p $dst, pc, $a",
                    [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>;
 
 let AddedComplexity = 10 in {
 let canFoldAsLoad = 1 in
 def PICLDR  : AXI2ldw<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
-                  Pseudo, "${addr:label}:\n\tldr$p $dst, $addr",
+                  Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr$p $dst, $addr",
                   [(set GPR:$dst, (load addrmodepc:$addr))]>;
 
 def PICLDRH : AXI3ldh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
-                  Pseudo, "${addr:label}:\n\tldr${p}h $dst, $addr",
+                 Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr${p}h $dst, $addr",
                   [(set GPR:$dst, (zextloadi16 addrmodepc:$addr))]>;
 
 def PICLDRB : AXI2ldb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
-                  Pseudo, "${addr:label}:\n\tldr${p}b $dst, $addr",
+                 Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr${p}b $dst, $addr",
                   [(set GPR:$dst, (zextloadi8 addrmodepc:$addr))]>;
 
 def PICLDRSH : AXI3ldsh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
-                  Pseudo, "${addr:label}:\n\tldr${p}sh $dst, $addr",
+                Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr${p}sh $dst, $addr",
                   [(set GPR:$dst, (sextloadi16 addrmodepc:$addr))]>;
 
 def PICLDRSB : AXI3ldsb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
-                  Pseudo, "${addr:label}:\n\tldr${p}sb $dst, $addr",
+                Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr${p}sb $dst, $addr",
                   [(set GPR:$dst, (sextloadi8 addrmodepc:$addr))]>;
 }
 let AddedComplexity = 10 in {
 def PICSTR  : AXI2stw<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
-               Pseudo, "${addr:label}:\n\tstr$p $src, $addr",
+               Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstr$p $src, $addr",
                [(store GPR:$src, addrmodepc:$addr)]>;
 
 def PICSTRH : AXI3sth<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
-               Pseudo, "${addr:label}:\n\tstr${p}h $src, $addr",
+               Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstr${p}h $src, $addr",
                [(truncstorei16 GPR:$src, addrmodepc:$addr)]>;
 
 def PICSTRB : AXI2stb<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
-               Pseudo, "${addr:label}:\n\tstr${p}b $src, $addr",
+               Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstr${p}b $src, $addr",
                [(truncstorei8 GPR:$src, addrmodepc:$addr)]>;
 }
 } // isNotDuplicable = 1
@@ -534,135 +601,152 @@ def PICSTRB : AXI2stb<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
 
 // LEApcrel - Load a pc-relative address into a register without offending the
 // assembler.
-def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p), Pseudo,
-                   !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
-                                         "${:private}PCRELL${:uid}+8))\n"),
-                              !strconcat("${:private}PCRELL${:uid}:\n\t",
-                                         "add$p $dst, pc, #PCRELV${:uid}")),
+def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p),
+                    Pseudo, IIC_iALUi,
+            !strconcat(!strconcat(".set ${:private}PCRELV${:uid}, ($label-(",
+                                  "${:private}PCRELL${:uid}+8))\n"),
+                       !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                  "add$p $dst, pc, #${:private}PCRELV${:uid}")),
                    []>;
 
 def LEApcrelJT : AXI1<0x0, (outs GPR:$dst),
-                           (ins i32imm:$label, i32imm:$id, pred:$p),
-          Pseudo,
-          !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
-                                         "${:private}PCRELL${:uid}+8))\n"),
-                              !strconcat("${:private}PCRELL${:uid}:\n\t",
-                                         "add$p $dst, pc, #PCRELV${:uid}")),
-                   []>;
+                           (ins i32imm:$label, nohash_imm:$id, pred:$p),
+          Pseudo, IIC_iALUi,
+   !strconcat(!strconcat(".set ${:private}PCRELV${:uid}, "
+                         "(${label}_${id}-(",
+                                  "${:private}PCRELL${:uid}+8))\n"),
+                       !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                  "add$p $dst, pc, #${:private}PCRELV${:uid}")),
+                   []> {
+    let Inst{25} = 1;
+}
 
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions.
 //
 
-let isReturn = 1, isTerminator = 1 in
-  def BX_RET : AI<(outs), (ins), BrMiscFrm, "bx", " lr", [(ARMretflag)]> {
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in
+  def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br, 
+                  "bx", " lr", [(ARMretflag)]> {
   let Inst{7-4}   = 0b0001;
   let Inst{19-8}  = 0b111111111111;
   let Inst{27-20} = 0b00010010;
 }
 
 // FIXME: remove when we have a way to marking a MI with these properties.
-// FIXME: $dst1 should be a def. But the extra ops must be in the end of the
-// operand list.
 // FIXME: Should pc be an implicit operand like PICADD, etc?
-let isReturn = 1, isTerminator = 1 in
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1 in
   def LDM_RET : AXI4ld<(outs),
-                    (ins addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops),
-                    LdStMulFrm, "ldm${p}${addr:submode} $addr, $dst1",
+                    (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+                    LdStMulFrm, IIC_Br, "ldm${p}${addr:submode} $addr, $wb",
                     []>;
 
 // On non-Darwin platforms R9 is callee-saved.
-let isCall = 1, Itinerary = IIC_Br,
-  Defs = [R0, R1, R2, R3, R12, LR,
-          D0, D1, D2, D3, D4, D5, D6, D7, CPSR] in {
+let isCall = 1,
+  Defs = [R0,  R1,  R2,  R3,  R12, LR,
+          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+          D16, D17, D18, D19, D20, D21, D22, D23,
+          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
   def BL  : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops),
-                "bl ${func:call}",
-                [(ARMcall tglobaladdr:$func)]>, Requires<[IsNotDarwin]>;
+                IIC_Br, "bl ${func:call}",
+                [(ARMcall tglobaladdr:$func)]>,
+            Requires<[IsARM, IsNotDarwin]>;
 
   def BL_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops),
-                   "bl", " ${func:call}",
-                   [(ARMcall_pred tglobaladdr:$func)]>, Requires<[IsNotDarwin]>;
+                   IIC_Br, "bl", " ${func:call}",
+                   [(ARMcall_pred tglobaladdr:$func)]>,
+                Requires<[IsARM, IsNotDarwin]>;
 
   // ARMv5T and above
   def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
-                "blx $func",
-                [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsNotDarwin]> {
+                IIC_Br, "blx $func",
+                [(ARMcall GPR:$func)]>,
+            Requires<[IsARM, HasV5T, IsNotDarwin]> {
     let Inst{7-4}   = 0b0011;
     let Inst{19-8}  = 0b111111111111;
     let Inst{27-20} = 0b00010010;
   }
 
-  let Uses = [LR] in {
-    // ARMv4T
-    def BX : ABXIx2<(outs), (ins GPR:$func, variable_ops),
-                     "mov lr, pc\n\tbx $func",
-                    [(ARMcall_nolink GPR:$func)]>, Requires<[IsNotDarwin]>;
+  // ARMv4T
+  def BX : ABXIx2<(outs), (ins GPR:$func, variable_ops),
+                  IIC_Br, "mov lr, pc\n\tbx $func",
+                  [(ARMcall_nolink GPR:$func)]>,
+           Requires<[IsARM, IsNotDarwin]> {
+    let Inst{7-4}   = 0b0001;
+    let Inst{19-8}  = 0b111111111111;
+    let Inst{27-20} = 0b00010010;
   }
 }
 
 // On Darwin R9 is call-clobbered.
-let isCall = 1, Itinerary = IIC_Br,
-  Defs = [R0, R1, R2, R3, R9, R12, LR,
-          D0, D1, D2, D3, D4, D5, D6, D7, CPSR] in {
+let isCall = 1,
+  Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR,
+          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+          D16, D17, D18, D19, D20, D21, D22, D23,
+          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
   def BLr9  : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops),
-                "bl ${func:call}",
-                [(ARMcall tglobaladdr:$func)]>, Requires<[IsDarwin]>;
+                IIC_Br, "bl ${func:call}",
+                [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM, IsDarwin]>;
 
   def BLr9_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops),
-                   "bl", " ${func:call}",
-                   [(ARMcall_pred tglobaladdr:$func)]>, Requires<[IsDarwin]>;
+                   IIC_Br, "bl", " ${func:call}",
+                   [(ARMcall_pred tglobaladdr:$func)]>,
+                  Requires<[IsARM, IsDarwin]>;
 
   // ARMv5T and above
   def BLXr9 : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
-                "blx $func",
+                IIC_Br, "blx $func",
                 [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsDarwin]> {
     let Inst{7-4}   = 0b0011;
     let Inst{19-8}  = 0b111111111111;
     let Inst{27-20} = 0b00010010;
   }
 
-  let Uses = [LR] in {
-    // ARMv4T
-    def BXr9 : ABXIx2<(outs), (ins GPR:$func, variable_ops),
-                     "mov lr, pc\n\tbx $func",
-                    [(ARMcall_nolink GPR:$func)]>, Requires<[IsDarwin]>;
+  // ARMv4T
+  def BXr9 : ABXIx2<(outs), (ins GPR:$func, variable_ops),
+                  IIC_Br, "mov lr, pc\n\tbx $func",
+                  [(ARMcall_nolink GPR:$func)]>, Requires<[IsARM, IsDarwin]> {
+    let Inst{7-4}   = 0b0001;
+    let Inst{19-8}  = 0b111111111111;
+    let Inst{27-20} = 0b00010010;
   }
 }
 
-let isBranch = 1, isTerminator = 1, Itinerary = IIC_Br in {
+let isBranch = 1, isTerminator = 1 in {
   // B is "predicable" since it can be xformed into a Bcc.
   let isBarrier = 1 in {
     let isPredicable = 1 in
-    def B : ABXI<0b1010, (outs), (ins brtarget:$target), "b $target",
-                [(br bb:$target)]>;
+    def B : ABXI<0b1010, (outs), (ins brtarget:$target), IIC_Br,
+                "b $target", [(br bb:$target)]>;
 
   let isNotDuplicable = 1, isIndirectBranch = 1 in {
   def BR_JTr : JTI<(outs), (ins GPR:$target, jtblock_operand:$jt, i32imm:$id),
-                    "mov pc, $target \n$jt",
+                    IIC_Br, "mov pc, $target \n$jt",
                     [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]> {
     let Inst{20}    = 0; // S Bit
     let Inst{24-21} = 0b1101;
-    let Inst{27-26} = {0,0};
+    let Inst{27-25} = 0b000;
   }
   def BR_JTm : JTI<(outs),
                    (ins addrmode2:$target, jtblock_operand:$jt, i32imm:$id),
-                   "ldr pc, $target \n$jt",
-                  [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt,
-                    imm:$id)]> {
+                   IIC_Br, "ldr pc, $target \n$jt",
+                   [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt,
+                     imm:$id)]> {
     let Inst{20}    = 1; // L bit
     let Inst{21}    = 0; // W bit
     let Inst{22}    = 0; // B bit
     let Inst{24}    = 1; // P bit
-    let Inst{27-26} = {0,1};
+    let Inst{27-25} = 0b011;
   }
   def BR_JTadd : JTI<(outs),
                    (ins GPR:$target, GPR:$idx, jtblock_operand:$jt, i32imm:$id),
-                     "add pc, $target, $idx \n$jt",
+                    IIC_Br, "add pc, $target, $idx \n$jt",
                     [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt,
                       imm:$id)]> {
     let Inst{20}    = 0; // S bit
     let Inst{24-21} = 0b0100;
-    let Inst{27-26} = {0,0};
+    let Inst{27-25} = 0b000;
   }
   } // isNotDuplicable = 1, isIndirectBranch = 1
   } // isBarrier = 1
@@ -670,7 +754,7 @@ let isBranch = 1, isTerminator = 1, Itinerary = IIC_Br in {
   // FIXME: should be able to write a pattern for ARMBrcond, but can't use
   // a two-value operand where a dag node expects two operands. :( 
   def Bcc : ABI<0b1010, (outs), (ins brtarget:$target),
-               "b", " $target",
+               IIC_Br, "b", " $target",
                [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>;
 }
 
@@ -679,133 +763,141 @@ let isBranch = 1, isTerminator = 1, Itinerary = IIC_Br in {
 //
 
 // Load
-let canFoldAsLoad = 1 in 
-def LDR  : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm,
+let canFoldAsLoad = 1, isReMaterializable = 1 in 
+def LDR  : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr,
                "ldr", " $dst, $addr",
                [(set GPR:$dst, (load addrmode2:$addr))]>;
 
 // Special LDR for loads from non-pc-relative constpools.
 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1 in
-def LDRcp : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm,
+def LDRcp : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr,
                  "ldr", " $dst, $addr", []>;
 
 // Loads with zero extension
 def LDRH  : AI3ldh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
-                 "ldr", "h $dst, $addr",
-                [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>;
+                  IIC_iLoadr, "ldr", "h $dst, $addr",
+                  [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>;
 
-def LDRB  : AI2ldb<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm,
-                 "ldr", "b $dst, $addr",
-                [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>;
+def LDRB  : AI2ldb<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, 
+                  IIC_iLoadr, "ldr", "b $dst, $addr",
+                  [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>;
 
 // Loads with sign extension
 def LDRSH : AI3ldsh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
-                 "ldr", "sh $dst, $addr",
-                [(set GPR:$dst, (sextloadi16 addrmode3:$addr))]>;
+                   IIC_iLoadr, "ldr", "sh $dst, $addr",
+                   [(set GPR:$dst, (sextloadi16 addrmode3:$addr))]>;
 
 def LDRSB : AI3ldsb<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
-                 "ldr", "sb $dst, $addr",
-                [(set GPR:$dst, (sextloadi8 addrmode3:$addr))]>;
+                   IIC_iLoadr, "ldr", "sb $dst, $addr",
+                   [(set GPR:$dst, (sextloadi8 addrmode3:$addr))]>;
 
-let mayLoad = 1 in {
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
 def LDRD : AI3ldd<(outs GPR:$dst1, GPR:$dst2), (ins addrmode3:$addr), LdMiscFrm,
-                "ldr", "d $dst1, $addr", []>, Requires<[IsARM, HasV5T]>;
+                 IIC_iLoadr, "ldr", "d $dst1, $addr",
+                 []>, Requires<[IsARM, HasV5TE]>;
 
 // Indexed loads
 def LDR_PRE  : AI2ldwpr<(outs GPR:$dst, GPR:$base_wb),
-                     (ins addrmode2:$addr), LdFrm,
+                     (ins addrmode2:$addr), LdFrm, IIC_iLoadru,
                      "ldr", " $dst, $addr!", "$addr.base = $base_wb", []>;
 
 def LDR_POST : AI2ldwpo<(outs GPR:$dst, GPR:$base_wb),
-                     (ins GPR:$base, am2offset:$offset), LdFrm,
+                     (ins GPR:$base, am2offset:$offset), LdFrm, IIC_iLoadru,
                      "ldr", " $dst, [$base], $offset", "$base = $base_wb", []>;
 
 def LDRH_PRE  : AI3ldhpr<(outs GPR:$dst, GPR:$base_wb),
-                     (ins addrmode3:$addr), LdMiscFrm,
+                     (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru,
                      "ldr", "h $dst, $addr!", "$addr.base = $base_wb", []>;
 
 def LDRH_POST : AI3ldhpo<(outs GPR:$dst, GPR:$base_wb),
-                     (ins GPR:$base,am3offset:$offset), LdMiscFrm,
+                     (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
                      "ldr", "h $dst, [$base], $offset", "$base = $base_wb", []>;
 
 def LDRB_PRE  : AI2ldbpr<(outs GPR:$dst, GPR:$base_wb),
-                     (ins addrmode2:$addr), LdFrm,
+                     (ins addrmode2:$addr), LdFrm, IIC_iLoadru,
                      "ldr", "b $dst, $addr!", "$addr.base = $base_wb", []>;
 
 def LDRB_POST : AI2ldbpo<(outs GPR:$dst, GPR:$base_wb),
-                     (ins GPR:$base,am2offset:$offset), LdFrm,
+                     (ins GPR:$base,am2offset:$offset), LdFrm, IIC_iLoadru,
                      "ldr", "b $dst, [$base], $offset", "$base = $base_wb", []>;
 
 def LDRSH_PRE : AI3ldshpr<(outs GPR:$dst, GPR:$base_wb),
-                      (ins addrmode3:$addr), LdMiscFrm,
+                      (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru,
                       "ldr", "sh $dst, $addr!", "$addr.base = $base_wb", []>;
 
 def LDRSH_POST: AI3ldshpo<(outs GPR:$dst, GPR:$base_wb),
-                      (ins GPR:$base,am3offset:$offset), LdMiscFrm,
+                      (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
                     "ldr", "sh $dst, [$base], $offset", "$base = $base_wb", []>;
 
 def LDRSB_PRE : AI3ldsbpr<(outs GPR:$dst, GPR:$base_wb),
-                      (ins addrmode3:$addr), LdMiscFrm,
+                      (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru,
                       "ldr", "sb $dst, $addr!", "$addr.base = $base_wb", []>;
 
 def LDRSB_POST: AI3ldsbpo<(outs GPR:$dst, GPR:$base_wb),
-                      (ins GPR:$base,am3offset:$offset), LdMiscFrm,
+                      (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
                     "ldr", "sb $dst, [$base], $offset", "$base = $base_wb", []>;
 }
 
 // Store
-def STR  : AI2stw<(outs), (ins GPR:$src, addrmode2:$addr), StFrm,
+def STR  : AI2stw<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer,
                "str", " $src, $addr",
                [(store GPR:$src, addrmode2:$addr)]>;
 
 // Stores with truncate
-def STRH : AI3sth<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm,
+def STRH : AI3sth<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm, IIC_iStorer,
                "str", "h $src, $addr",
                [(truncstorei16 GPR:$src, addrmode3:$addr)]>;
 
-def STRB : AI2stb<(outs), (ins GPR:$src, addrmode2:$addr), StFrm,
+def STRB : AI2stb<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer,
                "str", "b $src, $addr",
                [(truncstorei8 GPR:$src, addrmode2:$addr)]>;
 
 // Store doubleword
-let mayStore = 1 in
-def STRD : AI3std<(outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr),StMiscFrm,
-               "str", "d $src1, $addr", []>, Requires<[IsARM, HasV5T]>;
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+def STRD : AI3std<(outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr),
+               StMiscFrm, IIC_iStorer,
+               "str", "d $src1, $addr", []>, Requires<[IsARM, HasV5TE]>;
 
 // Indexed stores
 def STR_PRE  : AI2stwpr<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base, am2offset:$offset), StFrm,
+                     (ins GPR:$src, GPR:$base, am2offset:$offset), 
+                     StFrm, IIC_iStoreru,
                     "str", " $src, [$base, $offset]!", "$base = $base_wb",
                     [(set GPR:$base_wb,
                       (pre_store GPR:$src, GPR:$base, am2offset:$offset))]>;
 
 def STR_POST : AI2stwpo<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am2offset:$offset), StFrm,
+                     (ins GPR:$src, GPR:$base,am2offset:$offset), 
+                     StFrm, IIC_iStoreru,
                     "str", " $src, [$base], $offset", "$base = $base_wb",
                     [(set GPR:$base_wb,
                       (post_store GPR:$src, GPR:$base, am2offset:$offset))]>;
 
 def STRH_PRE : AI3sthpr<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am3offset:$offset), StMiscFrm,
+                     (ins GPR:$src, GPR:$base,am3offset:$offset), 
+                     StMiscFrm, IIC_iStoreru,
                      "str", "h $src, [$base, $offset]!", "$base = $base_wb",
                     [(set GPR:$base_wb,
                       (pre_truncsti16 GPR:$src, GPR:$base,am3offset:$offset))]>;
 
 def STRH_POST: AI3sthpo<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am3offset:$offset), StMiscFrm,
+                     (ins GPR:$src, GPR:$base,am3offset:$offset), 
+                     StMiscFrm, IIC_iStoreru,
                      "str", "h $src, [$base], $offset", "$base = $base_wb",
                     [(set GPR:$base_wb, (post_truncsti16 GPR:$src,
                                          GPR:$base, am3offset:$offset))]>;
 
 def STRB_PRE : AI2stbpr<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am2offset:$offset), StFrm,
+                     (ins GPR:$src, GPR:$base,am2offset:$offset), 
+                     StFrm, IIC_iStoreru,
                      "str", "b $src, [$base, $offset]!", "$base = $base_wb",
                     [(set GPR:$base_wb, (pre_truncsti8 GPR:$src,
                                          GPR:$base, am2offset:$offset))]>;
 
 def STRB_POST: AI2stbpo<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am2offset:$offset), StFrm,
+                     (ins GPR:$src, GPR:$base,am2offset:$offset), 
+                     StFrm, IIC_iStoreru,
                      "str", "b $src, [$base], $offset", "$base = $base_wb",
                     [(set GPR:$base_wb, (post_truncsti8 GPR:$src,
                                          GPR:$base, am2offset:$offset))]>;
@@ -814,17 +906,16 @@ def STRB_POST: AI2stbpo<(outs GPR:$base_wb),
 //  Load / store multiple Instructions.
 //
 
-// FIXME: $dst1 should be a def.
-let mayLoad = 1 in
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
 def LDM : AXI4ld<(outs),
-               (ins addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops),
-               LdStMulFrm, "ldm${p}${addr:submode} $addr, $dst1",
+               (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+               LdStMulFrm, IIC_iLoadm, "ldm${p}${addr:submode} $addr, $wb",
                []>;
 
-let mayStore = 1 in
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
 def STM : AXI4st<(outs),
-               (ins addrmode4:$addr, pred:$p, reglist:$src1, variable_ops),
-               LdStMulFrm, "stm${p}${addr:submode} $addr, $src1",
+               (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+               LdStMulFrm, IIC_iStorem, "stm${p}${addr:submode} $addr, $wb",
                []>;
 
 //===----------------------------------------------------------------------===//
@@ -832,16 +923,42 @@ def STM : AXI4st<(outs),
 //
 
 let neverHasSideEffects = 1 in
-def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm,
-                 "mov", " $dst, $src", []>, UnaryDP;
-def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm,
-                 "mov", " $dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP;
+def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr,
+                "mov", " $dst, $src", []>, UnaryDP;
+def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src), 
+                DPSoRegFrm, IIC_iMOVsr,
+                "mov", " $dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP;
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVi : AsI1<0b1101, (outs GPR:$dst), (ins so_imm:$src), DPFrm,
-                 "mov", " $dst, $src", [(set GPR:$dst, so_imm:$src)]>, UnaryDP;
+def MOVi : AsI1<0b1101, (outs GPR:$dst), (ins so_imm:$src), DPFrm, IIC_iMOVi,
+                "mov", " $dst, $src", [(set GPR:$dst, so_imm:$src)]>, UnaryDP {
+  let Inst{25} = 1;
+}
 
-def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVi16 : AI1<0b1000, (outs GPR:$dst), (ins i32imm:$src), 
+                 DPFrm, IIC_iMOVi,
+                 "movw", " $dst, $src",
+                 [(set GPR:$dst, imm0_65535:$src)]>,
+                 Requires<[IsARM, HasV6T2]> {
+  let Inst{20} = 0;
+  let Inst{25} = 1;
+}
+
+let Constraints = "$src = $dst" in
+def MOVTi16 : AI1<0b1010, (outs GPR:$dst), (ins GPR:$src, i32imm:$imm),
+                  DPFrm, IIC_iMOVi,
+                  "movt", " $dst, $imm", 
+                  [(set GPR:$dst,
+                        (or (and GPR:$src, 0xffff), 
+                            lo16AllZero:$imm))]>, UnaryDP,
+                  Requires<[IsARM, HasV6T2]> {
+  let Inst{20} = 0;
+  let Inst{25} = 1;
+}
+
+let Uses = [CPSR] in
+def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, IIC_iMOVsi,
                  "mov", " $dst, $src, rrx",
                  [(set GPR:$dst, (ARMrrx GPR:$src))]>, UnaryDP;
 
@@ -849,11 +966,11 @@ def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
 // due to flag operands.
 
 let Defs = [CPSR] in {
-def MOVsrl_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
-                      "mov", "s $dst, $src, lsr #1",
+def MOVsrl_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, 
+                      IIC_iMOVsi, "mov", "s $dst, $src, lsr #1",
                       [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP;
 def MOVsra_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
-                      "mov", "s $dst, $src, asr #1",
+                      IIC_iMOVsi, "mov", "s $dst, $src, asr #1",
                       [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP;
 }
 
@@ -901,6 +1018,24 @@ defm UXTAH : AI_bin_rrot<0b01101111, "uxtah",
 
 // TODO: UXT(A){B|H}16
 
+def SBFX  : I<(outs GPR:$dst),
+              (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi,
+               "sbfx", " $dst, $src, $lsb, $width", "", []>,
+               Requires<[IsARM, HasV6T2]> {
+  let Inst{27-21} = 0b0111101;
+  let Inst{6-4}   = 0b101;
+}
+
+def UBFX  : I<(outs GPR:$dst),
+              (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi,
+               "ubfx", " $dst, $src, $lsb, $width", "", []>,
+               Requires<[IsARM, HasV6T2]> {
+  let Inst{27-21} = 0b0111111;
+  let Inst{6-4}   = 0b101;
+}
+
 //===----------------------------------------------------------------------===//
 //  Arithmetic Instructions.
 //
@@ -923,30 +1058,36 @@ defm SBC : AI1_adde_sube_irs<0b0110, "sbc",
 
 // These don't define reg/reg forms, because they are handled above.
 def RSBri : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
-                  "rsb", " $dst, $a, $b",
-                  [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]>;
+                  IIC_iALUi, "rsb", " $dst, $a, $b",
+                  [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]> {
+    let Inst{25} = 1;
+}
 
 def RSBrs : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
-                  "rsb", " $dst, $a, $b",
+                  IIC_iALUsr, "rsb", " $dst, $a, $b",
                   [(set GPR:$dst, (sub so_reg:$b, GPR:$a))]>;
 
 // RSB with 's' bit set.
 let Defs = [CPSR] in {
 def RSBSri : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
-                 "rsb", "s $dst, $a, $b",
-                 [(set GPR:$dst, (subc so_imm:$b, GPR:$a))]>;
+                 IIC_iALUi, "rsb", "s $dst, $a, $b",
+                 [(set GPR:$dst, (subc so_imm:$b, GPR:$a))]> {
+    let Inst{25} = 1;
+}
 def RSBSrs : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
-                 "rsb", "s $dst, $a, $b",
+                 IIC_iALUsr, "rsb", "s $dst, $a, $b",
                  [(set GPR:$dst, (subc so_reg:$b, GPR:$a))]>;
 }
 
 let Uses = [CPSR] in {
 def RSCri : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
-                 DPFrm, "rsc", " $dst, $a, $b",
+                 DPFrm, IIC_iALUi, "rsc", " $dst, $a, $b",
                  [(set GPR:$dst, (sube so_imm:$b, GPR:$a))]>,
-                 Requires<[IsARM, CarryDefIsUnused]>;
+                 Requires<[IsARM, CarryDefIsUnused]> {
+    let Inst{25} = 1;
+}
 def RSCrs : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
-                 DPSoRegFrm, "rsc", " $dst, $a, $b",
+                 DPSoRegFrm, IIC_iALUsr, "rsc", " $dst, $a, $b",
                  [(set GPR:$dst, (sube so_reg:$b, GPR:$a))]>,
                  Requires<[IsARM, CarryDefIsUnused]>;
 }
@@ -954,11 +1095,13 @@ def RSCrs : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
 // FIXME: Allow these to be predicated.
 let Defs = [CPSR], Uses = [CPSR] in {
 def RSCSri : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
-                  DPFrm, "rscs $dst, $a, $b",
+                  DPFrm, IIC_iALUi, "rscs $dst, $a, $b",
                   [(set GPR:$dst, (sube so_imm:$b, GPR:$a))]>,
-                  Requires<[IsARM, CarryDefIsUnused]>;
+                  Requires<[IsARM, CarryDefIsUnused]> {
+    let Inst{25} = 1;
+}
 def RSCSrs : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
-                  DPSoRegFrm, "rscs $dst, $a, $b",
+                  DPSoRegFrm, IIC_iALUsr, "rscs $dst, $a, $b",
                   [(set GPR:$dst, (sube so_reg:$b, GPR:$a))]>,
                   Requires<[IsARM, CarryDefIsUnused]>;
 }
@@ -992,16 +1135,27 @@ defm EOR   : AsI1_bin_irs<0b0001, "eor",
 defm BIC   : AsI1_bin_irs<0b1110, "bic",
                           BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
 
-def  MVNr  : AsI1<0b1111, (outs GPR:$dst), (ins GPR:$src), DPFrm,
+def BFC    : I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
+               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi,
+               "bfc", " $dst, $imm", "$src = $dst",
+               [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>,
+               Requires<[IsARM, HasV6T2]> {
+  let Inst{27-21} = 0b0111110;
+  let Inst{6-0}   = 0b0011111;
+}
+
+def  MVNr  : AsI1<0b1111, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr,
                   "mvn", " $dst, $src",
                   [(set GPR:$dst, (not GPR:$src))]>, UnaryDP;
 def  MVNs  : AsI1<0b1111, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm,
-                  "mvn", " $dst, $src",
+                  IIC_iMOVsr, "mvn", " $dst, $src",
                   [(set GPR:$dst, (not so_reg:$src))]>, UnaryDP;
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def  MVNi  : AsI1<0b1111, (outs GPR:$dst), (ins so_imm:$imm), DPFrm,
-                  "mvn", " $dst, $imm",
-                  [(set GPR:$dst, so_imm_not:$imm)]>,UnaryDP;
+def  MVNi  : AsI1<0b1111, (outs GPR:$dst), (ins so_imm:$imm), DPFrm, 
+                  IIC_iMOVi, "mvn", " $dst, $imm",
+                  [(set GPR:$dst, so_imm_not:$imm)]>,UnaryDP {
+    let Inst{25} = 1;
+}
 
 def : ARMPat<(and   GPR:$src, so_imm_not:$imm),
              (BICri GPR:$src, so_imm_not:$imm)>;
@@ -1012,43 +1166,48 @@ def : ARMPat<(and   GPR:$src, so_imm_not:$imm),
 
 let isCommutable = 1 in
 def MUL   : AsMul1I<0b0000000, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-                    "mul", " $dst, $a, $b",
+                   IIC_iMUL32, "mul", " $dst, $a, $b",
                    [(set GPR:$dst, (mul GPR:$a, GPR:$b))]>;
 
 def MLA   : AsMul1I<0b0000001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
-                    "mla", " $dst, $a, $b, $c",
+                    IIC_iMAC32, "mla", " $dst, $a, $b, $c",
                    [(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>;
 
+def MLS   : AMul1I<0b0000011, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+                   IIC_iMAC32, "mls", " $dst, $a, $b, $c",
+                   [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]>,
+                   Requires<[IsARM, HasV6T2]>;
+
 // Extra precision multiplies with low / high results
 let neverHasSideEffects = 1 in {
 let isCommutable = 1 in {
 def SMULL : AsMul1I<0b0000110, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b),
+                               (ins GPR:$a, GPR:$b), IIC_iMUL64,
                     "smull", " $ldst, $hdst, $a, $b", []>;
 
 def UMULL : AsMul1I<0b0000100, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b),
+                               (ins GPR:$a, GPR:$b), IIC_iMUL64,
                     "umull", " $ldst, $hdst, $a, $b", []>;
 }
 
 // Multiply + accumulate
 def SMLAL : AsMul1I<0b0000111, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b),
+                               (ins GPR:$a, GPR:$b), IIC_iMAC64,
                     "smlal", " $ldst, $hdst, $a, $b", []>;
 
 def UMLAL : AsMul1I<0b0000101, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b),
+                               (ins GPR:$a, GPR:$b), IIC_iMAC64,
                     "umlal", " $ldst, $hdst, $a, $b", []>;
 
 def UMAAL : AMul1I <0b0000010, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b),
+                               (ins GPR:$a, GPR:$b), IIC_iMAC64,
                     "umaal", " $ldst, $hdst, $a, $b", []>,
                     Requires<[IsARM, HasV6]>;
 } // neverHasSideEffects
 
 // Most significant word multiply
 def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-               "smmul", " $dst, $a, $b",
+               IIC_iMUL32, "smmul", " $dst, $a, $b",
                [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>,
             Requires<[IsARM, HasV6]> {
   let Inst{7-4}   = 0b0001;
@@ -1056,7 +1215,7 @@ def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
 }
 
 def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
-               "smmla", " $dst, $a, $b, $c",
+               IIC_iMAC32, "smmla", " $dst, $a, $b, $c",
                [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>,
             Requires<[IsARM, HasV6]> {
   let Inst{7-4}   = 0b0001;
@@ -1064,7 +1223,7 @@ def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
 
 
 def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
-               "smmls", " $dst, $a, $b, $c",
+               IIC_iMAC32, "smmls", " $dst, $a, $b, $c",
                [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>,
             Requires<[IsARM, HasV6]> {
   let Inst{7-4}   = 0b1101;
@@ -1072,7 +1231,7 @@ def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
 
 multiclass AI_smul<string opc, PatFrag opnode> {
   def BB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              !strconcat(opc, "bb"), " $dst, $a, $b",
+              IIC_iMUL32, !strconcat(opc, "bb"), " $dst, $a, $b",
               [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
                                       (sext_inreg GPR:$b, i16)))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1081,7 +1240,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
            }
 
   def BT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              !strconcat(opc, "bt"), " $dst, $a, $b",
+              IIC_iMUL32, !strconcat(opc, "bt"), " $dst, $a, $b",
               [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
                                       (sra GPR:$b, (i32 16))))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1090,7 +1249,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
            }
 
   def TB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              !strconcat(opc, "tb"), " $dst, $a, $b",
+              IIC_iMUL32, !strconcat(opc, "tb"), " $dst, $a, $b",
               [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
                                       (sext_inreg GPR:$b, i16)))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1099,7 +1258,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
            }
 
   def TT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              !strconcat(opc, "tt"), " $dst, $a, $b",
+              IIC_iMUL32, !strconcat(opc, "tt"), " $dst, $a, $b",
               [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
                                       (sra GPR:$b, (i32 16))))]>,
             Requires<[IsARM, HasV5TE]> {
@@ -1108,7 +1267,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
            }
 
   def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              !strconcat(opc, "wb"), " $dst, $a, $b",
+              IIC_iMUL16, !strconcat(opc, "wb"), " $dst, $a, $b",
               [(set GPR:$dst, (sra (opnode GPR:$a,
                                     (sext_inreg GPR:$b, i16)), (i32 16)))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1117,7 +1276,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
            }
 
   def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              !strconcat(opc, "wt"), " $dst, $a, $b",
+              IIC_iMUL16, !strconcat(opc, "wt"), " $dst, $a, $b",
               [(set GPR:$dst, (sra (opnode GPR:$a,
                                     (sra GPR:$b, (i32 16))), (i32 16)))]>,
             Requires<[IsARM, HasV5TE]> {
@@ -1129,7 +1288,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
 
 multiclass AI_smla<string opc, PatFrag opnode> {
   def BB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              !strconcat(opc, "bb"), " $dst, $a, $b, $acc",
+              IIC_iMAC16, !strconcat(opc, "bb"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc,
                                (opnode (sext_inreg GPR:$a, i16),
                                        (sext_inreg GPR:$b, i16))))]>,
@@ -1139,7 +1298,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
            }
 
   def BT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              !strconcat(opc, "bt"), " $dst, $a, $b, $acc",
+              IIC_iMAC16, !strconcat(opc, "bt"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
                                                      (sra GPR:$b, (i32 16)))))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1148,7 +1307,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
            }
 
   def TB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              !strconcat(opc, "tb"), " $dst, $a, $b, $acc",
+              IIC_iMAC16, !strconcat(opc, "tb"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
                                                  (sext_inreg GPR:$b, i16))))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1157,7 +1316,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
            }
 
   def TT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              !strconcat(opc, "tt"), " $dst, $a, $b, $acc",
+              IIC_iMAC16, !strconcat(opc, "tt"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
                                                      (sra GPR:$b, (i32 16)))))]>,
             Requires<[IsARM, HasV5TE]> {
@@ -1166,7 +1325,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
            }
 
   def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              !strconcat(opc, "wb"), " $dst, $a, $b, $acc",
+              IIC_iMAC16, !strconcat(opc, "wb"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
                                        (sext_inreg GPR:$b, i16)), (i32 16))))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1175,7 +1334,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
            }
 
   def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              !strconcat(opc, "wt"), " $dst, $a, $b, $acc",
+              IIC_iMAC16, !strconcat(opc, "wt"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
                                          (sra GPR:$b, (i32 16))), (i32 16))))]>,
             Requires<[IsARM, HasV5TE]> {
@@ -1194,7 +1353,7 @@ defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 //  Misc. Arithmetic Instructions.
 //
 
-def CLZ  : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src),
+def CLZ  : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
               "clz", " $dst, $src",
               [(set GPR:$dst, (ctlz GPR:$src))]>, Requires<[IsARM, HasV5T]> {
   let Inst{7-4}   = 0b0001;
@@ -1202,7 +1361,7 @@ def CLZ  : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src),
   let Inst{19-16} = 0b1111;
 }
 
-def REV  : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
+def REV  : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
               "rev", " $dst, $src",
               [(set GPR:$dst, (bswap GPR:$src))]>, Requires<[IsARM, HasV6]> {
   let Inst{7-4}   = 0b0011;
@@ -1210,7 +1369,7 @@ def REV  : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
   let Inst{19-16} = 0b1111;
 }
 
-def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
+def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
                "rev16", " $dst, $src",
                [(set GPR:$dst,
                    (or (and (srl GPR:$src, (i32 8)), 0xFF),
@@ -1223,7 +1382,7 @@ def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
   let Inst{19-16} = 0b1111;
 }
 
-def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src),
+def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
                "revsh", " $dst, $src",
                [(set GPR:$dst,
                   (sext_inreg
@@ -1237,7 +1396,7 @@ def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src),
 
 def PKHBT : AMiscA1I<0b01101000, (outs GPR:$dst),
                                  (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
-               "pkhbt", " $dst, $src1, $src2, LSL $shamt",
+               IIC_iALUsi, "pkhbt", " $dst, $src1, $src2, LSL $shamt",
                [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF),
                                    (and (shl GPR:$src2, (i32 imm:$shamt)),
                                         0xFFFF0000)))]>,
@@ -1254,7 +1413,7 @@ def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)),
 
 def PKHTB : AMiscA1I<0b01101000, (outs GPR:$dst),
                                  (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
-               "pkhtb", " $dst, $src1, $src2, ASR $shamt",
+               IIC_iALUsi, "pkhtb", " $dst, $src1, $src2, ASR $shamt",
                [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000),
                                    (and (sra GPR:$src2, imm16_31:$shamt),
                                         0xFFFF)))]>, Requires<[IsARM, HasV6]> {
@@ -1300,21 +1459,23 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm),
 // FIXME: should be able to write a pattern for ARMcmov, but can't use
 // a two-value operand where a dag node expects two operands. :( 
 def MOVCCr : AI1<0b1101, (outs GPR:$dst), (ins GPR:$false, GPR:$true), DPFrm,
-                "mov", " $dst, $true",
+                IIC_iCMOVr, "mov", " $dst, $true",
       [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>,
                 RegConstraint<"$false = $dst">, UnaryDP;
 
 def MOVCCs : AI1<0b1101, (outs GPR:$dst),
-                        (ins GPR:$false, so_reg:$true), DPSoRegFrm,
+                        (ins GPR:$false, so_reg:$true), DPSoRegFrm, IIC_iCMOVsr,
                 "mov", " $dst, $true",
    [/*(set GPR:$dst, (ARMcmov GPR:$false, so_reg:$true, imm:$cc, CCR:$ccr))*/]>,
                 RegConstraint<"$false = $dst">, UnaryDP;
 
 def MOVCCi : AI1<0b1101, (outs GPR:$dst),
-                        (ins GPR:$false, so_imm:$true), DPFrm,
+                        (ins GPR:$false, so_imm:$true), DPFrm, IIC_iCMOVi,
                 "mov", " $dst, $true",
    [/*(set GPR:$dst, (ARMcmov GPR:$false, so_imm:$true, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $dst">, UnaryDP;
+                RegConstraint<"$false = $dst">, UnaryDP {
+    let Inst{25} = 1;
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -1324,14 +1485,14 @@ def MOVCCi : AI1<0b1101, (outs GPR:$dst),
 // __aeabi_read_tp preserves the registers r1-r3.
 let isCall = 1,
   Defs = [R0, R12, LR, CPSR] in {
-  def TPsoft : ABXI<0b1011, (outs), (ins),
+  def TPsoft : ABXI<0b1011, (outs), (ins), IIC_Br,
                "bl __aeabi_read_tp",
                [(set R0, ARMthread_pointer)]>;
 }
 
 //===----------------------------------------------------------------------===//
 // SJLJ Exception handling intrinsics
-//   eh_sjlj_setjmp() is a three instruction sequence to store the return 
+//   eh_sjlj_setjmp() is an instruction sequence to store the return
 //   address and save #0 in R0 for the non-longjmp case.
 //   Since by its nature we may be coming from some other function to get
 //   here, and we're using the stack frame for the containing function to
@@ -1342,13 +1503,19 @@ let isCall = 1,
 //   doing so, we also cause the prologue/epilogue code to actively preserve
 //   all of the callee-saved resgisters, which is exactly what we want.
 let Defs = 
-  [ R0, R1, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR,
-    D0, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15 ] in {
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
+    D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
+    D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
+    D31 ] in {
   def Int_eh_sjlj_setjmp : XI<(outs), (ins GPR:$src),
-                               AddrModeNone, SizeSpecial, IndexModeNone, Pseudo,
-                               "add r0, pc, #4\n\t"
-                               "str r0, [$src, #+4]\n\t"
-                               "mov r0, #0 @ eh_setjmp", "",
+                               AddrModeNone, SizeSpecial, IndexModeNone,
+                               Pseudo, NoItinerary,
+                               "str sp, [$src, #+8] @ eh_setjmp begin\n\t"
+                               "add r12, pc, #8\n\t"
+                               "str r12, [$src, #+4]\n\t"
+                               "mov r0, #0\n\t"
+                               "add pc, pc, #0\n\t"
+                               "mov r0, #1 @ eh_setjmp end", "",
                                [(set R0, (ARMeh_sjlj_setjmp GPR:$src))]>;
 }
 
@@ -1366,25 +1533,36 @@ def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
 
 // Two piece so_imms.
 let isReMaterializable = 1 in
-def MOVi2pieces : AI1x2<(outs GPR:$dst), (ins so_imm2part:$src), Pseudo,
+def MOVi2pieces : AI1x2<(outs GPR:$dst), (ins so_imm2part:$src), 
+                         Pseudo, IIC_iMOVi,
                          "mov", " $dst, $src",
-                         [(set GPR:$dst, so_imm2part:$src)]>;
+                         [(set GPR:$dst, so_imm2part:$src)]>,
+                  Requires<[IsARM, NoV6T2]>;
 
 def : ARMPat<(or GPR:$LHS, so_imm2part:$RHS),
-              (ORRri (ORRri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
-                     (so_imm2part_2 imm:$RHS))>;
+             (ORRri (ORRri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+                    (so_imm2part_2 imm:$RHS))>;
 def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS),
-              (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
-                     (so_imm2part_2 imm:$RHS))>;
+             (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+                    (so_imm2part_2 imm:$RHS))>;
+
+// 32-bit immediate using movw + movt.
+// This is a single pseudo instruction to make it re-materializable. Remove
+// when we can do generalized remat.
+let isReMaterializable = 1 in
+def MOVi32imm : AI1x2<(outs GPR:$dst), (ins i32imm:$src), Pseudo, IIC_iMOVi,
+                     "movw", " $dst, ${src:lo16}\n\tmovt${p} $dst, ${src:hi16}",
+                     [(set GPR:$dst, (i32 imm:$src))]>,
+               Requires<[IsARM, HasV6T2]>;
 
 // TODO: add,sub,and, 3-instr forms?
 
 
 // Direct calls
 def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>,
-      Requires<[IsNotDarwin]>;
+      Requires<[IsARM, IsNotDarwin]>;
 def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>,
-      Requires<[IsDarwin]>;
+      Requires<[IsARM, IsDarwin]>;
 
 // zextload i1 -> zextload i8
 def : ARMPat<(zextloadi1 addrmode2:$addr),  (LDRB addrmode2:$addr)>;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index a62597b..cd370aa 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -65,8 +65,28 @@ def SDTARMVGETLN  : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
 def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
 def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
 
-def NEONvduplaneq : SDNode<"ARMISD::VDUPLANEQ",
-                           SDTypeProfile<1, 2, [SDTCisVT<2, i32>]>>;
+def NEONvdup      : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
+
+// VDUPLANE can produce a quad-register result from a double-register source,
+// so the result is not constrained to match the source.
+def NEONvduplane  : SDNode<"ARMISD::VDUPLANE",
+                           SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                                SDTCisVT<2, i32>]>>;
+
+def SDTARMVEXT    : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
+def NEONvext      : SDNode<"ARMISD::VEXT", SDTARMVEXT>;
+
+def SDTARMVSHUF   : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+def NEONvrev64    : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
+def NEONvrev32    : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
+def NEONvrev16    : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
+
+def SDTARMVSHUF2  : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
+def NEONzip       : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
+def NEONuzp       : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
+def NEONtrn       : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
 
 //===----------------------------------------------------------------------===//
 // NEON operand definitions
@@ -87,28 +107,409 @@ def addrmode_neonldstm : Operand<i32>,
 //===----------------------------------------------------------------------===//
 
 /* TODO: Take advantage of vldm.
-let mayLoad = 1 in {
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
 def VLDMD : NI<(outs),
                (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops),
+               IIC_fpLoadm,
                "vldm${addr:submode} ${addr:base}, $dst1",
-               []>;
+               []> {
+  let Inst{27-25} = 0b110;
+  let Inst{20}    = 1;
+  let Inst{11-9}  = 0b101;
+}
 
 def VLDMS : NI<(outs),
                (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops),
+               IIC_fpLoadm,
                "vldm${addr:submode} ${addr:base}, $dst1",
-               []>;
+               []> {
+  let Inst{27-25} = 0b110;
+  let Inst{20}    = 1;
+  let Inst{11-9}  = 0b101;
+}
 }
 */
 
 // Use vldmia to load a Q register as a D register pair.
-def VLDRQ : NI<(outs QPR:$dst), (ins GPR:$addr),
+def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr),
+               IIC_fpLoadm,
                "vldmia $addr, ${dst:dregpair}",
-               [(set QPR:$dst, (v2f64 (load GPR:$addr)))]>;
+               [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]> {
+  let Inst{27-25} = 0b110;
+  let Inst{24}    = 0; // P bit
+  let Inst{23}    = 1; // U bit
+  let Inst{20}    = 1;
+  let Inst{11-9}  = 0b101;
+}
 
 // Use vstmia to store a Q register as a D register pair.
-def VSTRQ : NI<(outs), (ins QPR:$src, GPR:$addr),
+def VSTRQ : NI4<(outs), (ins QPR:$src, addrmode4:$addr),
+               IIC_fpStorem,
                "vstmia $addr, ${src:dregpair}",
-               [(store (v2f64 QPR:$src), GPR:$addr)]>;
+               [(store (v2f64 QPR:$src), addrmode4:$addr)]> {
+  let Inst{27-25} = 0b110;
+  let Inst{24}    = 0; // P bit
+  let Inst{23}    = 1; // U bit
+  let Inst{20}    = 0;
+  let Inst{11-9}  = 0b101;
+}
+
+//   VLD1     : Vector Load (multiple single elements)
+class VLD1D<bits<4> op7_4, string OpcodeStr, ValueType Ty, Intrinsic IntOp>
+  : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), (ins addrmode6:$addr), IIC_VLD1,
+          !strconcat(OpcodeStr, "\t\\{$dst\\}, $addr"), "",
+          [(set DPR:$dst, (Ty (IntOp addrmode6:$addr)))]>;
+class VLD1Q<bits<4> op7_4, string OpcodeStr, ValueType Ty, Intrinsic IntOp>
+  : NLdSt<0,0b10,0b1010,op7_4, (outs QPR:$dst), (ins addrmode6:$addr), IIC_VLD1,
+          !strconcat(OpcodeStr, "\t${dst:dregpair}, $addr"), "",
+          [(set QPR:$dst, (Ty (IntOp addrmode6:$addr)))]>;
+
+def  VLD1d8   : VLD1D<0b0000, "vld1.8",  v8i8,  int_arm_neon_vld1>;
+def  VLD1d16  : VLD1D<0b0100, "vld1.16", v4i16, int_arm_neon_vld1>;
+def  VLD1d32  : VLD1D<0b1000, "vld1.32", v2i32, int_arm_neon_vld1>;
+def  VLD1df   : VLD1D<0b1000, "vld1.32", v2f32, int_arm_neon_vld1>;
+def  VLD1d64  : VLD1D<0b1100, "vld1.64", v1i64, int_arm_neon_vld1>;
+
+def  VLD1q8   : VLD1Q<0b0000, "vld1.8",  v16i8, int_arm_neon_vld1>;
+def  VLD1q16  : VLD1Q<0b0100, "vld1.16", v8i16, int_arm_neon_vld1>;
+def  VLD1q32  : VLD1Q<0b1000, "vld1.32", v4i32, int_arm_neon_vld1>;
+def  VLD1qf   : VLD1Q<0b1000, "vld1.32", v4f32, int_arm_neon_vld1>;
+def  VLD1q64  : VLD1Q<0b1100, "vld1.64", v2i64, int_arm_neon_vld1>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
+
+//   VLD2     : Vector Load (multiple 2-element structures)
+class VLD2D<bits<4> op7_4, string OpcodeStr>
+  : NLdSt<0,0b10,0b1000,op7_4, (outs DPR:$dst1, DPR:$dst2),
+          (ins addrmode6:$addr), IIC_VLD2,
+          !strconcat(OpcodeStr, "\t\\{$dst1,$dst2\\}, $addr"), "", []>;
+class VLD2Q<bits<4> op7_4, string OpcodeStr>
+  : NLdSt<0,0b10,0b0011,op7_4,
+          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$addr), IIC_VLD2,
+          !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3,$dst4\\}, $addr"),
+          "", []>;
+
+def  VLD2d8   : VLD2D<0b0000, "vld2.8">;
+def  VLD2d16  : VLD2D<0b0100, "vld2.16">;
+def  VLD2d32  : VLD2D<0b1000, "vld2.32">;
+def  VLD2d64  : NLdSt<0,0b10,0b1010,0b1100, (outs DPR:$dst1, DPR:$dst2),
+                      (ins addrmode6:$addr), IIC_VLD1,
+                      "vld1.64\t\\{$dst1,$dst2\\}, $addr", "", []>;
+
+def  VLD2q8   : VLD2Q<0b0000, "vld2.8">;
+def  VLD2q16  : VLD2Q<0b0100, "vld2.16">;
+def  VLD2q32  : VLD2Q<0b1000, "vld2.32">;
+
+//   VLD3     : Vector Load (multiple 3-element structures)
+class VLD3D<bits<4> op7_4, string OpcodeStr>
+  : NLdSt<0,0b10,0b0100,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$addr), IIC_VLD3,
+          !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3\\}, $addr"), "", []>;
+class VLD3WB<bits<4> op7_4, string OpcodeStr>
+  : NLdSt<0,0b10,0b0101,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6:$addr), IIC_VLD3,
+          !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3\\}, $addr"),
+          "$addr.addr = $wb", []>;
+
+def  VLD3d8   : VLD3D<0b0000, "vld3.8">;
+def  VLD3d16  : VLD3D<0b0100, "vld3.16">;
+def  VLD3d32  : VLD3D<0b1000, "vld3.32">;
+def  VLD3d64  : NLdSt<0,0b10,0b0110,0b1100,
+                      (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
+                      (ins addrmode6:$addr), IIC_VLD1,
+                      "vld1.64\t\\{$dst1,$dst2,$dst3\\}, $addr", "", []>;
+
+// vld3 to double-spaced even registers.
+def  VLD3q8a  : VLD3WB<0b0000, "vld3.8">;
+def  VLD3q16a : VLD3WB<0b0100, "vld3.16">;
+def  VLD3q32a : VLD3WB<0b1000, "vld3.32">;
+
+// vld3 to double-spaced odd registers.
+def  VLD3q8b  : VLD3WB<0b0000, "vld3.8">;
+def  VLD3q16b : VLD3WB<0b0100, "vld3.16">;
+def  VLD3q32b : VLD3WB<0b1000, "vld3.32">;
+
+//   VLD4     : Vector Load (multiple 4-element structures)
+class VLD4D<bits<4> op7_4, string OpcodeStr>
+  : NLdSt<0,0b10,0b0000,op7_4,
+          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$addr), IIC_VLD4,
+          !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3,$dst4\\}, $addr"),
+          "", []>;
+class VLD4WB<bits<4> op7_4, string OpcodeStr>
+  : NLdSt<0,0b10,0b0001,op7_4,
+          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6:$addr), IIC_VLD4,
+          !strconcat(OpcodeStr, "\t\\{$dst1,$dst2,$dst3,$dst4\\}, $addr"),
+          "$addr.addr = $wb", []>;
+
+def  VLD4d8   : VLD4D<0b0000, "vld4.8">;
+def  VLD4d16  : VLD4D<0b0100, "vld4.16">;
+def  VLD4d32  : VLD4D<0b1000, "vld4.32">;
+def  VLD4d64  : NLdSt<0,0b10,0b0010,0b1100,
+                      (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+                      (ins addrmode6:$addr), IIC_VLD1,
+                      "vld1.64\t\\{$dst1,$dst2,$dst3,$dst4\\}, $addr", "", []>;
+
+// vld4 to double-spaced even registers.
+def  VLD4q8a  : VLD4WB<0b0000, "vld4.8">;
+def  VLD4q16a : VLD4WB<0b0100, "vld4.16">;
+def  VLD4q32a : VLD4WB<0b1000, "vld4.32">;
+
+// vld4 to double-spaced odd registers.
+def  VLD4q8b  : VLD4WB<0b0000, "vld4.8">;
+def  VLD4q16b : VLD4WB<0b0100, "vld4.16">;
+def  VLD4q32b : VLD4WB<0b1000, "vld4.32">;
+
+//   VLD1LN   : Vector Load (single element to one lane)
+//   FIXME: Not yet implemented.
+
+//   VLD2LN   : Vector Load (single 2-element structure to one lane)
+class VLD2LN<bits<4> op11_8, string OpcodeStr>
+  : NLdSt<1,0b10,op11_8,0b0000, (outs DPR:$dst1, DPR:$dst2),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
+          IIC_VLD2,
+          !strconcat(OpcodeStr, "\t\\{$dst1[$lane],$dst2[$lane]\\}, $addr"),
+          "$src1 = $dst1, $src2 = $dst2", []>;
+
+def VLD2LNd8  : VLD2LN<0b0001, "vld2.8">;
+def VLD2LNd16 : VLD2LN<0b0101, "vld2.16">;
+def VLD2LNd32 : VLD2LN<0b1001, "vld2.32">;
+
+// vld2 to double-spaced even registers.
+def VLD2LNq16a: VLD2LN<0b0101, "vld2.16">;
+def VLD2LNq32a: VLD2LN<0b1001, "vld2.32">;
+
+// vld2 to double-spaced odd registers.
+def VLD2LNq16b: VLD2LN<0b0101, "vld2.16">;
+def VLD2LNq32b: VLD2LN<0b1001, "vld2.32">;
+
+//   VLD3LN   : Vector Load (single 3-element structure to one lane)
+class VLD3LN<bits<4> op11_8, string OpcodeStr>
+  : NLdSt<1,0b10,op11_8,0b0000, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
+          nohash_imm:$lane), IIC_VLD3,
+          !strconcat(OpcodeStr,
+          "\t\\{$dst1[$lane],$dst2[$lane],$dst3[$lane]\\}, $addr"),
+          "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3", []>;
+
+def VLD3LNd8  : VLD3LN<0b0010, "vld3.8">;
+def VLD3LNd16 : VLD3LN<0b0110, "vld3.16">;
+def VLD3LNd32 : VLD3LN<0b1010, "vld3.32">;
+
+// vld3 to double-spaced even registers.
+def VLD3LNq16a: VLD3LN<0b0110, "vld3.16">;
+def VLD3LNq32a: VLD3LN<0b1010, "vld3.32">;
+
+// vld3 to double-spaced odd registers.
+def VLD3LNq16b: VLD3LN<0b0110, "vld3.16">;
+def VLD3LNq32b: VLD3LN<0b1010, "vld3.32">;
+
+//   VLD4LN   : Vector Load (single 4-element structure to one lane)
+class VLD4LN<bits<4> op11_8, string OpcodeStr>
+  : NLdSt<1,0b10,op11_8,0b0000,
+          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
+          nohash_imm:$lane), IIC_VLD4,
+          !strconcat(OpcodeStr,
+          "\t\\{$dst1[$lane],$dst2[$lane],$dst3[$lane],$dst4[$lane]\\}, $addr"),
+          "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>;
+
+def VLD4LNd8  : VLD4LN<0b0011, "vld4.8">;
+def VLD4LNd16 : VLD4LN<0b0111, "vld4.16">;
+def VLD4LNd32 : VLD4LN<0b1011, "vld4.32">;
+
+// vld4 to double-spaced even registers.
+def VLD4LNq16a: VLD4LN<0b0111, "vld4.16">;
+def VLD4LNq32a: VLD4LN<0b1011, "vld4.32">;
+
+// vld4 to double-spaced odd registers.
+def VLD4LNq16b: VLD4LN<0b0111, "vld4.16">;
+def VLD4LNq32b: VLD4LN<0b1011, "vld4.32">;
+
+//   VLD1DUP  : Vector Load (single element to all lanes)
+//   VLD2DUP  : Vector Load (single 2-element structure to all lanes)
+//   VLD3DUP  : Vector Load (single 3-element structure to all lanes)
+//   VLD4DUP  : Vector Load (single 4-element structure to all lanes)
+//   FIXME: Not yet implemented.
+} // mayLoad = 1, hasExtraDefRegAllocReq = 1
+
+//   VST1     : Vector Store (multiple single elements)
+class VST1D<bits<4> op7_4, string OpcodeStr, ValueType Ty, Intrinsic IntOp>
+  : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST,
+          !strconcat(OpcodeStr, "\t\\{$src\\}, $addr"), "",
+          [(IntOp addrmode6:$addr, (Ty DPR:$src))]>;
+class VST1Q<bits<4> op7_4, string OpcodeStr, ValueType Ty, Intrinsic IntOp>
+  : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$addr, QPR:$src), IIC_VST,
+          !strconcat(OpcodeStr, "\t${src:dregpair}, $addr"), "",
+          [(IntOp addrmode6:$addr, (Ty QPR:$src))]>;
+
+let hasExtraSrcRegAllocReq = 1 in {
+def  VST1d8   : VST1D<0b0000, "vst1.8",  v8i8,  int_arm_neon_vst1>;
+def  VST1d16  : VST1D<0b0100, "vst1.16", v4i16, int_arm_neon_vst1>;
+def  VST1d32  : VST1D<0b1000, "vst1.32", v2i32, int_arm_neon_vst1>;
+def  VST1df   : VST1D<0b1000, "vst1.32", v2f32, int_arm_neon_vst1>;
+def  VST1d64  : VST1D<0b1100, "vst1.64", v1i64, int_arm_neon_vst1>;
+
+def  VST1q8   : VST1Q<0b0000, "vst1.8",  v16i8, int_arm_neon_vst1>;
+def  VST1q16  : VST1Q<0b0100, "vst1.16", v8i16, int_arm_neon_vst1>;
+def  VST1q32  : VST1Q<0b1000, "vst1.32", v4i32, int_arm_neon_vst1>;
+def  VST1qf   : VST1Q<0b1000, "vst1.32", v4f32, int_arm_neon_vst1>;
+def  VST1q64  : VST1Q<0b1100, "vst1.64", v2i64, int_arm_neon_vst1>;
+} // hasExtraSrcRegAllocReq
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in {
+
+//   VST2     : Vector Store (multiple 2-element structures)
+class VST2D<bits<4> op7_4, string OpcodeStr>
+  : NLdSt<0,0b00,0b1000,op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST,
+          !strconcat(OpcodeStr, "\t\\{$src1,$src2\\}, $addr"), "", []>;
+class VST2Q<bits<4> op7_4, string OpcodeStr>
+  : NLdSt<0,0b00,0b0011,op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST,
+          !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3,$src4\\}, $addr"),
+          "", []>;
+
+def  VST2d8   : VST2D<0b0000, "vst2.8">;
+def  VST2d16  : VST2D<0b0100, "vst2.16">;
+def  VST2d32  : VST2D<0b1000, "vst2.32">;
+def  VST2d64  : NLdSt<0,0b00,0b1010,0b1100, (outs),
+                      (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST,
+                      "vst1.64\t\\{$src1,$src2\\}, $addr", "", []>;
+
+def  VST2q8   : VST2Q<0b0000, "vst2.8">;
+def  VST2q16  : VST2Q<0b0100, "vst2.16">;
+def  VST2q32  : VST2Q<0b1000, "vst2.32">;
+
+//   VST3     : Vector Store (multiple 3-element structures)
+class VST3D<bits<4> op7_4, string OpcodeStr>
+  : NLdSt<0,0b00,0b0100,op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST,
+          !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3\\}, $addr"), "", []>;
+class VST3WB<bits<4> op7_4, string OpcodeStr>
+  : NLdSt<0,0b00,0b0101,op7_4, (outs GPR:$wb),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST,
+          !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3\\}, $addr"),
+          "$addr.addr = $wb", []>;
+
+def  VST3d8   : VST3D<0b0000, "vst3.8">;
+def  VST3d16  : VST3D<0b0100, "vst3.16">;
+def  VST3d32  : VST3D<0b1000, "vst3.32">;
+def  VST3d64  : NLdSt<0,0b00,0b0110,0b1100, (outs),
+                      (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3),
+                      IIC_VST,
+                      "vst1.64\t\\{$src1,$src2,$src3\\}, $addr", "", []>;
+
+// vst3 to double-spaced even registers.
+def  VST3q8a  : VST3WB<0b0000, "vst3.8">;
+def  VST3q16a : VST3WB<0b0100, "vst3.16">;
+def  VST3q32a : VST3WB<0b1000, "vst3.32">;
+
+// vst3 to double-spaced odd registers.
+def  VST3q8b  : VST3WB<0b0000, "vst3.8">;
+def  VST3q16b : VST3WB<0b0100, "vst3.16">;
+def  VST3q32b : VST3WB<0b1000, "vst3.32">;
+
+//   VST4     : Vector Store (multiple 4-element structures)
+class VST4D<bits<4> op7_4, string OpcodeStr>
+  : NLdSt<0,0b00,0b0000,op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST,
+          !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3,$src4\\}, $addr"),
+          "", []>;
+class VST4WB<bits<4> op7_4, string OpcodeStr>
+  : NLdSt<0,0b00,0b0001,op7_4, (outs GPR:$wb),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST,
+          !strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3,$src4\\}, $addr"),
+          "$addr.addr = $wb", []>;
+
+def  VST4d8   : VST4D<0b0000, "vst4.8">;
+def  VST4d16  : VST4D<0b0100, "vst4.16">;
+def  VST4d32  : VST4D<0b1000, "vst4.32">;
+def  VST4d64  : NLdSt<0,0b00,0b0010,0b1100, (outs),
+                      (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
+                       DPR:$src4), IIC_VST,
+                      "vst1.64\t\\{$src1,$src2,$src3,$src4\\}, $addr", "", []>;
+
+// vst4 to double-spaced even registers.
+def  VST4q8a  : VST4WB<0b0000, "vst4.8">;
+def  VST4q16a : VST4WB<0b0100, "vst4.16">;
+def  VST4q32a : VST4WB<0b1000, "vst4.32">;
+
+// vst4 to double-spaced odd registers.
+def  VST4q8b  : VST4WB<0b0000, "vst4.8">;
+def  VST4q16b : VST4WB<0b0100, "vst4.16">;
+def  VST4q32b : VST4WB<0b1000, "vst4.32">;
+
+//   VST1LN   : Vector Store (single element from one lane)
+//   FIXME: Not yet implemented.
+
+//   VST2LN   : Vector Store (single 2-element structure from one lane)
+class VST2LN<bits<4> op11_8, string OpcodeStr>
+  : NLdSt<1,0b00,op11_8,0b0000, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
+          IIC_VST,
+          !strconcat(OpcodeStr, "\t\\{$src1[$lane],$src2[$lane]\\}, $addr"),
+          "", []>;
+
+def VST2LNd8  : VST2LN<0b0000, "vst2.8">;
+def VST2LNd16 : VST2LN<0b0100, "vst2.16">;
+def VST2LNd32 : VST2LN<0b1000, "vst2.32">;
+
+// vst2 to double-spaced even registers.
+def VST2LNq16a: VST2LN<0b0100, "vst2.16">;
+def VST2LNq32a: VST2LN<0b1000, "vst2.32">;
+
+// vst2 to double-spaced odd registers.
+def VST2LNq16b: VST2LN<0b0100, "vst2.16">;
+def VST2LNq32b: VST2LN<0b1000, "vst2.32">;
+
+//   VST3LN   : Vector Store (single 3-element structure from one lane)
+class VST3LN<bits<4> op11_8, string OpcodeStr>
+  : NLdSt<1,0b00,op11_8,0b0000, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
+           nohash_imm:$lane), IIC_VST,
+          !strconcat(OpcodeStr,
+          "\t\\{$src1[$lane],$src2[$lane],$src3[$lane]\\}, $addr"), "", []>;
+
+def VST3LNd8  : VST3LN<0b0010, "vst3.8">;
+def VST3LNd16 : VST3LN<0b0110, "vst3.16">;
+def VST3LNd32 : VST3LN<0b1010, "vst3.32">;
+
+// vst3 to double-spaced even registers.
+def VST3LNq16a: VST3LN<0b0110, "vst3.16">;
+def VST3LNq32a: VST3LN<0b1010, "vst3.32">;
+
+// vst3 to double-spaced odd registers.
+def VST3LNq16b: VST3LN<0b0110, "vst3.16">;
+def VST3LNq32b: VST3LN<0b1010, "vst3.32">;
+
+//   VST4LN   : Vector Store (single 4-element structure from one lane)
+class VST4LN<bits<4> op11_8, string OpcodeStr>
+  : NLdSt<1,0b00,op11_8,0b0000, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
+           nohash_imm:$lane), IIC_VST,
+          !strconcat(OpcodeStr,
+          "\t\\{$src1[$lane],$src2[$lane],$src3[$lane],$src4[$lane]\\}, $addr"),
+          "", []>;
+
+def VST4LNd8  : VST4LN<0b0011, "vst4.8">;
+def VST4LNd16 : VST4LN<0b0111, "vst4.16">;
+def VST4LNd32 : VST4LN<0b1011, "vst4.32">;
+
+// vst4 to double-spaced even registers.
+def VST4LNq16a: VST4LN<0b0111, "vst4.16">;
+def VST4LNq32a: VST4LN<0b1011, "vst4.32">;
+
+// vst4 to double-spaced odd registers.
+def VST4LNq16b: VST4LN<0b0111, "vst4.16">;
+def VST4LNq32b: VST4LN<0b1011, "vst4.32">;
+
+} // mayStore = 1, hasExtraSrcRegAllocReq = 1
 
 
 //===----------------------------------------------------------------------===//
@@ -117,18 +518,27 @@ def VSTRQ : NI<(outs), (ins QPR:$src, GPR:$addr),
 
 // Extract D sub-registers of Q registers.
 // (arm_dsubreg_0 is 5; arm_dsubreg_1 is 6)
-def SubReg_i8_reg  : SDNodeXForm<imm, [{
+def DSubReg_i8_reg  : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(5 + N->getZExtValue() / 8, MVT::i32);
 }]>;
-def SubReg_i16_reg : SDNodeXForm<imm, [{
+def DSubReg_i16_reg : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(5 + N->getZExtValue() / 4, MVT::i32);
 }]>;
-def SubReg_i32_reg : SDNodeXForm<imm, [{
+def DSubReg_i32_reg : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(5 + N->getZExtValue() / 2, MVT::i32);
 }]>;
-def SubReg_f64_reg : SDNodeXForm<imm, [{
+def DSubReg_f64_reg : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(5 + N->getZExtValue(), MVT::i32);
 }]>;
+def DSubReg_f64_other_reg : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(5 + (1 - N->getZExtValue()), MVT::i32);
+}]>;
+
+// Extract S sub-registers of Q/D registers.
+// (arm_ssubreg_0 is 1; arm_ssubreg_1 is 2; etc.)
+def SSubReg_f32_reg : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(1 + N->getZExtValue(), MVT::i32);
+}]>;
 
 // Translate lane numbers from Q registers to D subregs.
 def SubReg_i8_lane  : SDNodeXForm<imm, [{
@@ -150,117 +560,337 @@ class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
            ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
-        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        (ins DPR:$src), IIC_VUNAD, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>;
 class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
            ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
-        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        (ins QPR:$src), IIC_VUNAQ, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>;
 
+// Basic 2-register operations, scalar single-precision.
+class N2VDs<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+            bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+            ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src),
+        IIC_VUNAD, !strconcat(OpcodeStr, "\t$dst, $src"), "", []>;
+
+class N2VDsPat<SDNode OpNode, ValueType ResTy, ValueType OpTy, NeonI Inst>
+  : NEONFPPat<(ResTy (OpNode SPR:$a)),
+       (EXTRACT_SUBREG
+           (Inst (INSERT_SUBREG (OpTy (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0)),
+        arm_ssubreg_0)>;
+
 // Basic 2-register intrinsics, both double- and quad-register.
 class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
-              bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+              bits<2> op17_16, bits<5> op11_7, bit op4, 
+              InstrItinClass itin, string OpcodeStr,
               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
-        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        (ins DPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>;
 class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
-              bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+              bits<2> op17_16, bits<5> op11_7, bit op4,
+              InstrItinClass itin, string OpcodeStr,
               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
-        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        (ins QPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
 
+// Basic 2-register intrinsics, scalar single-precision
+class N2VDInts<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op4, 
+              InstrItinClass itin, string OpcodeStr,
+              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), itin,
+        !strconcat(OpcodeStr, "\t$dst, $src"), "", []>;
+
+class N2VDIntsPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$a)),
+       (EXTRACT_SUBREG
+           (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0)),
+        arm_ssubreg_0)>;
+
 // Narrow 2-register intrinsics.
 class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
-              string OpcodeStr, ValueType TyD, ValueType TyQ, Intrinsic IntOp>
+              InstrItinClass itin, string OpcodeStr,
+              ValueType TyD, ValueType TyQ, Intrinsic IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$dst),
-        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        (ins QPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>;
 
 // Long 2-register intrinsics.  (This is currently only used for VMOVL and is
 // derived from N2VImm instead of N2V because of the way the size is encoded.)
 class N2VLInt<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-              bit op6, bit op4, string OpcodeStr, ValueType TyQ, ValueType TyD,
-              Intrinsic IntOp>
+              bit op6, bit op4, InstrItinClass itin, string OpcodeStr,
+              ValueType TyQ, ValueType TyD, Intrinsic IntOp>
   : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4, (outs QPR:$dst),
-        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        (ins DPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src))))]>;
 
+// 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register.
+class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr>
+  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$dst1, DPR:$dst2),
+        (ins DPR:$src1, DPR:$src2), IIC_VPERMD, 
+        !strconcat(OpcodeStr, "\t$dst1, $dst2"),
+        "$src1 = $dst1, $src2 = $dst2", []>;
+class N2VQShuffle<bits<2> op19_18, bits<5> op11_7,
+                  InstrItinClass itin, string OpcodeStr>
+  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$dst1, QPR:$dst2),
+        (ins QPR:$src1, QPR:$src2), itin, 
+        !strconcat(OpcodeStr, "\t$dst1, $dst2"),
+        "$src1 = $dst1, $src2 = $dst2", []>;
+
 // Basic 3-register operations, both double- and quad-register.
 class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-           string OpcodeStr, ValueType ResTy, ValueType OpTy,
+           InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy,
            SDNode OpNode, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2),
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
         !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
         [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
   let isCommutable = Commutable;
 }
+class N3VDSL<bits<2> op21_20, bits<4> op11_8, 
+             InstrItinClass itin, string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N3V<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+        itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+        [(set (Ty DPR:$dst),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (NEONvduplane (Ty DPR_VFP2:$src2),
+                                          imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VDSL16<bits<2> op21_20, bits<4> op11_8, 
+               string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N3V<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
+        IIC_VMULi16D,
+        !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+        [(set (Ty DPR:$dst),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (NEONvduplane (Ty DPR_8:$src2),
+                                          imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+
 class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-           string OpcodeStr, ValueType ResTy, ValueType OpTy,
+           InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy,
            SDNode OpNode, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2),
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
         !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
         [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
   let isCommutable = Commutable;
 }
+class N3VQSL<bits<2> op21_20, bits<4> op11_8, 
+             InstrItinClass itin, string OpcodeStr, 
+             ValueType ResTy, ValueType OpTy, SDNode ShOp>
+  : N3V<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+        itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+        [(set (ResTy QPR:$dst),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
+                                                imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VQSL16<bits<2> op21_20, bits<4> op11_8, 
+               string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode ShOp>
+  : N3V<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
+        IIC_VMULi16Q,
+        !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+        [(set (ResTy QPR:$dst),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (NEONvduplane (OpTy DPR_8:$src2),
+                                                imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+
+// Basic 3-register operations, scalar single-precision
+class N3VDs<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           string OpcodeStr, ValueType ResTy, ValueType OpTy,
+           SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
+        !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", []> {
+  let isCommutable = Commutable;
+}
+class N3VDsPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)),
+       (EXTRACT_SUBREG
+           (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0),
+                 (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$b, arm_ssubreg_0)),
+        arm_ssubreg_0)>;
 
 // Basic 3-register intrinsics, both double- and quad-register.
 class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-              string OpcodeStr, ValueType ResTy, ValueType OpTy,
+              InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2),
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
         !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
         [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
   let isCommutable = Commutable;
 }
+class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, 
+                string OpcodeStr, ValueType Ty, Intrinsic IntOp>
+  : N3V<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+        itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+        [(set (Ty DPR:$dst),
+              (Ty (IntOp (Ty DPR:$src1),
+                         (Ty (NEONvduplane (Ty DPR_VFP2:$src2),
+                                           imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, ValueType Ty, Intrinsic IntOp>
+  : N3V<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
+        itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+        [(set (Ty DPR:$dst),
+              (Ty (IntOp (Ty DPR:$src1),
+                         (Ty (NEONvduplane (Ty DPR_8:$src2),
+                                           imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+
 class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-              string OpcodeStr, ValueType ResTy, ValueType OpTy,
+              InstrItinClass itin, string OpcodeStr, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2),
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
         !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
         [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
   let isCommutable = Commutable;
 }
+class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, 
+                string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N3V<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+        itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+        [(set (ResTy QPR:$dst),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
+                                                 imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N3V<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
+        itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+        [(set (ResTy QPR:$dst),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (ResTy (NEONvduplane (OpTy DPR_8:$src2),
+                                                 imm:$lane)))))]> {
+  let isCommutable = 0;
+}
 
 // Multiply-Add/Sub operations, both double- and quad-register.
 class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-                string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode>
+                InstrItinClass itin, string OpcodeStr, 
+                ValueType Ty, SDNode MulOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin,
         !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
         [(set DPR:$dst, (Ty (OpNode DPR:$src1,
                              (Ty (MulOp DPR:$src2, DPR:$src3)))))]>;
+class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode ShOp>
+  : N3V<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$dst),
+        (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+        !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
+        [(set (Ty DPR:$dst),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (MulOp DPR:$src2,
+                                   (Ty (NEONvduplane (Ty DPR_VFP2:$src3),
+                                                     imm:$lane)))))))]>;
+class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                    string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode ShOp>
+  : N3V<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$dst),
+        (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+        !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
+        [(set (Ty DPR:$dst),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (MulOp DPR:$src2,
+                                   (Ty (NEONvduplane (Ty DPR_8:$src3),
+                                                     imm:$lane)))))))]>;
+
 class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-                string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode>
+                InstrItinClass itin, string OpcodeStr, ValueType Ty,
+                SDNode MulOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin,
         !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
         [(set QPR:$dst, (Ty (OpNode QPR:$src1,
                              (Ty (MulOp QPR:$src2, QPR:$src3)))))]>;
+class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, ValueType ResTy, ValueType OpTy,
+                  SDNode MulOp, SDNode ShOp>
+  : N3V<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst),
+        (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+        !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
+        [(set (ResTy QPR:$dst),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (MulOp QPR:$src2,
+                                         (ResTy (NEONvduplane (OpTy DPR_VFP2:$src3),
+                                                              imm:$lane)))))))]>;
+class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                    string OpcodeStr, ValueType ResTy, ValueType OpTy,
+                    SDNode MulOp, SDNode ShOp>
+  : N3V<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst),
+        (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+        !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
+        [(set (ResTy QPR:$dst),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (MulOp QPR:$src2,
+                                         (ResTy (NEONvduplane (OpTy DPR_8:$src3),
+                                                              imm:$lane)))))))]>;
+
+// Multiply-Add/Sub operations, scalar single-precision
+class N3VDMulOps<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                 InstrItinClass itin, string OpcodeStr,
+                 ValueType Ty, SDNode MulOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR_VFP2:$dst),
+        (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), itin,
+        !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", []>;
+
+class N3VDMulOpsPat<SDNode MulNode, SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))),
+      (EXTRACT_SUBREG
+          (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$acc, arm_ssubreg_0),
+                (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a,   arm_ssubreg_0),
+                (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$b,   arm_ssubreg_0)),
+       arm_ssubreg_0)>;
 
 // Neon 3-argument intrinsics, both double- and quad-register.
 // The destination register is also used as the first source operand register.
 class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-               string OpcodeStr, ValueType ResTy, ValueType OpTy,
-               Intrinsic IntOp>
+               InstrItinClass itin, string OpcodeStr,
+               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin,
         !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
         [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1),
                                       (OpTy DPR:$src2), (OpTy DPR:$src3))))]>;
 class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-               string OpcodeStr, ValueType ResTy, ValueType OpTy,
-               Intrinsic IntOp>
+               InstrItinClass itin, string OpcodeStr,
+               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin,
         !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
         [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1),
                                       (OpTy QPR:$src2), (OpTy QPR:$src3))))]>;
@@ -268,19 +898,44 @@ class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 // Neon Long 3-argument intrinsic.  The destination register is
 // a quad-register and is also used as the first source operand register.
 class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-               string OpcodeStr, ValueType TyQ, ValueType TyD, Intrinsic IntOp>
+               InstrItinClass itin, string OpcodeStr,
+               ValueType TyQ, ValueType TyD, Intrinsic IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3),
+        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), itin,
         !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
         [(set QPR:$dst,
           (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>;
+class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                 string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst),
+        (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+        !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
+        [(set (ResTy QPR:$dst),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (OpTy DPR:$src2),
+                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$src3),
+                                                imm:$lane)))))]>;
+class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                   string OpcodeStr, ValueType ResTy, ValueType OpTy,
+                   Intrinsic IntOp>
+  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst),
+        (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+        !strconcat(OpcodeStr, "\t$dst, $src2, $src3[$lane]"), "$src1 = $dst",
+        [(set (ResTy QPR:$dst),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (OpTy DPR:$src2),
+                            (OpTy (NEONvduplane (OpTy DPR_8:$src3),
+                                                imm:$lane)))))]>;
+
 
 // Narrowing 3-register intrinsics.
 class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               string OpcodeStr, ValueType TyD, ValueType TyQ,
               Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins QPR:$src1, QPR:$src2),
+        (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VBINi4D,
         !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
         [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -288,21 +943,40 @@ class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 
 // Long 3-register intrinsics.
 class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-              string OpcodeStr, ValueType TyQ, ValueType TyD,
+              InstrItinClass itin, string OpcodeStr, ValueType TyQ, ValueType TyD,
               Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2),
+        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), itin,
         !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
         [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> {
   let isCommutable = Commutable;
 }
+class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                string OpcodeStr, ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), 
+        itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+        [(set (ResTy QPR:$dst),
+              (ResTy (IntOp (OpTy DPR:$src1),
+                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$src2),
+                                                imm:$lane)))))]>;
+class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, ValueType ResTy, ValueType OpTy, 
+                  Intrinsic IntOp>
+  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), 
+        itin, !strconcat(OpcodeStr, "\t$dst, $src1, $src2[$lane]"), "",
+        [(set (ResTy QPR:$dst),
+              (ResTy (IntOp (OpTy DPR:$src1),
+                            (OpTy (NEONvduplane (OpTy DPR_8:$src2),
+                                                imm:$lane)))))]>;
 
 // Wide 3-register intrinsics.
 class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               string OpcodeStr, ValueType TyQ, ValueType TyD,
               Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2),
+        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), IIC_VSUBiD,
         !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
         [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -313,13 +987,13 @@ class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                 bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
-        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        (ins DPR:$src), IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>;
 class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                 bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
-        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        (ins QPR:$src), IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
 
 // Pairwise long 2-register accumulate intrinsics,
@@ -329,29 +1003,31 @@ class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                  bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2),
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), IIC_VPALiD,
         !strconcat(OpcodeStr, "\t$dst, $src2"), "$src1 = $dst",
         [(set DPR:$dst, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$src2))))]>;
 class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                  bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2),
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VPALiQ,
         !strconcat(OpcodeStr, "\t$dst, $src2"), "$src1 = $dst",
         [(set QPR:$dst, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$src2))))]>;
 
 // Shift by immediate,
 // both double- and quad-register.
 class N2VDSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-             bit op4, string OpcodeStr, ValueType Ty, SDNode OpNode>
+             bit op4, InstrItinClass itin, string OpcodeStr,
+             ValueType Ty, SDNode OpNode>
   : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
-           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM),
+           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), itin,
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>;
 class N2VQSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-             bit op4, string OpcodeStr, ValueType Ty, SDNode OpNode>
+             bit op4, InstrItinClass itin, string OpcodeStr,
+             ValueType Ty, SDNode OpNode>
   : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
-           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM),
+           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>;
 
@@ -360,17 +1036,17 @@ class N2VLSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
              bit op6, bit op4, string OpcodeStr, ValueType ResTy,
              ValueType OpTy, SDNode OpNode>
   : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4,
-           (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM),
+           (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VSHLiD,
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src),
                                           (i32 imm:$SIMM))))]>;
 
 // Narrow shift by immediate.
 class N2VNSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-             bit op6, bit op4, string OpcodeStr, ValueType ResTy,
-             ValueType OpTy, SDNode OpNode>
+             bit op6, bit op4, InstrItinClass itin, string OpcodeStr,
+             ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4,
-           (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM),
+           (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src),
                                           (i32 imm:$SIMM))))]>;
@@ -381,6 +1057,7 @@ class N2VDShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
                 bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
   : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
            (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM),
+           IIC_VPALiD, 
            !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
            [(set DPR:$dst, (Ty (add DPR:$src1,
                                 (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>;
@@ -388,6 +1065,7 @@ class N2VQShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
                 bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
   : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
            (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM),
+           IIC_VPALiD, 
            !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
            [(set QPR:$dst, (Ty (add QPR:$src1,
                                 (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>;
@@ -398,12 +1076,14 @@ class N2VDShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
                 bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
   : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
            (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM),
+           IIC_VSHLiD, 
            !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
            [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>;
 class N2VQShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
                 bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
   : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
            (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM),
+           IIC_VSHLiQ, 
            !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
            [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>;
 
@@ -413,14 +1093,14 @@ class N2VCvtD<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
               bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp>
   : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
-           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM),
+           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VUNAD, 
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>;
 class N2VCvtQ<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
               bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp>
   : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
-           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM),
+           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), IIC_VUNAQ, 
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>;
 
@@ -428,50 +1108,68 @@ class N2VCvtQ<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
 // Multiclasses
 //===----------------------------------------------------------------------===//
 
+// Abbreviations used in multiclass suffixes:
+//   Q = quarter int (8 bit) elements
+//   H = half int (16 bit) elements
+//   S = single int (32 bit) elements
+//   D = double int (64 bit) elements
+
 // Neon 3-register vector operations.
 
 // First with only element sizes of 8, 16 and 32 bits:
 multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                   InstrItinClass itinD16, InstrItinClass itinD32,
+                   InstrItinClass itinQ16, InstrItinClass itinQ32,
                    string OpcodeStr, SDNode OpNode, bit Commutable = 0> {
   // 64-bit vector types.
-  def v8i8  : N3VD<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
-                   v8i8, v8i8, OpNode, Commutable>;
-  def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr, "16"),
-                   v4i16, v4i16, OpNode, Commutable>;
-  def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr, "32"),
-                   v2i32, v2i32, OpNode, Commutable>;
+  def v8i8  : N3VD<op24, op23, 0b00, op11_8, op4, itinD16, 
+                   !strconcat(OpcodeStr, "8"), v8i8, v8i8, OpNode, Commutable>;
+  def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16,
+                   !strconcat(OpcodeStr, "16"), v4i16, v4i16, OpNode, Commutable>;
+  def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, itinD32,
+                   !strconcat(OpcodeStr, "32"), v2i32, v2i32, OpNode, Commutable>;
 
   // 128-bit vector types.
-  def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
-                   v16i8, v16i8, OpNode, Commutable>;
-  def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr, "16"),
-                   v8i16, v8i16, OpNode, Commutable>;
-  def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr, "32"),
-                   v4i32, v4i32, OpNode, Commutable>;
+  def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, itinQ16,
+                   !strconcat(OpcodeStr, "8"), v16i8, v16i8, OpNode, Commutable>;
+  def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, itinQ16,
+                   !strconcat(OpcodeStr, "16"), v8i16, v8i16, OpNode, Commutable>;
+  def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, itinQ32,
+                   !strconcat(OpcodeStr, "32"), v4i32, v4i32, OpNode, Commutable>;
+}
+
+multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, SDNode ShOp> {
+  def v4i16 : N3VDSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v4i16, ShOp>;
+  def v2i32 : N3VDSL<0b10, op11_8, IIC_VMULi32D, !strconcat(OpcodeStr, "32"), v2i32, ShOp>;
+  def v8i16 : N3VQSL16<0b01, op11_8, !strconcat(OpcodeStr, "16"), v8i16, v4i16, ShOp>;
+  def v4i32 : N3VQSL<0b10, op11_8, IIC_VMULi32Q, !strconcat(OpcodeStr, "32"), v4i32, v2i32, ShOp>;
 }
 
 // ....then also with element size 64 bits:
 multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    InstrItinClass itinD, InstrItinClass itinQ,
                     string OpcodeStr, SDNode OpNode, bit Commutable = 0>
-  : N3V_QHS<op24, op23, op11_8, op4, OpcodeStr, OpNode, Commutable> {
-  def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr, "64"),
-                   v1i64, v1i64, OpNode, Commutable>;
-  def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr, "64"),
-                   v2i64, v2i64, OpNode, Commutable>;
+  : N3V_QHS<op24, op23, op11_8, op4, itinD, itinD, itinQ, itinQ,
+            OpcodeStr, OpNode, Commutable> {
+  def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, itinD,
+                   !strconcat(OpcodeStr, "64"), v1i64, v1i64, OpNode, Commutable>;
+  def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, itinQ,
+                   !strconcat(OpcodeStr, "64"), v2i64, v2i64, OpNode, Commutable>;
 }
 
 
 // Neon Narrowing 2-register vector intrinsics,
 //   source operand element sizes of 16, 32 and 64 bits:
 multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
-                       bits<5> op11_7, bit op6, bit op4, string OpcodeStr,
+                       bits<5> op11_7, bit op6, bit op4, 
+                       InstrItinClass itin, string OpcodeStr,
                        Intrinsic IntOp> {
   def v8i8  : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
-                      !strconcat(OpcodeStr, "16"), v8i8, v8i16, IntOp>;
+                      itin, !strconcat(OpcodeStr, "16"), v8i8, v8i16, IntOp>;
   def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
-                      !strconcat(OpcodeStr, "32"), v4i16, v4i32, IntOp>;
+                      itin, !strconcat(OpcodeStr, "32"), v4i16, v4i32, IntOp>;
   def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
-                      !strconcat(OpcodeStr, "64"), v2i32, v2i64, IntOp>;
+                      itin, !strconcat(OpcodeStr, "64"), v2i32, v2i64, IntOp>;
 }
 
 
@@ -480,11 +1178,11 @@ multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 multiclass N2VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
                        bit op4, string OpcodeStr, Intrinsic IntOp> {
   def v8i16 : N2VLInt<op24, op23, 0b001000, op11_8, op7, op6, op4,
-                      !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
+                      IIC_VQUNAiD, !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
   def v4i32 : N2VLInt<op24, op23, 0b010000, op11_8, op7, op6, op4,
-                      !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
+                      IIC_VQUNAiD, !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
   def v2i64 : N2VLInt<op24, op23, 0b100000, op11_8, op7, op6, op4,
-                      !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
+                      IIC_VQUNAiD, !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
 }
 
 
@@ -492,38 +1190,56 @@ multiclass N2VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
 
 // First with only element sizes of 16 and 32 bits:
 multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                     InstrItinClass itinD16, InstrItinClass itinD32,
+                     InstrItinClass itinQ16, InstrItinClass itinQ32,
                      string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> {
   // 64-bit vector types.
-  def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"),
+  def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, itinD16, !strconcat(OpcodeStr,"16"),
                       v4i16, v4i16, IntOp, Commutable>;
-  def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"),
+  def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, itinD32, !strconcat(OpcodeStr,"32"),
                       v2i32, v2i32, IntOp, Commutable>;
 
   // 128-bit vector types.
-  def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"),
+  def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, itinQ16, !strconcat(OpcodeStr,"16"),
                       v8i16, v8i16, IntOp, Commutable>;
-  def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"),
+  def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, itinQ32, !strconcat(OpcodeStr,"32"),
                       v4i32, v4i32, IntOp, Commutable>;
 }
 
+multiclass N3VIntSL_HS<bits<4> op11_8, 
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, Intrinsic IntOp> {
+  def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16, !strconcat(OpcodeStr, "16"), v4i16, IntOp>;
+  def v2i32 : N3VDIntSL<0b10, op11_8, itinD32, !strconcat(OpcodeStr, "32"), v2i32, IntOp>;
+  def v8i16 : N3VQIntSL16<0b01, op11_8, itinQ16, !strconcat(OpcodeStr, "16"), v8i16, v4i16, IntOp>;
+  def v4i32 : N3VQIntSL<0b10, op11_8, itinQ32, !strconcat(OpcodeStr, "32"), v4i32, v2i32, IntOp>;
+}
+
 // ....then also with element size of 8 bits:
 multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                      InstrItinClass itinD16, InstrItinClass itinD32,
+                      InstrItinClass itinQ16, InstrItinClass itinQ32,
                       string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
-  : N3VInt_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> {
-  def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
-                      v8i8, v8i8, IntOp, Commutable>;
-  def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
-                      v16i8, v16i8, IntOp, Commutable>;
+  : N3VInt_HS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32,
+              OpcodeStr, IntOp, Commutable> {
+  def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, itinD16,
+                      !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp, Commutable>;
+  def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, itinQ16,
+                      !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp, Commutable>;
 }
 
 // ....then also with element size of 64 bits:
 multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
                        string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
-  : N3VInt_QHS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> {
-  def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr,"64"),
-                      v1i64, v1i64, IntOp, Commutable>;
-  def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr,"64"),
-                      v2i64, v2i64, IntOp, Commutable>;
+  : N3VInt_QHS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32,
+               OpcodeStr, IntOp, Commutable> {
+  def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, itinD32,
+                      !strconcat(OpcodeStr,"64"), v1i64, v1i64, IntOp, Commutable>;
+  def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, itinQ32,
+                      !strconcat(OpcodeStr,"64"), v2i64, v2i64, IntOp, Commutable>;
 }
 
 
@@ -544,19 +1260,29 @@ multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 
 // First with only element sizes of 16 and 32 bits:
 multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
-                      string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> {
-  def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"),
-                      v4i32, v4i16, IntOp, Commutable>;
-  def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"),
-                      v2i64, v2i32, IntOp, Commutable>;
+                      InstrItinClass itin, string OpcodeStr,
+                      Intrinsic IntOp, bit Commutable = 0> {
+  def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin, 
+                      !strconcat(OpcodeStr,"16"), v4i32, v4i16, IntOp, Commutable>;
+  def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, itin,
+                      !strconcat(OpcodeStr,"32"), v2i64, v2i32, IntOp, Commutable>;
+}
+
+multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
+                        InstrItinClass itin, string OpcodeStr, Intrinsic IntOp> {
+  def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin, 
+                          !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
+  def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin,
+                        !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
 }
 
 // ....then also with element size of 8 bits:
 multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
-                       string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
-  : N3VLInt_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> {
-  def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
-                      v8i16, v8i8, IntOp, Commutable>;
+                       InstrItinClass itin, string OpcodeStr,
+                       Intrinsic IntOp, bit Commutable = 0>
+  : N3VLInt_HS<op24, op23, op11_8, op4, itin, OpcodeStr, IntOp, Commutable> {
+  def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin, 
+                      !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp, Commutable>;
 }
 
 
@@ -576,43 +1302,58 @@ multiclass N3VWInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 // Neon Multiply-Op vector operations,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        InstrItinClass itinD16, InstrItinClass itinD32,
+                        InstrItinClass itinQ16, InstrItinClass itinQ32,
                         string OpcodeStr, SDNode OpNode> {
   // 64-bit vector types.
-  def v8i8  : N3VDMulOp<op24, op23, 0b00, op11_8, op4,
+  def v8i8  : N3VDMulOp<op24, op23, 0b00, op11_8, op4, itinD16,
                         !strconcat(OpcodeStr, "8"), v8i8, mul, OpNode>;
-  def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4,
+  def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4, itinD16,
                         !strconcat(OpcodeStr, "16"), v4i16, mul, OpNode>;
-  def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4,
+  def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4, itinD32,
                         !strconcat(OpcodeStr, "32"), v2i32, mul, OpNode>;
 
   // 128-bit vector types.
-  def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4,
+  def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4, itinQ16,
                         !strconcat(OpcodeStr, "8"), v16i8, mul, OpNode>;
-  def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4,
+  def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4, itinQ16,
                         !strconcat(OpcodeStr, "16"), v8i16, mul, OpNode>;
-  def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4,
+  def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4, itinQ32,
                         !strconcat(OpcodeStr, "32"), v4i32, mul, OpNode>;
 }
 
+multiclass N3VMulOpSL_HS<bits<4> op11_8, 
+                         InstrItinClass itinD16, InstrItinClass itinD32,
+                         InstrItinClass itinQ16, InstrItinClass itinQ32,
+                         string OpcodeStr, SDNode ShOp> {
+  def v4i16 : N3VDMulOpSL16<0b01, op11_8, itinD16,
+                            !strconcat(OpcodeStr, "16"), v4i16, mul, ShOp>;
+  def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32,
+                          !strconcat(OpcodeStr, "32"), v2i32, mul, ShOp>;
+  def v8i16 : N3VQMulOpSL16<0b01, op11_8, itinQ16,
+                            !strconcat(OpcodeStr, "16"), v8i16, v4i16, mul, ShOp>;
+  def v4i32 : N3VQMulOpSL<0b10, op11_8, itinQ32,
+                          !strconcat(OpcodeStr, "32"), v4i32, v2i32, mul, ShOp>;
+}
 
 // Neon 3-argument intrinsics,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        string OpcodeStr, Intrinsic IntOp> {
   // 64-bit vector types.
-  def v8i8  : N3VDInt3<op24, op23, 0b00, op11_8, op4,
+  def v8i8  : N3VDInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16D,
                         !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>;
-  def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4,
+  def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D,
                         !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>;
-  def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4,
+  def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32D,
                         !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>;
 
   // 128-bit vector types.
-  def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4,
+  def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16Q,
                         !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>;
-  def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4,
+  def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16Q,
                         !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>;
-  def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4,
+  def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32Q,
                         !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>;
 }
 
@@ -622,17 +1363,25 @@ multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 // First with only element sizes of 16 and 32 bits:
 multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        string OpcodeStr, Intrinsic IntOp> {
-  def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4,
+  def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D,
                        !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
-  def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4,
+  def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi16D,
                        !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
 }
 
+multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
+                         string OpcodeStr, Intrinsic IntOp> {
+  def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D,
+                           !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
+  def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D,
+                         !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
+}
+
 // ....then also with element size of 8 bits:
 multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                         string OpcodeStr, Intrinsic IntOp>
   : N3VLInt3_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp> {
-  def v8i16 : N3VLInt3<op24, op23, 0b01, op11_8, op4,
+  def v8i16 : N3VLInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D,
                        !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
 }
 
@@ -640,23 +1389,24 @@ multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 // Neon 2-register vector intrinsics,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
-                      bits<5> op11_7, bit op4, string OpcodeStr,
-                      Intrinsic IntOp> {
+                      bits<5> op11_7, bit op4,
+                      InstrItinClass itinD, InstrItinClass itinQ,
+                      string OpcodeStr, Intrinsic IntOp> {
   // 64-bit vector types.
   def v8i8  : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
-                      !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>;
+                      itinD, !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>;
   def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
-                      !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>;
+                      itinD, !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>;
   def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
-                      !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>;
+                      itinD, !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>;
 
   // 128-bit vector types.
   def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
-                      !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>;
+                      itinQ, !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>;
   def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
-                      !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>;
+                      itinQ, !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>;
   def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
-                      !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>;
+                      itinQ, !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>;
 }
 
 
@@ -709,25 +1459,25 @@ multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 // Neon 2-register vector shift by immediate,
 //   element sizes of 8, 16, 32 and 64 bits:
 multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
-                      string OpcodeStr, SDNode OpNode> {
+                      InstrItinClass itin, string OpcodeStr, SDNode OpNode> {
   // 64-bit vector types.
-  def v8i8  : N2VDSh<op24, op23, 0b001000, op11_8, 0, op4,
+  def v8i8  : N2VDSh<op24, op23, 0b001000, op11_8, 0, op4, itin,
                      !strconcat(OpcodeStr, "8"), v8i8, OpNode>;
-  def v4i16 : N2VDSh<op24, op23, 0b010000, op11_8, 0, op4,
+  def v4i16 : N2VDSh<op24, op23, 0b010000, op11_8, 0, op4, itin,
                      !strconcat(OpcodeStr, "16"), v4i16, OpNode>;
-  def v2i32 : N2VDSh<op24, op23, 0b100000, op11_8, 0, op4,
+  def v2i32 : N2VDSh<op24, op23, 0b100000, op11_8, 0, op4, itin,
                      !strconcat(OpcodeStr, "32"), v2i32, OpNode>;
-  def v1i64 : N2VDSh<op24, op23, 0b000000, op11_8, 1, op4,
+  def v1i64 : N2VDSh<op24, op23, 0b000000, op11_8, 1, op4, itin,
                      !strconcat(OpcodeStr, "64"), v1i64, OpNode>;
 
   // 128-bit vector types.
-  def v16i8 : N2VQSh<op24, op23, 0b001000, op11_8, 0, op4,
+  def v16i8 : N2VQSh<op24, op23, 0b001000, op11_8, 0, op4, itin,
                      !strconcat(OpcodeStr, "8"), v16i8, OpNode>;
-  def v8i16 : N2VQSh<op24, op23, 0b010000, op11_8, 0, op4,
+  def v8i16 : N2VQSh<op24, op23, 0b010000, op11_8, 0, op4, itin,
                      !strconcat(OpcodeStr, "16"), v8i16, OpNode>;
-  def v4i32 : N2VQSh<op24, op23, 0b100000, op11_8, 0, op4,
+  def v4i32 : N2VQSh<op24, op23, 0b100000, op11_8, 0, op4, itin,
                      !strconcat(OpcodeStr, "32"), v4i32, OpNode>;
-  def v2i64 : N2VQSh<op24, op23, 0b000000, op11_8, 1, op4,
+  def v2i64 : N2VQSh<op24, op23, 0b000000, op11_8, 1, op4, itin,
                      !strconcat(OpcodeStr, "64"), v2i64, OpNode>;
 }
 
@@ -790,24 +1540,30 @@ multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 // Vector Add Operations.
 
 //   VADD     : Vector Add (integer and floating-point)
-defm VADD     : N3V_QHSD<0, 0, 0b1000, 0, "vadd.i", add, 1>;
-def  VADDfd   : N3VD<0, 0, 0b00, 0b1101, 0, "vadd.f32", v2f32, v2f32, fadd, 1>;
-def  VADDfq   : N3VQ<0, 0, 0b00, 0b1101, 0, "vadd.f32", v4f32, v4f32, fadd, 1>;
+defm VADD     : N3V_QHSD<0, 0, 0b1000, 0, IIC_VBINiD, IIC_VBINiQ, "vadd.i", add, 1>;
+def  VADDfd   : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd.f32", v2f32, v2f32, fadd, 1>;
+def  VADDfq   : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd.f32", v4f32, v4f32, fadd, 1>;
 //   VADDL    : Vector Add Long (Q = D + D)
-defm VADDLs   : N3VLInt_QHS<0,1,0b0000,0, "vaddl.s", int_arm_neon_vaddls, 1>;
-defm VADDLu   : N3VLInt_QHS<1,1,0b0000,0, "vaddl.u", int_arm_neon_vaddlu, 1>;
+defm VADDLs   : N3VLInt_QHS<0,1,0b0000,0, IIC_VSHLiD, "vaddl.s", int_arm_neon_vaddls, 1>;
+defm VADDLu   : N3VLInt_QHS<1,1,0b0000,0, IIC_VSHLiD, "vaddl.u", int_arm_neon_vaddlu, 1>;
 //   VADDW    : Vector Add Wide (Q = Q + D)
 defm VADDWs   : N3VWInt_QHS<0,1,0b0001,0, "vaddw.s", int_arm_neon_vaddws, 0>;
 defm VADDWu   : N3VWInt_QHS<1,1,0b0001,0, "vaddw.u", int_arm_neon_vaddwu, 0>;
 //   VHADD    : Vector Halving Add
-defm VHADDs   : N3VInt_QHS<0,0,0b0000,0, "vhadd.s", int_arm_neon_vhadds, 1>;
-defm VHADDu   : N3VInt_QHS<1,0,0b0000,0, "vhadd.u", int_arm_neon_vhaddu, 1>;
+defm VHADDs   : N3VInt_QHS<0,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vhadd.s", int_arm_neon_vhadds, 1>;
+defm VHADDu   : N3VInt_QHS<1,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vhadd.u", int_arm_neon_vhaddu, 1>;
 //   VRHADD   : Vector Rounding Halving Add
-defm VRHADDs  : N3VInt_QHS<0,0,0b0001,0, "vrhadd.s", int_arm_neon_vrhadds, 1>;
-defm VRHADDu  : N3VInt_QHS<1,0,0b0001,0, "vrhadd.u", int_arm_neon_vrhaddu, 1>;
+defm VRHADDs  : N3VInt_QHS<0,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vrhadd.s", int_arm_neon_vrhadds, 1>;
+defm VRHADDu  : N3VInt_QHS<1,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vrhadd.u", int_arm_neon_vrhaddu, 1>;
 //   VQADD    : Vector Saturating Add
-defm VQADDs   : N3VInt_QHSD<0,0,0b0000,1, "vqadd.s", int_arm_neon_vqadds, 1>;
-defm VQADDu   : N3VInt_QHSD<1,0,0b0000,1, "vqadd.u", int_arm_neon_vqaddu, 1>;
+defm VQADDs   : N3VInt_QHSD<0,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                            IIC_VBINi4Q, "vqadd.s", int_arm_neon_vqadds, 1>;
+defm VQADDu   : N3VInt_QHSD<1,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                            IIC_VBINi4Q, "vqadd.u", int_arm_neon_vqaddu, 1>;
 //   VADDHN   : Vector Add and Narrow Returning High Half (D = Q + Q)
 defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn.i", int_arm_neon_vaddhn, 1>;
 //   VRADDHN  : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
@@ -816,64 +1572,208 @@ defm VRADDHN  : N3VNInt_HSD<1,1,0b0100,0, "vraddhn.i", int_arm_neon_vraddhn, 1>;
 // Vector Multiply Operations.
 
 //   VMUL     : Vector Multiply (integer, polynomial and floating-point)
-defm VMUL     : N3V_QHS<0, 0, 0b1001, 1, "vmul.i", mul, 1>;
-def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, "vmul.p8", v8i8, v8i8,
+defm VMUL     : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D, IIC_VMULi16Q,
+                        IIC_VMULi32Q, "vmul.i", mul, 1>;
+def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16D, "vmul.p8", v8i8, v8i8,
                         int_arm_neon_vmulp, 1>;
-def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, "vmul.p8", v16i8, v16i8,
+def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16Q, "vmul.p8", v16i8, v16i8,
                         int_arm_neon_vmulp, 1>;
-def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul, 1>;
-def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, "vmul.f32", v4f32, v4f32, fmul, 1>;
+def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VBIND, "vmul.f32", v2f32, v2f32, fmul, 1>;
+def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VBINQ, "vmul.f32", v4f32, v4f32, fmul, 1>;
+defm VMULsl  : N3VSL_HS<0b1000, "vmul.i", mul>;
+def VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul.f32", v2f32, fmul>;
+def VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul.f32", v4f32, v2f32, fmul>;
+def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
+                      (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
+          (v8i16 (VMULslv8i16 (v8i16 QPR:$src1),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                                     (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (mul (v4i32 QPR:$src1),
+                      (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
+          (v4i32 (VMULslv4i32 (v4i32 QPR:$src1),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                                     (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
+                       (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),
+          (v4f32 (VMULslfq (v4f32 QPR:$src1),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src2,
+                                                  (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
 //   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
-defm VQDMULH  : N3VInt_HS<0,0,0b1011,0, "vqdmulh.s", int_arm_neon_vqdmulh, 1>;
+defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D,
+                          IIC_VMULi16Q, IIC_VMULi32Q, 
+                          "vqdmulh.s", int_arm_neon_vqdmulh, 1>;
+defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D,
+                            IIC_VMULi16Q, IIC_VMULi32Q,
+                            "vqdmulh.s",  int_arm_neon_vqdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1),
+                                       (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
+          (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1),
+                                 (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                                        (DSubReg_i16_reg imm:$lane))),
+                                 (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
+                                       (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
+          (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1),
+                                 (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                                        (DSubReg_i32_reg imm:$lane))),
+                                 (SubReg_i32_lane imm:$lane)))>;
+
 //   VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
-defm VQRDMULH : N3VInt_HS<1,0,0b1011,0, "vqrdmulh.s", int_arm_neon_vqrdmulh, 1>;
+defm VQRDMULH   : N3VInt_HS<1, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D,
+                            IIC_VMULi16Q, IIC_VMULi32Q,
+                            "vqrdmulh.s", int_arm_neon_vqrdmulh, 1>;
+defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D,
+                              IIC_VMULi16Q, IIC_VMULi32Q,
+                              "vqrdmulh.s",  int_arm_neon_vqrdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1),
+                                        (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
+          (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1),
+                                  (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                                         (DSubReg_i16_reg imm:$lane))),
+                                  (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
+                                        (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
+          (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1),
+                                  (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                                         (DSubReg_i32_reg imm:$lane))),
+                                  (SubReg_i32_lane imm:$lane)))>;
+
 //   VMULL    : Vector Multiply Long (integer and polynomial) (Q = D * D)
-defm VMULLs   : N3VLInt_QHS<0,1,0b1100,0, "vmull.s", int_arm_neon_vmulls, 1>;
-defm VMULLu   : N3VLInt_QHS<1,1,0b1100,0, "vmull.u", int_arm_neon_vmullu, 1>;
-def  VMULLp   : N3VLInt<0, 1, 0b00, 0b1110, 0, "vmull.p8", v8i16, v8i8,
+defm VMULLs   : N3VLInt_QHS<0,1,0b1100,0, IIC_VMULi16D, "vmull.s", int_arm_neon_vmulls, 1>;
+defm VMULLu   : N3VLInt_QHS<1,1,0b1100,0, IIC_VMULi16D, "vmull.u", int_arm_neon_vmullu, 1>;
+def  VMULLp   : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull.p8", v8i16, v8i8,
                         int_arm_neon_vmullp, 1>;
+defm VMULLsls : N3VLIntSL_HS<0, 0b1010, IIC_VMULi16D, "vmull.s", int_arm_neon_vmulls>;
+defm VMULLslu : N3VLIntSL_HS<1, 0b1010, IIC_VMULi16D, "vmull.u", int_arm_neon_vmullu>;
+
 //   VQDMULL  : Vector Saturating Doubling Multiply Long (Q = D * D)
-defm VQDMULL  : N3VLInt_HS<0,1,0b1101,0, "vqdmull.s", int_arm_neon_vqdmull, 1>;
+defm VQDMULL  : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, "vqdmull.s", int_arm_neon_vqdmull, 1>;
+defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D, "vqdmull.s", int_arm_neon_vqdmull>;
 
 // Vector Multiply-Accumulate and Multiply-Subtract Operations.
 
 //   VMLA     : Vector Multiply Accumulate (integer and floating-point)
-defm VMLA     : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmla.i", add>;
-def  VMLAfd   : N3VDMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v2f32, fmul, fadd>;
-def  VMLAfq   : N3VQMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v4f32, fmul, fadd>;
+defm VMLA     : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vmla.i", add>;
+def  VMLAfd   : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla.f32", v2f32, fmul, fadd>;
+def  VMLAfq   : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla.f32", v4f32, fmul, fadd>;
+defm VMLAsl   : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
+                              IIC_VMACi16Q, IIC_VMACi32Q, "vmla.i", add>;
+def  VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla.f32", v2f32, fmul, fadd>;
+def  VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla.f32", v4f32, v2f32, fmul, fadd>;
+
+def : Pat<(v8i16 (add (v8i16 QPR:$src1),
+                      (mul (v8i16 QPR:$src2),
+                           (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+          (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1),
+                              (v8i16 QPR:$src2),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src3,
+                                                     (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (add (v4i32 QPR:$src1),
+                      (mul (v4i32 QPR:$src2),
+                           (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+          (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1),
+                              (v4i32 QPR:$src2),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src3,
+                                                     (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fadd (v4f32 QPR:$src1),
+                       (fmul (v4f32 QPR:$src2),
+                             (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+          (v4f32 (VMLAslfq (v4f32 QPR:$src1),
+                           (v4f32 QPR:$src2),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src3,
+                                                  (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
 //   VMLAL    : Vector Multiply Accumulate Long (Q += D * D)
 defm VMLALs   : N3VLInt3_QHS<0,1,0b1000,0, "vmlal.s", int_arm_neon_vmlals>;
 defm VMLALu   : N3VLInt3_QHS<1,1,0b1000,0, "vmlal.u", int_arm_neon_vmlalu>;
+
+defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal.s", int_arm_neon_vmlals>;
+defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal.u", int_arm_neon_vmlalu>;
+
 //   VQDMLAL  : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
 defm VQDMLAL  : N3VLInt3_HS<0, 1, 0b1001, 0, "vqdmlal.s", int_arm_neon_vqdmlal>;
+defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal.s", int_arm_neon_vqdmlal>;
+
 //   VMLS     : Vector Multiply Subtract (integer and floating-point)
-defm VMLS     : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmls.i", sub>;
-def  VMLSfd   : N3VDMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v2f32, fmul, fsub>;
-def  VMLSfq   : N3VQMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v4f32, fmul, fsub>;
+defm VMLS     : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vmls.i", sub>;
+def  VMLSfd   : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls.f32", v2f32, fmul, fsub>;
+def  VMLSfq   : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls.f32", v4f32, fmul, fsub>;
+defm VMLSsl   : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
+                              IIC_VMACi16Q, IIC_VMACi32Q, "vmls.i", sub>;
+def  VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls.f32", v2f32, fmul, fsub>;
+def  VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls.f32", v4f32, v2f32, fmul, fsub>;
+
+def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
+                      (mul (v8i16 QPR:$src2),
+                           (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+          (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1),
+                              (v8i16 QPR:$src2),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src3,
+                                                     (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
+                      (mul (v4i32 QPR:$src2),
+                           (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+          (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1),
+                              (v4i32 QPR:$src2),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src3,
+                                                     (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fsub (v4f32 QPR:$src1),
+                       (fmul (v4f32 QPR:$src2),
+                             (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+          (v4f32 (VMLSslfq (v4f32 QPR:$src1),
+                           (v4f32 QPR:$src2),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src3,
+                                                  (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
 //   VMLSL    : Vector Multiply Subtract Long (Q -= D * D)
 defm VMLSLs   : N3VLInt3_QHS<0,1,0b1010,0, "vmlsl.s", int_arm_neon_vmlsls>;
 defm VMLSLu   : N3VLInt3_QHS<1,1,0b1010,0, "vmlsl.u", int_arm_neon_vmlslu>;
+
+defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl.s", int_arm_neon_vmlsls>;
+defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl.u", int_arm_neon_vmlslu>;
+
 //   VQDMLSL  : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
 defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, "vqdmlsl.s", int_arm_neon_vqdmlsl>;
+defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl.s", int_arm_neon_vqdmlsl>;
 
 // Vector Subtract Operations.
 
 //   VSUB     : Vector Subtract (integer and floating-point)
-defm VSUB     : N3V_QHSD<1, 0, 0b1000, 0, "vsub.i", sub, 0>;
-def  VSUBfd   : N3VD<0, 0, 0b10, 0b1101, 0, "vsub.f32", v2f32, v2f32, fsub, 0>;
-def  VSUBfq   : N3VQ<0, 0, 0b10, 0b1101, 0, "vsub.f32", v4f32, v4f32, fsub, 0>;
+defm VSUB     : N3V_QHSD<1, 0, 0b1000, 0, IIC_VSUBiD, IIC_VSUBiQ, "vsub.i", sub, 0>;
+def  VSUBfd   : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub.f32", v2f32, v2f32, fsub, 0>;
+def  VSUBfq   : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub.f32", v4f32, v4f32, fsub, 0>;
 //   VSUBL    : Vector Subtract Long (Q = D - D)
-defm VSUBLs   : N3VLInt_QHS<0,1,0b0010,0, "vsubl.s", int_arm_neon_vsubls, 1>;
-defm VSUBLu   : N3VLInt_QHS<1,1,0b0010,0, "vsubl.u", int_arm_neon_vsublu, 1>;
+defm VSUBLs   : N3VLInt_QHS<0,1,0b0010,0, IIC_VSHLiD, "vsubl.s", int_arm_neon_vsubls, 1>;
+defm VSUBLu   : N3VLInt_QHS<1,1,0b0010,0, IIC_VSHLiD, "vsubl.u", int_arm_neon_vsublu, 1>;
 //   VSUBW    : Vector Subtract Wide (Q = Q - D)
 defm VSUBWs   : N3VWInt_QHS<0,1,0b0011,0, "vsubw.s", int_arm_neon_vsubws, 0>;
 defm VSUBWu   : N3VWInt_QHS<1,1,0b0011,0, "vsubw.u", int_arm_neon_vsubwu, 0>;
 //   VHSUB    : Vector Halving Subtract
-defm VHSUBs   : N3VInt_QHS<0, 0, 0b0010, 0, "vhsub.s", int_arm_neon_vhsubs, 0>;
-defm VHSUBu   : N3VInt_QHS<1, 0, 0b0010, 0, "vhsub.u", int_arm_neon_vhsubu, 0>;
+defm VHSUBs   : N3VInt_QHS<0, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vhsub.s", int_arm_neon_vhsubs, 0>;
+defm VHSUBu   : N3VInt_QHS<1, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vhsub.u", int_arm_neon_vhsubu, 0>;
 //   VQSUB    : Vector Saturing Subtract
-defm VQSUBs   : N3VInt_QHSD<0, 0, 0b0010, 1, "vqsub.s", int_arm_neon_vqsubs, 0>;
-defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, "vqsub.u", int_arm_neon_vqsubu, 0>;
+defm VQSUBs   : N3VInt_QHSD<0, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                            IIC_VBINi4Q, "vqsub.s", int_arm_neon_vqsubs, 0>;
+defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                            IIC_VBINi4Q, "vqsub.u", int_arm_neon_vqsubu, 0>;
 //   VSUBHN   : Vector Subtract and Narrow Returning High Half (D = Q - Q)
 defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn.i", int_arm_neon_vsubhn, 0>;
 //   VRSUBHN  : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
@@ -882,85 +1782,101 @@ defm VRSUBHN  : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn.i", int_arm_neon_vrsubhn, 0>;
 // Vector Comparisons.
 
 //   VCEQ     : Vector Compare Equal
-defm VCEQ     : N3V_QHS<1, 0, 0b1000, 1, "vceq.i", NEONvceq, 1>;
-def  VCEQfd   : N3VD<0,0,0b00,0b1110,0, "vceq.f32", v2i32, v2f32, NEONvceq, 1>;
-def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, "vceq.f32", v4i32, v4f32, NEONvceq, 1>;
+defm VCEQ     : N3V_QHS<1, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                        IIC_VBINi4Q, "vceq.i", NEONvceq, 1>;
+def  VCEQfd   : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq.f32", v2i32, v2f32, NEONvceq, 1>;
+def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq.f32", v4i32, v4f32, NEONvceq, 1>;
 //   VCGE     : Vector Compare Greater Than or Equal
-defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, "vcge.s", NEONvcge, 0>;
-defm VCGEu    : N3V_QHS<1, 0, 0b0011, 1, "vcge.u", NEONvcgeu, 0>;
-def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, "vcge.f32", v2i32, v2f32, NEONvcge, 0>;
-def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, "vcge.f32", v4i32, v4f32, NEONvcge, 0>;
+defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                        IIC_VBINi4Q, "vcge.s", NEONvcge, 0>;
+defm VCGEu    : N3V_QHS<1, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
+                        IIC_VBINi4Q, "vcge.u", NEONvcgeu, 0>;
+def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge.f32", v2i32, v2f32, NEONvcge, 0>;
+def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge.f32", v4i32, v4f32, NEONvcge, 0>;
 //   VCGT     : Vector Compare Greater Than
-defm VCGTs    : N3V_QHS<0, 0, 0b0011, 0, "vcgt.s", NEONvcgt, 0>;
-defm VCGTu    : N3V_QHS<1, 0, 0b0011, 0, "vcgt.u", NEONvcgtu, 0>;
-def  VCGTfd   : N3VD<1,0,0b10,0b1110,0, "vcgt.f32", v2i32, v2f32, NEONvcgt, 0>;
-def  VCGTfq   : N3VQ<1,0,0b10,0b1110,0, "vcgt.f32", v4i32, v4f32, NEONvcgt, 0>;
+defm VCGTs    : N3V_QHS<0, 0, 0b0011, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
+                        IIC_VBINi4Q, "vcgt.s", NEONvcgt, 0>;
+defm VCGTu    : N3V_QHS<1, 0, 0b0011, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
+                        IIC_VBINi4Q, "vcgt.u", NEONvcgtu, 0>;
+def  VCGTfd   : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt.f32", v2i32, v2f32, NEONvcgt, 0>;
+def  VCGTfq   : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt.f32", v4i32, v4f32, NEONvcgt, 0>;
 //   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
-def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, "vacge.f32", v2i32, v2f32,
+def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, IIC_VBIND, "vacge.f32", v2i32, v2f32,
                         int_arm_neon_vacged, 0>;
-def  VACGEq   : N3VQInt<1, 0, 0b00, 0b1110, 1, "vacge.f32", v4i32, v4f32,
+def  VACGEq   : N3VQInt<1, 0, 0b00, 0b1110, 1, IIC_VBINQ, "vacge.f32", v4i32, v4f32,
                         int_arm_neon_vacgeq, 0>;
 //   VACGT    : Vector Absolute Compare Greater Than (aka VCAGT)
-def  VACGTd   : N3VDInt<1, 0, 0b10, 0b1110, 1, "vacgt.f32", v2i32, v2f32,
+def  VACGTd   : N3VDInt<1, 0, 0b10, 0b1110, 1, IIC_VBIND, "vacgt.f32", v2i32, v2f32,
                         int_arm_neon_vacgtd, 0>;
-def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, "vacgt.f32", v4i32, v4f32,
+def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, IIC_VBINQ, "vacgt.f32", v4i32, v4f32,
                         int_arm_neon_vacgtq, 0>;
 //   VTST     : Vector Test Bits
-defm VTST     : N3V_QHS<0, 0, 0b1000, 1, "vtst.i", NEONvtst, 1>;
+defm VTST     : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
+                        IIC_VBINi4Q, "vtst.i", NEONvtst, 1>;
 
 // Vector Bitwise Operations.
 
 //   VAND     : Vector Bitwise AND
-def  VANDd    : N3VD<0, 0, 0b00, 0b0001, 1, "vand", v2i32, v2i32, and, 1>;
-def  VANDq    : N3VQ<0, 0, 0b00, 0b0001, 1, "vand", v4i32, v4i32, and, 1>;
+def  VANDd    : N3VD<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand", v2i32, v2i32, and, 1>;
+def  VANDq    : N3VQ<0, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "vand", v4i32, v4i32, and, 1>;
 
 //   VEOR     : Vector Bitwise Exclusive OR
-def  VEORd    : N3VD<1, 0, 0b00, 0b0001, 1, "veor", v2i32, v2i32, xor, 1>;
-def  VEORq    : N3VQ<1, 0, 0b00, 0b0001, 1, "veor", v4i32, v4i32, xor, 1>;
+def  VEORd    : N3VD<1, 0, 0b00, 0b0001, 1, IIC_VBINiD, "veor", v2i32, v2i32, xor, 1>;
+def  VEORq    : N3VQ<1, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "veor", v4i32, v4i32, xor, 1>;
 
 //   VORR     : Vector Bitwise OR
-def  VORRd    : N3VD<0, 0, 0b10, 0b0001, 1, "vorr", v2i32, v2i32, or, 1>;
-def  VORRq    : N3VQ<0, 0, 0b10, 0b0001, 1, "vorr", v4i32, v4i32, or, 1>;
+def  VORRd    : N3VD<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr", v2i32, v2i32, or, 1>;
+def  VORRq    : N3VQ<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr", v4i32, v4i32, or, 1>;
 
 //   VBIC     : Vector Bitwise Bit Clear (AND NOT)
 def  VBICd    : N3V<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
-                    (ins DPR:$src1, DPR:$src2), "vbic\t$dst, $src1, $src2", "",
-                    [(set DPR:$dst, (v2i32 (and DPR:$src1,(vnot DPR:$src2))))]>;
+                    (ins DPR:$src1, DPR:$src2), IIC_VBINiD,
+                    "vbic\t$dst, $src1, $src2", "",
+                    [(set DPR:$dst, (v2i32 (and DPR:$src1,
+                                                (vnot_conv DPR:$src2))))]>;
 def  VBICq    : N3V<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
-                    (ins QPR:$src1, QPR:$src2), "vbic\t$dst, $src1, $src2", "",
-                    [(set QPR:$dst, (v4i32 (and QPR:$src1,(vnot QPR:$src2))))]>;
+                    (ins QPR:$src1, QPR:$src2), IIC_VBINiQ,
+                    "vbic\t$dst, $src1, $src2", "",
+                    [(set QPR:$dst, (v4i32 (and QPR:$src1,
+                                                (vnot_conv QPR:$src2))))]>;
 
 //   VORN     : Vector Bitwise OR NOT
 def  VORNd    : N3V<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst),
-                    (ins DPR:$src1, DPR:$src2), "vorn\t$dst, $src1, $src2", "",
-                    [(set DPR:$dst, (v2i32 (or DPR:$src1, (vnot DPR:$src2))))]>;
+                    (ins DPR:$src1, DPR:$src2), IIC_VBINiD,
+                    "vorn\t$dst, $src1, $src2", "",
+                    [(set DPR:$dst, (v2i32 (or DPR:$src1,
+                                               (vnot_conv DPR:$src2))))]>;
 def  VORNq    : N3V<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst),
-                    (ins QPR:$src1, QPR:$src2), "vorn\t$dst, $src1, $src2", "",
-                    [(set QPR:$dst, (v4i32 (or QPR:$src1, (vnot QPR:$src2))))]>;
+                    (ins QPR:$src1, QPR:$src2), IIC_VBINiQ,
+                    "vorn\t$dst, $src1, $src2", "",
+                    [(set QPR:$dst, (v4i32 (or QPR:$src1,
+                                               (vnot_conv QPR:$src2))))]>;
 
 //   VMVN     : Vector Bitwise NOT
 def  VMVNd    : N2V<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0,
-                    (outs DPR:$dst), (ins DPR:$src), "vmvn\t$dst, $src", "",
+                    (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD,
+                    "vmvn\t$dst, $src", "",
                     [(set DPR:$dst, (v2i32 (vnot DPR:$src)))]>;
 def  VMVNq    : N2V<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
-                    (outs QPR:$dst), (ins QPR:$src), "vmvn\t$dst, $src", "",
+                    (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD,
+                    "vmvn\t$dst, $src", "",
                     [(set QPR:$dst, (v4i32 (vnot QPR:$src)))]>;
 def : Pat<(v2i32 (vnot_conv DPR:$src)), (VMVNd DPR:$src)>;
 def : Pat<(v4i32 (vnot_conv QPR:$src)), (VMVNq QPR:$src)>;
 
 //   VBSL     : Vector Bitwise Select
 def  VBSLd    : N3V<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
-                    (ins DPR:$src1, DPR:$src2, DPR:$src3),
+                    (ins DPR:$src1, DPR:$src2, DPR:$src3), IIC_VCNTiD,
                     "vbsl\t$dst, $src2, $src3", "$src1 = $dst",
                     [(set DPR:$dst,
                       (v2i32 (or (and DPR:$src2, DPR:$src1),
-                                 (and DPR:$src3, (vnot DPR:$src1)))))]>;
+                                 (and DPR:$src3, (vnot_conv DPR:$src1)))))]>;
 def  VBSLq    : N3V<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
-                    (ins QPR:$src1, QPR:$src2, QPR:$src3),
+                    (ins QPR:$src1, QPR:$src2, QPR:$src3), IIC_VCNTiQ,
                     "vbsl\t$dst, $src2, $src3", "$src1 = $dst",
                     [(set QPR:$dst,
                       (v4i32 (or (and QPR:$src2, QPR:$src1),
-                                 (and QPR:$src3, (vnot QPR:$src1)))))]>;
+                                 (and QPR:$src3, (vnot_conv QPR:$src1)))))]>;
 
 //   VBIF     : Vector Bitwise Insert if False
 //              like VBSL but with: "vbif\t$dst, $src3, $src1", "$src2 = $dst",
@@ -973,16 +1889,18 @@ def  VBSLq    : N3V<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
 // Vector Absolute Differences.
 
 //   VABD     : Vector Absolute Difference
-defm VABDs    : N3VInt_QHS<0, 0, 0b0111, 0, "vabd.s", int_arm_neon_vabds, 0>;
-defm VABDu    : N3VInt_QHS<1, 0, 0b0111, 0, "vabd.u", int_arm_neon_vabdu, 0>;
-def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v2f32, v2f32,
-                        int_arm_neon_vabdf, 0>;
-def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v4f32, v4f32,
-                        int_arm_neon_vabdf, 0>;
+defm VABDs    : N3VInt_QHS<0, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vabd.s", int_arm_neon_vabds, 0>;
+defm VABDu    : N3VInt_QHS<1, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vabd.u", int_arm_neon_vabdu, 0>;
+def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, IIC_VBIND, "vabd.f32", v2f32, v2f32,
+                        int_arm_neon_vabds, 0>;
+def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vabd.f32", v4f32, v4f32,
+                        int_arm_neon_vabds, 0>;
 
 //   VABDL    : Vector Absolute Difference Long (Q = | D - D |)
-defm VABDLs   : N3VLInt_QHS<0,1,0b0111,0, "vabdl.s", int_arm_neon_vabdls, 0>;
-defm VABDLu   : N3VLInt_QHS<1,1,0b0111,0, "vabdl.u", int_arm_neon_vabdlu, 0>;
+defm VABDLs   : N3VLInt_QHS<0,1,0b0111,0, IIC_VBINi4Q, "vabdl.s", int_arm_neon_vabdls, 0>;
+defm VABDLu   : N3VLInt_QHS<1,1,0b0111,0, IIC_VBINi4Q, "vabdl.u", int_arm_neon_vabdlu, 0>;
 
 //   VABA     : Vector Absolute Difference and Accumulate
 defm VABAs    : N3VInt3_QHS<0,1,0b0101,0, "vaba.s", int_arm_neon_vabas>;
@@ -995,32 +1913,36 @@ defm VABALu   : N3VLInt3_QHS<1,1,0b0101,0, "vabal.u", int_arm_neon_vabalu>;
 // Vector Maximum and Minimum.
 
 //   VMAX     : Vector Maximum
-defm VMAXs    : N3VInt_QHS<0, 0, 0b0110, 0, "vmax.s", int_arm_neon_vmaxs, 1>;
-defm VMAXu    : N3VInt_QHS<1, 0, 0b0110, 0, "vmax.u", int_arm_neon_vmaxu, 1>;
-def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v2f32, v2f32,
-                        int_arm_neon_vmaxf, 1>;
-def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v4f32, v4f32,
-                        int_arm_neon_vmaxf, 1>;
+defm VMAXs    : N3VInt_QHS<0, 0, 0b0110, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vmax.s", int_arm_neon_vmaxs, 1>;
+defm VMAXu    : N3VInt_QHS<1, 0, 0b0110, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vmax.u", int_arm_neon_vmaxu, 1>;
+def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, IIC_VBIND, "vmax.f32", v2f32, v2f32,
+                        int_arm_neon_vmaxs, 1>;
+def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, IIC_VBINQ, "vmax.f32", v4f32, v4f32,
+                        int_arm_neon_vmaxs, 1>;
 
 //   VMIN     : Vector Minimum
-defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, "vmin.s", int_arm_neon_vmins, 1>;
-defm VMINu    : N3VInt_QHS<1, 0, 0b0110, 1, "vmin.u", int_arm_neon_vminu, 1>;
-def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v2f32, v2f32,
-                        int_arm_neon_vminf, 1>;
-def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v4f32, v4f32,
-                        int_arm_neon_vminf, 1>;
+defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vmin.s", int_arm_neon_vmins, 1>;
+defm VMINu    : N3VInt_QHS<1, 0, 0b0110, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vmin.u", int_arm_neon_vminu, 1>;
+def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, IIC_VBIND, "vmin.f32", v2f32, v2f32,
+                        int_arm_neon_vmins, 1>;
+def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, IIC_VBINQ, "vmin.f32", v4f32, v4f32,
+                        int_arm_neon_vmins, 1>;
 
 // Vector Pairwise Operations.
 
 //   VPADD    : Vector Pairwise Add
-def  VPADDi8  : N3VDInt<0, 0, 0b00, 0b1011, 1, "vpadd.i8", v8i8, v8i8,
-                        int_arm_neon_vpaddi, 0>;
-def  VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, "vpadd.i16", v4i16, v4i16,
-                        int_arm_neon_vpaddi, 0>;
-def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, "vpadd.i32", v2i32, v2i32,
-                        int_arm_neon_vpaddi, 0>;
-def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, "vpadd.f32", v2f32, v2f32,
-                        int_arm_neon_vpaddf, 0>;
+def  VPADDi8  : N3VDInt<0, 0, 0b00, 0b1011, 1, IIC_VBINiD, "vpadd.i8", v8i8, v8i8,
+                        int_arm_neon_vpadd, 0>;
+def  VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, IIC_VBINiD, "vpadd.i16", v4i16, v4i16,
+                        int_arm_neon_vpadd, 0>;
+def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, IIC_VBINiD, "vpadd.i32", v2i32, v2i32,
+                        int_arm_neon_vpadd, 0>;
+def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, IIC_VBIND, "vpadd.f32", v2f32, v2f32,
+                        int_arm_neon_vpadd, 0>;
 
 //   VPADDL   : Vector Pairwise Add Long
 defm VPADDLs  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl.s",
@@ -1035,81 +1957,91 @@ defm VPADALu  : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpadal.u",
                               int_arm_neon_vpadalu>;
 
 //   VPMAX    : Vector Pairwise Maximum
-def  VPMAXs8  : N3VDInt<0, 0, 0b00, 0b1010, 0, "vpmax.s8", v8i8, v8i8,
+def  VPMAXs8  : N3VDInt<0, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax.s8", v8i8, v8i8,
                         int_arm_neon_vpmaxs, 0>;
-def  VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, "vpmax.s16", v4i16, v4i16,
+def  VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax.s16", v4i16, v4i16,
                         int_arm_neon_vpmaxs, 0>;
-def  VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, "vpmax.s32", v2i32, v2i32,
+def  VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax.s32", v2i32, v2i32,
                         int_arm_neon_vpmaxs, 0>;
-def  VPMAXu8  : N3VDInt<1, 0, 0b00, 0b1010, 0, "vpmax.u8", v8i8, v8i8,
+def  VPMAXu8  : N3VDInt<1, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax.u8", v8i8, v8i8,
                         int_arm_neon_vpmaxu, 0>;
-def  VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, "vpmax.u16", v4i16, v4i16,
+def  VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax.u16", v4i16, v4i16,
                         int_arm_neon_vpmaxu, 0>;
-def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, "vpmax.u32", v2i32, v2i32,
+def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax.u32", v2i32, v2i32,
                         int_arm_neon_vpmaxu, 0>;
-def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, "vpmax.f32", v2f32, v2f32,
-                        int_arm_neon_vpmaxf, 0>;
+def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, IIC_VBINi4D, "vpmax.f32", v2f32, v2f32,
+                        int_arm_neon_vpmaxs, 0>;
 
 //   VPMIN    : Vector Pairwise Minimum
-def  VPMINs8  : N3VDInt<0, 0, 0b00, 0b1010, 1, "vpmin.s8", v8i8, v8i8,
+def  VPMINs8  : N3VDInt<0, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin.s8", v8i8, v8i8,
                         int_arm_neon_vpmins, 0>;
-def  VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, "vpmin.s16", v4i16, v4i16,
+def  VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin.s16", v4i16, v4i16,
                         int_arm_neon_vpmins, 0>;
-def  VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, "vpmin.s32", v2i32, v2i32,
+def  VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin.s32", v2i32, v2i32,
                         int_arm_neon_vpmins, 0>;
-def  VPMINu8  : N3VDInt<1, 0, 0b00, 0b1010, 1, "vpmin.u8", v8i8, v8i8,
+def  VPMINu8  : N3VDInt<1, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin.u8", v8i8, v8i8,
                         int_arm_neon_vpminu, 0>;
-def  VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, "vpmin.u16", v4i16, v4i16,
+def  VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin.u16", v4i16, v4i16,
                         int_arm_neon_vpminu, 0>;
-def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, "vpmin.u32", v2i32, v2i32,
+def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin.u32", v2i32, v2i32,
                         int_arm_neon_vpminu, 0>;
-def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, "vpmin.f32", v2f32, v2f32,
-                        int_arm_neon_vpminf, 0>;
+def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, IIC_VBINi4D, "vpmin.f32", v2f32, v2f32,
+                        int_arm_neon_vpmins, 0>;
 
 // Vector Reciprocal and Reciprocal Square Root Estimate and Step.
 
 //   VRECPE   : Vector Reciprocal Estimate
-def  VRECPEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32",
+def  VRECPEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, 
+                        IIC_VUNAD, "vrecpe.u32",
                         v2i32, v2i32, int_arm_neon_vrecpe>;
-def  VRECPEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32",
+def  VRECPEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, 
+                        IIC_VUNAQ, "vrecpe.u32",
                         v4i32, v4i32, int_arm_neon_vrecpe>;
-def  VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32",
-                        v2f32, v2f32, int_arm_neon_vrecpef>;
-def  VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32",
-                        v4f32, v4f32, int_arm_neon_vrecpef>;
+def  VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+                        IIC_VUNAD, "vrecpe.f32",
+                        v2f32, v2f32, int_arm_neon_vrecpe>;
+def  VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+                        IIC_VUNAQ, "vrecpe.f32",
+                        v4f32, v4f32, int_arm_neon_vrecpe>;
 
 //   VRECPS   : Vector Reciprocal Step
-def  VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v2f32, v2f32,
+def  VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, IIC_VRECSD, "vrecps.f32", v2f32, v2f32,
                         int_arm_neon_vrecps, 1>;
-def  VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v4f32, v4f32,
+def  VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, IIC_VRECSQ, "vrecps.f32", v4f32, v4f32,
                         int_arm_neon_vrecps, 1>;
 
 //   VRSQRTE  : Vector Reciprocal Square Root Estimate
-def  VRSQRTEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32",
-                        v2i32, v2i32, int_arm_neon_vrsqrte>;
-def  VRSQRTEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32",
-                        v4i32, v4i32, int_arm_neon_vrsqrte>;
-def  VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32",
-                        v2f32, v2f32, int_arm_neon_vrsqrtef>;
-def  VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32",
-                        v4f32, v4f32, int_arm_neon_vrsqrtef>;
+def  VRSQRTEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+                         IIC_VUNAD, "vrsqrte.u32",
+                         v2i32, v2i32, int_arm_neon_vrsqrte>;
+def  VRSQRTEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+                         IIC_VUNAQ, "vrsqrte.u32",
+                         v4i32, v4i32, int_arm_neon_vrsqrte>;
+def  VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
+                         IIC_VUNAD, "vrsqrte.f32",
+                         v2f32, v2f32, int_arm_neon_vrsqrte>;
+def  VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, 
+                         IIC_VUNAQ, "vrsqrte.f32",
+                         v4f32, v4f32, int_arm_neon_vrsqrte>;
 
 //   VRSQRTS  : Vector Reciprocal Square Root Step
-def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v2f32, v2f32,
+def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, IIC_VRECSD, "vrsqrts.f32", v2f32, v2f32,
                         int_arm_neon_vrsqrts, 1>;
-def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v4f32, v4f32,
+def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, IIC_VRECSQ, "vrsqrts.f32", v4f32, v4f32,
                         int_arm_neon_vrsqrts, 1>;
 
 // Vector Shifts.
 
 //   VSHL     : Vector Shift
-defm VSHLs    : N3VInt_QHSD<0, 0, 0b0100, 0, "vshl.s", int_arm_neon_vshifts, 0>;
-defm VSHLu    : N3VInt_QHSD<1, 0, 0b0100, 0, "vshl.u", int_arm_neon_vshiftu, 0>;
+defm VSHLs    : N3VInt_QHSD<0, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ,
+                            IIC_VSHLiQ, "vshl.s", int_arm_neon_vshifts, 0>;
+defm VSHLu    : N3VInt_QHSD<1, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ,
+                            IIC_VSHLiQ, "vshl.u", int_arm_neon_vshiftu, 0>;
 //   VSHL     : Vector Shift Left (Immediate)
-defm VSHLi    : N2VSh_QHSD<0, 1, 0b0111, 1, "vshl.i", NEONvshl>;
+defm VSHLi    : N2VSh_QHSD<0, 1, 0b0111, 1, IIC_VSHLiD, "vshl.i", NEONvshl>;
 //   VSHR     : Vector Shift Right (Immediate)
-defm VSHRs    : N2VSh_QHSD<0, 1, 0b0000, 1, "vshr.s", NEONvshrs>;
-defm VSHRu    : N2VSh_QHSD<1, 1, 0b0000, 1, "vshr.u", NEONvshru>;
+defm VSHRs    : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr.s", NEONvshrs>;
+defm VSHRu    : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr.u", NEONvshru>;
 
 //   VSHLL    : Vector Shift Left Long
 def  VSHLLs8  : N2VLSh<0, 1, 0b001000, 0b1010, 0, 0, 1, "vshll.s8",
@@ -1134,86 +2066,90 @@ def  VSHLLi32 : N2VLSh<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll.i32",
                        v2i64, v2i32, NEONvshlli>;
 
 //   VSHRN    : Vector Shift Right and Narrow
-def  VSHRN16  : N2VNSh<0, 1, 0b001000, 0b1000, 0, 0, 1, "vshrn.i16",
-                       v8i8, v8i16, NEONvshrn>;
-def  VSHRN32  : N2VNSh<0, 1, 0b010000, 0b1000, 0, 0, 1, "vshrn.i32",
-                       v4i16, v4i32, NEONvshrn>;
-def  VSHRN64  : N2VNSh<0, 1, 0b100000, 0b1000, 0, 0, 1, "vshrn.i64",
-                       v2i32, v2i64, NEONvshrn>;
+def  VSHRN16  : N2VNSh<0, 1, 0b001000, 0b1000, 0, 0, 1, 
+                       IIC_VSHLiD, "vshrn.i16", v8i8, v8i16, NEONvshrn>;
+def  VSHRN32  : N2VNSh<0, 1, 0b010000, 0b1000, 0, 0, 1,
+                       IIC_VSHLiD, "vshrn.i32", v4i16, v4i32, NEONvshrn>;
+def  VSHRN64  : N2VNSh<0, 1, 0b100000, 0b1000, 0, 0, 1,
+                       IIC_VSHLiD, "vshrn.i64", v2i32, v2i64, NEONvshrn>;
 
 //   VRSHL    : Vector Rounding Shift
-defm VRSHLs   : N3VInt_QHSD<0,0,0b0101,0, "vrshl.s", int_arm_neon_vrshifts, 0>;
-defm VRSHLu   : N3VInt_QHSD<1,0,0b0101,0, "vrshl.u", int_arm_neon_vrshiftu, 0>;
+defm VRSHLs   : N3VInt_QHSD<0,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+                            IIC_VSHLi4Q, "vrshl.s", int_arm_neon_vrshifts, 0>;
+defm VRSHLu   : N3VInt_QHSD<1,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+                            IIC_VSHLi4Q, "vrshl.u", int_arm_neon_vrshiftu, 0>;
 //   VRSHR    : Vector Rounding Shift Right
-defm VRSHRs   : N2VSh_QHSD<0, 1, 0b0010, 1, "vrshr.s", NEONvrshrs>;
-defm VRSHRu   : N2VSh_QHSD<1, 1, 0b0010, 1, "vrshr.u", NEONvrshru>;
+defm VRSHRs   : N2VSh_QHSD<0, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr.s", NEONvrshrs>;
+defm VRSHRu   : N2VSh_QHSD<1, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr.u", NEONvrshru>;
 
 //   VRSHRN   : Vector Rounding Shift Right and Narrow
-def  VRSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 1, 1, "vrshrn.i16",
-                       v8i8, v8i16, NEONvrshrn>;
-def  VRSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 1, 1, "vrshrn.i32",
-                       v4i16, v4i32, NEONvrshrn>;
-def  VRSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 1, 1, "vrshrn.i64",
-                       v2i32, v2i64, NEONvrshrn>;
+def  VRSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 1, 1,
+                       IIC_VSHLi4D, "vrshrn.i16", v8i8, v8i16, NEONvrshrn>;
+def  VRSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 1, 1, 
+                       IIC_VSHLi4D, "vrshrn.i32", v4i16, v4i32, NEONvrshrn>;
+def  VRSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 1, 1,
+                       IIC_VSHLi4D, "vrshrn.i64", v2i32, v2i64, NEONvrshrn>;
 
 //   VQSHL    : Vector Saturating Shift
-defm VQSHLs   : N3VInt_QHSD<0,0,0b0100,1, "vqshl.s", int_arm_neon_vqshifts, 0>;
-defm VQSHLu   : N3VInt_QHSD<1,0,0b0100,1, "vqshl.u", int_arm_neon_vqshiftu, 0>;
+defm VQSHLs   : N3VInt_QHSD<0,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+                            IIC_VSHLi4Q, "vqshl.s", int_arm_neon_vqshifts, 0>;
+defm VQSHLu   : N3VInt_QHSD<1,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+                            IIC_VSHLi4Q, "vqshl.u", int_arm_neon_vqshiftu, 0>;
 //   VQSHL    : Vector Saturating Shift Left (Immediate)
-defm VQSHLsi  : N2VSh_QHSD<0, 1, 0b0111, 1, "vqshl.s", NEONvqshls>;
-defm VQSHLui  : N2VSh_QHSD<1, 1, 0b0111, 1, "vqshl.u", NEONvqshlu>;
+defm VQSHLsi  : N2VSh_QHSD<0, 1, 0b0111, 1, IIC_VSHLi4D, "vqshl.s", NEONvqshls>;
+defm VQSHLui  : N2VSh_QHSD<1, 1, 0b0111, 1, IIC_VSHLi4D, "vqshl.u", NEONvqshlu>;
 //   VQSHLU   : Vector Saturating Shift Left (Immediate, Unsigned)
-defm VQSHLsu  : N2VSh_QHSD<1, 1, 0b0110, 1, "vqshlu.s", NEONvqshlsu>;
+defm VQSHLsu  : N2VSh_QHSD<1, 1, 0b0110, 1, IIC_VSHLi4D, "vqshlu.s", NEONvqshlsu>;
 
 //   VQSHRN   : Vector Saturating Shift Right and Narrow
-def VQSHRNs16 : N2VNSh<0, 1, 0b001000, 0b1001, 0, 0, 1, "vqshrn.s16",
-                       v8i8, v8i16, NEONvqshrns>;
-def VQSHRNs32 : N2VNSh<0, 1, 0b010000, 0b1001, 0, 0, 1, "vqshrn.s32",
-                       v4i16, v4i32, NEONvqshrns>;
-def VQSHRNs64 : N2VNSh<0, 1, 0b100000, 0b1001, 0, 0, 1, "vqshrn.s64",
-                       v2i32, v2i64, NEONvqshrns>;
-def VQSHRNu16 : N2VNSh<1, 1, 0b001000, 0b1001, 0, 0, 1, "vqshrn.u16",
-                       v8i8, v8i16, NEONvqshrnu>;
-def VQSHRNu32 : N2VNSh<1, 1, 0b010000, 0b1001, 0, 0, 1, "vqshrn.u32",
-                       v4i16, v4i32, NEONvqshrnu>;
-def VQSHRNu64 : N2VNSh<1, 1, 0b100000, 0b1001, 0, 0, 1, "vqshrn.u64",
-                       v2i32, v2i64, NEONvqshrnu>;
+def VQSHRNs16 : N2VNSh<0, 1, 0b001000, 0b1001, 0, 0, 1, 
+                       IIC_VSHLi4D, "vqshrn.s16", v8i8, v8i16, NEONvqshrns>;
+def VQSHRNs32 : N2VNSh<0, 1, 0b010000, 0b1001, 0, 0, 1,
+                       IIC_VSHLi4D, "vqshrn.s32", v4i16, v4i32, NEONvqshrns>;
+def VQSHRNs64 : N2VNSh<0, 1, 0b100000, 0b1001, 0, 0, 1, 
+                       IIC_VSHLi4D, "vqshrn.s64", v2i32, v2i64, NEONvqshrns>;
+def VQSHRNu16 : N2VNSh<1, 1, 0b001000, 0b1001, 0, 0, 1,
+                       IIC_VSHLi4D, "vqshrn.u16", v8i8, v8i16, NEONvqshrnu>;
+def VQSHRNu32 : N2VNSh<1, 1, 0b010000, 0b1001, 0, 0, 1,
+                       IIC_VSHLi4D, "vqshrn.u32", v4i16, v4i32, NEONvqshrnu>;
+def VQSHRNu64 : N2VNSh<1, 1, 0b100000, 0b1001, 0, 0, 1,
+                       IIC_VSHLi4D, "vqshrn.u64", v2i32, v2i64, NEONvqshrnu>;
 
 //   VQSHRUN  : Vector Saturating Shift Right and Narrow (Unsigned)
-def VQSHRUN16 : N2VNSh<1, 1, 0b001000, 0b1000, 0, 0, 1, "vqshrun.s16",
-                       v8i8, v8i16, NEONvqshrnsu>;
-def VQSHRUN32 : N2VNSh<1, 1, 0b010000, 0b1000, 0, 0, 1, "vqshrun.s32",
-                       v4i16, v4i32, NEONvqshrnsu>;
-def VQSHRUN64 : N2VNSh<1, 1, 0b100000, 0b1000, 0, 0, 1, "vqshrun.s64",
-                       v2i32, v2i64, NEONvqshrnsu>;
+def VQSHRUN16 : N2VNSh<1, 1, 0b001000, 0b1000, 0, 0, 1,
+                       IIC_VSHLi4D, "vqshrun.s16", v8i8, v8i16, NEONvqshrnsu>;
+def VQSHRUN32 : N2VNSh<1, 1, 0b010000, 0b1000, 0, 0, 1,
+                       IIC_VSHLi4D, "vqshrun.s32", v4i16, v4i32, NEONvqshrnsu>;
+def VQSHRUN64 : N2VNSh<1, 1, 0b100000, 0b1000, 0, 0, 1,
+                       IIC_VSHLi4D, "vqshrun.s64", v2i32, v2i64, NEONvqshrnsu>;
 
 //   VQRSHL   : Vector Saturating Rounding Shift
-defm VQRSHLs  : N3VInt_QHSD<0, 0, 0b0101, 1, "vqrshl.s",
-                            int_arm_neon_vqrshifts, 0>;
-defm VQRSHLu  : N3VInt_QHSD<1, 0, 0b0101, 1, "vqrshl.u",
-                            int_arm_neon_vqrshiftu, 0>;
+defm VQRSHLs  : N3VInt_QHSD<0, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+                            IIC_VSHLi4Q, "vqrshl.s", int_arm_neon_vqrshifts, 0>;
+defm VQRSHLu  : N3VInt_QHSD<1, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+                            IIC_VSHLi4Q, "vqrshl.u", int_arm_neon_vqrshiftu, 0>;
 
 //   VQRSHRN  : Vector Saturating Rounding Shift Right and Narrow
-def VQRSHRNs16: N2VNSh<0, 1, 0b001000, 0b1001, 0, 1, 1, "vqrshrn.s16",
-                       v8i8, v8i16, NEONvqrshrns>;
-def VQRSHRNs32: N2VNSh<0, 1, 0b010000, 0b1001, 0, 1, 1, "vqrshrn.s32",
-                       v4i16, v4i32, NEONvqrshrns>;
-def VQRSHRNs64: N2VNSh<0, 1, 0b100000, 0b1001, 0, 1, 1, "vqrshrn.s64",
-                       v2i32, v2i64, NEONvqrshrns>;
-def VQRSHRNu16: N2VNSh<1, 1, 0b001000, 0b1001, 0, 1, 1, "vqrshrn.u16",
-                       v8i8, v8i16, NEONvqrshrnu>;
-def VQRSHRNu32: N2VNSh<1, 1, 0b010000, 0b1001, 0, 1, 1, "vqrshrn.u32",
-                       v4i16, v4i32, NEONvqrshrnu>;
-def VQRSHRNu64: N2VNSh<1, 1, 0b100000, 0b1001, 0, 1, 1, "vqrshrn.u64",
-                       v2i32, v2i64, NEONvqrshrnu>;
+def VQRSHRNs16: N2VNSh<0, 1, 0b001000, 0b1001, 0, 1, 1,
+                       IIC_VSHLi4D, "vqrshrn.s16", v8i8, v8i16, NEONvqrshrns>;
+def VQRSHRNs32: N2VNSh<0, 1, 0b010000, 0b1001, 0, 1, 1,
+                       IIC_VSHLi4D, "vqrshrn.s32", v4i16, v4i32, NEONvqrshrns>;
+def VQRSHRNs64: N2VNSh<0, 1, 0b100000, 0b1001, 0, 1, 1,
+                       IIC_VSHLi4D, "vqrshrn.s64", v2i32, v2i64, NEONvqrshrns>;
+def VQRSHRNu16: N2VNSh<1, 1, 0b001000, 0b1001, 0, 1, 1,
+                       IIC_VSHLi4D, "vqrshrn.u16", v8i8, v8i16, NEONvqrshrnu>;
+def VQRSHRNu32: N2VNSh<1, 1, 0b010000, 0b1001, 0, 1, 1,
+                       IIC_VSHLi4D, "vqrshrn.u32", v4i16, v4i32, NEONvqrshrnu>;
+def VQRSHRNu64: N2VNSh<1, 1, 0b100000, 0b1001, 0, 1, 1, 
+                       IIC_VSHLi4D, "vqrshrn.u64", v2i32, v2i64, NEONvqrshrnu>;
 
 //   VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned)
-def VQRSHRUN16: N2VNSh<1, 1, 0b001000, 0b1000, 0, 1, 1, "vqrshrun.s16",
-                       v8i8, v8i16, NEONvqrshrnsu>;
-def VQRSHRUN32: N2VNSh<1, 1, 0b010000, 0b1000, 0, 1, 1, "vqrshrun.s32",
-                       v4i16, v4i32, NEONvqrshrnsu>;
-def VQRSHRUN64: N2VNSh<1, 1, 0b100000, 0b1000, 0, 1, 1, "vqrshrun.s64",
-                       v2i32, v2i64, NEONvqrshrnsu>;
+def VQRSHRUN16: N2VNSh<1, 1, 0b001000, 0b1000, 0, 1, 1,
+                       IIC_VSHLi4D, "vqrshrun.s16", v8i8, v8i16, NEONvqrshrnsu>;
+def VQRSHRUN32: N2VNSh<1, 1, 0b010000, 0b1000, 0, 1, 1, 
+                       IIC_VSHLi4D, "vqrshrun.s32", v4i16, v4i32, NEONvqrshrnsu>;
+def VQRSHRUN64: N2VNSh<1, 1, 0b100000, 0b1000, 0, 1, 1,
+                       IIC_VSHLi4D, "vqrshrun.s64", v2i32, v2i64, NEONvqrshrnsu>;
 
 //   VSRA     : Vector Shift Right and Accumulate
 defm VSRAs    : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra.s", NEONvshrs>;
@@ -1230,15 +2166,19 @@ defm VSRI     : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri.", NEONvsri>;
 // Vector Absolute and Saturating Absolute.
 
 //   VABS     : Vector Absolute Value
-defm VABS     : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, "vabs.s",
+defm VABS     : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, 
+                           IIC_VUNAiD, IIC_VUNAiQ, "vabs.s",
                            int_arm_neon_vabs>;
-def  VABSfd   : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
-                        v2f32, v2f32, int_arm_neon_vabsf>;
-def  VABSfq   : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
-                        v4f32, v4f32, int_arm_neon_vabsf>;
+def  VABSfd   : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                        IIC_VUNAD, "vabs.f32",
+                        v2f32, v2f32, int_arm_neon_vabs>;
+def  VABSfq   : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                        IIC_VUNAQ, "vabs.f32",
+                        v4f32, v4f32, int_arm_neon_vabs>;
 
 //   VQABS    : Vector Saturating Absolute Value
-defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, "vqabs.s",
+defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, 
+                           IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs.s",
                            int_arm_neon_vqabs>;
 
 // Vector Negate.
@@ -1248,11 +2188,11 @@ def vneg_conv : PatFrag<(ops node:$in), (sub immAllZerosV_bc, node:$in)>;
 
 class VNEGD<bits<2> size, string OpcodeStr, ValueType Ty>
   : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src),
-        !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set DPR:$dst, (Ty (vneg DPR:$src)))]>;
 class VNEGQ<bits<2> size, string OpcodeStr, ValueType Ty>
   : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src),
-        !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        IIC_VSHLiD, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set QPR:$dst, (Ty (vneg QPR:$src)))]>;
 
 //   VNEG     : Vector Negate
@@ -1265,10 +2205,12 @@ def  VNEGs32q : VNEGQ<0b10, "vneg.s32", v4i32>;
 
 //   VNEG     : Vector Negate (floating-point)
 def  VNEGf32d : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
-                    (outs DPR:$dst), (ins DPR:$src), "vneg.f32\t$dst, $src", "",
+                    (outs DPR:$dst), (ins DPR:$src), IIC_VUNAD,
+                    "vneg.f32\t$dst, $src", "",
                     [(set DPR:$dst, (v2f32 (fneg DPR:$src)))]>;
 def  VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
-                    (outs QPR:$dst), (ins QPR:$src), "vneg.f32\t$dst, $src", "",
+                    (outs QPR:$dst), (ins QPR:$src), IIC_VUNAQ,
+                    "vneg.f32\t$dst, $src", "",
                     [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>;
 
 def : Pat<(v8i8 (vneg_conv DPR:$src)), (VNEGs8d DPR:$src)>;
@@ -1279,21 +2221,26 @@ def : Pat<(v8i16 (vneg_conv QPR:$src)), (VNEGs16q QPR:$src)>;
 def : Pat<(v4i32 (vneg_conv QPR:$src)), (VNEGs32q QPR:$src)>;
 
 //   VQNEG    : Vector Saturating Negate
-defm VQNEG    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, "vqneg.s",
+defm VQNEG    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, 
+                           IIC_VQUNAiD, IIC_VQUNAiQ, "vqneg.s",
                            int_arm_neon_vqneg>;
 
 // Vector Bit Counting Operations.
 
 //   VCLS     : Vector Count Leading Sign Bits
-defm VCLS     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, "vcls.s",
+defm VCLS     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, 
+                           IIC_VCNTiD, IIC_VCNTiQ, "vcls.s",
                            int_arm_neon_vcls>;
 //   VCLZ     : Vector Count Leading Zeros
-defm VCLZ     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, "vclz.i",
+defm VCLZ     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, 
+                           IIC_VCNTiD, IIC_VCNTiQ, "vclz.i",
                            int_arm_neon_vclz>;
 //   VCNT     : Vector Count One Bits
-def  VCNTd    : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8",
+def  VCNTd    : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, 
+                        IIC_VCNTiD, "vcnt.8",
                         v8i8, v8i8, int_arm_neon_vcnt>;
-def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8",
+def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
+                        IIC_VCNTiQ, "vcnt.8",
                         v16i8, v16i8, int_arm_neon_vcnt>;
 
 // Vector Move Operations.
@@ -1301,9 +2248,9 @@ def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8",
 //   VMOV     : Vector Move (Register)
 
 def  VMOVD    : N3V<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src),
-                    "vmov\t$dst, $src", "", []>;
+                    IIC_VMOVD, "vmov\t$dst, $src", "", []>;
 def  VMOVQ    : N3V<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src),
-                    "vmov\t$dst, $src", "", []>;
+                    IIC_VMOVD, "vmov\t$dst, $src", "", []>;
 
 //   VMOV     : Vector Move (Immediate)
 
@@ -1343,146 +2290,188 @@ def vmovImm64 : PatLeaf<(build_vector), [{
 // be encoded based on the immed values.
 
 def VMOVv8i8  : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst),
-                         (ins i8imm:$SIMM), "vmov.i8\t$dst, $SIMM", "",
+                         (ins i8imm:$SIMM), IIC_VMOVImm,
+                         "vmov.i8\t$dst, $SIMM", "",
                          [(set DPR:$dst, (v8i8 vmovImm8:$SIMM))]>;
 def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst),
-                         (ins i8imm:$SIMM), "vmov.i8\t$dst, $SIMM", "",
+                         (ins i8imm:$SIMM), IIC_VMOVImm,
+                         "vmov.i8\t$dst, $SIMM", "",
                          [(set QPR:$dst, (v16i8 vmovImm8:$SIMM))]>;
 
 def VMOVv4i16 : N1ModImm<1, 0b000, 0b1000, 0, 0, 0, 1, (outs DPR:$dst),
-                         (ins i16imm:$SIMM), "vmov.i16\t$dst, $SIMM", "",
+                         (ins i16imm:$SIMM), IIC_VMOVImm,
+                         "vmov.i16\t$dst, $SIMM", "",
                          [(set DPR:$dst, (v4i16 vmovImm16:$SIMM))]>;
 def VMOVv8i16 : N1ModImm<1, 0b000, 0b1000, 0, 1, 0, 1, (outs QPR:$dst),
-                         (ins i16imm:$SIMM), "vmov.i16\t$dst, $SIMM", "",
+                         (ins i16imm:$SIMM), IIC_VMOVImm,
+                         "vmov.i16\t$dst, $SIMM", "",
                          [(set QPR:$dst, (v8i16 vmovImm16:$SIMM))]>;
 
 def VMOVv2i32 : N1ModImm<1, 0b000, 0b0000, 0, 0, 0, 1, (outs DPR:$dst),
-                         (ins i32imm:$SIMM), "vmov.i32\t$dst, $SIMM", "",
+                         (ins i32imm:$SIMM), IIC_VMOVImm,
+                         "vmov.i32\t$dst, $SIMM", "",
                          [(set DPR:$dst, (v2i32 vmovImm32:$SIMM))]>;
 def VMOVv4i32 : N1ModImm<1, 0b000, 0b0000, 0, 1, 0, 1, (outs QPR:$dst),
-                         (ins i32imm:$SIMM), "vmov.i32\t$dst, $SIMM", "",
+                         (ins i32imm:$SIMM), IIC_VMOVImm,
+                         "vmov.i32\t$dst, $SIMM", "",
                          [(set QPR:$dst, (v4i32 vmovImm32:$SIMM))]>;
 
 def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$dst),
-                         (ins i64imm:$SIMM), "vmov.i64\t$dst, $SIMM", "",
+                         (ins i64imm:$SIMM), IIC_VMOVImm,
+                         "vmov.i64\t$dst, $SIMM", "",
                          [(set DPR:$dst, (v1i64 vmovImm64:$SIMM))]>;
 def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$dst),
-                         (ins i64imm:$SIMM), "vmov.i64\t$dst, $SIMM", "",
+                         (ins i64imm:$SIMM), IIC_VMOVImm,
+                         "vmov.i64\t$dst, $SIMM", "",
                          [(set QPR:$dst, (v2i64 vmovImm64:$SIMM))]>;
 
 //   VMOV     : Vector Get Lane (move scalar to ARM core register)
 
 def VGETLNs8  : NVGetLane<0b11100101, 0b1011, 0b00,
-                          (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
-                          "vmov", ".s8\t$dst, $src[$lane]",
+                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", ".s8\t$dst, $src[$lane]",
                           [(set GPR:$dst, (NEONvgetlanes (v8i8 DPR:$src),
                                            imm:$lane))]>;
 def VGETLNs16 : NVGetLane<0b11100001, 0b1011, 0b01,
-                          (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
-                          "vmov", ".s16\t$dst, $src[$lane]",
+                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", ".s16\t$dst, $src[$lane]",
                           [(set GPR:$dst, (NEONvgetlanes (v4i16 DPR:$src),
                                            imm:$lane))]>;
 def VGETLNu8  : NVGetLane<0b11101101, 0b1011, 0b00,
-                          (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
-                          "vmov", ".u8\t$dst, $src[$lane]",
+                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", ".u8\t$dst, $src[$lane]",
                           [(set GPR:$dst, (NEONvgetlaneu (v8i8 DPR:$src),
                                            imm:$lane))]>;
 def VGETLNu16 : NVGetLane<0b11101001, 0b1011, 0b01,
-                          (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
-                          "vmov", ".u16\t$dst, $src[$lane]",
+                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", ".u16\t$dst, $src[$lane]",
                           [(set GPR:$dst, (NEONvgetlaneu (v4i16 DPR:$src),
                                            imm:$lane))]>;
 def VGETLNi32 : NVGetLane<0b11100001, 0b1011, 0b00,
-                          (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
-                          "vmov", ".32\t$dst, $src[$lane]",
+                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", ".32\t$dst, $src[$lane]",
                           [(set GPR:$dst, (extractelt (v2i32 DPR:$src),
                                            imm:$lane))]>;
 // def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
 def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane),
           (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src,
-                           (SubReg_i8_reg imm:$lane))),
+                           (DSubReg_i8_reg imm:$lane))),
                      (SubReg_i8_lane imm:$lane))>;
 def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane),
           (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src,
-                             (SubReg_i16_reg imm:$lane))),
+                             (DSubReg_i16_reg imm:$lane))),
                      (SubReg_i16_lane imm:$lane))>;
 def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane),
           (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src,
-                           (SubReg_i8_reg imm:$lane))),
+                           (DSubReg_i8_reg imm:$lane))),
                      (SubReg_i8_lane imm:$lane))>;
 def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane),
           (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src,
-                             (SubReg_i16_reg imm:$lane))),
+                             (DSubReg_i16_reg imm:$lane))),
                      (SubReg_i16_lane imm:$lane))>;
 def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
           (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src,
-                             (SubReg_i32_reg imm:$lane))),
+                             (DSubReg_i32_reg imm:$lane))),
                      (SubReg_i32_lane imm:$lane))>;
+def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2),
+                          (SSubReg_f32_reg imm:$src2))>;
+def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2),
+                          (SSubReg_f32_reg imm:$src2))>;
 //def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2),
-//          (EXTRACT_SUBREG QPR:$src1, (SubReg_f64_reg imm:$src2))>;
+//          (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
 def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
-          (EXTRACT_SUBREG QPR:$src1, (SubReg_f64_reg imm:$src2))>;
+          (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
 
 
 //   VMOV     : Vector Set Lane (move ARM core register to scalar)
 
 let Constraints = "$src1 = $dst" in {
 def VSETLNi8  : NVSetLane<0b11100100, 0b1011, 0b00, (outs DPR:$dst),
-                          (ins DPR:$src1, GPR:$src2, i32imm:$lane),
-                          "vmov", ".8\t$dst[$lane], $src2",
+                          (ins DPR:$src1, GPR:$src2, nohash_imm:$lane),
+                          IIC_VMOVISL, "vmov", ".8\t$dst[$lane], $src2",
                           [(set DPR:$dst, (vector_insert (v8i8 DPR:$src1),
                                            GPR:$src2, imm:$lane))]>;
 def VSETLNi16 : NVSetLane<0b11100000, 0b1011, 0b01, (outs DPR:$dst),
-                          (ins DPR:$src1, GPR:$src2, i32imm:$lane),
-                          "vmov", ".16\t$dst[$lane], $src2",
+                          (ins DPR:$src1, GPR:$src2, nohash_imm:$lane),
+                          IIC_VMOVISL, "vmov", ".16\t$dst[$lane], $src2",
                           [(set DPR:$dst, (vector_insert (v4i16 DPR:$src1),
                                            GPR:$src2, imm:$lane))]>;
 def VSETLNi32 : NVSetLane<0b11100000, 0b1011, 0b00, (outs DPR:$dst),
-                          (ins DPR:$src1, GPR:$src2, i32imm:$lane),
-                          "vmov", ".32\t$dst[$lane], $src2",
+                          (ins DPR:$src1, GPR:$src2, nohash_imm:$lane),
+                          IIC_VMOVISL, "vmov", ".32\t$dst[$lane], $src2",
                           [(set DPR:$dst, (insertelt (v2i32 DPR:$src1),
                                            GPR:$src2, imm:$lane))]>;
 }
 def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
           (v16i8 (INSERT_SUBREG QPR:$src1, 
                   (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1,
-                                   (SubReg_i8_reg imm:$lane))),
+                                   (DSubReg_i8_reg imm:$lane))),
                             GPR:$src2, (SubReg_i8_lane imm:$lane)),
-                  (SubReg_i8_reg imm:$lane)))>;
+                  (DSubReg_i8_reg imm:$lane)))>;
 def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane),
           (v8i16 (INSERT_SUBREG QPR:$src1, 
                   (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
-                                     (SubReg_i16_reg imm:$lane))),
+                                     (DSubReg_i16_reg imm:$lane))),
                              GPR:$src2, (SubReg_i16_lane imm:$lane)),
-                  (SubReg_i16_reg imm:$lane)))>;
+                  (DSubReg_i16_reg imm:$lane)))>;
 def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane),
           (v4i32 (INSERT_SUBREG QPR:$src1, 
                   (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1,
-                                     (SubReg_i32_reg imm:$lane))),
+                                     (DSubReg_i32_reg imm:$lane))),
                              GPR:$src2, (SubReg_i32_lane imm:$lane)),
-                  (SubReg_i32_reg imm:$lane)))>;
+                  (DSubReg_i32_reg imm:$lane)))>;
+
+def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)),
+          (INSERT_SUBREG (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2),
+                         SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
+def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)),
+          (INSERT_SUBREG (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2),
+                         SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
 
 //def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
-//          (INSERT_SUBREG QPR:$src1, DPR:$src2, (SubReg_f64_reg imm:$src3))>;
+//          (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
 def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
-          (INSERT_SUBREG QPR:$src1, DPR:$src2, (SubReg_f64_reg imm:$src3))>;
+          (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
+
+def : Pat<(v2f32 (scalar_to_vector SPR:$src)),
+          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, arm_ssubreg_0)>;
+def : Pat<(v2f64 (scalar_to_vector DPR:$src)),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, arm_dsubreg_0)>;
+def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, arm_ssubreg_0)>;
+
+def : Pat<(v8i8 (scalar_to_vector GPR:$src)),
+          (VSETLNi8  (v8i8  (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v4i16 (scalar_to_vector GPR:$src)),
+          (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v2i32 (scalar_to_vector GPR:$src)),
+          (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+
+def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+                         (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         arm_dsubreg_0)>;
+def : Pat<(v8i16 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+                         (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         arm_dsubreg_0)>;
+def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                         (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         arm_dsubreg_0)>;
 
 //   VDUP     : Vector Duplicate (from ARM core register to all elements)
 
-def splat_lo : PatFrag<(ops node:$lhs, node:$rhs),
-                       (vector_shuffle node:$lhs, node:$rhs), [{
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  return SVOp->isSplat() && SVOp->getSplatIndex() == 0;
-}]>;
-
 class VDUPD<bits<8> opcod1, bits<2> opcod3, string asmSize, ValueType Ty>
   : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$dst), (ins GPR:$src),
-          "vdup", !strconcat(asmSize, "\t$dst, $src"),
-          [(set DPR:$dst, (Ty (splat_lo (scalar_to_vector GPR:$src), undef)))]>;
+          IIC_VMOVIS, "vdup", !strconcat(asmSize, "\t$dst, $src"),
+          [(set DPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>;
 class VDUPQ<bits<8> opcod1, bits<2> opcod3, string asmSize, ValueType Ty>
   : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$dst), (ins GPR:$src),
-          "vdup", !strconcat(asmSize, "\t$dst, $src"),
-          [(set QPR:$dst, (Ty (splat_lo (scalar_to_vector GPR:$src), undef)))]>;
+          IIC_VMOVIS, "vdup", !strconcat(asmSize, "\t$dst, $src"),
+          [(set QPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>;
 
 def  VDUP8d   : VDUPD<0b11101100, 0b00, ".8", v8i8>;
 def  VDUP16d  : VDUPD<0b11101000, 0b01, ".16", v4i16>;
@@ -1492,45 +2481,28 @@ def  VDUP16q  : VDUPQ<0b11101010, 0b01, ".16", v8i16>;
 def  VDUP32q  : VDUPQ<0b11101010, 0b00, ".32", v4i32>;
 
 def  VDUPfd   : NVDup<0b11101000, 0b1011, 0b00, (outs DPR:$dst), (ins GPR:$src),
-                      "vdup", ".32\t$dst, $src",
-                      [(set DPR:$dst, (v2f32 (splat_lo
-                                              (scalar_to_vector
-                                               (f32 (bitconvert GPR:$src))),
-                                              undef)))]>;
+                      IIC_VMOVIS, "vdup", ".32\t$dst, $src",
+                      [(set DPR:$dst, (v2f32 (NEONvdup
+                                              (f32 (bitconvert GPR:$src)))))]>;
 def  VDUPfq   : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src),
-                      "vdup", ".32\t$dst, $src",
-                      [(set QPR:$dst, (v4f32 (splat_lo
-                                              (scalar_to_vector
-                                               (f32 (bitconvert GPR:$src))),
-                                              undef)))]>;
+                      IIC_VMOVIS, "vdup", ".32\t$dst, $src",
+                      [(set QPR:$dst, (v4f32 (NEONvdup
+                                              (f32 (bitconvert GPR:$src)))))]>;
 
 //   VDUP     : Vector Duplicate Lane (from scalar to all elements)
 
-def SHUFFLE_get_splat_lane : SDNodeXForm<vector_shuffle, [{
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  return CurDAG->getTargetConstant(SVOp->getSplatIndex(), MVT::i32);
-}]>;
-
-def splat_lane : PatFrag<(ops node:$lhs, node:$rhs),
-                         (vector_shuffle node:$lhs, node:$rhs), [{
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  return SVOp->isSplat();
-}], SHUFFLE_get_splat_lane>;
-
 class VDUPLND<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, ValueType Ty>
   : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 0, 0,
-        (outs DPR:$dst), (ins DPR:$src, i32imm:$lane),
+        (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
         !strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "",
-        [(set DPR:$dst, (Ty (splat_lane:$lane DPR:$src, undef)))]>;
+        [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>;
 
-// vector_shuffle requires that the source and destination types match, so
-// VDUP to a 128-bit result uses a target-specific VDUPLANEQ node.
 class VDUPLNQ<bits<2> op19_18, bits<2> op17_16, string OpcodeStr,
               ValueType ResTy, ValueType OpTy>
   : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 1, 0,
-        (outs QPR:$dst), (ins DPR:$src, i32imm:$lane),
+        (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
         !strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "",
-        [(set QPR:$dst, (ResTy (NEONvduplaneq (OpTy DPR:$src), imm:$lane)))]>;
+        [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), imm:$lane)))]>;
 
 def VDUPLN8d  : VDUPLND<0b00, 0b01, "vdup.8", v8i8>;
 def VDUPLN16d : VDUPLND<0b00, 0b10, "vdup.16", v4i16>;
@@ -1541,15 +2513,51 @@ def VDUPLN16q : VDUPLNQ<0b00, 0b10, "vdup.16", v8i16, v4i16>;
 def VDUPLN32q : VDUPLNQ<0b01, 0b00, "vdup.32", v4i32, v2i32>;
 def VDUPLNfq  : VDUPLNQ<0b01, 0b00, "vdup.32", v4f32, v2f32>;
 
+def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
+          (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
+                                  (DSubReg_i8_reg imm:$lane))),
+                           (SubReg_i8_lane imm:$lane)))>;
+def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)),
+          (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src,
+                                    (DSubReg_i16_reg imm:$lane))),
+                            (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)),
+          (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src,
+                                    (DSubReg_i32_reg imm:$lane))),
+                            (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
+          (v4f32 (VDUPLNfq (v2f32 (EXTRACT_SUBREG QPR:$src,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
+def VDUPfdf   : N2V<0b11, 0b11, 0b01, 0b00, 0b11000, 0, 0,
+                    (outs DPR:$dst), (ins SPR:$src),
+                    IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "",
+                    [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]>;
+
+def VDUPfqf   : N2V<0b11, 0b11, 0b01, 0b00, 0b11000, 1, 0,
+                    (outs QPR:$dst), (ins SPR:$src),
+                    IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "",
+                    [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>;
+
+def : Pat<(v2i64 (NEONvduplane (v2i64 QPR:$src), imm:$lane)),
+          (INSERT_SUBREG QPR:$src, 
+                         (i64 (EXTRACT_SUBREG QPR:$src, (DSubReg_f64_reg imm:$lane))),
+                         (DSubReg_f64_other_reg imm:$lane))>;
+def : Pat<(v2f64 (NEONvduplane (v2f64 QPR:$src), imm:$lane)),
+          (INSERT_SUBREG QPR:$src, 
+                         (f64 (EXTRACT_SUBREG QPR:$src, (DSubReg_f64_reg imm:$lane))),
+                         (DSubReg_f64_other_reg imm:$lane))>;
+
 //   VMOVN    : Vector Narrowing Move
-defm VMOVN    : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, "vmovn.i",
+defm VMOVN    : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVD, "vmovn.i",
                             int_arm_neon_vmovn>;
 //   VQMOVN   : Vector Saturating Narrowing Move
-defm VQMOVNs  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, "vqmovn.s",
+defm VQMOVNs  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD, "vqmovn.s",
                             int_arm_neon_vqmovns>;
-defm VQMOVNu  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, "vqmovn.u",
+defm VQMOVNu  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD, "vqmovn.u",
                             int_arm_neon_vqmovnu>;
-defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, "vqmovun.s",
+defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD, "vqmovun.s",
                             int_arm_neon_vqmovnsu>;
 //   VMOVL    : Vector Lengthening Move
 defm VMOVLs   : N2VLInt_QHS<0,1,0b1010,0,0,1, "vmovl.s", int_arm_neon_vmovls>;
@@ -1597,6 +2605,247 @@ def VCVTxs2fq : N2VCvtQ<0, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.s32",
 def VCVTxu2fq : N2VCvtQ<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32",
                         v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
 
+// Vector Reverse.
+
+//   VREV64   : Vector Reverse elements within 64-bit doublewords
+
+class VREV64D<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$dst),
+        (ins DPR:$src), IIC_VMOVD, 
+        !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (Ty (NEONvrev64 (Ty DPR:$src))))]>;
+class VREV64Q<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src), IIC_VMOVD, 
+        !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (Ty (NEONvrev64 (Ty QPR:$src))))]>;
+
+def VREV64d8  : VREV64D<0b00, "vrev64.8", v8i8>;
+def VREV64d16 : VREV64D<0b01, "vrev64.16", v4i16>;
+def VREV64d32 : VREV64D<0b10, "vrev64.32", v2i32>;
+def VREV64df  : VREV64D<0b10, "vrev64.32", v2f32>;
+
+def VREV64q8  : VREV64Q<0b00, "vrev64.8", v16i8>;
+def VREV64q16 : VREV64Q<0b01, "vrev64.16", v8i16>;
+def VREV64q32 : VREV64Q<0b10, "vrev64.32", v4i32>;
+def VREV64qf  : VREV64Q<0b10, "vrev64.32", v4f32>;
+
+//   VREV32   : Vector Reverse elements within 32-bit words
+
+class VREV32D<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$dst),
+        (ins DPR:$src), IIC_VMOVD, 
+        !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (Ty (NEONvrev32 (Ty DPR:$src))))]>;
+class VREV32Q<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src), IIC_VMOVD, 
+        !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (Ty (NEONvrev32 (Ty QPR:$src))))]>;
+
+def VREV32d8  : VREV32D<0b00, "vrev32.8", v8i8>;
+def VREV32d16 : VREV32D<0b01, "vrev32.16", v4i16>;
+
+def VREV32q8  : VREV32Q<0b00, "vrev32.8", v16i8>;
+def VREV32q16 : VREV32Q<0b01, "vrev32.16", v8i16>;
+
+//   VREV16   : Vector Reverse elements within 16-bit halfwords
+
+class VREV16D<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$dst),
+        (ins DPR:$src), IIC_VMOVD, 
+        !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (Ty (NEONvrev16 (Ty DPR:$src))))]>;
+class VREV16Q<bits<2> op19_18, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src), IIC_VMOVD, 
+        !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (Ty (NEONvrev16 (Ty QPR:$src))))]>;
+
+def VREV16d8  : VREV16D<0b00, "vrev16.8", v8i8>;
+def VREV16q8  : VREV16Q<0b00, "vrev16.8", v16i8>;
+
+// Other Vector Shuffles.
+
+//   VEXT     : Vector Extract
+
+class VEXTd<string OpcodeStr, ValueType Ty>
+  : N3V<0,1,0b11,0b0000,0,0, (outs DPR:$dst),
+        (ins DPR:$lhs, DPR:$rhs, i32imm:$index), IIC_VEXTD,
+        !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "",
+        [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs),
+                                      (Ty DPR:$rhs), imm:$index)))]>;
+
+class VEXTq<string OpcodeStr, ValueType Ty>
+  : N3V<0,1,0b11,0b0000,1,0, (outs QPR:$dst),
+        (ins QPR:$lhs, QPR:$rhs, i32imm:$index), IIC_VEXTQ,
+        !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "",
+        [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs),
+                                      (Ty QPR:$rhs), imm:$index)))]>;
+
+def VEXTd8  : VEXTd<"vext.8",  v8i8>;
+def VEXTd16 : VEXTd<"vext.16", v4i16>;
+def VEXTd32 : VEXTd<"vext.32", v2i32>;
+def VEXTdf  : VEXTd<"vext.32", v2f32>;
+
+def VEXTq8  : VEXTq<"vext.8",  v16i8>;
+def VEXTq16 : VEXTq<"vext.16", v8i16>;
+def VEXTq32 : VEXTq<"vext.32", v4i32>;
+def VEXTqf  : VEXTq<"vext.32", v4f32>;
+
+//   VTRN     : Vector Transpose
+
+def  VTRNd8   : N2VDShuffle<0b00, 0b00001, "vtrn.8">;
+def  VTRNd16  : N2VDShuffle<0b01, 0b00001, "vtrn.16">;
+def  VTRNd32  : N2VDShuffle<0b10, 0b00001, "vtrn.32">;
+
+def  VTRNq8   : N2VQShuffle<0b00, 0b00001, IIC_VPERMQ, "vtrn.8">;
+def  VTRNq16  : N2VQShuffle<0b01, 0b00001, IIC_VPERMQ, "vtrn.16">;
+def  VTRNq32  : N2VQShuffle<0b10, 0b00001, IIC_VPERMQ, "vtrn.32">;
+
+//   VUZP     : Vector Unzip (Deinterleave)
+
+def  VUZPd8   : N2VDShuffle<0b00, 0b00010, "vuzp.8">;
+def  VUZPd16  : N2VDShuffle<0b01, 0b00010, "vuzp.16">;
+def  VUZPd32  : N2VDShuffle<0b10, 0b00010, "vuzp.32">;
+
+def  VUZPq8   : N2VQShuffle<0b00, 0b00010, IIC_VPERMQ3, "vuzp.8">;
+def  VUZPq16  : N2VQShuffle<0b01, 0b00010, IIC_VPERMQ3, "vuzp.16">;
+def  VUZPq32  : N2VQShuffle<0b10, 0b00010, IIC_VPERMQ3, "vuzp.32">;
+
+//   VZIP     : Vector Zip (Interleave)
+
+def  VZIPd8   : N2VDShuffle<0b00, 0b00011, "vzip.8">;
+def  VZIPd16  : N2VDShuffle<0b01, 0b00011, "vzip.16">;
+def  VZIPd32  : N2VDShuffle<0b10, 0b00011, "vzip.32">;
+
+def  VZIPq8   : N2VQShuffle<0b00, 0b00011, IIC_VPERMQ3, "vzip.8">;
+def  VZIPq16  : N2VQShuffle<0b01, 0b00011, IIC_VPERMQ3, "vzip.16">;
+def  VZIPq32  : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip.32">;
+
+// Vector Table Lookup and Table Extension.
+
+//   VTBL     : Vector Table Lookup
+def  VTBL1
+  : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$dst),
+        (ins DPR:$tbl1, DPR:$src), IIC_VTB1,
+        "vtbl.8\t$dst, \\{$tbl1\\}, $src", "",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl1 DPR:$tbl1, DPR:$src)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def  VTBL2
+  : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst),
+        (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTB2,
+        "vtbl.8\t$dst, \\{$tbl1,$tbl2\\}, $src", "",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl2
+                               DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
+def  VTBL3
+  : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst),
+        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTB3,
+        "vtbl.8\t$dst, \\{$tbl1,$tbl2,$tbl3\\}, $src", "",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl3
+                               DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
+def  VTBL4
+  : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst),
+        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTB4,
+        "vtbl.8\t$dst, \\{$tbl1,$tbl2,$tbl3,$tbl4\\}, $src", "",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl4 DPR:$tbl1, DPR:$tbl2,
+                               DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>;
+} // hasExtraSrcRegAllocReq = 1
+
+//   VTBX     : Vector Table Extension
+def  VTBX1
+  : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$dst),
+        (ins DPR:$orig, DPR:$tbl1, DPR:$src), IIC_VTBX1,
+        "vtbx.8\t$dst, \\{$tbl1\\}, $src", "$orig = $dst",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx1
+                               DPR:$orig, DPR:$tbl1, DPR:$src)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def  VTBX2
+  : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst),
+        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTBX2,
+        "vtbx.8\t$dst, \\{$tbl1,$tbl2\\}, $src", "$orig = $dst",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx2
+                               DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
+def  VTBX3
+  : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst),
+        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTBX3,
+        "vtbx.8\t$dst, \\{$tbl1,$tbl2,$tbl3\\}, $src", "$orig = $dst",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx3 DPR:$orig, DPR:$tbl1,
+                               DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
+def  VTBX4
+  : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1,
+        DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTBX4,
+        "vtbx.8\t$dst, \\{$tbl1,$tbl2,$tbl3,$tbl4\\}, $src", "$orig = $dst",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx4 DPR:$orig, DPR:$tbl1,
+                               DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>;
+} // hasExtraSrcRegAllocReq = 1
+
+//===----------------------------------------------------------------------===//
+// NEON instructions for single-precision FP math
+//===----------------------------------------------------------------------===//
+
+// These need separate instructions because they must use DPR_VFP2 register
+// class which have SPR sub-registers.
+
+// Vector Add Operations used for single-precision FP
+let neverHasSideEffects = 1 in
+def VADDfd_sfp : N3VDs<0, 0, 0b00, 0b1101, 0, "vadd.f32", v2f32, v2f32, fadd,1>;
+def : N3VDsPat<fadd, VADDfd_sfp>;
+
+// Vector Sub Operations used for single-precision FP
+let neverHasSideEffects = 1 in
+def VSUBfd_sfp : N3VDs<0, 0, 0b10, 0b1101, 0, "vsub.f32", v2f32, v2f32, fsub,0>;
+def : N3VDsPat<fsub, VSUBfd_sfp>;
+
+// Vector Multiply Operations used for single-precision FP
+let neverHasSideEffects = 1 in
+def VMULfd_sfp : N3VDs<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul,1>;
+def : N3VDsPat<fmul, VMULfd_sfp>;
+
+// Vector Multiply-Accumulate/Subtract used for single-precision FP
+let neverHasSideEffects = 1 in
+def VMLAfd_sfp : N3VDMulOps<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla.f32", v2f32,fmul,fadd>;
+def : N3VDMulOpsPat<fmul, fadd, VMLAfd_sfp>;
+
+let neverHasSideEffects = 1 in
+def VMLSfd_sfp : N3VDMulOps<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls.f32", v2f32,fmul,fsub>;
+def : N3VDMulOpsPat<fmul, fsub, VMLSfd_sfp>;
+
+// Vector Absolute used for single-precision FP
+let neverHasSideEffects = 1 in
+def  VABSfd_sfp : N2VDInts<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                           IIC_VUNAD, "vabs.f32",
+                           v2f32, v2f32, int_arm_neon_vabs>;
+def : N2VDIntsPat<fabs, VABSfd_sfp>;
+
+// Vector Negate used for single-precision FP
+let neverHasSideEffects = 1 in
+def  VNEGf32d_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
+                        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), IIC_VUNAD,
+                        "vneg.f32\t$dst, $src", "", []>;
+def : N2VDIntsPat<fneg, VNEGf32d_sfp>;
+
+// Vector Convert between single-precision FP and integer
+let neverHasSideEffects = 1 in
+def  VCVTf2sd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt.s32.f32",
+                          v2i32, v2f32, fp_to_sint>;
+def : N2VDsPat<arm_ftosi, f32, v2f32, VCVTf2sd_sfp>;
+
+let neverHasSideEffects = 1 in
+def  VCVTf2ud_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt.u32.f32",
+                          v2i32, v2f32, fp_to_uint>;
+def : N2VDsPat<arm_ftoui, f32, v2f32, VCVTf2ud_sfp>;
+
+let neverHasSideEffects = 1 in
+def  VCVTs2fd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt.f32.s32",
+                          v2f32, v2i32, sint_to_fp>;
+def : N2VDsPat<arm_sitof, f32, v2i32, VCVTs2fd_sfp>;
+
+let neverHasSideEffects = 1 in
+def  VCVTu2fd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt.f32.u32",
+                          v2f32, v2i32, uint_to_fp>;
+def : N2VDsPat<arm_uitof, f32, v2i32, VCVTu2fd_sfp>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 904d9b1..9816add 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -117,86 +117,150 @@ def t_addrmode_sp : Operand<i32>,
 
 let Defs = [SP], Uses = [SP] in {
 def tADJCALLSTACKUP :
-PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary,
            "@ tADJCALLSTACKUP $amt1",
-           [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, Requires<[IsThumb]>;
+           [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, Requires<[IsThumb1Only]>;
 
 def tADJCALLSTACKDOWN :
-PseudoInst<(outs), (ins i32imm:$amt),
+PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
            "@ tADJCALLSTACKDOWN $amt",
-           [(ARMcallseq_start imm:$amt)]>, Requires<[IsThumb]>;
+           [(ARMcallseq_start imm:$amt)]>, Requires<[IsThumb1Only]>;
 }
 
+// For both thumb1 and thumb2.
 let isNotDuplicable = 1 in
-def tPICADD : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, pclabel:$cp),
-                  "$cp:\n\tadd $dst, pc",
-                  [(set tGPR:$dst, (ARMpic_add tGPR:$lhs, imm:$cp))]>;
+def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr,
+                 "\n$cp:\n\tadd $dst, pc",
+                 [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>;
 
 // PC relative add.
-def tADDrPCi : T1I<(outs tGPR:$dst), (ins i32imm:$rhs),
+def tADDrPCi : T1I<(outs tGPR:$dst), (ins i32imm:$rhs), IIC_iALUi,
                   "add $dst, pc, $rhs * 4", []>;
 
 // ADD rd, sp, #imm8
-// FIXME: hard code sp?
-def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, i32imm:$rhs),
-                  "add $dst, $sp, $rhs * 4 @ addrspi", []>;
+def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, i32imm:$rhs), IIC_iALUi,
+                  "add $dst, $sp, $rhs * 4", []>;
 
 // ADD sp, sp, #imm7
-// FIXME: hard code sp?
-def tADDspi : T1It<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs),
+def tADDspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iALUi,
                   "add $dst, $rhs * 4", []>;
 
-// FIXME: Make use of the following?
-// ADD rm, sp, rm
+// SUB sp, sp, #imm7
+def tSUBspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iALUi,
+                  "sub $dst, $rhs * 4", []>;
+
+// ADD rm, sp
+def tADDrSP : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                  "add $dst, $rhs", []>;
+
 // ADD sp, rm
+def tADDspr : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                  "add $dst, $rhs", []>;
+
+// Pseudo instruction that will expand into a tSUBspi + a copy.
+let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler.
+def tSUBspi_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs),
+               NoItinerary, "@ sub $dst, $rhs * 4", []>;
+
+def tADDspr_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
+               NoItinerary, "@ add $dst, $rhs", []>;
+
+let Defs = [CPSR] in
+def tANDsp : PseudoInst<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+             NoItinerary, "@ and $dst, $rhs", []>;
+} // usesCustomDAGSchedInserter
 
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions.
 //
 
-let isReturn = 1, isTerminator = 1 in {
-  def tBX_RET : TI<(outs), (ins), "bx lr", [(ARMretflag)]>;
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  def tBX_RET : TI<(outs), (ins), IIC_Br, "bx lr", [(ARMretflag)]>;
   // Alternative return instruction used by vararg functions.
-  def tBX_RET_vararg : TI<(outs), (ins tGPR:$target), "bx $target", []>;
+  def tBX_RET_vararg : TI<(outs), (ins tGPR:$target), IIC_Br, "bx $target", []>;
 }
 
 // FIXME: remove when we have a way to marking a MI with these properties.
-let isReturn = 1, isTerminator = 1 in
-def tPOP_RET : TI<(outs reglist:$dst1, variable_ops), (ins),
-                   "pop $dst1", []>;
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1 in
+def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$wb, variable_ops), IIC_Br,
+                   "pop${p} $wb", []>;
 
 let isCall = 1,
-  Defs = [R0, R1, R2, R3, LR,
-          D0, D1, D2, D3, D4, D5, D6, D7] in {
-  def tBL  : TIx2<(outs), (ins i32imm:$func, variable_ops),
+  Defs = [R0,  R1,  R2,  R3,  R12, LR,
+          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+          D16, D17, D18, D19, D20, D21, D22, D23,
+          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
+  // Also used for Thumb2
+  def tBL  : TIx2<(outs), (ins i32imm:$func, variable_ops), IIC_Br, 
                    "bl ${func:call}",
-                   [(ARMtcall tglobaladdr:$func)]>;
-  // ARMv5T and above
-  def tBLXi : TIx2<(outs), (ins i32imm:$func, variable_ops),
+                   [(ARMtcall tglobaladdr:$func)]>,
+             Requires<[IsThumb, IsNotDarwin]>;
+
+  // ARMv5T and above, also used for Thumb2
+  def tBLXi : TIx2<(outs), (ins i32imm:$func, variable_ops), IIC_Br, 
                     "blx ${func:call}",
-                    [(ARMcall tglobaladdr:$func)]>, Requires<[HasV5T]>;
-  def tBLXr : TI<(outs), (ins tGPR:$func, variable_ops),
+                    [(ARMcall tglobaladdr:$func)]>,
+              Requires<[IsThumb, HasV5T, IsNotDarwin]>;
+
+  // Also used for Thumb2
+  def tBLXr : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br, 
+                  "blx $func",
+                  [(ARMtcall GPR:$func)]>,
+              Requires<[IsThumb, HasV5T, IsNotDarwin]>;
+
+  // ARMv4T
+  def tBX : TIx2<(outs), (ins tGPR:$func, variable_ops), IIC_Br, 
+                  "mov lr, pc\n\tbx $func",
+                  [(ARMcall_nolink tGPR:$func)]>,
+            Requires<[IsThumb1Only, IsNotDarwin]>;
+}
+
+// On Darwin R9 is call-clobbered.
+let isCall = 1,
+  Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR,
+          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+          D16, D17, D18, D19, D20, D21, D22, D23,
+          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
+  // Also used for Thumb2
+  def tBLr9 : TIx2<(outs), (ins i32imm:$func, variable_ops), IIC_Br, 
+                   "bl ${func:call}",
+                   [(ARMtcall tglobaladdr:$func)]>,
+              Requires<[IsThumb, IsDarwin]>;
+
+  // ARMv5T and above, also used for Thumb2
+  def tBLXi_r9 : TIx2<(outs), (ins i32imm:$func, variable_ops), IIC_Br, 
+                      "blx ${func:call}",
+                      [(ARMcall tglobaladdr:$func)]>,
+                 Requires<[IsThumb, HasV5T, IsDarwin]>;
+
+  // Also used for Thumb2
+  def tBLXr_r9 : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br, 
                   "blx $func",
-                  [(ARMtcall tGPR:$func)]>, Requires<[HasV5T]>;
+                  [(ARMtcall GPR:$func)]>,
+                 Requires<[IsThumb, HasV5T, IsDarwin]>;
+
   // ARMv4T
-  def tBX : TIx2<(outs), (ins tGPR:$func, variable_ops),
-                  "cpy lr, pc\n\tbx $func",
-                  [(ARMcall_nolink tGPR:$func)]>;
+  def tBXr9 : TIx2<(outs), (ins tGPR:$func, variable_ops), IIC_Br, 
+                  "mov lr, pc\n\tbx $func",
+                  [(ARMcall_nolink tGPR:$func)]>,
+              Requires<[IsThumb1Only, IsDarwin]>;
 }
 
 let isBranch = 1, isTerminator = 1 in {
   let isBarrier = 1 in {
     let isPredicable = 1 in
-    def tB   : T1I<(outs), (ins brtarget:$target), "b $target",
-                   [(br bb:$target)]>;
+    def tB   : T1I<(outs), (ins brtarget:$target), IIC_Br,
+                   "b $target", [(br bb:$target)]>;
 
   // Far jump
-  def tBfar : T1Ix2<(outs), (ins brtarget:$target), 
+  let Defs = [LR] in
+  def tBfar : TIx2<(outs), (ins brtarget:$target), IIC_Br, 
                     "bl $target\t@ far jump",[]>;
 
   def tBR_JTr : T1JTI<(outs),
                       (ins tGPR:$target, jtblock_operand:$jt, i32imm:$id),
-                      "cpy pc, $target \n\t.align\t2\n$jt",
+                      IIC_Br, "mov pc, $target\n\t.align\t2\n$jt",
                       [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]>;
   }
 }
@@ -204,7 +268,8 @@ let isBranch = 1, isTerminator = 1 in {
 // FIXME: should be able to write a pattern for ARMBrcond, but can't use
 // a two-value operand where a dag node expects two operands. :(
 let isBranch = 1, isTerminator = 1 in
-  def tBcc : T1I<(outs), (ins brtarget:$target, pred:$cc), "b$cc $target",
+  def tBcc : T1I<(outs), (ins brtarget:$target, pred:$cc), IIC_Br,
+                 "b$cc $target",
                  [/*(ARMbrcond bb:$target, imm:$cc)*/]>;
 
 //===----------------------------------------------------------------------===//
@@ -212,384 +277,363 @@ let isBranch = 1, isTerminator = 1 in
 //
 
 let canFoldAsLoad = 1 in
-def tLDR : T1I4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr),
-               "ldr $dst, $addr",
+def tLDR : T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr, 
+               "ldr", " $dst, $addr",
                [(set tGPR:$dst, (load t_addrmode_s4:$addr))]>;
 
-def tLDRB : T1I1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr),
-                "ldrb $dst, $addr",
+def tLDRB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr), IIC_iLoadr,
+                "ldrb", " $dst, $addr",
                 [(set tGPR:$dst, (zextloadi8 t_addrmode_s1:$addr))]>;
 
-def tLDRH : T1I2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr),
-                "ldrh $dst, $addr",
+def tLDRH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr), IIC_iLoadr,
+                "ldrh", " $dst, $addr",
                 [(set tGPR:$dst, (zextloadi16 t_addrmode_s2:$addr))]>;
 
-def tLDRSB : T1I1<(outs tGPR:$dst), (ins t_addrmode_rr:$addr),
-                 "ldrsb $dst, $addr",
+let AddedComplexity = 10 in
+def tLDRSB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr,
+                 "ldrsb", " $dst, $addr",
                  [(set tGPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>;
 
-def tLDRSH : T1I2<(outs tGPR:$dst), (ins t_addrmode_rr:$addr),
-                 "ldrsh $dst, $addr",
+let AddedComplexity = 10 in
+def tLDRSH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr,
+                 "ldrsh", " $dst, $addr",
                  [(set tGPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>;
 
 let canFoldAsLoad = 1 in
-def tLDRspi : T1Is<(outs tGPR:$dst), (ins t_addrmode_sp:$addr),
-                  "ldr $dst, $addr",
+def tLDRspi : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi,
+                  "ldr", " $dst, $addr",
                   [(set tGPR:$dst, (load t_addrmode_sp:$addr))]>;
 
 // Special instruction for restore. It cannot clobber condition register
 // when it's expanded by eliminateCallFramePseudoInstr().
 let canFoldAsLoad = 1, mayLoad = 1 in
-def tRestore : T1Is<(outs tGPR:$dst), (ins t_addrmode_sp:$addr),
-                    "ldr $dst, $addr", []>;
+def tRestore : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi,
+                    "ldr", " $dst, $addr", []>;
 
 // Load tconstpool
 let canFoldAsLoad = 1 in
-def tLDRpci : T1Is<(outs tGPR:$dst), (ins i32imm:$addr),
-                  "ldr $dst, $addr",
+def tLDRpci : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
+                  "ldr", " $dst, $addr",
                   [(set tGPR:$dst, (load (ARMWrapper tconstpool:$addr)))]>;
 
 // Special LDR for loads from non-pc-relative constpools.
 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1 in
-def tLDRcp  : T1Is<(outs tGPR:$dst), (ins i32imm:$addr),
-                  "ldr $dst, $addr", []>;
+def tLDRcp  : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
+                  "ldr", " $dst, $addr", []>;
 
-def tSTR : T1I4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr),
-               "str $src, $addr",
+def tSTR : T1pI4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr), IIC_iStorer,
+               "str", " $src, $addr",
                [(store tGPR:$src, t_addrmode_s4:$addr)]>;
 
-def tSTRB : T1I1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr),
-                 "strb $src, $addr",
+def tSTRB : T1pI1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr), IIC_iStorer,
+                 "strb", " $src, $addr",
                  [(truncstorei8 tGPR:$src, t_addrmode_s1:$addr)]>;
 
-def tSTRH : T1I2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr),
-                 "strh $src, $addr",
+def tSTRH : T1pI2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr), IIC_iStorer,
+                 "strh", " $src, $addr",
                  [(truncstorei16 tGPR:$src, t_addrmode_s2:$addr)]>;
 
-def tSTRspi : T1Is<(outs), (ins tGPR:$src, t_addrmode_sp:$addr),
-                   "str $src, $addr",
+def tSTRspi : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei,
+                   "str", " $src, $addr",
                    [(store tGPR:$src, t_addrmode_sp:$addr)]>;
 
 let mayStore = 1 in {
 // Special instruction for spill. It cannot clobber condition register
 // when it's expanded by eliminateCallFramePseudoInstr().
-def tSpill : T1Is<(outs), (ins tGPR:$src, t_addrmode_sp:$addr),
-                  "str $src, $addr", []>;
+def tSpill : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei,
+                  "str", " $src, $addr", []>;
 }
 
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
 
-// TODO: A7-44: LDMIA - load multiple
+// These requires base address to be written back or one of the loaded regs.
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+def tLDM : T1I<(outs),
+               (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+               IIC_iLoadm,
+               "ldm${addr:submode}${p} $addr, $wb", []>;
 
-let mayLoad = 1 in
-def tPOP : TI<(outs reglist:$dst1, variable_ops), (ins),
-               "pop $dst1", []>;
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+def tSTM : T1I<(outs),
+               (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+               IIC_iStorem,
+               "stm${addr:submode}${p} $addr, $wb", []>;
 
-let mayStore = 1 in
-def tPUSH : TI<(outs), (ins reglist:$src1, variable_ops),
-                "push $src1", []>;
+let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
+def tPOP : T1I<(outs), (ins pred:$p, reglist:$wb, variable_ops), IIC_Br,
+               "pop${p} $wb", []>;
+
+let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in
+def tPUSH : T1I<(outs), (ins pred:$p, reglist:$wb, variable_ops), IIC_Br,
+                "push${p} $wb", []>;
 
 //===----------------------------------------------------------------------===//
 //  Arithmetic Instructions.
 //
 
 // Add with carry register
-let isCommutable = 1, Defs = [CPSR], Uses = [CPSR] in
-def tADCS : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-                "adc $dst, $rhs",
-                [(set tGPR:$dst, (adde tGPR:$lhs, tGPR:$rhs))]>;
+let isCommutable = 1, Uses = [CPSR] in
+def tADC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                 "adc", " $dst, $rhs",
+                 [(set tGPR:$dst, (adde tGPR:$lhs, tGPR:$rhs))]>;
 
 // Add immediate
-let Defs = [CPSR] in {
-def tADDi3 : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
-                "add $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (add tGPR:$lhs, imm0_7:$rhs))]>;
-def tADDSi3 : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
-                 "add $dst, $lhs, $rhs",
-                 [(set tGPR:$dst, (addc tGPR:$lhs, imm0_7:$rhs))]>;
-}
+def tADDi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
+                   "add", " $dst, $lhs, $rhs",
+                   [(set tGPR:$dst, (add tGPR:$lhs, imm0_7:$rhs))]>;
 
-let Defs = [CPSR] in {
-def tADDi8 : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
-                 "add $dst, $rhs",
-                 [(set tGPR:$dst, (add tGPR:$lhs, imm8_255:$rhs))]>;
-def tADDSi8 : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
-                  "add $dst, $rhs",
-                  [(set tGPR:$dst, (addc tGPR:$lhs, imm8_255:$rhs))]>;
-}
+def tADDi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
+                   "add", " $dst, $rhs",
+                   [(set tGPR:$dst, (add tGPR:$lhs, imm8_255:$rhs))]>;
 
 // Add register
-let isCommutable = 1, Defs = [CPSR] in {
-def tADDrr : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-                "add $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (add tGPR:$lhs, tGPR:$rhs))]>;
-def tADDSrr : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-                 "add $dst, $lhs, $rhs",
-                 [(set tGPR:$dst, (addc tGPR:$lhs, tGPR:$rhs))]>;
-}
+let isCommutable = 1 in
+def tADDrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                   "add", " $dst, $lhs, $rhs",
+                   [(set tGPR:$dst, (add tGPR:$lhs, tGPR:$rhs))]>;
 
 let neverHasSideEffects = 1 in
-def tADDhirr : T1It<(outs tGPR:$dst), (ins GPR:$lhs, GPR:$rhs),
-                   "add $dst, $rhs @ addhirr", []>;
+def tADDhirr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                     "add", " $dst, $rhs", []>;
 
 // And register
-let isCommutable = 1, Defs = [CPSR] in
-def tAND : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-                "and $dst, $rhs",
-                [(set tGPR:$dst, (and tGPR:$lhs, tGPR:$rhs))]>;
+let isCommutable = 1 in
+def tAND : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                 "and", " $dst, $rhs",
+                 [(set tGPR:$dst, (and tGPR:$lhs, tGPR:$rhs))]>;
 
 // ASR immediate
-let Defs = [CPSR] in
-def tASRri : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
-                "asr $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (sra tGPR:$lhs, (i32 imm:$rhs)))]>;
+def tASRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
+                  "asr", " $dst, $lhs, $rhs",
+                  [(set tGPR:$dst, (sra tGPR:$lhs, (i32 imm:$rhs)))]>;
 
 // ASR register
-let Defs = [CPSR] in
-def tASRrr : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-                 "asr $dst, $rhs",
-                 [(set tGPR:$dst, (sra tGPR:$lhs, tGPR:$rhs))]>;
+def tASRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
+                   "asr", " $dst, $rhs",
+                   [(set tGPR:$dst, (sra tGPR:$lhs, tGPR:$rhs))]>;
 
 // BIC register
-let Defs = [CPSR] in
-def tBIC : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-               "bic $dst, $rhs",
-               [(set tGPR:$dst, (and tGPR:$lhs, (not tGPR:$rhs)))]>;
+def tBIC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                 "bic", " $dst, $rhs",
+                 [(set tGPR:$dst, (and tGPR:$lhs, (not tGPR:$rhs)))]>;
 
 // CMN register
 let Defs = [CPSR] in {
-def tCMN : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs),
-              "cmn $lhs, $rhs",
-              [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>;
-def tCMNZ : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs),
-               "cmn $lhs, $rhs",
-               [(ARMcmpZ tGPR:$lhs, (ineg tGPR:$rhs))]>;
+def tCMN : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+                "cmn", " $lhs, $rhs",
+                [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>;
+def tCMNZ : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+                 "cmn", " $lhs, $rhs",
+                 [(ARMcmpZ tGPR:$lhs, (ineg tGPR:$rhs))]>;
 }
 
 // CMP immediate
 let Defs = [CPSR] in {
-def tCMPi8 : T1I<(outs), (ins tGPR:$lhs, i32imm:$rhs),
-               "cmp $lhs, $rhs",
-               [(ARMcmp tGPR:$lhs, imm0_255:$rhs)]>;
-def tCMPZi8 : T1I<(outs), (ins tGPR:$lhs, i32imm:$rhs),
-                "cmp $lhs, $rhs",
-                [(ARMcmpZ tGPR:$lhs, imm0_255:$rhs)]>;
+def tCMPi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi,
+                  "cmp", " $lhs, $rhs",
+                  [(ARMcmp tGPR:$lhs, imm0_255:$rhs)]>;
+def tCMPzi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi,
+                  "cmp", " $lhs, $rhs",
+                  [(ARMcmpZ tGPR:$lhs, imm0_255:$rhs)]>;
 
 }
 
 // CMP register
 let Defs = [CPSR] in {
-def tCMPr : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs),
-               "cmp $lhs, $rhs",
-               [(ARMcmp tGPR:$lhs, tGPR:$rhs)]>;
-def tCMPZr : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs),
-                "cmp $lhs, $rhs",
-                [(ARMcmpZ tGPR:$lhs, tGPR:$rhs)]>;
+def tCMPr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+                 "cmp", " $lhs, $rhs",
+                 [(ARMcmp tGPR:$lhs, tGPR:$rhs)]>;
+def tCMPzr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+                  "cmp", " $lhs, $rhs",
+                  [(ARMcmpZ tGPR:$lhs, tGPR:$rhs)]>;
+
+def tCMPhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr,
+                   "cmp", " $lhs, $rhs", []>;
+def tCMPzhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr,
+                    "cmp", " $lhs, $rhs", []>;
 }
 
-// TODO: A7-37: CMP(3) - cmp hi regs
 
 // XOR register
-let isCommutable = 1, Defs = [CPSR] in
-def tEOR : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-               "eor $dst, $rhs",
-               [(set tGPR:$dst, (xor tGPR:$lhs, tGPR:$rhs))]>;
+let isCommutable = 1 in
+def tEOR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                 "eor", " $dst, $rhs",
+                 [(set tGPR:$dst, (xor tGPR:$lhs, tGPR:$rhs))]>;
 
 // LSL immediate
-let Defs = [CPSR] in
-def tLSLri : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
-                "lsl $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (shl tGPR:$lhs, (i32 imm:$rhs)))]>;
+def tLSLri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
+                  "lsl", " $dst, $lhs, $rhs",
+                  [(set tGPR:$dst, (shl tGPR:$lhs, (i32 imm:$rhs)))]>;
 
 // LSL register
-let Defs = [CPSR] in
-def tLSLrr : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-                 "lsl $dst, $rhs",
-                 [(set tGPR:$dst, (shl tGPR:$lhs, tGPR:$rhs))]>;
+def tLSLrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
+                   "lsl", " $dst, $rhs",
+                   [(set tGPR:$dst, (shl tGPR:$lhs, tGPR:$rhs))]>;
 
 // LSR immediate
-let Defs = [CPSR] in
-def tLSRri : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
-                "lsr $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (srl tGPR:$lhs, (i32 imm:$rhs)))]>;
+def tLSRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
+                  "lsr", " $dst, $lhs, $rhs",
+                  [(set tGPR:$dst, (srl tGPR:$lhs, (i32 imm:$rhs)))]>;
 
 // LSR register
-let Defs = [CPSR] in
-def tLSRrr : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-                 "lsr $dst, $rhs",
-                 [(set tGPR:$dst, (srl tGPR:$lhs, tGPR:$rhs))]>;
+def tLSRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
+                   "lsr", " $dst, $rhs",
+                   [(set tGPR:$dst, (srl tGPR:$lhs, tGPR:$rhs))]>;
 
 // move register
-let Defs = [CPSR] in
-def tMOVi8 : T1I<(outs tGPR:$dst), (ins i32imm:$src),
-                 "mov $dst, $src",
-                 [(set tGPR:$dst, imm0_255:$src)]>;
+def tMOVi8 : T1sI<(outs tGPR:$dst), (ins i32imm:$src), IIC_iMOVi,
+                  "mov", " $dst, $src",
+                  [(set tGPR:$dst, imm0_255:$src)]>;
 
 // TODO: A7-73: MOV(2) - mov setting flag.
 
 
-// Note: MOV(2) of two low regs updates the flags, so we emit this as 'cpy',
-// which is MOV(3).  This also supports high registers.
 let neverHasSideEffects = 1 in {
-def tMOVr       : T1I<(outs tGPR:$dst), (ins tGPR:$src),
-                      "cpy $dst, $src", []>;
-def tMOVhir2lor : T1I<(outs tGPR:$dst), (ins GPR:$src),
-                      "cpy $dst, $src\t@ hir2lor", []>;
-def tMOVlor2hir : T1I<(outs GPR:$dst), (ins tGPR:$src),
-                      "cpy $dst, $src\t@ lor2hir", []>;
-def tMOVhir2hir : T1I<(outs GPR:$dst), (ins GPR:$src),
-                      "cpy $dst, $src\t@ hir2hir", []>;
+// FIXME: Make this predicable.
+def tMOVr       : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr,
+                      "mov $dst, $src", []>;
+let Defs = [CPSR] in
+def tMOVSr      : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr,
+                       "movs $dst, $src", []>;
+
+// FIXME: Make these predicable.
+def tMOVgpr2tgpr : T1I<(outs tGPR:$dst), (ins GPR:$src), IIC_iMOVr,
+                       "mov $dst, $src", []>;
+def tMOVtgpr2gpr : T1I<(outs GPR:$dst), (ins tGPR:$src), IIC_iMOVr,
+                       "mov $dst, $src", []>;
+def tMOVgpr2gpr  : T1I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr,
+                       "mov $dst, $src", []>;
 } // neverHasSideEffects
 
 // multiply register
-let isCommutable = 1, Defs = [CPSR] in
-def tMUL : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-               "mul $dst, $rhs",
-               [(set tGPR:$dst, (mul tGPR:$lhs, tGPR:$rhs))]>;
+let isCommutable = 1 in
+def tMUL : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMUL32,
+                 "mul", " $dst, $rhs",
+                 [(set tGPR:$dst, (mul tGPR:$lhs, tGPR:$rhs))]>;
 
 // move inverse register
-let Defs = [CPSR] in
-def tMVN : T1I<(outs tGPR:$dst), (ins tGPR:$src),
-              "mvn $dst, $src",
-              [(set tGPR:$dst, (not tGPR:$src))]>;
-
-// negate register
-let Defs = [CPSR] in
-def tNEG : T1I<(outs tGPR:$dst), (ins tGPR:$src),
-              "neg $dst, $src",
-              [(set tGPR:$dst, (ineg tGPR:$src))]>;
+def tMVN : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr,
+                "mvn", " $dst, $src",
+                [(set tGPR:$dst, (not tGPR:$src))]>;
 
 // bitwise or register
-let isCommutable = 1, Defs = [CPSR] in
-def tORR : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-               "orr $dst, $rhs",
-               [(set tGPR:$dst, (or tGPR:$lhs, tGPR:$rhs))]>;
+let isCommutable = 1 in
+def tORR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),  IIC_iALUr,
+                 "orr", " $dst, $rhs",
+                 [(set tGPR:$dst, (or tGPR:$lhs, tGPR:$rhs))]>;
 
 // swaps
-def tREV : T1I<(outs tGPR:$dst), (ins tGPR:$src),
-              "rev $dst, $src",
-              [(set tGPR:$dst, (bswap tGPR:$src))]>,
-              Requires<[IsThumb, HasV6]>;
-
-def tREV16 : T1I<(outs tGPR:$dst), (ins tGPR:$src),
-                "rev16 $dst, $src",
-                [(set tGPR:$dst,
-                    (or (and (srl tGPR:$src, (i32 8)), 0xFF),
-                        (or (and (shl tGPR:$src, (i32 8)), 0xFF00),
-                            (or (and (srl tGPR:$src, (i32 8)), 0xFF0000),
-                                (and (shl tGPR:$src, (i32 8)), 0xFF000000)))))]>,
-                Requires<[IsThumb, HasV6]>;
-
-def tREVSH : T1I<(outs tGPR:$dst), (ins tGPR:$src),
-                "revsh $dst, $src",
-                [(set tGPR:$dst,
-                   (sext_inreg
-                     (or (srl (and tGPR:$src, 0xFFFF), (i32 8)),
-                         (shl tGPR:$src, (i32 8))), i16))]>,
-                Requires<[IsThumb, HasV6]>;
+def tREV : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                "rev", " $dst, $src",
+                [(set tGPR:$dst, (bswap tGPR:$src))]>,
+                Requires<[IsThumb1Only, HasV6]>;
+
+def tREV16 : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                  "rev16", " $dst, $src",
+             [(set tGPR:$dst,
+                   (or (and (srl tGPR:$src, (i32 8)), 0xFF),
+                       (or (and (shl tGPR:$src, (i32 8)), 0xFF00),
+                           (or (and (srl tGPR:$src, (i32 8)), 0xFF0000),
+                               (and (shl tGPR:$src, (i32 8)), 0xFF000000)))))]>,
+                Requires<[IsThumb1Only, HasV6]>;
+
+def tREVSH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                  "revsh", " $dst, $src",
+                  [(set tGPR:$dst,
+                        (sext_inreg
+                          (or (srl (and tGPR:$src, 0xFF00), (i32 8)),
+                              (shl tGPR:$src, (i32 8))), i16))]>,
+                  Requires<[IsThumb1Only, HasV6]>;
 
 // rotate right register
-let Defs = [CPSR] in
-def tROR : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-                "ror $dst, $rhs",
-                [(set tGPR:$dst, (rotr tGPR:$lhs, tGPR:$rhs))]>;
+def tROR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
+                 "ror", " $dst, $rhs",
+                 [(set tGPR:$dst, (rotr tGPR:$lhs, tGPR:$rhs))]>;
+
+// negate register
+def tRSB : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iALUi,
+                "rsb", " $dst, $src, #0",
+                [(set tGPR:$dst, (ineg tGPR:$src))]>;
 
 // Subtract with carry register
-let Defs = [CPSR], Uses = [CPSR] in
-def tSBCS : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-                "sbc $dst, $rhs",
-                [(set tGPR:$dst, (sube tGPR:$lhs, tGPR:$rhs))]>;
+let Uses = [CPSR] in
+def tSBC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                 "sbc", " $dst, $rhs",
+                 [(set tGPR:$dst, (sube tGPR:$lhs, tGPR:$rhs))]>;
 
 // Subtract immediate
-let Defs = [CPSR] in {
-def tSUBi3 : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
-                "sub $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (add tGPR:$lhs, imm0_7_neg:$rhs))]>;
-def tSUBSi3 : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
-                 "sub $dst, $lhs, $rhs",
-                 [(set tGPR:$dst, (addc tGPR:$lhs, imm0_7_neg:$rhs))]>;
-}
+def tSUBi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
+                  "sub", " $dst, $lhs, $rhs",
+                  [(set tGPR:$dst, (add tGPR:$lhs, imm0_7_neg:$rhs))]>;
 
-let Defs = [CPSR] in {
-def tSUBi8 : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
-                  "sub $dst, $rhs",
-                  [(set tGPR:$dst, (add tGPR:$lhs, imm8_255_neg:$rhs))]>;
-def tSUBSi8 : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
-                   "sub $dst, $rhs",
-                   [(set tGPR:$dst, (addc tGPR:$lhs, imm8_255_neg:$rhs))]>;
-}
+def tSUBi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
+                   "sub", " $dst, $rhs",
+                   [(set tGPR:$dst, (add tGPR:$lhs, imm8_255_neg:$rhs))]>;
 
 // subtract register
-let Defs = [CPSR] in {
-def tSUBrr : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-                "sub $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (sub tGPR:$lhs, tGPR:$rhs))]>;
-def tSUBSrr : T1I<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-                "sub $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (subc tGPR:$lhs, tGPR:$rhs))]>;
-}
+def tSUBrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                  "sub", " $dst, $lhs, $rhs",
+                  [(set tGPR:$dst, (sub tGPR:$lhs, tGPR:$rhs))]>;
 
 // TODO: A7-96: STMIA - store multiple.
 
-def tSUBspi : T1It<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
-                  "sub $dst, $rhs * 4", []>;
-
 // sign-extend byte
-def tSXTB  : T1I<(outs tGPR:$dst), (ins tGPR:$src),
-                "sxtb $dst, $src",
-                [(set tGPR:$dst, (sext_inreg tGPR:$src, i8))]>,
-                Requires<[IsThumb, HasV6]>;
+def tSXTB  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                  "sxtb", " $dst, $src",
+                  [(set tGPR:$dst, (sext_inreg tGPR:$src, i8))]>,
+                  Requires<[IsThumb1Only, HasV6]>;
 
 // sign-extend short
-def tSXTH  : T1I<(outs tGPR:$dst), (ins tGPR:$src),
-                "sxth $dst, $src",
-                [(set tGPR:$dst, (sext_inreg tGPR:$src, i16))]>,
-                Requires<[IsThumb, HasV6]>;
+def tSXTH  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                  "sxth", " $dst, $src",
+                  [(set tGPR:$dst, (sext_inreg tGPR:$src, i16))]>,
+                  Requires<[IsThumb1Only, HasV6]>;
 
 // test
 let isCommutable = 1, Defs = [CPSR] in
-def tTST  : T1I<(outs), (ins tGPR:$lhs, tGPR:$rhs),
-               "tst $lhs, $rhs",
-               [(ARMcmpZ (and tGPR:$lhs, tGPR:$rhs), 0)]>;
+def tTST  : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+                 "tst", " $lhs, $rhs",
+                 [(ARMcmpZ (and tGPR:$lhs, tGPR:$rhs), 0)]>;
 
 // zero-extend byte
-def tUXTB  : T1I<(outs tGPR:$dst), (ins tGPR:$src),
-                "uxtb $dst, $src",
-                [(set tGPR:$dst, (and tGPR:$src, 0xFF))]>,
-                Requires<[IsThumb, HasV6]>;
+def tUXTB  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                  "uxtb", " $dst, $src",
+                  [(set tGPR:$dst, (and tGPR:$src, 0xFF))]>,
+                  Requires<[IsThumb1Only, HasV6]>;
 
 // zero-extend short
-def tUXTH  : T1I<(outs tGPR:$dst), (ins tGPR:$src),
-                "uxth $dst, $src",
-                [(set tGPR:$dst, (and tGPR:$src, 0xFFFF))]>,
-                Requires<[IsThumb, HasV6]>;
+def tUXTH  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                  "uxth", " $dst, $src",
+                  [(set tGPR:$dst, (and tGPR:$src, 0xFFFF))]>,
+                  Requires<[IsThumb1Only, HasV6]>;
 
 
 // Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC DAG operation.
 // Expanded by the scheduler into a branch sequence.
 let usesCustomDAGSchedInserter = 1 in  // Expanded by the scheduler.
-  def tMOVCCr :
+  def tMOVCCr_pseudo :
   PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, pred:$cc),
-              "@ tMOVCCr $cc",
-              [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>;
+              NoItinerary, "@ tMOVCCr $cc",
+             [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>;
+
+
+// 16-bit movcc in IT blocks for Thumb2.
+def tMOVCCr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iCMOVr,
+                    "mov", " $dst, $rhs", []>;
+
+def tMOVCCi : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iCMOVi,
+                    "mov", " $dst, $rhs", []>;
 
 // tLEApcrel - Load a pc-relative address into a register without offending the
 // assembler.
-def tLEApcrel : TIx2<(outs tGPR:$dst), (ins i32imm:$label),
-                    !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
-                                          "${:private}PCRELL${:uid}+4))\n"),
-                               !strconcat("\tmov $dst, #PCRELV${:uid}\n",
-                                  "${:private}PCRELL${:uid}:\n\tadd $dst, pc")),
-                    []>;
-
-def tLEApcrelJT : TIx2<(outs tGPR:$dst), (ins i32imm:$label, i32imm:$id),
-          !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
-                                         "${:private}PCRELL${:uid}+4))\n"),
-                     !strconcat("\tmov $dst, #PCRELV${:uid}\n",
-                                "${:private}PCRELL${:uid}:\n\tadd $dst, pc")),
-                    []>;
+def tLEApcrel : T1I<(outs tGPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
+                    "adr$p $dst, #$label", []>;
+
+def tLEApcrelJT : T1I<(outs tGPR:$dst),
+                      (ins i32imm:$label, nohash_imm:$id, pred:$p),
+                      IIC_iALUi, "adr$p $dst, #${label}_${id}", []>;
 
 //===----------------------------------------------------------------------===//
 // TLS Instructions
@@ -598,7 +642,7 @@ def tLEApcrelJT : TIx2<(outs tGPR:$dst), (ins i32imm:$label, i32imm:$id),
 // __aeabi_read_tp preserves the registers r1-r3.
 let isCall = 1,
   Defs = [R0, LR] in {
-  def tTPsoft  : TIx2<(outs), (ins),
+  def tTPsoft  : TIx2<(outs), (ins), IIC_Br,
                "bl __aeabi_read_tp",
                [(set R0, ARMthread_pointer)]>;
 }
@@ -607,20 +651,46 @@ let isCall = 1,
 // Non-Instruction Patterns
 //
 
+// Add with carry
+def : T1Pat<(addc   tGPR:$lhs, imm0_7:$rhs),
+            (tADDi3 tGPR:$lhs, imm0_7:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, imm8_255:$rhs),
+            (tADDi8 tGPR:$lhs, imm8_255:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, tGPR:$rhs),
+            (tADDrr tGPR:$lhs, tGPR:$rhs)>;
+
+// Subtract with carry
+def : T1Pat<(addc   tGPR:$lhs, imm0_7_neg:$rhs),
+            (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, imm8_255_neg:$rhs),
+            (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>;
+def : T1Pat<(subc   tGPR:$lhs, tGPR:$rhs),
+            (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
+
 // ConstantPool, GlobalAddress
-def : TPat<(ARMWrapper  tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>;
-def : TPat<(ARMWrapper  tconstpool  :$dst), (tLEApcrel tconstpool  :$dst)>;
+def : T1Pat<(ARMWrapper  tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>;
+def : T1Pat<(ARMWrapper  tconstpool  :$dst), (tLEApcrel tconstpool  :$dst)>;
 
 // JumpTable
-def : TPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
-           (tLEApcrelJT tjumptable:$dst, imm:$id)>;
+def : T1Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+            (tLEApcrelJT tjumptable:$dst, imm:$id)>;
 
 // Direct calls
-def : TPat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>;
-def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>;
+def : T1Pat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>,
+      Requires<[IsThumb, IsNotDarwin]>;
+def : T1Pat<(ARMtcall texternalsym:$func), (tBLr9 texternalsym:$func)>,
+      Requires<[IsThumb, IsDarwin]>;
+
+def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>,
+      Requires<[IsThumb, HasV5T, IsNotDarwin]>;
+def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi_r9 texternalsym:$func)>,
+      Requires<[IsThumb, HasV5T, IsDarwin]>;
 
 // Indirect calls to ARM routines
-def : Tv5Pat<(ARMcall tGPR:$dst), (tBLXr tGPR:$dst)>;
+def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>,
+      Requires<[IsThumb, HasV5T, IsNotDarwin]>;
+def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr_r9 GPR:$dst)>,
+      Requires<[IsThumb, HasV5T, IsDarwin]>;
 
 // zextload i1 -> zextload i8
 def : T1Pat<(zextloadi1 t_addrmode_s1:$addr),
@@ -631,6 +701,20 @@ def : T1Pat<(extloadi1  t_addrmode_s1:$addr),  (tLDRB t_addrmode_s1:$addr)>;
 def : T1Pat<(extloadi8  t_addrmode_s1:$addr),  (tLDRB t_addrmode_s1:$addr)>;
 def : T1Pat<(extloadi16 t_addrmode_s2:$addr),  (tLDRH t_addrmode_s2:$addr)>;
 
+// If it's impossible to use [r,r] address mode for sextload, select to
+// ldr{b|h} + sxt{b|h} instead.
+def : T1Pat<(sextloadi8 t_addrmode_s1:$addr),
+            (tSXTB (tLDRB t_addrmode_s1:$addr))>,
+      Requires<[IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi16 t_addrmode_s2:$addr),
+            (tSXTH (tLDRH t_addrmode_s2:$addr))>,
+      Requires<[IsThumb1Only, HasV6]>;
+
+def : T1Pat<(sextloadi8 t_addrmode_s1:$addr),
+            (tASRri (tLSLri (tLDRB t_addrmode_s1:$addr), 24), 24)>;
+def : T1Pat<(sextloadi16 t_addrmode_s1:$addr),
+            (tASRri (tLSLri (tLDRH t_addrmode_s1:$addr), 16), 16)>;
+
 // Large immediate handling.
 
 // Two piece imms.
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 50345a6..0750dcc 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -11,6 +11,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+// IT block predicate field
+def it_pred : Operand<i32> {
+  let PrintMethod = "printPredicateOperand";
+}
+
+// IT block condition mask
+def it_mask : Operand<i32> {
+  let PrintMethod = "printThumbITMask";
+}
+
+// Table branch address
+def tb_addrmode : Operand<i32> {
+  let PrintMethod = "printTBAddrMode";
+}
+
 // Shifted operands. No register controlled shifts for Thumb2.
 // Note: We do not support rrx shifted operands yet.
 def t2_so_reg : Operand<i32>,    // reg imm
@@ -20,23 +35,14 @@ def t2_so_reg : Operand<i32>,    // reg imm
   let MIOperandInfo = (ops GPR, i32imm);
 }
 
-// t2_so_imm_XFORM - Return a t2_so_imm value packed into the format 
-// described for t2_so_imm def below.
-def t2_so_imm_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(
-        ARM_AM::getT2SOImmVal(N->getZExtValue()), MVT::i32);
-}]>;
-
 // t2_so_imm_not_XFORM - Return the complement of a t2_so_imm value
 def t2_so_imm_not_XFORM : SDNodeXForm<imm, [{
-    return CurDAG->getTargetConstant(
-        ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())), MVT::i32);
+  return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32);
 }]>;
 
 // t2_so_imm_neg_XFORM - Return the negation of a t2_so_imm value
 def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
-    return CurDAG->getTargetConstant(
-        ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())), MVT::i32);
+  return CurDAG->getTargetConstant(-((int)N->getZExtValue()), MVT::i32);
 }]>;
 
 // t2_so_imm - Match a 32-bit immediate operand, which is an
@@ -47,27 +53,21 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
 // [bits 0-7], the 4-bit shift/splat amount is the next 4 bits [bits 8-11].
 def t2_so_imm : Operand<i32>,
                 PatLeaf<(imm), [{
-       return ARM_AM::getT2SOImmVal((uint32_t)N->getZExtValue()) != -1;
-     }], t2_so_imm_XFORM> {
-  let PrintMethod = "printT2SOImmOperand";
-}
+  return ARM_AM::getT2SOImmVal((uint32_t)N->getZExtValue()) != -1; 
+}]>;
 
 // t2_so_imm_not - Match an immediate that is a complement 
 // of a t2_so_imm.
 def t2_so_imm_not : Operand<i32>,
                     PatLeaf<(imm), [{
-       return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1;
-     }], t2_so_imm_not_XFORM> {
-  let PrintMethod = "printT2SOImmOperand";
-}
+  return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1;
+}], t2_so_imm_not_XFORM>;
 
 // t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
 def t2_so_imm_neg : Operand<i32>,
                     PatLeaf<(imm), [{
-       return ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())) != -1;
-     }], t2_so_imm_neg_XFORM> {
-  let PrintMethod = "printT2SOImmOperand";
-}
+  return ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())) != -1;
+}], t2_so_imm_neg_XFORM>;
 
 /// imm1_31 predicate - True if the 32-bit immediate is in the range [1,31].
 def imm1_31 : PatLeaf<(i32 imm), [{
@@ -75,7 +75,8 @@ def imm1_31 : PatLeaf<(i32 imm), [{
 }]>;
 
 /// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
-def imm0_4095 : PatLeaf<(i32 imm), [{
+def imm0_4095 : Operand<i32>,
+                PatLeaf<(i32 imm), [{
   return (uint32_t)N->getZExtValue() < 4096;
 }]>;
 
@@ -83,48 +84,9 @@ def imm0_4095_neg : PatLeaf<(i32 imm), [{
  return (uint32_t)(-N->getZExtValue()) < 4096; 
 }], imm_neg_XFORM>; 
 
-/// imm0_65535 predicate - True if the 32-bit immediate is in the range 
-/// [0.65535].
-def imm0_65535 : PatLeaf<(i32 imm), [{
-  return (uint32_t)N->getZExtValue() < 65536;
-}]>;
-
-/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
-/// e.g., 0xf000ffff
-def bf_inv_mask_imm : Operand<i32>,
-                      PatLeaf<(imm), [{ 
-  uint32_t v = (uint32_t)N->getZExtValue();
-  if (v == 0xffffffff)
-    return 0;
-  // naive checker. should do better, but simple is best for now since it's
-  // more likely to be correct.
-  while (v & 1) v >>= 1;    // shift off the leading 1's
-  if (v)
-    {
-      while (!(v & 1)) v >>=1;  // shift off the mask
-      while (v & 1) v >>= 1;    // shift off the trailing 1's
-    }
-  // if this is a mask for clearing a bitfield, what's left should be zero.
-  return (v == 0);
-}] > {
-  let PrintMethod = "printBitfieldInvMaskImmOperand";
-}
-
-/// Split a 32-bit immediate into two 16 bit parts.
-def t2_lo16 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() & 0xffff,
-                                   MVT::i32);
-}]>;
-
-def t2_hi16 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32);
-}]>;
-
-def t2_lo16AllZero : PatLeaf<(i32 imm), [{
-  // Returns true if all low 16-bits are 0.
-  return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
-  }], t2_hi16>;
-
+def imm0_255_neg : PatLeaf<(i32 imm), [{
+  return (uint32_t)(-N->getZExtValue()) < 255;
+}], imm_neg_XFORM>; 
 
 // Define Thumb2 specific addressing modes.
 
@@ -147,14 +109,14 @@ def t2am_imm8_offset : Operand<i32>,
   let PrintMethod = "printT2AddrModeImm8OffsetOperand";
 }
 
-// t2addrmode_imm8s4  := reg + (imm8 << 2)
+// t2addrmode_imm8s4  := reg +/- (imm8 << 2)
 def t2addrmode_imm8s4 : Operand<i32>,
                         ComplexPattern<i32, 2, "SelectT2AddrModeImm8s4", []> {
-  let PrintMethod = "printT2AddrModeImm8Operand";
+  let PrintMethod = "printT2AddrModeImm8s4Operand";
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
 }
 
-// t2addrmode_so_reg  := reg + reg << imm2
+// t2addrmode_so_reg  := reg + (reg << imm2)
 def t2addrmode_so_reg : Operand<i32>,
                         ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> {
   let PrintMethod = "printT2AddrModeSoRegOperand";
@@ -171,52 +133,58 @@ def t2addrmode_so_reg : Operand<i32>,
 /// changed to modify CPSR.
 multiclass T2I_un_irs<string opc, PatFrag opnode, bit Cheap = 0, bit ReMat = 0>{
    // shifted imm
-   def i : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src),
+   def i : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi,
                 opc, " $dst, $src",
                 [(set GPR:$dst, (opnode t2_so_imm:$src))]> {
      let isAsCheapAsAMove = Cheap;
      let isReMaterializable = ReMat;
    }
    // register
-   def r : T2I<(outs GPR:$dst), (ins GPR:$src),
-               opc, " $dst, $src",
+   def r : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr,
+               opc, ".w $dst, $src",
                 [(set GPR:$dst, (opnode GPR:$src))]>;
    // shifted register
-   def s : T2I<(outs GPR:$dst), (ins t2_so_reg:$src),
-               opc, " $dst, $src",
+   def s : T2I<(outs GPR:$dst), (ins t2_so_reg:$src), IIC_iMOVsi,
+               opc, ".w $dst, $src",
                [(set GPR:$dst, (opnode t2_so_reg:$src))]>;
 }
 
 /// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
 //  binary operation that produces a value. These are predicable and can be
 /// changed to modify CPSR.
-multiclass T2I_bin_irs<string opc, PatFrag opnode, bit Commutable = 0> {
+multiclass T2I_bin_irs<string opc, PatFrag opnode, 
+                       bit Commutable = 0, string wide =""> {
    // shifted imm
-   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs),
+   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
                  opc, " $dst, $lhs, $rhs",
                  [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>;
    // register
-   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
-                 opc, " $dst, $lhs, $rhs",
+   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                 opc, !strconcat(wide, " $dst, $lhs, $rhs"),
                  [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
      let isCommutable = Commutable;
    }
    // shifted register
-   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs),
-                 opc, " $dst, $lhs, $rhs",
+   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+                 opc, !strconcat(wide, " $dst, $lhs, $rhs"),
                  [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>;
 }
 
+/// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need
+//  the ".w" prefix to indicate that they are wide.
+multiclass T2I_bin_w_irs<string opc, PatFrag opnode, bit Commutable = 0> :
+    T2I_bin_irs<opc, opnode, Commutable, ".w">;
+
 /// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
 /// reversed. It doesn't define the 'rr' form since it's handled by its
 /// T2I_bin_irs counterpart.
 multiclass T2I_rbin_is<string opc, PatFrag opnode> {
    // shifted imm
-   def ri : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs),
-                opc, " $dst, $rhs, $lhs",
+   def ri : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs), IIC_iALUi,
+                opc, ".w $dst, $rhs, $lhs",
                 [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]>;
    // shifted register
-   def rs : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs),
+   def rs : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi,
                 opc, " $dst, $rhs, $lhs",
                 [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]>;
 }
@@ -226,18 +194,18 @@ multiclass T2I_rbin_is<string opc, PatFrag opnode> {
 let Defs = [CPSR] in {
 multiclass T2I_bin_s_irs<string opc, PatFrag opnode, bit Commutable = 0> {
    // shifted imm
-   def ri : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs),
-                !strconcat(opc, "s"), " $dst, $lhs, $rhs",
+   def ri : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+                !strconcat(opc, "s"), ".w $dst, $lhs, $rhs",
                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>;
    // register
-   def rr : T2I<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
-                !strconcat(opc, "s"), " $dst, $lhs, $rhs",
+   def rr : T2I<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                !strconcat(opc, "s"), ".w $dst, $lhs, $rhs",
                 [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
      let isCommutable = Commutable;
    }
    // shifted register
-   def rs : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs),
-                !strconcat(opc, "s"), " $dst, $lhs, $rhs",
+   def rs : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+                !strconcat(opc, "s"), ".w $dst, $lhs, $rhs",
                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>;
 }
 }
@@ -246,22 +214,22 @@ multiclass T2I_bin_s_irs<string opc, PatFrag opnode, bit Commutable = 0> {
 /// patterns for a binary operation that produces a value.
 multiclass T2I_bin_ii12rs<string opc, PatFrag opnode, bit Commutable = 0> {
    // shifted imm
-   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs),
-                 opc, " $dst, $lhs, $rhs",
+   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+                 opc, ".w $dst, $lhs, $rhs",
                  [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>;
    // 12-bit imm
-   def ri12 : T2sI<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs),
+   def ri12 : T2sI<(outs GPR:$dst), (ins GPR:$lhs, imm0_4095:$rhs), IIC_iALUi,
                    !strconcat(opc, "w"), " $dst, $lhs, $rhs",
                    [(set GPR:$dst, (opnode GPR:$lhs, imm0_4095:$rhs))]>;
    // register
-   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
-                 opc, " $dst, $lhs, $rhs",
+   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                 opc, ".w $dst, $lhs, $rhs",
                  [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
      let isCommutable = Commutable;
    }
    // shifted register
-   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs),
-                 opc, " $dst, $lhs, $rhs",
+   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+                 opc, ".w $dst, $lhs, $rhs",
                  [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>;
 }
 
@@ -271,41 +239,41 @@ multiclass T2I_bin_ii12rs<string opc, PatFrag opnode, bit Commutable = 0> {
 let Uses = [CPSR] in {
 multiclass T2I_adde_sube_irs<string opc, PatFrag opnode, bit Commutable = 0> {
    // shifted imm
-   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs),
+   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
                  opc, " $dst, $lhs, $rhs",
                  [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>,
                  Requires<[IsThumb2, CarryDefIsUnused]>;
    // register
-   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
-                 opc, " $dst, $lhs, $rhs",
+   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                 opc, ".w $dst, $lhs, $rhs",
                  [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>,
                  Requires<[IsThumb2, CarryDefIsUnused]> {
      let isCommutable = Commutable;
    }
    // shifted register
-   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs),
-                 opc, " $dst, $lhs, $rhs",
+   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+                 opc, ".w $dst, $lhs, $rhs",
                  [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>,
                  Requires<[IsThumb2, CarryDefIsUnused]>;
    // Carry setting variants
    // shifted imm
-   def Sri : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs),
+   def Sri : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
                   !strconcat(opc, "s $dst, $lhs, $rhs"),
                   [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>,
                   Requires<[IsThumb2, CarryDefIsUsed]> {
                     let Defs = [CPSR];
                   }
    // register
-   def Srr : T2XI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
-                  !strconcat(opc, "s $dst, $lhs, $rhs"),
+   def Srr : T2XI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                  !strconcat(opc, "s.w $dst, $lhs, $rhs"),
                   [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>,
                   Requires<[IsThumb2, CarryDefIsUsed]> {
                     let Defs = [CPSR];
                     let isCommutable = Commutable;
    }
    // shifted register
-   def Srs : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs),
-                  !strconcat(opc, "s $dst, $lhs, $rhs"),
+   def Srs : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+                  !strconcat(opc, "s.w $dst, $lhs, $rhs"),
                   [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>,
                   Requires<[IsThumb2, CarryDefIsUsed]> {
                     let Defs = [CPSR];
@@ -313,49 +281,17 @@ multiclass T2I_adde_sube_irs<string opc, PatFrag opnode, bit Commutable = 0> {
 }
 }
 
-/// T2I_rsc_is - Same as T2I_adde_sube_irs except the order of operands are
-/// reversed. It doesn't define the 'rr' form since it's handled by its
-/// T2I_adde_sube_irs counterpart.
-let Defs = [CPSR], Uses = [CPSR] in {
-multiclass T2I_rsc_is<string opc, PatFrag opnode> {
-   // shifted imm
-   def ri : T2sI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs),
-                 opc, " $dst, $rhs, $lhs",
-                 [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]>,
-                 Requires<[IsThumb2, CarryDefIsUnused]>;
-   // shifted register
-   def rs : T2sI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs),
-                 opc, " $dst, $rhs, $lhs",
-                 [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]>,
-                 Requires<[IsThumb2, CarryDefIsUnused]>;
-   // shifted imm
-   def Sri : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs),
-                 !strconcat(opc, "s $dst, $rhs, $lhs"),
-                 [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]>,
-                 Requires<[IsThumb2, CarryDefIsUsed]> {
-                   let Defs = [CPSR];
-   }
-   // shifted register
-   def Srs : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs),
-                 !strconcat(opc, "s $dst, $rhs, $lhs"),
-                 [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]>,
-                 Requires<[IsThumb2, CarryDefIsUsed]> {
-                   let Defs = [CPSR];
-   }
-}
-}
-
-/// T2I_rbin_s_is - Same as T2I_bin_s_irs except the order of operands are
-/// reversed. It doesn't define the 'rr' form since it's handled by its
-/// T2I_bin_s_irs counterpart.
+/// T2I_rbin_s_is - Same as T2I_rbin_is except sets 's' bit.
 let Defs = [CPSR] in {
 multiclass T2I_rbin_s_is<string opc, PatFrag opnode> {
    // shifted imm
    def ri : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs, cc_out:$s),
-                 !strconcat(opc, "${s} $dst, $rhs, $lhs"),
+                 IIC_iALUi,
+                 !strconcat(opc, "${s}.w $dst, $rhs, $lhs"),
                  [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]>;
    // shifted register
    def rs : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs, cc_out:$s),
+                 IIC_iALUsi,
                  !strconcat(opc, "${s} $dst, $rhs, $lhs"),
                  [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]>;
 }
@@ -365,96 +301,96 @@ multiclass T2I_rbin_s_is<string opc, PatFrag opnode> {
 //  rotate operation that produces a value.
 multiclass T2I_sh_ir<string opc, PatFrag opnode> {
    // 5-bit imm
-   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs),
-                 opc, " $dst, $lhs, $rhs",
+   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
+                 opc, ".w $dst, $lhs, $rhs",
                  [(set GPR:$dst, (opnode GPR:$lhs, imm1_31:$rhs))]>;
    // register
-   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
-                 opc, " $dst, $lhs, $rhs",
+   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iMOVsr,
+                 opc, ".w $dst, $lhs, $rhs",
                  [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>;
 }
 
-/// T21_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
+/// T2I_cmp_is - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
 /// patterns. Similar to T2I_bin_irs except the instruction does not produce
 /// a explicit result, only implicitly set CPSR.
-let Uses = [CPSR] in {
+let Defs = [CPSR] in {
 multiclass T2I_cmp_is<string opc, PatFrag opnode> {
    // shifted imm
-   def ri : T2I<(outs), (ins GPR:$lhs, t2_so_imm:$rhs),
-                opc, " $lhs, $rhs",
+   def ri : T2I<(outs), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iCMPi,
+                opc, ".w $lhs, $rhs",
                 [(opnode GPR:$lhs, t2_so_imm:$rhs)]>;
    // register
-   def rr : T2I<(outs), (ins GPR:$lhs, GPR:$rhs),
-                opc, " $lhs, $rhs",
+   def rr : T2I<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr,
+                opc, ".w $lhs, $rhs",
                 [(opnode GPR:$lhs, GPR:$rhs)]>;
    // shifted register
-   def rs : T2I<(outs), (ins GPR:$lhs, t2_so_reg:$rhs),
-                opc, " $lhs, $rhs",
+   def rs : T2I<(outs), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iCMPsi,
+                opc, ".w $lhs, $rhs",
                 [(opnode GPR:$lhs, t2_so_reg:$rhs)]>;
 }
 }
 
 /// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns.
 multiclass T2I_ld<string opc, PatFrag opnode> {
-  def i12 : T2Ii12<(outs GPR:$dst), (ins t2addrmode_imm12:$addr),
-                   opc, " $dst, $addr",
+  def i12 : T2Ii12<(outs GPR:$dst), (ins t2addrmode_imm12:$addr), IIC_iLoadi,
+                   opc, ".w $dst, $addr",
                    [(set GPR:$dst, (opnode t2addrmode_imm12:$addr))]>;
-  def i8  : T2Ii8 <(outs GPR:$dst), (ins t2addrmode_imm8:$addr),
+  def i8  : T2Ii8 <(outs GPR:$dst), (ins t2addrmode_imm8:$addr), IIC_iLoadi,
                    opc, " $dst, $addr",
                    [(set GPR:$dst, (opnode t2addrmode_imm8:$addr))]>;
-  def s   : T2Iso <(outs GPR:$dst), (ins t2addrmode_so_reg:$addr),
-                   opc, " $dst, $addr",
+  def s   : T2Iso <(outs GPR:$dst), (ins t2addrmode_so_reg:$addr), IIC_iLoadr,
+                   opc, ".w $dst, $addr",
                    [(set GPR:$dst, (opnode t2addrmode_so_reg:$addr))]>;
-  def pci : T2Ipc <(outs GPR:$dst), (ins i32imm:$addr),
-                   opc, " $dst, $addr",
+  def pci : T2Ipc <(outs GPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
+                   opc, ".w $dst, $addr",
                    [(set GPR:$dst, (opnode (ARMWrapper tconstpool:$addr)))]>;
 }
 
 /// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns.
 multiclass T2I_st<string opc, PatFrag opnode> {
-  def i12 : T2Ii12<(outs), (ins GPR:$src, t2addrmode_imm12:$addr),
-                   opc, " $src, $addr",
+  def i12 : T2Ii12<(outs), (ins GPR:$src, t2addrmode_imm12:$addr), IIC_iStorei,
+                   opc, ".w $src, $addr",
                    [(opnode GPR:$src, t2addrmode_imm12:$addr)]>;
-  def i8  : T2Ii8 <(outs), (ins GPR:$src, t2addrmode_imm8:$addr),
+  def i8  : T2Ii8 <(outs), (ins GPR:$src, t2addrmode_imm8:$addr), IIC_iStorei,
                    opc, " $src, $addr",
                    [(opnode GPR:$src, t2addrmode_imm8:$addr)]>;
-  def s   : T2Iso <(outs), (ins GPR:$src, t2addrmode_so_reg:$addr),
-                   opc, " $src, $addr",
+  def s   : T2Iso <(outs), (ins GPR:$src, t2addrmode_so_reg:$addr), IIC_iStorer,
+                   opc, ".w $src, $addr",
                    [(opnode GPR:$src, t2addrmode_so_reg:$addr)]>;
 }
 
 /// T2I_picld - Defines the PIC load pattern.
 class T2I_picld<string opc, PatFrag opnode> :
-      T2I<(outs GPR:$dst), (ins addrmodepc:$addr),
-          !strconcat("${addr:label}:\n\t", opc), " $dst, $addr",
+      T2I<(outs GPR:$dst), (ins addrmodepc:$addr), IIC_iLoadi,
+          !strconcat("\n${addr:label}:\n\t", opc), " $dst, $addr",
           [(set GPR:$dst, (opnode addrmodepc:$addr))]>;
 
 /// T2I_picst - Defines the PIC store pattern.
 class T2I_picst<string opc, PatFrag opnode> :
-      T2I<(outs), (ins GPR:$src, addrmodepc:$addr),
-          !strconcat("${addr:label}:\n\t", opc), " $src, $addr",
+      T2I<(outs), (ins GPR:$src, addrmodepc:$addr), IIC_iStorer,
+          !strconcat("\n${addr:label}:\n\t", opc), " $src, $addr",
           [(opnode GPR:$src, addrmodepc:$addr)]>;
 
 
 /// T2I_unary_rrot - A unary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
 multiclass T2I_unary_rrot<string opc, PatFrag opnode> {
-  def r     : T2I<(outs GPR:$dst), (ins GPR:$Src),
-                  opc, " $dst, $Src",
-                 [(set GPR:$dst, (opnode GPR:$Src))]>;
-  def r_rot : T2I<(outs GPR:$dst), (ins GPR:$Src, i32imm:$rot),
-                  opc, " $dst, $Src, ror $rot",
-                 [(set GPR:$dst, (opnode (rotr GPR:$Src, rot_imm:$rot)))]>;
+  def r     : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+                  opc, ".w $dst, $src",
+                 [(set GPR:$dst, (opnode GPR:$src))]>;
+  def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi,
+                  opc, ".w $dst, $src, ror $rot",
+                 [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>;
 }
 
 /// T2I_bin_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
 multiclass T2I_bin_rrot<string opc, PatFrag opnode> {
-  def rr     : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS),
+  def rr     : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), IIC_iALUr,
                   opc, " $dst, $LHS, $RHS",
                   [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>;
   def rr_rot : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot),
-                  opc, " $dst, $LHS, $RHS, ror $rot",
+                  IIC_iALUsr, opc, " $dst, $LHS, $RHS, ror $rot",
                   [(set GPR:$dst, (opnode GPR:$LHS,
                                           (rotr GPR:$RHS, rot_imm:$rot)))]>;
 }
@@ -467,42 +403,46 @@ multiclass T2I_bin_rrot<string opc, PatFrag opnode> {
 //  Miscellaneous Instructions.
 //
 
-let isNotDuplicable = 1 in
-def t2PICADD : T2XI<(outs tGPR:$dst), (ins tGPR:$lhs, pclabel:$cp),
-                    "$cp:\n\tadd $dst, pc",
-                    [(set tGPR:$dst, (ARMpic_add tGPR:$lhs, imm:$cp))]>;
-
-
 // LEApcrel - Load a pc-relative address into a register without offending the
 // assembler.
-def t2LEApcrel : T2XI<(outs GPR:$dst), (ins i32imm:$label, pred:$p),
-                   !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
-                                         "${:private}PCRELL${:uid}+8))\n"),
-                              !strconcat("${:private}PCRELL${:uid}:\n\t",
-                                         "add$p $dst, pc, #PCRELV${:uid}")),
-                   []>;
+def t2LEApcrel : T2XI<(outs GPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
+                      "adr$p.w $dst, #$label", []>;
 
 def t2LEApcrelJT : T2XI<(outs GPR:$dst),
-                       (ins i32imm:$label, i32imm:$id, pred:$p),
-          !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
-                                         "${:private}PCRELL${:uid}+8))\n"),
-                              !strconcat("${:private}PCRELL${:uid}:\n\t",
-                                         "add$p $dst, pc, #PCRELV${:uid}")),
-                   []>;
-
-// ADD rd, sp, #so_imm
-def t2ADDrSPi : T2XI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
-                     "add $dst, $sp, $imm",
-                     []>;
-
-// ADD rd, sp, #imm12
-def t2ADDrSPi12 : T2XI<(outs GPR:$dst), (ins GPR:$sp, i32imm:$imm),
-                       "addw $dst, $sp, $imm",
-                       []>;
-
-def t2ADDrSPs : T2XI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
-                     "addw $dst, $sp, $rhs",
-                     []>;
+                        (ins i32imm:$label, nohash_imm:$id, pred:$p), IIC_iALUi,
+                        "adr$p.w $dst, #${label}_${id}", []>;
+
+// ADD r, sp, {so_imm|i12}
+def t2ADDrSPi   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
+                        IIC_iALUi, "add", ".w $dst, $sp, $imm", []>;
+def t2ADDrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm), 
+                       IIC_iALUi, "addw", " $dst, $sp, $imm", []>;
+
+// ADD r, sp, so_reg
+def t2ADDrSPs   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
+                        IIC_iALUsi, "add", ".w $dst, $sp, $rhs", []>;
+
+// SUB r, sp, {so_imm|i12}
+def t2SUBrSPi   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
+                        IIC_iALUi, "sub", ".w $dst, $sp, $imm", []>;
+def t2SUBrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm),
+                       IIC_iALUi, "subw", " $dst, $sp, $imm", []>;
+
+// SUB r, sp, so_reg
+def t2SUBrSPs   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
+                       IIC_iALUsi,
+                       "sub", " $dst, $sp, $rhs", []>;
+
+
+// Pseudo instruction that will expand into a t2SUBrSPi + a copy.
+let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler.
+def t2SUBrSPi_   : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
+                   NoItinerary, "@ sub.w $dst, $sp, $imm", []>;
+def t2SUBrSPi12_ : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm),
+                   NoItinerary, "@ subw $dst, $sp, $imm", []>;
+def t2SUBrSPs_   : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
+                   NoItinerary, "@ sub $dst, $sp, $rhs", []>;
+} // usesCustomDAGSchedInserter
 
 
 //===----------------------------------------------------------------------===//
@@ -521,12 +461,14 @@ defm t2LDRB  : T2I_ld<"ldrb", UnOpFrag<(zextloadi8  node:$Src)>>;
 defm t2LDRSH : T2I_ld<"ldrsh", UnOpFrag<(sextloadi16 node:$Src)>>;
 defm t2LDRSB : T2I_ld<"ldrsb", UnOpFrag<(sextloadi8  node:$Src)>>;
 
-let mayLoad = 1 in {
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
-def t2LDRDi8 : T2Ii8s4<(outs GPR:$dst), (ins t2addrmode_imm8s4:$addr),
-                       "ldrd", " $dst, $addr", []>;
-def t2LDRDpci : T2Ii8s4<(outs GPR:$dst), (ins i32imm:$addr),
-                       "ldrd", " $dst, $addr", []>;
+def t2LDRDi8  : T2Ii8s4<(outs GPR:$dst1, GPR:$dst2),
+                        (ins t2addrmode_imm8s4:$addr),
+                        IIC_iLoadi, "ldrd", " $dst1, $addr", []>;
+def t2LDRDpci : T2Ii8s4<(outs GPR:$dst1, GPR:$dst2),
+                        (ins i32imm:$addr), IIC_iLoadi,
+                       "ldrd", " $dst1, $addr", []>;
 }
 
 // zextload i1 -> zextload i8
@@ -573,57 +515,57 @@ def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
 let mayLoad = 1 in {
 def t2LDR_PRE  : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
                             (ins t2addrmode_imm8:$addr),
-                            AddrModeT2_i8, IndexModePre,
+                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
                             "ldr", " $dst, $addr!", "$addr.base = $base_wb",
                             []>;
 
 def t2LDR_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
                             (ins GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost,
+                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
                            "ldr", " $dst, [$base], $offset", "$base = $base_wb",
                             []>;
 
 def t2LDRB_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
                             (ins t2addrmode_imm8:$addr),
-                            AddrModeT2_i8, IndexModePre,
+                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
                             "ldrb", " $dst, $addr!", "$addr.base = $base_wb",
                             []>;
 def t2LDRB_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
                             (ins GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost,
+                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
                           "ldrb", " $dst, [$base], $offset", "$base = $base_wb",
                             []>;
 
 def t2LDRH_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
                             (ins t2addrmode_imm8:$addr),
-                            AddrModeT2_i8, IndexModePre,
+                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
                             "ldrh", " $dst, $addr!", "$addr.base = $base_wb",
                             []>;
 def t2LDRH_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
                             (ins GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost,
+                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
                           "ldrh", " $dst, [$base], $offset", "$base = $base_wb",
                             []>;
 
 def t2LDRSB_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
                             (ins t2addrmode_imm8:$addr),
-                            AddrModeT2_i8, IndexModePre,
+                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
                             "ldrsb", " $dst, $addr!", "$addr.base = $base_wb",
                             []>;
 def t2LDRSB_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
                             (ins GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost,
+                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
                          "ldrsb", " $dst, [$base], $offset", "$base = $base_wb",
                             []>;
 
 def t2LDRSH_PRE : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
                             (ins t2addrmode_imm8:$addr),
-                            AddrModeT2_i8, IndexModePre,
+                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
                             "ldrsh", " $dst, $addr!", "$addr.base = $base_wb",
                             []>;
 def t2LDRSH_POST : T2Iidxldst<(outs GPR:$dst, GPR:$base_wb),
                             (ins GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost,
+                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
                          "ldrsh", " $dst, [$base], $offset", "$base = $base_wb",
                             []>;
 }
@@ -634,108 +576,95 @@ defm t2STRB  : T2I_st<"strb", BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
 defm t2STRH  : T2I_st<"strh", BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
 
 // Store doubleword
-let mayLoad = 1 in
-def t2STRDi8 : T2Ii8s4<(outs), (ins GPR:$src, t2addrmode_imm8s4:$addr),
-                        "strd", " $src, $addr", []>;
+let mayLoad = 1, hasExtraSrcRegAllocReq = 1 in
+def t2STRDi8 : T2Ii8s4<(outs),
+                       (ins GPR:$src1, GPR:$src2, t2addrmode_imm8s4:$addr),
+               IIC_iStorer, "strd", " $src1, $addr", []>;
 
 // Indexed stores
 def t2STR_PRE  : T2Iidxldst<(outs GPR:$base_wb),
                             (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePre,
+                            AddrModeT2_i8, IndexModePre, IIC_iStoreiu,
                           "str", " $src, [$base, $offset]!", "$base = $base_wb",
              [(set GPR:$base_wb,
                    (pre_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
 
 def t2STR_POST : T2Iidxldst<(outs GPR:$base_wb),
                             (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost,
+                            AddrModeT2_i8, IndexModePost, IIC_iStoreiu,
                            "str", " $src, [$base], $offset", "$base = $base_wb",
              [(set GPR:$base_wb,
                    (post_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
 
 def t2STRH_PRE  : T2Iidxldst<(outs GPR:$base_wb),
                             (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePre,
+                            AddrModeT2_i8, IndexModePre, IIC_iStoreiu,
                          "strh", " $src, [$base, $offset]!", "$base = $base_wb",
         [(set GPR:$base_wb,
               (pre_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
 
 def t2STRH_POST : T2Iidxldst<(outs GPR:$base_wb),
                             (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost,
+                            AddrModeT2_i8, IndexModePost, IIC_iStoreiu,
                           "strh", " $src, [$base], $offset", "$base = $base_wb",
        [(set GPR:$base_wb,
              (post_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
 
 def t2STRB_PRE  : T2Iidxldst<(outs GPR:$base_wb),
                             (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePre,
+                            AddrModeT2_i8, IndexModePre, IIC_iStoreiu,
                          "strb", " $src, [$base, $offset]!", "$base = $base_wb",
          [(set GPR:$base_wb,
                (pre_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
 
 def t2STRB_POST : T2Iidxldst<(outs GPR:$base_wb),
                             (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost,
+                            AddrModeT2_i8, IndexModePost, IIC_iStoreiu,
                           "strb", " $src, [$base], $offset", "$base = $base_wb",
         [(set GPR:$base_wb,
               (post_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
 
 
-// Address computation and loads and stores in PIC mode.
-let isNotDuplicable = 1, AddedComplexity = 10 in {
-let canFoldAsLoad = 1 in
-def t2PICLDR   : T2I_picld<"ldr",  UnOpFrag<(load node:$Src)>>;
-
-def t2PICLDRH  : T2I_picld<"ldrh", UnOpFrag<(zextloadi16 node:$Src)>>;
-def t2PICLDRB  : T2I_picld<"ldrb", UnOpFrag<(zextloadi8 node:$Src)>>;
-def t2PICLDRSH : T2I_picld<"ldrsh", UnOpFrag<(sextloadi16 node:$Src)>>;
-def t2PICLDRSB : T2I_picld<"ldrsb", UnOpFrag<(sextloadi8 node:$Src)>>;
-
-def t2PICSTR   : T2I_picst<"str", BinOpFrag<(store node:$LHS, node:$RHS)>>;
-def t2PICSTRH  : T2I_picst<"strh", BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
-def t2PICSTRB  : T2I_picst<"strb", BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
-} // isNotDuplicable = 1, AddedComplexity = 10
-
+// FIXME: ldrd / strd pre / post variants
 
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
 
-let mayLoad = 1 in
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
 def t2LDM : T2XI<(outs),
-                 (ins addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops),
-                 "ldm${p}${addr:submode} $addr, $dst1", []>;
+                 (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+              IIC_iLoadm, "ldm${addr:submode}${p}${addr:wide} $addr, $wb", []>;
 
-let mayStore = 1 in
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
 def t2STM : T2XI<(outs),
-                 (ins addrmode4:$addr, pred:$p, reglist:$src1, variable_ops),
-                 "stm${p}${addr:submode} $addr, $src1", []>;
+                 (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+              IIC_iStorem, "stm${addr:submode}${p}${addr:wide} $addr, $wb", []>;
 
 //===----------------------------------------------------------------------===//
 //  Move Instructions.
 //
 
 let neverHasSideEffects = 1 in
-def t2MOVr : T2sI<(outs GPR:$dst), (ins GPR:$src),
-                   "mov", " $dst, $src", []>;
+def t2MOVr : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr,
+                   "mov", ".w $dst, $src", []>;
 
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def t2MOVi : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src),
-                   "mov", " $dst, $src",
+// AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, AddedComplexity = 1 in
+def t2MOVi : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi,
+                   "mov", ".w $dst, $src",
                    [(set GPR:$dst, t2_so_imm:$src)]>;
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def t2MOVi16 : T2I<(outs GPR:$dst), (ins i32imm:$src),
+def t2MOVi16 : T2I<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi,
                    "movw", " $dst, $src",
                    [(set GPR:$dst, imm0_65535:$src)]>;
 
-// FIXME: Also available in ARM mode.
 let Constraints = "$src = $dst" in
-def t2MOVTi16 : T2sI<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm),
-                     "movt", " $dst, $imm",
-                     [(set GPR:$dst,
-                           (or (and GPR:$src, 0xffff), t2_lo16AllZero:$imm))]>;
+def t2MOVTi16 : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm), IIC_iMOVi,
+                    "movt", " $dst, $imm",
+                    [(set GPR:$dst,
+                          (or (and GPR:$src, 0xffff), lo16AllZero:$imm))]>;
 
 //===----------------------------------------------------------------------===//
 //  Extend Instructions.
@@ -785,12 +714,14 @@ defm t2SUBS : T2I_bin_s_irs <"sub",  BinOpFrag<(subc node:$LHS, node:$RHS)>>;
 defm t2ADC  : T2I_adde_sube_irs<"adc",BinOpFrag<(adde node:$LHS, node:$RHS)>,1>;
 defm t2SBC  : T2I_adde_sube_irs<"sbc",BinOpFrag<(sube node:$LHS, node:$RHS)>>;
 
-// RSB, RSC
+// RSB
 defm t2RSB  : T2I_rbin_is   <"rsb", BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
 defm t2RSBS : T2I_rbin_s_is <"rsb", BinOpFrag<(subc node:$LHS, node:$RHS)>>;
-defm t2RSC  : T2I_rsc_is    <"rsc", BinOpFrag<(sube node:$LHS, node:$RHS)>>;
 
 // (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+let AddedComplexity = 1 in
+def : T2Pat<(add       GPR:$src, imm0_255_neg:$imm),
+            (t2SUBri   GPR:$src, imm0_255_neg:$imm)>;
 def : T2Pat<(add       GPR:$src, t2_so_imm_neg:$imm),
             (t2SUBri   GPR:$src, t2_so_imm_neg:$imm)>;
 def : T2Pat<(add       GPR:$src, imm0_4095_neg:$imm),
@@ -806,105 +737,250 @@ defm t2LSR  : T2I_sh_ir<"lsr", BinOpFrag<(srl  node:$LHS, node:$RHS)>>;
 defm t2ASR  : T2I_sh_ir<"asr", BinOpFrag<(sra  node:$LHS, node:$RHS)>>;
 defm t2ROR  : T2I_sh_ir<"ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
 
-def t2MOVrx : T2sI<(outs GPR:$dst), (ins GPR:$src),
-                   "mov", " $dst, $src, rrx",
+let Uses = [CPSR] in {
+def t2MOVrx : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                   "rrx", " $dst, $src",
                    [(set GPR:$dst, (ARMrrx GPR:$src))]>;
+}
+
+let Defs = [CPSR] in {
+def t2MOVsrl_flag : T2XI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                         "lsrs.w $dst, $src, #1",
+                         [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>;
+def t2MOVsra_flag : T2XI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                         "asrs.w $dst, $src, #1",
+                         [(set GPR:$dst, (ARMsra_flag GPR:$src))]>;
+}
 
 //===----------------------------------------------------------------------===//
 //  Bitwise Instructions.
 //
 
-defm t2AND  : T2I_bin_irs<"and", BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
-defm t2ORR  : T2I_bin_irs<"orr", BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
-defm t2EOR  : T2I_bin_irs<"eor", BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
+defm t2AND  : T2I_bin_w_irs<"and", BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
+defm t2ORR  : T2I_bin_w_irs<"orr", BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
+defm t2EOR  : T2I_bin_w_irs<"eor", BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
 
-defm t2BIC  : T2I_bin_irs<"bic", BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+defm t2BIC  : T2I_bin_w_irs<"bic", BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
 
-def : T2Pat<(and     GPR:$src, t2_so_imm_not:$imm),
-            (t2BICri GPR:$src, t2_so_imm_not:$imm)>;
+let Constraints = "$src = $dst" in
+def t2BFC : T2I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
+                IIC_iALUi, "bfc", " $dst, $imm",
+                [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>;
 
-defm t2ORN  : T2I_bin_irs<"orn", BinOpFrag<(or  node:$LHS, (not node:$RHS))>>;
+def t2SBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+                 IIC_iALUi, "sbfx", " $dst, $src, $lsb, $width", []>;
 
-def : T2Pat<(or      GPR:$src, t2_so_imm_not:$imm),
-            (t2ORNri GPR:$src, t2_so_imm_not:$imm)>;
+def t2UBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+                 IIC_iALUi, "ubfx", " $dst, $src, $lsb, $width", []>;
+
+// FIXME: A8.6.18  BFI - Bitfield insert (Encoding T1)
+
+defm t2ORN  : T2I_bin_irs<"orn", BinOpFrag<(or  node:$LHS, (not node:$RHS))>>;
 
 // Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version
 let AddedComplexity = 1 in
 defm t2MVN  : T2I_un_irs  <"mvn", UnOpFrag<(not node:$Src)>, 1, 1>;
 
-def : T2Pat<(t2_so_imm_not:$src),
-            (t2MVNi t2_so_imm_not:$src)>;
 
-// A8.6.17  BFC - Bitfield clear
-// FIXME: Also available in ARM mode.
-let Constraints = "$src = $dst" in
-def t2BFC : T2I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
-                "bfc", " $dst, $imm",
-                [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>;
+def : T2Pat<(and     GPR:$src, t2_so_imm_not:$imm),
+            (t2BICri GPR:$src, t2_so_imm_not:$imm)>;
 
-// FIXME: A8.6.18  BFI - Bitfield insert (Encoding T1)
+// FIXME: Disable this pattern on Darwin to workaround an assembler bug.
+def : T2Pat<(or      GPR:$src, t2_so_imm_not:$imm),
+            (t2ORNri GPR:$src, t2_so_imm_not:$imm)>,
+            Requires<[IsThumb2]>;
+
+def : T2Pat<(t2_so_imm_not:$src),
+            (t2MVNi t2_so_imm_not:$src)>;
 
 //===----------------------------------------------------------------------===//
 //  Multiply Instructions.
 //
 let isCommutable = 1 in
-def t2MUL: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b),
+def t2MUL: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
                 "mul", " $dst, $a, $b",
                 [(set GPR:$dst, (mul GPR:$a, GPR:$b))]>;
 
-def t2MLA: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+def t2MLA: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
 		"mla", " $dst, $a, $b, $c",
 		[(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>;
 
-def t2MLS: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+def t2MLS: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
 		"mls", " $dst, $a, $b, $c",
                 [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]>;
 
-// FIXME: SMULL, etc.
+// Extra precision multiplies with low / high results
+let neverHasSideEffects = 1 in {
+let isCommutable = 1 in {
+def t2SMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64,
+                   "smull", " $ldst, $hdst, $a, $b", []>;
+
+def t2UMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64,
+                   "umull", " $ldst, $hdst, $a, $b", []>;
+}
+
+// Multiply + accumulate
+def t2SMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
+                  "smlal", " $ldst, $hdst, $a, $b", []>;
+
+def t2UMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
+                  "umlal", " $ldst, $hdst, $a, $b", []>;
+
+def t2UMAAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
+                  "umaal", " $ldst, $hdst, $a, $b", []>;
+} // neverHasSideEffects
+
+// Most significant word multiply
+def t2SMMUL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+                  "smmul", " $dst, $a, $b",
+                  [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>;
+
+def t2SMMLA : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+                  "smmla", " $dst, $a, $b, $c",
+                  [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>;
+
+
+def t2SMMLS : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+                   "smmls", " $dst, $a, $b, $c",
+                   [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>;
+
+multiclass T2I_smul<string opc, PatFrag opnode> {
+  def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+              !strconcat(opc, "bb"), " $dst, $a, $b",
+              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
+                                      (sext_inreg GPR:$b, i16)))]>;
+
+  def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+              !strconcat(opc, "bt"), " $dst, $a, $b",
+              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
+                                      (sra GPR:$b, (i32 16))))]>;
+
+  def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+              !strconcat(opc, "tb"), " $dst, $a, $b",
+              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
+                                      (sext_inreg GPR:$b, i16)))]>;
+
+  def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+              !strconcat(opc, "tt"), " $dst, $a, $b",
+              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
+                                      (sra GPR:$b, (i32 16))))]>;
+
+  def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL16,
+              !strconcat(opc, "wb"), " $dst, $a, $b",
+              [(set GPR:$dst, (sra (opnode GPR:$a,
+                                    (sext_inreg GPR:$b, i16)), (i32 16)))]>;
+
+  def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL16,
+              !strconcat(opc, "wt"), " $dst, $a, $b",
+              [(set GPR:$dst, (sra (opnode GPR:$a,
+                                    (sra GPR:$b, (i32 16))), (i32 16)))]>;
+}
+
+
+multiclass T2I_smla<string opc, PatFrag opnode> {
+  def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+              !strconcat(opc, "bb"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc,
+                               (opnode (sext_inreg GPR:$a, i16),
+                                       (sext_inreg GPR:$b, i16))))]>;
+
+  def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+             !strconcat(opc, "bt"), " $dst, $a, $b, $acc",
+             [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
+                                                    (sra GPR:$b, (i32 16)))))]>;
+
+  def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+              !strconcat(opc, "tb"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
+                                                 (sext_inreg GPR:$b, i16))))]>;
+
+  def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+              !strconcat(opc, "tt"), " $dst, $a, $b, $acc",
+             [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
+                                                    (sra GPR:$b, (i32 16)))))]>;
+
+  def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+              !strconcat(opc, "wb"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
+                                       (sext_inreg GPR:$b, i16)), (i32 16))))]>;
+
+  def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+              !strconcat(opc, "wt"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
+                                         (sra GPR:$b, (i32 16))), (i32 16))))]>;
+}
+
+defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+
+// TODO: Halfword multiple accumulate long: SMLAL<x><y>
+// TODO: Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
+
 
 //===----------------------------------------------------------------------===//
 //  Misc. Arithmetic Instructions.
 //
 
-def t2CLZ : T2I<(outs GPR:$dst), (ins GPR:$src),
+def t2CLZ : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
                 "clz", " $dst, $src",
                 [(set GPR:$dst, (ctlz GPR:$src))]>;
 
-def t2REV : T2I<(outs GPR:$dst), (ins GPR:$src),
-                "rev", " $dst, $src",
+def t2REV : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+                "rev", ".w $dst, $src",
                 [(set GPR:$dst, (bswap GPR:$src))]>;
 
-def t2REV16 : T2I<(outs GPR:$dst), (ins GPR:$src),
-                "rev16", " $dst, $src",
+def t2REV16 : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+                "rev16", ".w $dst, $src",
                 [(set GPR:$dst,
                     (or (and (srl GPR:$src, (i32 8)), 0xFF),
                         (or (and (shl GPR:$src, (i32 8)), 0xFF00),
                             (or (and (srl GPR:$src, (i32 8)), 0xFF0000),
                                 (and (shl GPR:$src, (i32 8)), 0xFF000000)))))]>;
 
-/////
-/// A8.6.137  REVSH
-/////
-def t2REVSH : T2I<(outs GPR:$dst), (ins GPR:$src),
-                 "revsh", " $dst, $src",
+def t2REVSH : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+                 "revsh", ".w $dst, $src",
                  [(set GPR:$dst,
                     (sext_inreg
-                      (or (srl (and GPR:$src, 0xFFFF), (i32 8)),
+                      (or (srl (and GPR:$src, 0xFF00), (i32 8)),
                           (shl GPR:$src, (i32 8))), i16))]>;
 
-// FIXME: PKHxx etc.
+def t2PKHBT : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
+                  IIC_iALUsi, "pkhbt", " $dst, $src1, $src2, LSL $shamt",
+                  [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF),
+                                      (and (shl GPR:$src2, (i32 imm:$shamt)),
+                                           0xFFFF0000)))]>;
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : T2Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)),
+            (t2PKHBT GPR:$src1, GPR:$src2, 0)>;
+def : T2Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)),
+            (t2PKHBT GPR:$src1, GPR:$src2, imm16_31:$shamt)>;
+
+def t2PKHTB : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
+                  IIC_iALUsi, "pkhtb", " $dst, $src1, $src2, ASR $shamt",
+                  [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000),
+                                      (and (sra GPR:$src2, imm16_31:$shamt),
+                                           0xFFFF)))]>;
+
+// Alternate cases for PKHTB where identities eliminate some nodes.  Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+def : T2Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, (i32 16))),
+            (t2PKHTB GPR:$src1, GPR:$src2, 16)>;
+def : T2Pat<(or (and GPR:$src1, 0xFFFF0000),
+                     (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)),
+            (t2PKHTB GPR:$src1, GPR:$src2, imm1_15:$shamt)>;
 
 //===----------------------------------------------------------------------===//
 //  Comparison Instructions...
 //
 
-defm t2CMP   : T2I_cmp_is<"cmp",
-                          BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+defm t2CMP  : T2I_cmp_is<"cmp",
+                         BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
 defm t2CMPz : T2I_cmp_is<"cmp",
                          BinOpFrag<(ARMcmpZ node:$LHS, node:$RHS)>>;
 
-defm t2CMN   : T2I_cmp_is<"cmn",
-                          BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
+defm t2CMN  : T2I_cmp_is<"cmn",
+                         BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
 defm t2CMNz : T2I_cmp_is<"cmn",
                          BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>;
 
@@ -923,45 +999,132 @@ defm t2TEQ  : T2I_cmp_is<"teq",
 // Short range conditional branch. Looks awesome for loops. Need to figure
 // out how to use this one.
 
-// FIXME: Conditional moves
+
+// Conditional moves
+// FIXME: should be able to write a pattern for ARMcmov, but can't use
+// a two-value operand where a dag node expects two operands. :( 
+def t2MOVCCr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true), IIC_iCMOVr,
+                   "mov", ".w $dst, $true",
+      [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $dst">;
+
+def t2MOVCCi : T2I<(outs GPR:$dst), (ins GPR:$false, t2_so_imm:$true),
+                   IIC_iCMOVi, "mov", ".w $dst, $true",
+[/*(set GPR:$dst, (ARMcmov GPR:$false, t2_so_imm:$true, imm:$cc, CCR:$ccr))*/]>,
+                   RegConstraint<"$false = $dst">;
+
+def t2MOVCClsl : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true, i32imm:$rhs),
+                   IIC_iCMOVsi, "lsl", ".w $dst, $true, $rhs", []>,
+                   RegConstraint<"$false = $dst">;
+def t2MOVCClsr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true, i32imm:$rhs),
+                   IIC_iCMOVsi, "lsr", ".w $dst, $true, $rhs", []>,
+                   RegConstraint<"$false = $dst">;
+def t2MOVCCasr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true, i32imm:$rhs),
+                   IIC_iCMOVsi, "asr", ".w $dst, $true, $rhs", []>,
+                   RegConstraint<"$false = $dst">;
+def t2MOVCCror : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true, i32imm:$rhs),
+                   IIC_iCMOVsi, "ror", ".w $dst, $true, $rhs", []>,
+                   RegConstraint<"$false = $dst">;
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+let isCall = 1,
+  Defs = [R0, R12, LR, CPSR] in {
+  def t2TPsoft : T2XI<(outs), (ins), IIC_Br,
+                     "bl __aeabi_read_tp",
+                     [(set R0, ARMthread_pointer)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//   eh_sjlj_setjmp() is an instruction sequence to store the return
+//   address and save #0 in R0 for the non-longjmp case.
+//   Since by its nature we may be coming from some other function to get
+//   here, and we're using the stack frame for the containing function to
+//   save/restore registers, we can't keep anything live in regs across
+//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+//   when we get here from a longjmp(). We force everthing out of registers
+//   except for our own input by listing the relevant registers in Defs. By
+//   doing so, we also cause the prologue/epilogue code to actively preserve
+//   all of the callee-saved resgisters, which is exactly what we want.
+let Defs = 
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
+    D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
+    D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
+    D31 ] in {
+  def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins GPR:$src),
+                               AddrModeNone, SizeSpecial, NoItinerary,
+                               "str.w sp, [$src, #+8] @ eh_setjmp begin\n"
+                               "\tadr r12, 0f\n"
+                               "\torr r12, #1\n"
+                               "\tstr.w r12, [$src, #+4]\n"
+                               "\tmovs r0, #0\n"
+                               "\tb 1f\n"
+                               "0:\tmovs r0, #1 @ eh_setjmp end\n"
+                               "1:", "",
+                               [(set R0, (ARMeh_sjlj_setjmp GPR:$src))]>;
+}
+
+
 
 //===----------------------------------------------------------------------===//
 // Control-Flow Instructions
 //
 
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: $dst1 should be a def. But the extra ops must be in the end of the
+// operand list.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1 in
+  def t2LDM_RET : T2XI<(outs),
+                    (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+                    IIC_Br, "ldm${addr:submode}${p}${addr:wide} $addr, $wb",
+                    []>;
+
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 let isPredicable = 1 in
-def t2B   : T2XI<(outs), (ins brtarget:$target),
-                 "b $target",
+def t2B   : T2XI<(outs), (ins brtarget:$target), IIC_Br,
+                 "b.w $target",
                  [(br bb:$target)]>;
 
 let isNotDuplicable = 1, isIndirectBranch = 1 in {
-def t2BR_JTr : T2JTI<(outs), (ins GPR:$target, jtblock_operand:$jt, i32imm:$id),
-                     "mov pc, $target \n$jt",
-                     [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>;
+def t2BR_JT :
+    T2JTI<(outs),
+          (ins GPR:$target, GPR:$index, jt2block_operand:$jt, i32imm:$id),
+           IIC_Br, "mov pc, $target\n$jt",
+          [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]>;
 
-def t2BR_JTm : 
+// FIXME: Add a non-pc based case that can be predicated.
+def t2TBB :
     T2JTI<(outs),
-          (ins t2addrmode_so_reg:$target, jtblock_operand:$jt, i32imm:$id),
-          "ldr pc, $target \n$jt",
-          [(ARMbrjt (i32 (load t2addrmode_so_reg:$target)), tjumptable:$jt,
-             imm:$id)]>;
+        (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id),
+         IIC_Br, "tbb $index\n$jt", []>;
 
-def t2BR_JTadd : 
+def t2TBH :
     T2JTI<(outs),
-          (ins GPR:$target, GPR:$idx, jtblock_operand:$jt, i32imm:$id),
-          "add pc, $target, $idx \n$jt",
-          [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt, imm:$id)]>;
-} // isNotDuplicate, isIndirectBranch
+        (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id),
+         IIC_Br, "tbh $index\n$jt", []>;
+} // isNotDuplicable, isIndirectBranch
+
 } // isBranch, isTerminator, isBarrier
 
 // FIXME: should be able to write a pattern for ARMBrcond, but can't use
 // a two-value operand where a dag node expects two operands. :(
 let isBranch = 1, isTerminator = 1 in
-def t2Bcc : T2I<(outs), (ins brtarget:$target), 
-                "b", " $target",
+def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
+                "b", ".w $target",
                 [/*(ARMbrcond bb:$target, imm:$cc)*/]>;
 
+
+// IT block
+def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
+                    AddrModeNone, Size2Bytes,  IIC_iALUx,
+                    "it$mask $cc", "", []>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
@@ -972,7 +1135,10 @@ def : T2Pat<(ARMWrapper  tconstpool  :$dst), (t2LEApcrel tconstpool  :$dst)>;
 def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
             (t2LEApcrelJT tjumptable:$dst, imm:$id)>;
 
-// Large immediate handling.
-
-def : T2Pat<(i32 imm:$src),
-            (t2MOVTi16 (t2MOVi16 (t2_lo16 imm:$src)), (t2_hi16 imm:$src))>;
+// 32-bit immediate using movw + movt.
+// This is a single pseudo instruction to make it re-materializable. Remove
+// when we can do generalized remat.
+let isReMaterializable = 1 in
+def t2MOVi32imm : T2Ix2<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi,
+                     "movw", " $dst, ${src:lo16}\n\tmovt${p} $dst, ${src:hi16}",
+                     [(set GPR:$dst, (i32 imm:$src))]>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 9104c77..56336d1 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -36,57 +36,57 @@ def arm_fmdrr  : SDNode<"ARMISD::FMDRR",  SDT_FMDRR>;
 
 let canFoldAsLoad = 1 in {
 def FLDD  : ADI5<0b1101, 0b01, (outs DPR:$dst), (ins addrmode5:$addr),
-                 "fldd", " $dst, $addr",
+                 IIC_fpLoad64, "fldd", " $dst, $addr",
                  [(set DPR:$dst, (load addrmode5:$addr))]>;
 
 def FLDS  : ASI5<0b1101, 0b01, (outs SPR:$dst), (ins addrmode5:$addr),
-                 "flds", " $dst, $addr",
+                 IIC_fpLoad32, "flds", " $dst, $addr",
                  [(set SPR:$dst, (load addrmode5:$addr))]>;
 } // canFoldAsLoad
 
 def FSTD  : ADI5<0b1101, 0b00, (outs), (ins DPR:$src, addrmode5:$addr),
-                 "fstd", " $src, $addr",
+                 IIC_fpStore64, "fstd", " $src, $addr",
                  [(store DPR:$src, addrmode5:$addr)]>;
 
 def FSTS  : ASI5<0b1101, 0b00, (outs), (ins SPR:$src, addrmode5:$addr),
-                 "fsts", " $src, $addr",
+                 IIC_fpStore32, "fsts", " $src, $addr",
                  [(store SPR:$src, addrmode5:$addr)]>;
 
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
 
-let mayLoad = 1 in {
-def FLDMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$dst1,
-                           variable_ops),
-                  "fldm${addr:submode}d${p} ${addr:base}, $dst1",
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
+def FLDMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb,
+                           variable_ops), IIC_fpLoadm,
+                  "fldm${addr:submode}d${p} ${addr:base}, $wb",
                   []> {
   let Inst{20} = 1;
 }
 
-def FLDMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$dst1,
-                           variable_ops),
-                  "fldm${addr:submode}s${p} ${addr:base}, $dst1",
+def FLDMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb,
+                           variable_ops), IIC_fpLoadm, 
+                  "fldm${addr:submode}s${p} ${addr:base}, $wb",
                   []> {
   let Inst{20} = 1;
 }
-}
+} // mayLoad, hasExtraDefRegAllocReq
 
-let mayStore = 1 in {
-def FSTMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$src1,
-                           variable_ops),
-                 "fstm${addr:submode}d${p} ${addr:base}, $src1",
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in {
+def FSTMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb,
+                           variable_ops), IIC_fpStorem,
+                 "fstm${addr:submode}d${p} ${addr:base}, $wb",
                  []> {
   let Inst{20} = 0;
 }
 
-def FSTMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$src1,
-                           variable_ops),
-                 "fstm${addr:submode}s${p} ${addr:base}, $src1",
+def FSTMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb,
+                           variable_ops), IIC_fpStorem,
+                 "fstm${addr:submode}s${p} ${addr:base}, $wb",
                  []> {
   let Inst{20} = 0;
 }
-} // mayStore
+} // mayStore, hasExtraSrcRegAllocReq
 
 // FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores
 
@@ -95,46 +95,48 @@ def FSTMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$src1,
 //
 
 def FADDD  : ADbI<0b11100011, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
-                 "faddd", " $dst, $a, $b",
+                 IIC_fpALU64, "faddd", " $dst, $a, $b",
                  [(set DPR:$dst, (fadd DPR:$a, DPR:$b))]>;
 
-def FADDS  : ASbI<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
-                 "fadds", " $dst, $a, $b",
-                 [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>;
+def FADDS  : ASbIn<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+                  IIC_fpALU32, "fadds", " $dst, $a, $b",
+                  [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>;
 
 // These are encoded as unary instructions.
+let Defs = [FPSCR] in {
 def FCMPED : ADuI<0b11101011, 0b0100, 0b1100, (outs), (ins DPR:$a, DPR:$b),
-                 "fcmped", " $a, $b",
+                 IIC_fpCMP64, "fcmped", " $a, $b",
                  [(arm_cmpfp DPR:$a, DPR:$b)]>;
 
 def FCMPES : ASuI<0b11101011, 0b0100, 0b1100, (outs), (ins SPR:$a, SPR:$b),
-                 "fcmpes", " $a, $b",
+                 IIC_fpCMP32, "fcmpes", " $a, $b",
                  [(arm_cmpfp SPR:$a, SPR:$b)]>;
+}
 
 def FDIVD  : ADbI<0b11101000, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
-                 "fdivd", " $dst, $a, $b",
+                 IIC_fpDIV64, "fdivd", " $dst, $a, $b",
                  [(set DPR:$dst, (fdiv DPR:$a, DPR:$b))]>;
 
 def FDIVS  : ASbI<0b11101000, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
-                 "fdivs", " $dst, $a, $b",
+                 IIC_fpDIV32, "fdivs", " $dst, $a, $b",
                  [(set SPR:$dst, (fdiv SPR:$a, SPR:$b))]>;
 
 def FMULD  : ADbI<0b11100010, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
-                 "fmuld", " $dst, $a, $b",
+                 IIC_fpMUL64, "fmuld", " $dst, $a, $b",
                  [(set DPR:$dst, (fmul DPR:$a, DPR:$b))]>;
 
-def FMULS  : ASbI<0b11100010, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
-                 "fmuls", " $dst, $a, $b",
-                 [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>;
+def FMULS  : ASbIn<0b11100010, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+                  IIC_fpMUL32, "fmuls", " $dst, $a, $b",
+                  [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>;
                  
 def FNMULD  : ADbI<0b11100010, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
-                  "fnmuld", " $dst, $a, $b",
+                  IIC_fpMUL64, "fnmuld", " $dst, $a, $b",
                   [(set DPR:$dst, (fneg (fmul DPR:$a, DPR:$b)))]> {
   let Inst{6} = 1;
 }
 
 def FNMULS  : ASbI<0b11100010, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
-                  "fnmuls", " $dst, $a, $b",
+                  IIC_fpMUL32, "fnmuls", " $dst, $a, $b",
                   [(set SPR:$dst, (fneg (fmul SPR:$a, SPR:$b)))]> {
   let Inst{6} = 1;
 }
@@ -147,14 +149,14 @@ def : Pat<(fmul (fneg SPR:$a), SPR:$b),
 
 
 def FSUBD  : ADbI<0b11100011, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
-                 "fsubd", " $dst, $a, $b",
+                 IIC_fpALU64, "fsubd", " $dst, $a, $b",
                  [(set DPR:$dst, (fsub DPR:$a, DPR:$b))]> {
   let Inst{6} = 1;
 }
 
-def FSUBS  : ASbI<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
-                 "fsubs", " $dst, $a, $b",
-                 [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]> {
+def FSUBS  : ASbIn<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+                  IIC_fpALU32, "fsubs", " $dst, $a, $b",
+                  [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]> {
   let Inst{6} = 1;
 }
 
@@ -163,29 +165,31 @@ def FSUBS  : ASbI<0b11100011, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
 //
 
 def FABSD  : ADuI<0b11101011, 0b0000, 0b1100, (outs DPR:$dst), (ins DPR:$a),
-                 "fabsd", " $dst, $a",
+                 IIC_fpUNA64, "fabsd", " $dst, $a",
                  [(set DPR:$dst, (fabs DPR:$a))]>;
 
-def FABSS  : ASuI<0b11101011, 0b0000, 0b1100, (outs SPR:$dst), (ins SPR:$a),
-                 "fabss", " $dst, $a",
-                 [(set SPR:$dst, (fabs SPR:$a))]>;
+def FABSS  : ASuIn<0b11101011, 0b0000, 0b1100, (outs SPR:$dst), (ins SPR:$a),
+                  IIC_fpUNA32, "fabss", " $dst, $a",
+                  [(set SPR:$dst, (fabs SPR:$a))]>;
 
+let Defs = [FPSCR] in {
 def FCMPEZD : ADuI<0b11101011, 0b0101, 0b1100, (outs), (ins DPR:$a),
-                  "fcmpezd", " $a",
+                  IIC_fpCMP64, "fcmpezd", " $a",
                   [(arm_cmpfp0 DPR:$a)]>;
 
 def FCMPEZS : ASuI<0b11101011, 0b0101, 0b1100, (outs), (ins SPR:$a),
-                  "fcmpezs", " $a",
+                  IIC_fpCMP32, "fcmpezs", " $a",
                   [(arm_cmpfp0 SPR:$a)]>;
+}
 
 def FCVTDS : ASuI<0b11101011, 0b0111, 0b1100, (outs DPR:$dst), (ins SPR:$a),
-                 "fcvtds", " $dst, $a",
+                 IIC_fpCVTDS, "fcvtds", " $dst, $a",
                  [(set DPR:$dst, (fextend SPR:$a))]>;
 
 // Special case encoding: bits 11-8 is 0b1011.
-def FCVTSD : AI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm,
-                 "fcvtsd", " $dst, $a",
-                 [(set SPR:$dst, (fround DPR:$a))]> {
+def FCVTSD : VFPAI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm,
+                   IIC_fpCVTSD, "fcvtsd", " $dst, $a",
+                   [(set SPR:$dst, (fround DPR:$a))]> {
   let Inst{27-23} = 0b11101;
   let Inst{21-16} = 0b110111;
   let Inst{11-8}  = 0b1011;
@@ -194,26 +198,26 @@ def FCVTSD : AI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm,
 
 let neverHasSideEffects = 1 in {
 def FCPYD  : ADuI<0b11101011, 0b0000, 0b0100, (outs DPR:$dst), (ins DPR:$a),
-                 "fcpyd", " $dst, $a", []>;
+                 IIC_fpUNA64, "fcpyd", " $dst, $a", []>;
 
 def FCPYS  : ASuI<0b11101011, 0b0000, 0b0100, (outs SPR:$dst), (ins SPR:$a),
-                 "fcpys", " $dst, $a", []>;
+                 IIC_fpUNA32, "fcpys", " $dst, $a", []>;
 } // neverHasSideEffects
 
 def FNEGD  : ADuI<0b11101011, 0b0001, 0b0100, (outs DPR:$dst), (ins DPR:$a),
-                 "fnegd", " $dst, $a",
+                 IIC_fpUNA64, "fnegd", " $dst, $a",
                  [(set DPR:$dst, (fneg DPR:$a))]>;
 
-def FNEGS  : ASuI<0b11101011, 0b0001, 0b0100, (outs SPR:$dst), (ins SPR:$a),
-                 "fnegs", " $dst, $a",
-                 [(set SPR:$dst, (fneg SPR:$a))]>;
+def FNEGS  : ASuIn<0b11101011, 0b0001, 0b0100, (outs SPR:$dst), (ins SPR:$a),
+                  IIC_fpUNA32, "fnegs", " $dst, $a",
+                  [(set SPR:$dst, (fneg SPR:$a))]>;
 
 def FSQRTD  : ADuI<0b11101011, 0b0001, 0b1100, (outs DPR:$dst), (ins DPR:$a),
-                 "fsqrtd", " $dst, $a",
+                 IIC_fpSQRT64, "fsqrtd", " $dst, $a",
                  [(set DPR:$dst, (fsqrt DPR:$a))]>;
 
 def FSQRTS  : ASuI<0b11101011, 0b0001, 0b1100, (outs SPR:$dst), (ins SPR:$a),
-                 "fsqrts", " $dst, $a",
+                 IIC_fpSQRT32, "fsqrts", " $dst, $a",
                  [(set SPR:$dst, (fsqrt SPR:$a))]>;
 
 //===----------------------------------------------------------------------===//
@@ -221,16 +225,16 @@ def FSQRTS  : ASuI<0b11101011, 0b0001, 0b1100, (outs SPR:$dst), (ins SPR:$a),
 //
 
 def FMRS   : AVConv2I<0b11100001, 0b1010, (outs GPR:$dst), (ins SPR:$src),
-                 "fmrs", " $dst, $src",
+                 IIC_VMOVSI, "fmrs", " $dst, $src",
                  [(set GPR:$dst, (bitconvert SPR:$src))]>;
 
 def FMSR   : AVConv4I<0b11100000, 0b1010, (outs SPR:$dst), (ins GPR:$src),
-                 "fmsr", " $dst, $src",
+                 IIC_VMOVIS, "fmsr", " $dst, $src",
                  [(set SPR:$dst, (bitconvert GPR:$src))]>;
 
 def FMRRD  : AVConv3I<0b11000101, 0b1011,
-                      (outs GPR:$dst1, GPR:$dst2), (ins DPR:$src),
-                 "fmrrd", " $dst1, $dst2, $src",
+                      (outs GPR:$wb, GPR:$dst2), (ins DPR:$src),
+                 IIC_VMOVDI, "fmrrd", " $wb, $dst2, $src",
                  [/* FIXME: Can't write pattern for multiple result instr*/]>;
 
 // FMDHR: GPR -> SPR
@@ -238,7 +242,7 @@ def FMRRD  : AVConv3I<0b11000101, 0b1011,
 
 def FMDRR : AVConv5I<0b11000100, 0b1011,
                      (outs DPR:$dst), (ins GPR:$src1, GPR:$src2),
-                "fmdrr", " $dst, $src1, $src2",
+                IIC_VMOVID, "fmdrr", " $dst, $src1, $src2",
                 [(set DPR:$dst, (arm_fmdrr GPR:$src1, GPR:$src2))]>;
 
 // FMRDH: SPR -> GPR
@@ -254,23 +258,23 @@ def FMDRR : AVConv5I<0b11000100, 0b1011,
 // Int to FP:
 
 def FSITOD : AVConv1I<0b11101011, 0b1000, 0b1011, (outs DPR:$dst), (ins SPR:$a),
-                 "fsitod", " $dst, $a",
+                 IIC_fpCVTID, "fsitod", " $dst, $a",
                  [(set DPR:$dst, (arm_sitof SPR:$a))]> {
   let Inst{7} = 1;
 }
 
-def FSITOS : AVConv1I<0b11101011, 0b1000, 0b1010, (outs SPR:$dst), (ins SPR:$a),
-                 "fsitos", " $dst, $a",
+def FSITOS : AVConv1In<0b11101011, 0b1000, 0b1010, (outs SPR:$dst),(ins SPR:$a),
+                 IIC_fpCVTIS, "fsitos", " $dst, $a",
                  [(set SPR:$dst, (arm_sitof SPR:$a))]> {
   let Inst{7} = 1;
 }
 
 def FUITOD : AVConv1I<0b11101011, 0b1000, 0b1011, (outs DPR:$dst), (ins SPR:$a),
-                 "fuitod", " $dst, $a",
+                 IIC_fpCVTID, "fuitod", " $dst, $a",
                  [(set DPR:$dst, (arm_uitof SPR:$a))]>;
 
-def FUITOS : AVConv1I<0b11101011, 0b1000, 0b1010, (outs SPR:$dst), (ins SPR:$a),
-                 "fuitos", " $dst, $a",
+def FUITOS : AVConv1In<0b11101011, 0b1000, 0b1010, (outs SPR:$dst),(ins SPR:$a),
+                 IIC_fpCVTIS, "fuitos", " $dst, $a",
                  [(set SPR:$dst, (arm_uitof SPR:$a))]>;
 
 // FP to Int:
@@ -278,28 +282,28 @@ def FUITOS : AVConv1I<0b11101011, 0b1000, 0b1010, (outs SPR:$dst), (ins SPR:$a),
 
 def FTOSIZD : AVConv1I<0b11101011, 0b1101, 0b1011,
                        (outs SPR:$dst), (ins DPR:$a),
-                 "ftosizd", " $dst, $a",
+                 IIC_fpCVTDI, "ftosizd", " $dst, $a",
                  [(set SPR:$dst, (arm_ftosi DPR:$a))]> {
   let Inst{7} = 1; // Z bit
 }
 
-def FTOSIZS : AVConv1I<0b11101011, 0b1101, 0b1010,
-                       (outs SPR:$dst), (ins SPR:$a),
-                 "ftosizs", " $dst, $a",
+def FTOSIZS : AVConv1In<0b11101011, 0b1101, 0b1010,
+                        (outs SPR:$dst), (ins SPR:$a),
+                 IIC_fpCVTSI, "ftosizs", " $dst, $a",
                  [(set SPR:$dst, (arm_ftosi SPR:$a))]> {
   let Inst{7} = 1; // Z bit
 }
 
 def FTOUIZD : AVConv1I<0b11101011, 0b1100, 0b1011,
                        (outs SPR:$dst), (ins DPR:$a),
-                 "ftouizd", " $dst, $a",
+                 IIC_fpCVTDI, "ftouizd", " $dst, $a",
                  [(set SPR:$dst, (arm_ftoui DPR:$a))]> {
   let Inst{7} = 1; // Z bit
 }
 
-def FTOUIZS : AVConv1I<0b11101011, 0b1100, 0b1010,
-                       (outs SPR:$dst), (ins SPR:$a),
-                 "ftouizs", " $dst, $a",
+def FTOUIZS : AVConv1In<0b11101011, 0b1100, 0b1010,
+                        (outs SPR:$dst), (ins SPR:$a),
+                 IIC_fpCVTSI, "ftouizs", " $dst, $a",
                  [(set SPR:$dst, (arm_ftoui SPR:$a))]> {
   let Inst{7} = 1; // Z bit
 }
@@ -309,48 +313,53 @@ def FTOUIZS : AVConv1I<0b11101011, 0b1100, 0b1010,
 //
 
 def FMACD : ADbI<0b11100000, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
-                "fmacd", " $dst, $a, $b",
+                IIC_fpMAC64, "fmacd", " $dst, $a, $b",
                 [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b), DPR:$dstin))]>,
                 RegConstraint<"$dstin = $dst">;
 
-def FMACS : ASbI<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
-                "fmacs", " $dst, $a, $b",
-                [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
-                RegConstraint<"$dstin = $dst">;
+def FMACS : ASbIn<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
+                 IIC_fpMAC32, "fmacs", " $dst, $a, $b",
+                 [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
+                 RegConstraint<"$dstin = $dst">;
 
 def FMSCD : ADbI<0b11100001, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
-                "fmscd", " $dst, $a, $b",
+                IIC_fpMAC64, "fmscd", " $dst, $a, $b",
                 [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b), DPR:$dstin))]>,
                 RegConstraint<"$dstin = $dst">;
 
 def FMSCS : ASbI<0b11100001, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
-                "fmscs", " $dst, $a, $b",
+                IIC_fpMAC32, "fmscs", " $dst, $a, $b",
                 [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
                 RegConstraint<"$dstin = $dst">;
 
 def FNMACD : ADbI<0b11100000, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
-                 "fnmacd", " $dst, $a, $b",
+                 IIC_fpMAC64, "fnmacd", " $dst, $a, $b",
              [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>,
                 RegConstraint<"$dstin = $dst"> {
   let Inst{6} = 1;
 }
 
-def FNMACS : ASbI<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
-                "fnmacs", " $dst, $a, $b",
+def FNMACS : ASbIn<0b11100000, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
+                  IIC_fpMAC32, "fnmacs", " $dst, $a, $b",
              [(set SPR:$dst, (fadd (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
                 RegConstraint<"$dstin = $dst"> {
   let Inst{6} = 1;
 }
 
+def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, DPR:$b)),
+          (FNMACD DPR:$dstin, DPR:$a, DPR:$b)>, Requires<[DontUseNEONForFP]>;
+def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)),
+          (FNMACS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[DontUseNEONForFP]>;
+
 def FNMSCD : ADbI<0b11100001, (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
-                 "fnmscd", " $dst, $a, $b",
+                 IIC_fpMAC64, "fnmscd", " $dst, $a, $b",
              [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>,
                 RegConstraint<"$dstin = $dst"> {
   let Inst{6} = 1;
 }
 
 def FNMSCS : ASbI<0b11100001, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
-                "fnmscs", " $dst, $a, $b",
+                IIC_fpMAC32, "fnmscs", " $dst, $a, $b",
              [(set SPR:$dst, (fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
                 RegConstraint<"$dstin = $dst"> {
   let Inst{6} = 1;
@@ -362,25 +371,25 @@ def FNMSCS : ASbI<0b11100001, (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
 
 def FCPYDcc  : ADuI<0b11101011, 0b0000, 0b0100,
                     (outs DPR:$dst), (ins DPR:$false, DPR:$true),
-                    "fcpyd", " $dst, $true",
+                    IIC_fpUNA64, "fcpyd", " $dst, $true",
                 [/*(set DPR:$dst, (ARMcmov DPR:$false, DPR:$true, imm:$cc))*/]>,
                     RegConstraint<"$false = $dst">;
 
 def FCPYScc  : ASuI<0b11101011, 0b0000, 0b0100,
                     (outs SPR:$dst), (ins SPR:$false, SPR:$true),
-                    "fcpys", " $dst, $true",
+                    IIC_fpUNA32, "fcpys", " $dst, $true",
                 [/*(set SPR:$dst, (ARMcmov SPR:$false, SPR:$true, imm:$cc))*/]>,
                     RegConstraint<"$false = $dst">;
 
 def FNEGDcc  : ADuI<0b11101011, 0b0001, 0b0100,
                     (outs DPR:$dst), (ins DPR:$false, DPR:$true),
-                    "fnegd", " $dst, $true",
+                    IIC_fpUNA64, "fnegd", " $dst, $true",
                 [/*(set DPR:$dst, (ARMcneg DPR:$false, DPR:$true, imm:$cc))*/]>,
                     RegConstraint<"$false = $dst">;
 
 def FNEGScc  : ASuI<0b11101011, 0b0001, 0b0100,
                     (outs SPR:$dst), (ins SPR:$false, SPR:$true),
-                    "fnegs", " $dst, $true",
+                    IIC_fpUNA32, "fnegs", " $dst, $true",
                 [/*(set SPR:$dst, (ARMcneg SPR:$false, SPR:$true, imm:$cc))*/]>,
                     RegConstraint<"$false = $dst">;
 
@@ -389,8 +398,8 @@ def FNEGScc  : ASuI<0b11101011, 0b0001, 0b0100,
 // Misc.
 //
 
-let Defs = [CPSR] in
-def FMSTAT : AI<(outs), (ins), VFPMiscFrm, "fmstat", "", [(arm_fmstat)]> {
+let Defs = [CPSR], Uses = [FPSCR] in
+def FMSTAT : VFPAI<(outs), (ins), VFPMiscFrm, IIC_fpSTAT, "fmstat", "", [(arm_fmstat)]> {
   let Inst{27-20} = 0b11101111;
   let Inst{19-16} = 0b0001;
   let Inst{15-12} = 0b1111;
diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
index e551c41..24990e6 100644
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ b/lib/Target/ARM/ARMJITInfo.cpp
@@ -19,15 +19,15 @@
 #include "ARMSubtarget.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/Config/alloca.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Streams.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/System/Memory.h"
 #include <cstdlib>
 using namespace llvm;
 
 void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
-  abort();
+  llvm_report_error("ARMJITInfo::replaceMachineCodeForFunction");
 }
 
 /// JITCompilerFunction - This contains the address of the JIT function used to
@@ -45,11 +45,11 @@ static TargetJITInfo::JITCompilerFn JITCompilerFunction;
 // CompilationCallback stub - We can't use a C function with inline assembly in
 // it, because we the prolog/epilog inserted by GCC won't work for us (we need
 // to preserve more context and manipulate the stack directly).  Instead,
-// write our own wrapper, which does things our way, so we have complete 
+// write our own wrapper, which does things our way, so we have complete
 // control over register saving and restoring.
 extern "C" {
 #if defined(__arm__)
-  void ARMCompilationCallback(void);
+  void ARMCompilationCallback();
   asm(
     ".text\n"
     ".align 2\n"
@@ -77,11 +77,11 @@ extern "C" {
     // order for the registers.
     //      +--------+
     //   0  | LR     | Original return address
-    //      +--------+    
+    //      +--------+
     //   1  | LR     | Stub address (start of stub)
     // 2-5  | R3..R0 | Saved registers (we need to preserve all regs)
     // 6-20 | D0..D7 | Saved VFP registers
-    //      +--------+    
+    //      +--------+
     //
 #ifndef __SOFTFP__
     // Restore VFP caller-saved registers.
@@ -103,15 +103,14 @@ extern "C" {
       );
 #else  // Not an ARM host
   void ARMCompilationCallback() {
-    assert(0 && "Cannot call ARMCompilationCallback() on a non-ARM arch!\n");
-    abort();
+    llvm_unreachable("Cannot call ARMCompilationCallback() on a non-ARM arch!");
   }
 #endif
 }
 
-/// ARMCompilationCallbackC - This is the target-specific function invoked 
-/// by the function stub when we did not know the real target of a call.  
-/// This function must locate the start of the stub or call site and pass 
+/// ARMCompilationCallbackC - This is the target-specific function invoked
+/// by the function stub when we did not know the real target of a call.
+/// This function must locate the start of the stub or call site and pass
 /// it into the JIT compiler function.
 extern "C" void ARMCompilationCallbackC(intptr_t StubAddr) {
   // Get the address of the compiled code for this function.
@@ -123,14 +122,12 @@ extern "C" void ARMCompilationCallbackC(intptr_t StubAddr) {
   //   ldr pc, [pc,#-4]
   //   <addr>
   if (!sys::Memory::setRangeWritable((void*)StubAddr, 8)) {
-    cerr << "ERROR: Unable to mark stub writable\n";
-    abort();
+    llvm_unreachable("ERROR: Unable to mark stub writable");
   }
   *(intptr_t *)StubAddr = 0xe51ff004;  // ldr pc, [pc, #-4]
   *(intptr_t *)(StubAddr+4) = NewVal;
   if (!sys::Memory::setRangeExecutable((void*)StubAddr, 8)) {
-    cerr << "ERROR: Unable to mark stub executable\n";
-    abort();
+    llvm_unreachable("ERROR: Unable to mark stub executable");
   }
 }
 
@@ -143,7 +140,14 @@ ARMJITInfo::getLazyResolverFunction(JITCompilerFn F) {
 void *ARMJITInfo::emitGlobalValueIndirectSym(const GlobalValue *GV, void *Ptr,
                                              JITCodeEmitter &JCE) {
   JCE.startGVStub(GV, 4, 4);
+  intptr_t Addr = (intptr_t)JCE.getCurrentPCValue();
+  if (!sys::Memory::setRangeWritable((void*)Addr, 4)) {
+    llvm_unreachable("ERROR: Unable to mark indirect symbol writable");
+  }
   JCE.emitWordLE((intptr_t)Ptr);
+  if (!sys::Memory::setRangeExecutable((void*)Addr, 4)) {
+    llvm_unreachable("ERROR: Unable to mark indirect symbol executable");
+  }
   void *PtrAddr = JCE.finishGVStub(GV);
   addIndirectSymAddr(Ptr, (intptr_t)PtrAddr);
   return PtrAddr;
@@ -161,31 +165,43 @@ void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn,
       if (!LazyPtr) {
         // In PIC mode, the function stub is loading a lazy-ptr.
         LazyPtr= (intptr_t)emitGlobalValueIndirectSym((GlobalValue*)F, Fn, JCE);
-        if (F)
-          DOUT << "JIT: Indirect symbol emitted at [" << LazyPtr << "] for GV '"
-               << F->getName() << "'\n";
-        else
-          DOUT << "JIT: Stub emitted at [" << LazyPtr
-               << "] for external function at '" << Fn << "'\n";
+        DEBUG(if (F)
+                errs() << "JIT: Indirect symbol emitted at [" << LazyPtr
+                       << "] for GV '" << F->getName() << "'\n";
+              else
+                errs() << "JIT: Stub emitted at [" << LazyPtr
+                       << "] for external function at '" << Fn << "'\n");
       }
       JCE.startGVStub(F, 16, 4);
       intptr_t Addr = (intptr_t)JCE.getCurrentPCValue();
+      if (!sys::Memory::setRangeWritable((void*)Addr, 16)) {
+        llvm_unreachable("ERROR: Unable to mark stub writable");
+      }
       JCE.emitWordLE(0xe59fc004);            // ldr pc, [pc, #+4]
       JCE.emitWordLE(0xe08fc00c);            // L_func$scv: add ip, pc, ip
       JCE.emitWordLE(0xe59cf000);            // ldr pc, [ip]
       JCE.emitWordLE(LazyPtr - (Addr+4+8));  // func - (L_func$scv+8)
       sys::Memory::InvalidateInstructionCache((void*)Addr, 16);
+      if (!sys::Memory::setRangeExecutable((void*)Addr, 16)) {
+        llvm_unreachable("ERROR: Unable to mark stub executable");
+      }
     } else {
       // The stub is 8-byte size and 4-aligned.
       JCE.startGVStub(F, 8, 4);
       intptr_t Addr = (intptr_t)JCE.getCurrentPCValue();
+      if (!sys::Memory::setRangeWritable((void*)Addr, 8)) {
+        llvm_unreachable("ERROR: Unable to mark stub writable");
+      }
       JCE.emitWordLE(0xe51ff004);    // ldr pc, [pc, #-4]
       JCE.emitWordLE((intptr_t)Fn);  // addr of function
       sys::Memory::InvalidateInstructionCache((void*)Addr, 8);
+      if (!sys::Memory::setRangeExecutable((void*)Addr, 8)) {
+        llvm_unreachable("ERROR: Unable to mark stub executable");
+      }
     }
   } else {
     // The compilation callback will overwrite the first two words of this
-    // stub with indirect branch instructions targeting the compiled code. 
+    // stub with indirect branch instructions targeting the compiled code.
     // This stub sets the return address to restart the stub, so that
     // the new branch will be invoked when we come back.
     //
@@ -193,6 +209,9 @@ void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn,
     // The stub is 16-byte size and 4-byte aligned.
     JCE.startGVStub(F, 16, 4);
     intptr_t Addr = (intptr_t)JCE.getCurrentPCValue();
+    if (!sys::Memory::setRangeWritable((void*)Addr, 16)) {
+      llvm_unreachable("ERROR: Unable to mark stub writable");
+    }
     // Save LR so the callback can determine which stub called it.
     // The compilation callback is responsible for popping this prior
     // to returning.
@@ -204,6 +223,9 @@ void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn,
     // The address of the compilation callback.
     JCE.emitWordLE((intptr_t)ARMCompilationCallback);
     sys::Memory::InvalidateInstructionCache((void*)Addr, 16);
+    if (!sys::Memory::setRangeExecutable((void*)Addr, 16)) {
+      llvm_unreachable("ERROR: Unable to mark stub executable");
+    }
   }
 
   return JCE.finishGVStub(F);
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 59cf125..d2ec9ee 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -15,9 +15,11 @@
 #define DEBUG_TYPE "arm-ldst-opt"
 #include "ARM.h"
 #include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMRegisterInfo.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -29,6 +31,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -61,6 +64,7 @@ namespace {
     const TargetRegisterInfo *TRI;
     ARMFunctionInfo *AFI;
     RegScavenger *RS;
+    bool isThumb2;
 
     virtual bool runOnMachineFunction(MachineFunction &Fn);
 
@@ -93,6 +97,15 @@ namespace {
     void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator &MBBI);
+    bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MBBI,
+                                  const TargetInstrInfo *TII,
+                                  bool &Advance,
+                                  MachineBasicBlock::iterator &I);
+    bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   bool &Advance,
+                                   MachineBasicBlock::iterator &I);
     bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
     bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
   };
@@ -107,6 +120,14 @@ static int getLoadStoreMultipleOpcode(int Opcode) {
   case ARM::STR:
     NumSTMGened++;
     return ARM::STM;
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    NumLDMGened++;
+    return ARM::t2LDM;
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    NumSTMGened++;
+    return ARM::t2STM;
   case ARM::FLDS:
     NumFLDMGened++;
     return ARM::FLDMS;
@@ -119,14 +140,30 @@ static int getLoadStoreMultipleOpcode(int Opcode) {
   case ARM::FSTD:
     NumFSTMGened++;
     return ARM::FSTMD;
-  default: abort();
+  default: llvm_unreachable("Unhandled opcode!");
   }
   return 0;
 }
 
+static bool isT2i32Load(unsigned Opc) {
+  return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
+}
+
+static bool isi32Load(unsigned Opc) {
+  return Opc == ARM::LDR || isT2i32Load(Opc);
+}
+
+static bool isT2i32Store(unsigned Opc) {
+  return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
+}
+
+static bool isi32Store(unsigned Opc) {
+  return Opc == ARM::STR || isT2i32Store(Opc);
+}
+
 /// MergeOps - Create and insert a LDM or STM with Base as base register and
 /// registers in Regs as the register operands that would be loaded / stored.
-/// It returns true if the transformation is done. 
+/// It returns true if the transformation is done.
 bool
 ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MBBI,
@@ -140,14 +177,20 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
     return false;
 
   ARM_AM::AMSubMode Mode = ARM_AM::ia;
-  bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR;
-  if (isAM4 && Offset == 4)
+  bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
+  if (isAM4 && Offset == 4) {
+    if (isThumb2)
+      // Thumb2 does not support ldmib / stmib.
+      return false;
     Mode = ARM_AM::ib;
-  else if (isAM4 && Offset == -4 * (int)NumRegs + 4)
+  } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) {
+    if (isThumb2)
+      // Thumb2 does not support ldmda / stmda.
+      return false;
     Mode = ARM_AM::da;
-  else if (isAM4 && Offset == -4 * (int)NumRegs)
+  } else if (isAM4 && Offset == -4 * (int)NumRegs) {
     Mode = ARM_AM::db;
-  else if (Offset != 0) {
+  } else if (Offset != 0) {
     // If starting offset isn't zero, insert a MI to materialize a new base.
     // But only do so if it is cost effective, i.e. merging more than two
     // loads / stores.
@@ -155,7 +198,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       return false;
 
     unsigned NewBase;
-    if (Opcode == ARM::LDR)
+    if (isi32Load(Opcode))
       // If it is a load, then just use one of the destination register to
       // use as the new base.
       NewBase = Regs[NumRegs-1].first;
@@ -165,24 +208,30 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       if (NewBase == 0)
         return false;
     }
-    int BaseOpc = ARM::ADDri;
+    int BaseOpc = !isThumb2
+      ? ARM::ADDri
+      : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri);
     if (Offset < 0) {
-      BaseOpc = ARM::SUBri;
+      BaseOpc = !isThumb2
+        ? ARM::SUBri
+        : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri);
       Offset = - Offset;
     }
-    int ImmedOffset = ARM_AM::getSOImmVal(Offset);
+    int ImmedOffset = isThumb2
+      ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset);
     if (ImmedOffset == -1)
+      // FIXME: Try t2ADDri12 or t2SUBri12?
       return false;  // Probably not worth it then.
 
     BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
-      .addReg(Base, getKillRegState(BaseKill)).addImm(ImmedOffset)
+      .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
       .addImm(Pred).addReg(PredReg).addReg(0);
     Base = NewBase;
     BaseKill = true;  // New base is always killed right its use.
   }
 
   bool isDPR = Opcode == ARM::FLDD || Opcode == ARM::FSTD;
-  bool isDef = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
+  bool isDef = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
   Opcode = getLoadStoreMultipleOpcode(Opcode);
   MachineInstrBuilder MIB = (isAM4)
     ? BuildMI(MBB, MBBI, dl, TII->get(Opcode))
@@ -192,6 +241,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
         .addReg(Base, getKillRegState(BaseKill))
         .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs))
         .addImm(Pred).addReg(PredReg);
+  MIB.addReg(0); // Add optional writeback (0 for now).
   for (unsigned i = 0; i != NumRegs; ++i)
     MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
                      | getKillRegState(Regs[i].second));
@@ -207,7 +257,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
                           ARMCC::CondCodes Pred, unsigned PredReg,
                           unsigned Scratch, MemOpQueue &MemOps,
                           SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
-  bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR;
+  bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
   int Offset = MemOps[SIndex].Offset;
   int SOffset = Offset;
   unsigned Pos = MemOps[SIndex].Position;
@@ -265,41 +315,53 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
   return;
 }
 
-/// getInstrPredicate - If instruction is predicated, returns its predicate
-/// condition, otherwise returns AL. It also returns the condition code
-/// register by reference.
-static ARMCC::CondCodes getInstrPredicate(MachineInstr *MI, unsigned &PredReg) {
-  int PIdx = MI->findFirstPredOperandIdx();
-  if (PIdx == -1) {
-    PredReg = 0;
-    return ARMCC::AL;
-  }
-
-  PredReg = MI->getOperand(PIdx+1).getReg();
-  return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm();
-}
-
 static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
-                                       unsigned Bytes, ARMCC::CondCodes Pred,
-                                       unsigned PredReg) {
+                                       unsigned Bytes, unsigned Limit,
+                                       ARMCC::CondCodes Pred, unsigned PredReg){
   unsigned MyPredReg = 0;
-  return (MI && MI->getOpcode() == ARM::SUBri &&
-          MI->getOperand(0).getReg() == Base &&
+  if (!MI)
+    return false;
+  if (MI->getOpcode() != ARM::t2SUBri &&
+      MI->getOpcode() != ARM::t2SUBrSPi &&
+      MI->getOpcode() != ARM::t2SUBrSPi12 &&
+      MI->getOpcode() != ARM::tSUBspi &&
+      MI->getOpcode() != ARM::SUBri)
+    return false;
+
+  // Make sure the offset fits in 8 bits.
+  if (Bytes <= 0 || (Limit && Bytes >= Limit))
+    return false;
+
+  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
+  return (MI->getOperand(0).getReg() == Base &&
           MI->getOperand(1).getReg() == Base &&
-          ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes &&
-          getInstrPredicate(MI, MyPredReg) == Pred &&
+          (MI->getOperand(2).getImm()*Scale) == Bytes &&
+          llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
           MyPredReg == PredReg);
 }
 
 static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
-                                       unsigned Bytes, ARMCC::CondCodes Pred,
-                                       unsigned PredReg) {
+                                       unsigned Bytes, unsigned Limit,
+                                       ARMCC::CondCodes Pred, unsigned PredReg){
   unsigned MyPredReg = 0;
-  return (MI && MI->getOpcode() == ARM::ADDri &&
-          MI->getOperand(0).getReg() == Base &&
+  if (!MI)
+    return false;
+  if (MI->getOpcode() != ARM::t2ADDri &&
+      MI->getOpcode() != ARM::t2ADDrSPi &&
+      MI->getOpcode() != ARM::t2ADDrSPi12 &&
+      MI->getOpcode() != ARM::tADDspi &&
+      MI->getOpcode() != ARM::ADDri)
+    return false;
+
+  if (Bytes <= 0 || (Limit && Bytes >= Limit))
+    // Make sure the offset fits in 8 bits.
+    return false;
+
+  unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME
+  return (MI->getOperand(0).getReg() == Base &&
           MI->getOperand(1).getReg() == Base &&
-          ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes &&
-          getInstrPredicate(MI, MyPredReg) == Pred &&
+          (MI->getOperand(2).getImm()*Scale) == Bytes &&
+          llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
           MyPredReg == PredReg);
 }
 
@@ -308,6 +370,10 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
   default: return 0;
   case ARM::LDR:
   case ARM::STR:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
   case ARM::FLDS:
   case ARM::FSTS:
     return 4;
@@ -316,7 +382,9 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
     return 8;
   case ARM::LDM:
   case ARM::STM:
-    return (MI->getNumOperands() - 4) * 4;
+  case ARM::t2LDM:
+  case ARM::t2STM:
+    return (MI->getNumOperands() - 5) * 4;
   case ARM::FLDMS:
   case ARM::FSTMS:
   case ARM::FLDMD:
@@ -325,7 +393,7 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
   }
 }
 
-/// mergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
+/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
 /// register into the LDM/STM/FLDM{D|S}/FSTM{D|S} op when possible:
 ///
 /// stmia rn, <ra, rb, rc>
@@ -337,17 +405,18 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
 /// ldmia rn, <ra, rb, rc>
 /// =>
 /// ldmdb rn!, <ra, rb, rc>
-static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator MBBI,
-                                      bool &Advance,
-                                      MachineBasicBlock::iterator &I) {
+bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MBBI,
+                                               bool &Advance,
+                                               MachineBasicBlock::iterator &I) {
   MachineInstr *MI = MBBI;
   unsigned Base = MI->getOperand(0).getReg();
   unsigned Bytes = getLSMultipleTransferSize(MI);
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
   int Opcode = MI->getOpcode();
-  bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::STM;
+  bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::t2LDM ||
+    Opcode == ARM::STM || Opcode == ARM::t2STM;
 
   if (isAM4) {
     if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm()))
@@ -364,13 +433,17 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
     if (MBBI != MBB.begin()) {
       MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
       if (Mode == ARM_AM::ia &&
-          isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+          isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
         MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true));
+        MI->getOperand(4).setReg(Base);
+        MI->getOperand(4).setIsDef();
         MBB.erase(PrevMBBI);
         return true;
       } else if (Mode == ARM_AM::ib &&
-                 isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+                 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
         MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true));
+        MI->getOperand(4).setReg(Base);  // WB to base
+        MI->getOperand(4).setIsDef();
         MBB.erase(PrevMBBI);
         return true;
       }
@@ -379,8 +452,10 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
     if (MBBI != MBB.end()) {
       MachineBasicBlock::iterator NextMBBI = next(MBBI);
       if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
-          isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+          isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
         MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
+        MI->getOperand(4).setReg(Base);  // WB to base
+        MI->getOperand(4).setIsDef();
         if (NextMBBI == I) {
           Advance = true;
           ++I;
@@ -388,8 +463,10 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
         MBB.erase(NextMBBI);
         return true;
       } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
-                 isMatchingDecrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+                 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
         MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
+        MI->getOperand(4).setReg(Base);  // WB to base
+        MI->getOperand(4).setIsDef();
         if (NextMBBI == I) {
           Advance = true;
           ++I;
@@ -408,8 +485,10 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
     if (MBBI != MBB.begin()) {
       MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
       if (Mode == ARM_AM::ia &&
-          isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+          isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
         MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset));
+        MI->getOperand(4).setReg(Base);  // WB to base
+        MI->getOperand(4).setIsDef();
         MBB.erase(PrevMBBI);
         return true;
       }
@@ -418,8 +497,10 @@ static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
     if (MBBI != MBB.end()) {
       MachineBasicBlock::iterator NextMBBI = next(MBBI);
       if (Mode == ARM_AM::ia &&
-          isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+          isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
         MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset));
+        MI->getOperand(4).setReg(Base);  // WB to base
+        MI->getOperand(4).setIsDef();
         if (NextMBBI == I) {
           Advance = true;
           ++I;
@@ -441,7 +522,13 @@ static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) {
   case ARM::FLDD: return ARM::FLDMD;
   case ARM::FSTS: return ARM::FSTMS;
   case ARM::FSTD: return ARM::FSTMD;
-  default: abort();
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    return ARM::t2LDR_PRE;
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return ARM::t2STR_PRE;
+  default: llvm_unreachable("Unhandled opcode!");
   }
   return 0;
 }
@@ -454,48 +541,62 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) {
   case ARM::FLDD: return ARM::FLDMD;
   case ARM::FSTS: return ARM::FSTMS;
   case ARM::FSTD: return ARM::FSTMD;
-  default: abort();
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    return ARM::t2LDR_POST;
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return ARM::t2STR_POST;
+  default: llvm_unreachable("Unhandled opcode!");
   }
   return 0;
 }
 
-/// mergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
+/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
 /// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
-static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MBBI,
-                                     const TargetInstrInfo *TII,
-                                     bool &Advance,
-                                     MachineBasicBlock::iterator &I) {
+bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MBBI,
+                                               const TargetInstrInfo *TII,
+                                               bool &Advance,
+                                               MachineBasicBlock::iterator &I) {
   MachineInstr *MI = MBBI;
   unsigned Base = MI->getOperand(1).getReg();
   bool BaseKill = MI->getOperand(1).isKill();
   unsigned Bytes = getLSMultipleTransferSize(MI);
   int Opcode = MI->getOpcode();
   DebugLoc dl = MI->getDebugLoc();
+  bool isAM5 = Opcode == ARM::FLDD || Opcode == ARM::FLDS ||
+    Opcode == ARM::FSTD || Opcode == ARM::FSTS;
   bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
-  if ((isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) ||
-      (!isAM2 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0))
+  if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0)
     return false;
+  else if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
+    return false;
+  else if (isT2i32Load(Opcode) || isT2i32Store(Opcode))
+    if (MI->getOperand(2).getImm() != 0)
+      return false;
 
-  bool isLd = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
+  bool isLd = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
   // Can't do the merge if the destination register is the same as the would-be
   // writeback register.
   if (isLd && MI->getOperand(0).getReg() == Base)
     return false;
 
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
   bool DoMerge = false;
   ARM_AM::AddrOpc AddSub = ARM_AM::add;
   unsigned NewOpc = 0;
+  // AM2 - 12 bits, thumb2 - 8 bits.
+  unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
   if (MBBI != MBB.begin()) {
     MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
-    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
       DoMerge = true;
       AddSub = ARM_AM::sub;
       NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
-    } else if (isAM2 && isMatchingIncrement(PrevMBBI, Base, Bytes,
-                                            Pred, PredReg)) {
+    } else if (!isAM5 &&
+               isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
       DoMerge = true;
       NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
     }
@@ -505,11 +606,12 @@ static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
 
   if (!DoMerge && MBBI != MBB.end()) {
     MachineBasicBlock::iterator NextMBBI = next(MBBI);
-    if (isAM2 && isMatchingDecrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+    if (!isAM5 &&
+        isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
       DoMerge = true;
       AddSub = ARM_AM::sub;
       NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
-    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
       DoMerge = true;
       NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
     }
@@ -526,33 +628,51 @@ static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
     return false;
 
   bool isDPR = NewOpc == ARM::FLDMD || NewOpc == ARM::FSTMD;
-  unsigned Offset = isAM2 ? ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift)
-    : ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) ? ARM_AM::db : ARM_AM::ia,
-                        true, isDPR ? 2 : 1);
+  unsigned Offset = 0;
+  if (isAM5)
+    Offset = ARM_AM::getAM5Opc((AddSub == ARM_AM::sub)
+                               ? ARM_AM::db
+                               : ARM_AM::ia, true, (isDPR ? 2 : 1));
+  else if (isAM2)
+    Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+  else
+    Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
   if (isLd) {
-    if (isAM2)
-      // LDR_PRE, LDR_POST;
-      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
-        .addReg(Base, RegState::Define)
-        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
-    else
+    if (isAM5)
       // FLDMS, FLDMD
       BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
         .addReg(Base, getKillRegState(BaseKill))
         .addImm(Offset).addImm(Pred).addReg(PredReg)
+        .addReg(Base, getDefRegState(true)) // WB base register
         .addReg(MI->getOperand(0).getReg(), RegState::Define);
-  } else {
-    MachineOperand &MO = MI->getOperand(0);
-    if (isAM2)
-      // STR_PRE, STR_POST;
-      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
-        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+    else if (isAM2)
+      // LDR_PRE, LDR_POST,
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+        .addReg(Base, RegState::Define)
         .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
     else
+      // t2LDR_PRE, t2LDR_POST
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+        .addReg(Base, RegState::Define)
+        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+  } else {
+    MachineOperand &MO = MI->getOperand(0);
+    if (isAM5)
       // FSTMS, FSTMD
       BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset)
         .addImm(Pred).addReg(PredReg)
+        .addReg(Base, getDefRegState(true)) // WB base register
         .addReg(MO.getReg(), getKillRegState(MO.isKill()));
+    else if (isAM2)
+      // STR_PRE, STR_POST
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+    else
+      // t2STR_PRE, t2STR_POST
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
   }
   MBB.erase(MBBI);
 
@@ -561,7 +681,7 @@ static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
 
 /// isMemoryOp - Returns true if instruction is a memory operations (that this
 /// pass is capable of operating on).
-static bool isMemoryOp(MachineInstr *MI) {
+static bool isMemoryOp(const MachineInstr *MI) {
   int Opcode = MI->getOpcode();
   switch (Opcode) {
   default: break;
@@ -574,6 +694,11 @@ static bool isMemoryOp(MachineInstr *MI) {
   case ARM::FLDD:
   case ARM::FSTD:
     return MI->getOperand(1).isReg();
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return MI->getOperand(1).isReg();
   }
   return false;
 }
@@ -600,6 +725,12 @@ static int getMemoryOpOffset(const MachineInstr *MI) {
   bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
   unsigned NumOperands = MI->getDesc().getNumOperands();
   unsigned OffField = MI->getOperand(NumOperands-3).getImm();
+
+  if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
+      Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
+      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8)
+    return OffField;
+
   int Offset = isAM2
     ? ARM_AM::getAM2Offset(OffField)
     : (isAM3 ? ARM_AM::getAM3Offset(OffField)
@@ -621,37 +752,43 @@ static void InsertLDR_STR(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator &MBBI,
                           int OffImm, bool isDef,
                           DebugLoc dl, unsigned NewOpc,
-                          unsigned Reg, bool RegDeadKill,
-                          unsigned BaseReg, bool BaseKill,
-                          unsigned OffReg, bool OffKill,
+                          unsigned Reg, bool RegDeadKill, bool RegUndef,
+                          unsigned BaseReg, bool BaseKill, bool BaseUndef,
+                          unsigned OffReg, bool OffKill, bool OffUndef,
                           ARMCC::CondCodes Pred, unsigned PredReg,
-                          const TargetInstrInfo *TII) {
-  unsigned Offset;
-  if (OffImm < 0)
-    Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift);
-  else
-    Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift);
-  if (isDef)
-    BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
+                          const TargetInstrInfo *TII, bool isT2) {
+  int Offset = OffImm;
+  if (!isT2) {
+    if (OffImm < 0)
+      Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift);
+    else
+      Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift);
+  }
+  if (isDef) {
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+                                      TII->get(NewOpc))
       .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
-      .addReg(BaseReg, getKillRegState(BaseKill))
-      .addReg(OffReg,  getKillRegState(OffKill))
-      .addImm(Offset)
-      .addImm(Pred).addReg(PredReg);
-  else
-    BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
-      .addReg(Reg, getKillRegState(RegDeadKill))
-      .addReg(BaseReg, getKillRegState(BaseKill))
-      .addReg(OffReg,  getKillRegState(OffKill))
-      .addImm(Offset)
-      .addImm(Pred).addReg(PredReg);
+      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+    if (!isT2)
+      MIB.addReg(OffReg,  getKillRegState(OffKill)|getUndefRegState(OffUndef));
+    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+  } else {
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+                                      TII->get(NewOpc))
+      .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
+      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+    if (!isT2)
+      MIB.addReg(OffReg,  getKillRegState(OffKill)|getUndefRegState(OffUndef));
+    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+  }
 }
 
 bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator &MBBI) {
   MachineInstr *MI = &*MBBI;
   unsigned Opcode = MI->getOpcode();
-  if (Opcode == ARM::LDRD || Opcode == ARM::STRD) {
+  if (Opcode == ARM::LDRD || Opcode == ARM::STRD ||
+      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) {
     unsigned EvenReg = MI->getOperand(0).getReg();
     unsigned OddReg  = MI->getOperand(1).getReg();
     unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
@@ -659,45 +796,59 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
     if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)
       return false;
 
-    bool isLd = Opcode == ARM::LDRD;
+    bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
+    bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
     bool EvenDeadKill = isLd ?
       MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
+    bool EvenUndef = MI->getOperand(0).isUndef();
     bool OddDeadKill  = isLd ?
       MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
+    bool OddUndef = MI->getOperand(1).isUndef();
     const MachineOperand &BaseOp = MI->getOperand(2);
     unsigned BaseReg = BaseOp.getReg();
     bool BaseKill = BaseOp.isKill();
-    const MachineOperand &OffOp = MI->getOperand(3);
-    unsigned OffReg = OffOp.getReg();
-    bool OffKill = OffOp.isKill();
+    bool BaseUndef = BaseOp.isUndef();
+    unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg();
+    bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
+    bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
     int OffImm = getMemoryOpOffset(MI);
     unsigned PredReg = 0;
-    ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+    ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
 
     if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) {
       // Ascending register numbers and no offset. It's safe to change it to a
       // ldm or stm.
-      unsigned NewOpc = (Opcode == ARM::LDRD) ? ARM::LDM : ARM::STM;
+      unsigned NewOpc = (isLd)
+        ? (isT2 ? ARM::t2LDM : ARM::LDM)
+        : (isT2 ? ARM::t2STM : ARM::STM);
       if (isLd) {
         BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
           .addReg(BaseReg, getKillRegState(BaseKill))
           .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
           .addImm(Pred).addReg(PredReg)
+          .addReg(0)
           .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
-          .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill));
+          .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
         ++NumLDRD2LDM;
       } else {
         BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
           .addReg(BaseReg, getKillRegState(BaseKill))
           .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
           .addImm(Pred).addReg(PredReg)
-          .addReg(EvenReg, getKillRegState(EvenDeadKill))
-          .addReg(OddReg, getKillRegState(OddDeadKill));
+          .addReg(0)
+          .addReg(EvenReg,
+                  getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
+          .addReg(OddReg,
+                  getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
         ++NumSTRD2STM;
       }
     } else {
       // Split into two instructions.
-      unsigned NewOpc = (Opcode == ARM::LDRD) ? ARM::LDR : ARM::STR;
+      assert((!isT2 || !OffReg) &&
+             "Thumb2 ldrd / strd does not encode offset register!");
+      unsigned NewOpc = (isLd)
+        ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR)
+        : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR);
       DebugLoc dl = MBBI->getDebugLoc();
       // If this is a load and base register is killed, it may have been
       // re-defed by the load, make sure the first load does not clobber it.
@@ -707,17 +858,23 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
            (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) {
         assert(!TRI->regsOverlap(OddReg, BaseReg) &&
                (!OffReg || !TRI->regsOverlap(OddReg, OffReg)));
-        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, OddReg, OddDeadKill,
-                      BaseReg, false, OffReg, false, Pred, PredReg, TII);
-        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, EvenReg, EvenDeadKill,
-                      BaseReg, BaseKill, OffReg, OffKill, Pred, PredReg, TII);
+        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
+                      OddReg, OddDeadKill, false,
+                      BaseReg, false, BaseUndef, OffReg, false, OffUndef,
+                      Pred, PredReg, TII, isT2);
+        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
+                      EvenReg, EvenDeadKill, false,
+                      BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
+                      Pred, PredReg, TII, isT2);
       } else {
         InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
-                      EvenReg, EvenDeadKill, BaseReg, false, OffReg, false,
-                      Pred, PredReg, TII);
+                      EvenReg, EvenDeadKill, EvenUndef,
+                      BaseReg, false, BaseUndef, OffReg, false, OffUndef,
+                      Pred, PredReg, TII, isT2);
         InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
-                      OddReg, OddDeadKill, BaseReg, BaseKill, OffReg, OffKill,
-                      Pred, PredReg, TII);
+                      OddReg, OddDeadKill, OddUndef,
+                      BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
+                      Pred, PredReg, TII, isT2);
       }
       if (isLd)
         ++NumLDRD2LDR;
@@ -761,7 +918,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       unsigned Size = getLSMultipleTransferSize(MBBI);
       unsigned Base = MBBI->getOperand(1).getReg();
       unsigned PredReg = 0;
-      ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
+      ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg);
       int Offset = getMemoryOpOffset(MBBI);
       // Watch out for:
       // r4 := ldr [r5]
@@ -772,7 +929,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       // looks like the later ldr(s) use the same base register. Try to
       // merge the ldr's so far, including this one. But don't try to
       // combine the following ldr(s).
-      Clobber = (Opcode == ARM::LDR && Base == MBBI->getOperand(0).getReg());
+      Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg());
       if (CurrBase == 0 && !Clobber) {
         // Start of a new chain.
         CurrBase = Base;
@@ -825,12 +982,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         // Try to find a free register to use as a new base in case it's needed.
         // First advance to the instruction just before the start of the chain.
         AdvanceRS(MBB, MemOps);
-        // Find a scratch register. Make sure it's a call clobbered register or
-        // a spilled callee-saved register.
-        unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass, true);
-        if (!Scratch)
-          Scratch = RS->FindUnusedReg(&ARM::GPRRegClass,
-                                      AFI->getSpilledCSRegisters());
+        // Find a scratch register.
+        unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass);
         // Process the load / store instructions.
         RS->forward(prior(MBBI));
 
@@ -842,7 +995,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         // Try folding preceeding/trailing base inc/dec into the generated
         // LDM/STM ops.
         for (unsigned i = 0, e = Merges.size(); i < e; ++i)
-          if (mergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
+          if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
             ++NumMerges;
         NumMerges += Merges.size();
 
@@ -850,15 +1003,15 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         // that were not merged to form LDM/STM ops.
         for (unsigned i = 0; i != NumMemOps; ++i)
           if (!MemOps[i].Merged)
-            if (mergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
+            if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
               ++NumMerges;
 
-        // RS may be pointing to an instruction that's deleted. 
+        // RS may be pointing to an instruction that's deleted.
         RS->skipTo(prior(MBBI));
       } else if (NumMemOps == 1) {
         // Try folding preceeding/trailing base inc/dec into the single
         // load/store.
-        if (mergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
+        if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
           ++NumMerges;
           RS->forward(prior(MBBI));
         }
@@ -907,16 +1060,18 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
   if (MBB.empty()) return false;
 
   MachineBasicBlock::iterator MBBI = prior(MBB.end());
-  if (MBBI->getOpcode() == ARM::BX_RET && MBBI != MBB.begin()) {
+  if (MBBI != MBB.begin() &&
+      (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET)) {
     MachineInstr *PrevMI = prior(MBBI);
-    if (PrevMI->getOpcode() == ARM::LDM) {
+    if (PrevMI->getOpcode() == ARM::LDM || PrevMI->getOpcode() == ARM::t2LDM) {
       MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
-      if (MO.getReg() == ARM::LR) {
-        PrevMI->setDesc(TII->get(ARM::LDM_RET));
-        MO.setReg(ARM::PC);
-        MBB.erase(MBBI);
-        return true;
-      }
+      if (MO.getReg() != ARM::LR)
+        return false;
+      unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET;
+      PrevMI->setDesc(TII->get(NewOpc));
+      MO.setReg(ARM::PC);
+      MBB.erase(MBBI);
+      return true;
     }
   }
   return false;
@@ -928,6 +1083,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   TII = TM.getInstrInfo();
   TRI = TM.getRegisterInfo();
   RS = new RegScavenger();
+  isThumb2 = AFI->isThumb2Function();
 
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
@@ -956,6 +1112,7 @@ namespace {
     const TargetRegisterInfo *TRI;
     const ARMSubtarget *STI;
     MachineRegisterInfo *MRI;
+    MachineFunction *MF;
 
     virtual bool runOnMachineFunction(MachineFunction &Fn);
 
@@ -967,8 +1124,9 @@ namespace {
     bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
                           unsigned &NewOpc, unsigned &EvenReg,
                           unsigned &OddReg, unsigned &BaseReg,
-                          unsigned &OffReg, unsigned &Offset,
-                          unsigned &PredReg, ARMCC::CondCodes &Pred);
+                          unsigned &OffReg, int &Offset,
+                          unsigned &PredReg, ARMCC::CondCodes &Pred,
+                          bool &isT2);
     bool RescheduleOps(MachineBasicBlock *MBB,
                        SmallVector<MachineInstr*, 4> &Ops,
                        unsigned Base, bool isLd,
@@ -984,6 +1142,7 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   TRI = Fn.getTarget().getRegisterInfo();
   STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
   MRI = &Fn.getRegInfo();
+  MF  = &Fn;
 
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
@@ -1045,48 +1204,83 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
                                           DebugLoc &dl,
                                           unsigned &NewOpc, unsigned &EvenReg,
                                           unsigned &OddReg, unsigned &BaseReg,
-                                          unsigned &OffReg, unsigned &Offset,
+                                          unsigned &OffReg, int &Offset,
                                           unsigned &PredReg,
-                                          ARMCC::CondCodes &Pred) {
+                                          ARMCC::CondCodes &Pred,
+                                          bool &isT2) {
+  // Make sure we're allowed to generate LDRD/STRD.
+  if (!STI->hasV5TEOps())
+    return false;
+
   // FIXME: FLDS / FSTS -> FLDD / FSTD
+  unsigned Scale = 1;
   unsigned Opcode = Op0->getOpcode();
   if (Opcode == ARM::LDR)
     NewOpc = ARM::LDRD;
   else if (Opcode == ARM::STR)
     NewOpc = ARM::STRD;
-  else
-    return 0;
+  else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
+    NewOpc = ARM::t2LDRDi8;
+    Scale = 4;
+    isT2 = true;
+  } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
+    NewOpc = ARM::t2STRDi8;
+    Scale = 4;
+    isT2 = true;
+  } else
+    return false;
+
+  // Make sure the offset registers match.
+  if (!isT2 &&
+      (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg()))
+      return false;
 
   // Must sure the base address satisfies i64 ld / st alignment requirement.
   if (!Op0->hasOneMemOperand() ||
-      !Op0->memoperands_begin()->getValue() ||
-      Op0->memoperands_begin()->isVolatile())
+      !(*Op0->memoperands_begin())->getValue() ||
+      (*Op0->memoperands_begin())->isVolatile())
     return false;
 
-  unsigned Align = Op0->memoperands_begin()->getAlignment();
+  unsigned Align = (*Op0->memoperands_begin())->getAlignment();
+  Function *Func = MF->getFunction();
   unsigned ReqAlign = STI->hasV6Ops()
-    ? TD->getPrefTypeAlignment(Type::Int64Ty) : 8; // Pre-v6 need 8-byte align
+    ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext())) 
+    : 8;  // Pre-v6 need 8-byte align
   if (Align < ReqAlign)
     return false;
 
   // Then make sure the immediate offset fits.
   int OffImm = getMemoryOpOffset(Op0);
-  ARM_AM::AddrOpc AddSub = ARM_AM::add;
-  if (OffImm < 0) {
-    AddSub = ARM_AM::sub;
-    OffImm = - OffImm;
+  if (isT2) {
+    if (OffImm < 0) {
+      if (OffImm < -255)
+        // Can't fall back to t2LDRi8 / t2STRi8.
+        return false;
+    } else {
+      int Limit = (1 << 8) * Scale;
+      if (OffImm >= Limit || (OffImm & (Scale-1)))
+        return false;
+    }
+    Offset = OffImm;
+  } else {
+    ARM_AM::AddrOpc AddSub = ARM_AM::add;
+    if (OffImm < 0) {
+      AddSub = ARM_AM::sub;
+      OffImm = - OffImm;
+    }
+    int Limit = (1 << 8) * Scale;
+    if (OffImm >= Limit || (OffImm & (Scale-1)))
+      return false;
+    Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
   }
-  if (OffImm >= 256) // 8 bits
-    return false;
-  Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
-
   EvenReg = Op0->getOperand(0).getReg();
   OddReg  = Op1->getOperand(0).getReg();
   if (EvenReg == OddReg)
     return false;
   BaseReg = Op0->getOperand(1).getReg();
-  OffReg = Op0->getOperand(2).getReg();
-  Pred = getInstrPredicate(Op0, PredReg);
+  if (!isT2)
+    OffReg = Op0->getOperand(2).getReg();
+  Pred = llvm::getInstrPredicate(Op0, PredReg);
   dl = Op0->getDebugLoc();
   return true;
 }
@@ -1138,7 +1332,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
       LastOffset = Offset;
       LastBytes = Bytes;
       LastOpcode = Opcode;
-      if (++NumMove == 8) // FIXME: Tune
+      if (++NumMove == 8) // FIXME: Tune this limit.
         break;
     }
 
@@ -1174,29 +1368,36 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
         unsigned EvenReg = 0, OddReg = 0;
         unsigned BaseReg = 0, OffReg = 0, PredReg = 0;
         ARMCC::CondCodes Pred = ARMCC::AL;
+        bool isT2 = false;
         unsigned NewOpc = 0;
-        unsigned Offset = 0;
+        int Offset = 0;
         DebugLoc dl;
         if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
                                              EvenReg, OddReg, BaseReg, OffReg,
-                                             Offset, PredReg, Pred)) {
+                                             Offset, PredReg, Pred, isT2)) {
           Ops.pop_back();
           Ops.pop_back();
 
           // Form the pair instruction.
           if (isLd) {
-            BuildMI(*MBB, InsertPos, dl, TII->get(NewOpc))
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
+                                              dl, TII->get(NewOpc))
               .addReg(EvenReg, RegState::Define)
               .addReg(OddReg, RegState::Define)
-              .addReg(BaseReg).addReg(0).addImm(Offset)
-              .addImm(Pred).addReg(PredReg);
+              .addReg(BaseReg);
+            if (!isT2)
+              MIB.addReg(OffReg);
+            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
             ++NumLDRDFormed;
           } else {
-            BuildMI(*MBB, InsertPos, dl, TII->get(NewOpc))
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
+                                              dl, TII->get(NewOpc))
               .addReg(EvenReg)
               .addReg(OddReg)
-              .addReg(BaseReg).addReg(0).addImm(Offset)
-              .addImm(Pred).addReg(PredReg);
+              .addReg(BaseReg);
+            if (!isT2)
+              MIB.addReg(OffReg);
+            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
             ++NumSTRDFormed;
           }
           MBB->erase(Op0);
@@ -1249,12 +1450,11 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
       if (!isMemoryOp(MI))
         continue;
       unsigned PredReg = 0;
-      if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
+      if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL)
         continue;
 
-      int Opcode = MI->getOpcode();
-      bool isLd = Opcode == ARM::LDR ||
-        Opcode == ARM::FLDS || Opcode == ARM::FLDD;
+      int Opc = MI->getOpcode();
+      bool isLd = isi32Load(Opc) || Opc == ARM::FLDS || Opc == ARM::FLDD;
       unsigned Base = MI->getOperand(1).getReg();
       int Offset = getMemoryOpOffset(MI);
 
diff --git a/lib/Target/ARM/ARMMCAsmInfo.cpp b/lib/Target/ARM/ARMMCAsmInfo.cpp
new file mode 100644
index 0000000..0ff65d2
--- /dev/null
+++ b/lib/Target/ARM/ARMMCAsmInfo.cpp
@@ -0,0 +1,72 @@
+//===-- ARMMCAsmInfo.cpp - ARM asm properties -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the ARMMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCAsmInfo.h"
+using namespace llvm;
+
+static const char *const arm_asm_table[] = {
+  "{r0}", "r0",
+  "{r1}", "r1",
+  "{r2}", "r2",
+  "{r3}", "r3",
+  "{r4}", "r4",
+  "{r5}", "r5",
+  "{r6}", "r6",
+  "{r7}", "r7",
+  "{r8}", "r8",
+  "{r9}", "r9",
+  "{r10}", "r10",
+  "{r11}", "r11",
+  "{r12}", "r12",
+  "{r13}", "r13",
+  "{r14}", "r14",
+  "{lr}", "lr",
+  "{sp}", "sp",
+  "{ip}", "ip",
+  "{fp}", "fp",
+  "{sl}", "sl",
+  "{memory}", "memory",
+  "{cc}", "cc",
+  0,0
+};
+
+ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() {
+  AsmTransCBE = arm_asm_table;
+  Data64bitsDirective = 0;
+  CommentString = "@";
+  COMMDirectiveTakesAlignment = false;
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::SjLj;
+  AbsoluteEHSectionOffsets = false;
+}
+
+ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
+  AlignmentIsInBytes = false;
+  Data64bitsDirective = 0;
+  CommentString = "@";
+  COMMDirectiveTakesAlignment = false;
+  
+  NeedsSet = false;
+  HasLEB128 = true;
+  AbsoluteDebugSectionOffsets = true;
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  SetDirective = "\t.set\t";
+  LCOMMDirective = "\t.lcomm\t";
+
+  DwarfRequiresFrameSection = false;
+
+  SupportsDebugInformation = true;
+}
diff --git a/lib/Target/ARM/ARMMCAsmInfo.h b/lib/Target/ARM/ARMMCAsmInfo.h
new file mode 100644
index 0000000..90f7822
--- /dev/null
+++ b/lib/Target/ARM/ARMMCAsmInfo.h
@@ -0,0 +1,31 @@
+//=====-- ARMMCAsmInfo.h - ARM asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARMTARGETASMINFO_H
+#define LLVM_ARMTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+
+  struct ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
+    explicit ARMMCAsmInfoDarwin();
+  };
+
+  struct ARMELFMCAsmInfo : public MCAsmInfo {
+    explicit ARMELFMCAsmInfo();
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 66d3df6..2176b27 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -1,10 +1,10 @@
 //====- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // This file declares ARM-specific per-machine-function information.
@@ -52,10 +52,6 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// enable far jump.
   bool LRSpilledForFarJump;
 
-  /// R3IsLiveIn - True if R3 is live in to this function.
-  /// FIXME: Remove when register scavenger for Thumb is done.
-  bool R3IsLiveIn;
-
   /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer
   /// spill stack offset.
   unsigned FramePtrSpillOffset;
@@ -100,7 +96,7 @@ public:
     hasThumb2(false),
     Align(2U),
     VarArgsRegSaveSize(0), HasStackFrame(false),
-    LRSpilledForFarJump(false), R3IsLiveIn(false),
+    LRSpilledForFarJump(false),
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
     GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
@@ -111,7 +107,7 @@ public:
     hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()),
     Align(isThumb ? 1U : 2U),
     VarArgsRegSaveSize(0), HasStackFrame(false),
-    LRSpilledForFarJump(false), R3IsLiveIn(false),
+    LRSpilledForFarJump(false),
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
     GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32),
@@ -119,6 +115,7 @@ public:
     JumpTableUId(0), ConstPoolEntryUId(0) {}
 
   bool isThumbFunction() const { return isThumb; }
+  bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
   bool isThumb2Function() const { return isThumb && hasThumb2; }
 
   unsigned getAlign() const { return Align; }
@@ -133,13 +130,9 @@ public:
   bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
   void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
 
-  // FIXME: Remove when register scavenger for Thumb is done.
-  bool isR3LiveIn() const { return R3IsLiveIn; }
-  void setR3IsLiveIn(bool l) { R3IsLiveIn = l; }
-
   unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; }
   void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; }
-  
+
   unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; }
   unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; }
   unsigned getDPRCalleeSavedAreaOffset()  const { return DPRCSOffset; }
diff --git a/lib/Target/ARM/ARMPerfectShuffle.h b/lib/Target/ARM/ARMPerfectShuffle.h
new file mode 100644
index 0000000..5ff7c38
--- /dev/null
+++ b/lib/Target/ARM/ARMPerfectShuffle.h
@@ -0,0 +1,6586 @@
+//===-- ARMPerfectShuffle.h - NEON Perfect Shuffle Table ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using neon instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  135053414U,	// <0,0,0,0>: Cost 1 vdup0 LHS
+  1543503974U,	// <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+  2618572962U,	// <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+  2568054923U,	// <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1476398390U,	// <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+  2550140624U,	// <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+  2550141434U,	// <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+  2591945711U,	// <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U,	// <0,0,0,u>: Cost 1 vdup0 LHS
+  2886516736U,	// <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+  1812775014U,	// <0,0,1,1>: Cost 2 vzipl LHS, LHS
+  1618133094U,	// <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2625209292U,	// <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+  2886558034U,	// <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+  2617246864U,	// <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+  3659723031U,	// <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+  2591953904U,	// <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+  1812775581U,	// <0,0,1,u>: Cost 2 vzipl LHS, LHS
+  3020734464U,	// <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+  3020734474U,	// <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+  1946992742U,	// <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2631181989U,	// <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+  3020734668U,	// <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+  3826550569U,	// <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+  2617247674U,	// <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+  2591962097U,	// <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1946992796U,	// <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+  2635163787U,	// <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2686419196U,	// <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+  2686492933U,	// <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+  2617248156U,	// <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+  2617248258U,	// <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+  3826551298U,	// <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+  3690990200U,	// <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+  3713551042U,	// <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+  2635163787U,	// <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2617248658U,	// <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+  2888450150U,	// <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021570150U,	// <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  3641829519U,	// <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+  3021570252U,	// <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+  1543507254U,	// <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752810294U,	// <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  3786998152U,	// <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+  1543507497U,	// <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+  2684354972U,	// <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+  2617249488U,	// <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+  3765617070U,	// <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+  3635865780U,	// <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+  2617249734U,	// <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+  2617249796U,	// <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+  2718712274U,	// <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+  2617249960U,	// <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+  2720039396U,	// <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+  2684355053U,	// <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+  3963609190U,	// <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+  2617250298U,	// <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+  3796435464U,	// <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+  3659762998U,	// <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+  3659763810U,	// <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+  2617250616U,	// <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+  2657727309U,	// <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+  2658390942U,	// <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+  2659054575U,	// <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  3635880854U,	// <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+  3635881401U,	// <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+  3734787298U,	// <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+  2617251174U,	// <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+  3659772002U,	// <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+  3659772189U,	// <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+  2617251436U,	// <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+  2659054575U,	// <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  135053414U,	// <0,0,u,0>: Cost 1 vdup0 LHS
+  1817419878U,	// <0,0,u,1>: Cost 2 vzipl LHS, LHS
+  1947435110U,	// <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+  2568120467U,	// <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+  1476463926U,	// <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+  1543510170U,	// <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752813210U,	// <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  2592011255U,	// <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+  135053414U,	// <0,0,u,u>: Cost 1 vdup0 LHS
+  2618581002U,	// <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+  1557446758U,	// <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2618581155U,	// <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+  2690548468U,	// <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+  2626543954U,	// <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+  4094985216U,	// <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+  2592019278U,	// <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+  2592019448U,	// <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+  1557447325U,	// <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+  1476476938U,	// <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+  2886517556U,	// <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+  2886517654U,	// <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+  2886517720U,	// <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+  1476480310U,	// <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+  2886558864U,	// <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+  2550223354U,	// <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+  2550223856U,	// <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+  1476482862U,	// <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+  1494401126U,	// <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+  3020735284U,	// <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+  2562172349U,	// <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+  835584U,	// <0,1,2,3>: Cost 0 copy LHS
+  1494404406U,	// <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+  3020735488U,	// <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+  2631190458U,	// <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+  1518294010U,	// <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+  835584U,	// <0,1,2,u>: Cost 0 copy LHS
+  2692318156U,	// <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+  2691875800U,	// <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+  2691875806U,	// <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+  2692539367U,	// <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+  2562182454U,	// <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+  2691875840U,	// <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+  2692760578U,	// <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+  2639817411U,	// <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+  2691875863U,	// <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+  2568159334U,	// <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+  4095312692U,	// <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+  2568160934U,	// <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+  2568161432U,	// <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+  2568162614U,	// <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+  1557450038U,	// <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754235702U,	// <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  2592052220U,	// <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+  1557450281U,	// <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+  3765617775U,	// <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+  2647781007U,	// <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+  3704934138U,	// <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+  2691875984U,	// <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+  2657734598U,	// <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+  2650435539U,	// <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+  2651099172U,	// <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+  2651762805U,	// <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+  2691876029U,	// <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+  2592063590U,	// <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+  3765617871U,	// <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+  2654417337U,	// <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+  3765617889U,	// <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+  2592066870U,	// <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+  3765617907U,	// <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+  2657071869U,	// <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+  1583993678U,	// <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+  1584657311U,	// <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+  2657735672U,	// <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+  2657735808U,	// <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+  2631193772U,	// <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+  2661053667U,	// <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+  2657736038U,	// <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+  3721524621U,	// <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+  2657736158U,	// <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+  2657736300U,	// <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+  2657736322U,	// <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+  1494450278U,	// <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+  1557452590U,	// <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2754238254U,	// <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+  835584U,	// <0,1,u,3>: Cost 0 copy LHS
+  1494453558U,	// <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+  1557452954U,	// <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754238618U,	// <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  1518343168U,	// <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+  835584U,	// <0,1,u,u>: Cost 0 copy LHS
+  2752299008U,	// <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+  1544847462U,	// <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678557286U,	// <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+  2696521165U,	// <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+  2752340172U,	// <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+  2691876326U,	// <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+  2618589695U,	// <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+  2592093185U,	// <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+  1678557340U,	// <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+  2618589942U,	// <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+  2752299828U,	// <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+  2886518376U,	// <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+  2752299766U,	// <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  2550295862U,	// <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+  2752340992U,	// <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+  2886559674U,	// <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+  3934208106U,	// <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+  2752340771U,	// <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+  1476558868U,	// <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+  2226628029U,	// <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+  2752300648U,	// <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+  3020736114U,	// <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476562230U,	// <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+  2550304464U,	// <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+  2618591162U,	// <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+  2550305777U,	// <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+  1476564782U,	// <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+  2618591382U,	// <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+  2752301206U,	// <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  3826043121U,	// <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+  2752301468U,	// <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618591746U,	// <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+  2752301570U,	// <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  3830688102U,	// <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+  2698807012U,	// <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+  2752301269U,	// <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562261094U,	// <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+  4095313828U,	// <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+  2226718152U,	// <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+  2568235169U,	// <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+  2562264374U,	// <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+  1544850742U,	// <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678560566U,	// <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+  2592125957U,	// <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1678560584U,	// <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+  2691876686U,	// <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+  2618592976U,	// <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+  3765618528U,	// <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+  3765618536U,	// <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+  2618593222U,	// <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+  2752303108U,	// <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+  2618593378U,	// <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+  2824785206U,	// <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2824785207U,	// <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2752303950U,	// <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+  3830690081U,	// <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+  2618593786U,	// <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+  2691876794U,	// <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+  2752303990U,	// <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+  3830690445U,	// <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+  2752303928U,	// <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+  2657743695U,	// <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+  2691876839U,	// <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+  2659070961U,	// <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  2659734594U,	// <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+  3734140051U,	// <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+  2701166596U,	// <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+  2662389094U,	// <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+  2662389126U,	// <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+  3736794583U,	// <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+  2752304748U,	// <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+  2659070961U,	// <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  1476608026U,	// <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+  1544853294U,	// <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678563118U,	// <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+  3021178482U,	// <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476611382U,	// <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+  1544853658U,	// <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678563482U,	// <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+  2824785449U,	// <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  1678563172U,	// <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+  2556329984U,	// <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+  2686421142U,	// <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+  2562303437U,	// <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+  4094986652U,	// <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+  2556333366U,	// <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+  4094986754U,	// <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+  3798796488U,	// <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+  3776530634U,	// <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+  2556335918U,	// <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+  2886518934U,	// <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+  2556338933U,	// <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+  2691877105U,	// <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+  2886519196U,	// <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+  2886519298U,	// <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+  4095740418U,	// <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+  3659944242U,	// <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+  3769600286U,	// <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+  2886519582U,	// <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+  1482604646U,	// <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+  1482605302U,	// <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+  2556348008U,	// <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+  3020736924U,	// <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482607926U,	// <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+  3020737026U,	// <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598154746U,	// <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+  2598155258U,	// <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+  1482610478U,	// <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+  3692341398U,	// <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+  2635851999U,	// <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+  3636069840U,	// <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+  2691877276U,	// <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+  3961522690U,	// <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+  3826797058U,	// <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+  3703622282U,	// <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+  3769600452U,	// <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+  2640497430U,	// <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+  3962194070U,	// <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+  2232617112U,	// <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+  2232690849U,	// <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+  4095314332U,	// <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+  3962194434U,	// <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+  2691877378U,	// <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+  3826765110U,	// <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+  3665941518U,	// <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+  2691877405U,	// <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+  3630112870U,	// <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+  3630113526U,	// <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+  4035199734U,	// <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+  3769600578U,	// <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+  2232846516U,	// <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+  3779037780U,	// <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+  2718714461U,	// <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+  2706106975U,	// <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+  2233141464U,	// <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+  2691877496U,	// <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+  3727511914U,	// <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+  3765619338U,	// <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+  3765619347U,	// <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+  3765987996U,	// <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+  3306670270U,	// <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+  3792456365U,	// <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+  2706770608U,	// <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+  2706844345U,	// <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+  3769600707U,	// <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+  2659742787U,	// <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+  3636102612U,	// <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+  3769600740U,	// <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+  3769600747U,	// <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+  3769600758U,	// <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+  3659993400U,	// <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+  3781176065U,	// <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+  2664388218U,	// <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+  1482653798U,	// <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+  1482654460U,	// <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+  2556397160U,	// <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+  3021179292U,	// <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482657078U,	// <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+  3021179394U,	// <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598203898U,	// <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+  2708097874U,	// <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+  1482659630U,	// <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+  2617278468U,	// <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+  2618605670U,	// <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2618605734U,	// <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+  3642091695U,	// <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+  2753134796U,	// <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+  2718714770U,	// <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+  3021245750U,	// <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  3665982483U,	// <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+  3021245768U,	// <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2568355942U,	// <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+  3692348212U,	// <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+  3692348310U,	// <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+  2568358064U,	// <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+  2568359222U,	// <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+  1812778294U,	// <0,4,1,5>: Cost 2 vzipl LHS, RHS
+  3022671158U,	// <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+  2592248852U,	// <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1812778537U,	// <0,4,1,u>: Cost 2 vzipl LHS, RHS
+  2568364134U,	// <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+  2238573423U,	// <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+  3692349032U,	// <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+  2631214761U,	// <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+  2568367414U,	// <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+  2887028022U,	// <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+  1946996022U,	// <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U,	// <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1946996040U,	// <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+  3692349590U,	// <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+  3826878614U,	// <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+  3826878625U,	// <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+  3692349852U,	// <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+  3692349954U,	// <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+  3826878978U,	// <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+  4095200566U,	// <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+  3713583814U,	// <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+  3692350238U,	// <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+  2550464552U,	// <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+  3962194914U,	// <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+  3693677631U,	// <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+  3642124467U,	// <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+  2718715088U,	// <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+  2618608950U,	// <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+  2753137974U,	// <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+  3666015255U,	// <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+  2618609193U,	// <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+  2568388710U,	// <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+  2568389526U,	// <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+  3636159963U,	// <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+  2568390836U,	// <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+  2568391990U,	// <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+  2718715180U,	// <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+  1618136374U,	// <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592281624U,	// <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+  1618136392U,	// <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2550480938U,	// <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+  3826880801U,	// <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+  2562426332U,	// <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+  3786190181U,	// <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+  2718715252U,	// <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+  3826881165U,	// <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+  2712669568U,	// <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+  2657760081U,	// <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+  2718715284U,	// <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+  3654090854U,	// <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+  3934229326U,	// <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+  3734156437U,	// <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+  3734820070U,	// <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+  3654094134U,	// <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+  2713259464U,	// <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2713333201U,	// <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+  3654095866U,	// <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+  2713259464U,	// <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2568413286U,	// <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+  2618611502U,	// <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2753140526U,	// <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+  2568415415U,	// <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+  2568416566U,	// <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+  1817423158U,	// <0,4,u,5>: Cost 2 vzipl LHS, RHS
+  1947438390U,	// <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+  2592306203U,	// <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+  1947438408U,	// <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+  3630219264U,	// <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+  2625912934U,	// <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  3692355748U,	// <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+  3693019384U,	// <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+  3630222646U,	// <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+  3699655062U,	// <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+  2718715508U,	// <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+  3087011126U,	// <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+  2625913501U,	// <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+  1500659814U,	// <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+  2886520528U,	// <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+  2574403176U,	// <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+  2574403734U,	// <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+  1500662674U,	// <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+  2886520836U,	// <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+  2886520930U,	// <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+  2718715600U,	// <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+  1500665646U,	// <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+  2556493926U,	// <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+  2244546120U,	// <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+  3692357256U,	// <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+  2568439994U,	// <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+  2556497206U,	// <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+  3020738564U,	// <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+  4027877161U,	// <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+  3093220662U,	// <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3093220663U,	// <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3699656854U,	// <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+  3699656927U,	// <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+  3699657006U,	// <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+  3699657116U,	// <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+  2637859284U,	// <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+  3790319453U,	// <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+  3699657354U,	// <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+  2716725103U,	// <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+  2716798840U,	// <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+  2661747602U,	// <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+  3630252810U,	// <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+  3636225507U,	// <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+  3716910172U,	// <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+  3962195892U,	// <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+  2625916214U,	// <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  3718901071U,	// <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+  2718715846U,	// <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+  2625916457U,	// <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+  3791278034U,	// <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+  3791351771U,	// <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+  3318386260U,	// <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+  3791499245U,	// <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+  3318533734U,	// <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+  2718715908U,	// <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+  2657767522U,	// <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+  2718715928U,	// <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+  2718715937U,	// <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+  2592358502U,	// <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+  3792015404U,	// <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+  3731509754U,	// <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+  3785748546U,	// <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+  2592361782U,	// <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+  2592362594U,	// <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+  3785748576U,	// <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+  1644974178U,	// <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+  1645047915U,	// <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+  2562506854U,	// <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+  2562507670U,	// <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+  2562508262U,	// <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+  3636250774U,	// <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+  2562510134U,	// <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+  2718716072U,	// <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+  2718716074U,	// <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+  2719379635U,	// <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+  2562512686U,	// <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+  1500717158U,	// <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+  2625918766U,	// <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  2719674583U,	// <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+  2568489152U,	// <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+  1500720025U,	// <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+  2625919130U,	// <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  2586407243U,	// <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+  1646301444U,	// <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+  1646375181U,	// <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+  2586411110U,	// <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+  2619949158U,	// <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2619949220U,	// <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+  3785748789U,	// <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+  2619949386U,	// <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+  2586415202U,	// <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+  2586415436U,	// <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+  2952793398U,	// <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+  2619949725U,	// <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562531430U,	// <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+  3693691700U,	// <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+  2886521338U,	// <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+  3693691864U,	// <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+  2562534710U,	// <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+  2580450932U,	// <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+  2886521656U,	// <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+  2966736182U,	// <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  2966736183U,	// <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+  1500741734U,	// <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+  2250518817U,	// <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+  2574485096U,	// <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+  2631894694U,	// <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+  1500744604U,	// <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+  2574487248U,	// <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+  3020739384U,	// <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+  2954136886U,	// <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+  1500747566U,	// <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+  3693693078U,	// <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+  3705637136U,	// <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+  3705637192U,	// <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+  3693693340U,	// <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+  2637867477U,	// <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+  3705637424U,	// <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+  3666154056U,	// <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+  2722697800U,	// <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+  2722771537U,	// <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+  2562556006U,	// <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+  4095316257U,	// <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+  2562557420U,	// <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+  3636299926U,	// <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+  2562559286U,	// <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+  2619952438U,	// <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2723287696U,	// <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+  4027895094U,	// <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+  2619952681U,	// <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+  2718716594U,	// <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3648250774U,	// <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+  3792458436U,	// <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+  3705638767U,	// <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+  3648252831U,	// <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+  3797619416U,	// <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+  3792458472U,	// <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+  4035202358U,	// <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+  2718716594U,	// <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3786412796U,	// <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+  3792458504U,	// <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+  3728200126U,	// <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+  3798135575U,	// <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+  3786412836U,	// <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+  3792458543U,	// <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+  2718716728U,	// <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+  2718716738U,	// <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+  2718716747U,	// <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+  2718716750U,	// <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+  2724909910U,	// <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+  3636323823U,	// <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+  2725057384U,	// <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+  2718716790U,	// <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+  2718716800U,	// <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+  3792458629U,	// <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+  2725352332U,	// <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+  2718716822U,	// <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+  1500790886U,	// <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+  2619954990U,	// <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562590192U,	// <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+  2725721017U,	// <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+  1500793762U,	// <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+  2619955354U,	// <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2725942228U,	// <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+  2954186038U,	// <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+  1500796718U,	// <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+  2256401391U,	// <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+  2632564838U,	// <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256548865U,	// <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+  3700998396U,	// <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+  2718716952U,	// <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+  2718716962U,	// <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+  2621284845U,	// <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+  3904685542U,	// <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+  2632565405U,	// <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256409584U,	// <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+  3706307380U,	// <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+  2632565654U,	// <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+  3769603168U,	// <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+  2256704532U,	// <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+  3769603184U,	// <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+  3700999366U,	// <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+  2886522476U,	// <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+  2256999480U,	// <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+  2586501222U,	// <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+  1182749690U,	// <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+  3636356595U,	// <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+  2727711916U,	// <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+  2586504502U,	// <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+  2632566606U,	// <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+  2586505559U,	// <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+  3020740204U,	// <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+  1183265849U,	// <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+  3701000342U,	// <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+  3706308849U,	// <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+  3330315268U,	// <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+  3706309020U,	// <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+  3706309122U,	// <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+  3712281127U,	// <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+  2639202936U,	// <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  3802412321U,	// <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+  2640530202U,	// <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+  3654287462U,	// <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+  2256507900U,	// <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+  2256581637U,	// <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+  3660262008U,	// <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+  3786413405U,	// <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+  2632568118U,	// <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  3718917457U,	// <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+  3787003255U,	// <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+  2632568361U,	// <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+  3706310268U,	// <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+  3792459156U,	// <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+  3330331654U,	// <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+  3722899255U,	// <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+  2256737304U,	// <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+  3724226521U,	// <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+  2718717377U,	// <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+  2729997763U,	// <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+  2720044499U,	// <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+  3712946517U,	// <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+  2256524286U,	// <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+  3792459246U,	// <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+  3796440567U,	// <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+  3654307126U,	// <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+  2656457394U,	// <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+  3792459281U,	// <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+  2730661396U,	// <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+  2658448293U,	// <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+  3787003431U,	// <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+  3654312854U,	// <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+  3654313446U,	// <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+  3804771905U,	// <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+  3654315318U,	// <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+  3654315651U,	// <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+  3660288348U,	// <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+  2718717548U,	// <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+  2664420990U,	// <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+  2256466935U,	// <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+  1182798848U,	// <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+  2256614409U,	// <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+  2731693714U,	// <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+  2256761883U,	// <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+  2632571034U,	// <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  2669066421U,	// <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+  2731988662U,	// <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+  1183315007U,	// <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+  135053414U,	// <0,u,0,0>: Cost 1 vdup0 LHS
+  1544896614U,	// <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1678999654U,	// <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+  2691880677U,	// <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+  1476988214U,	// <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+  2718791419U,	// <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+  3021248666U,	// <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2592535607U,	// <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+  135053414U,	// <0,u,0,u>: Cost 1 vdup0 LHS
+  1476993097U,	// <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+  1812780846U,	// <0,u,1,1>: Cost 2 vzipl LHS, LHS
+  1618138926U,	// <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2752742134U,	// <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  1476996406U,	// <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+  1812781210U,	// <0,u,1,5>: Cost 2 vzipl LHS, RHS
+  2887006416U,	// <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+  2966736200U,	// <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  1812781413U,	// <0,u,1,u>: Cost 2 vzipl LHS, LHS
+  1482973286U,	// <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+  1482973987U,	// <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+  1946998574U,	// <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+  835584U,	// <0,u,2,3>: Cost 0 copy LHS
+  1482976566U,	// <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+  3020781631U,	// <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+  1946998938U,	// <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+  1518810169U,	// <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+  835584U,	// <0,u,2,u>: Cost 0 copy LHS
+  2618640534U,	// <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+  2752743574U,	// <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  2636556597U,	// <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+  2752743836U,	// <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618640898U,	// <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+  2752743938U,	// <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  2639202936U,	// <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  2639874762U,	// <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+  2752743637U,	// <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562703462U,	// <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+  2888455982U,	// <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021575982U,	// <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  2568677591U,	// <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+  2562706742U,	// <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+  1544899894U,	// <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679002934U,	// <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+  2718718033U,	// <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+  1679002952U,	// <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+  2568683622U,	// <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+  2568684438U,	// <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+  3765622902U,	// <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+  2691881087U,	// <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+  2568686902U,	// <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+  2650492890U,	// <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+  1618139290U,	// <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2824834358U,	// <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+  1618139308U,	// <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592579686U,	// <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+  2262496983U,	// <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+  2654474688U,	// <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+  2691881168U,	// <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+  2592582966U,	// <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+  2656465587U,	// <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+  2657129220U,	// <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+  1584051029U,	// <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+  1584714662U,	// <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+  2562728038U,	// <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+  2562728854U,	// <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+  2562729473U,	// <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+  2661111018U,	// <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+  2562731318U,	// <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+  2718718258U,	// <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+  2586620261U,	// <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+  2657793644U,	// <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+  2562733870U,	// <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+  135053414U,	// <0,u,u,0>: Cost 1 vdup0 LHS
+  1544902446U,	// <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1679005486U,	// <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+  835584U,	// <0,u,u,3>: Cost 0 copy LHS
+  1483025718U,	// <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+  1544902810U,	// <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679005850U,	// <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+  1518859327U,	// <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+  835584U,	// <0,u,u,u>: Cost 0 copy LHS
+  2689744896U,	// <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+  1610694666U,	// <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+  2689744916U,	// <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+  2619310332U,	// <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+  2684657701U,	// <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+  2620637598U,	// <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+  3708977654U,	// <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+  3666351168U,	// <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+  1611210825U,	// <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+  2556780646U,	// <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+  2556781355U,	// <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+  1616003174U,	// <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  3693052888U,	// <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+  2556783926U,	// <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+  2580672143U,	// <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+  2724839566U,	// <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+  3654415354U,	// <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+  1616003228U,	// <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+  2685690019U,	// <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+  2685763756U,	// <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+  2698297524U,	// <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+  2685911230U,	// <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+  2689745100U,	// <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+  3764814038U,	// <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+  2724839640U,	// <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+  2592625658U,	// <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+  2686279915U,	// <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+  3087843328U,	// <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  3087843338U,	// <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+  67944550U,	// <1,0,3,2>: Cost 1 vrev LHS
+  2568743135U,	// <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+  2562772278U,	// <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+  4099850454U,	// <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+  3704998538U,	// <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+  2592633923U,	// <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+  68386972U,	// <1,0,3,u>: Cost 1 vrev LHS
+  2620640146U,	// <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+  2689745234U,	// <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+  2689745244U,	// <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+  3760980320U,	// <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+  3761054057U,	// <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+  2619313462U,	// <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  3761201531U,	// <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+  3666383940U,	// <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+  2619313705U,	// <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+  4029300736U,	// <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+  2895249510U,	// <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+  3028287590U,	// <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3642501345U,	// <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+  2215592058U,	// <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+  3724242907U,	// <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+  3724906540U,	// <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+  3911118134U,	// <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+  3028287644U,	// <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3762086375U,	// <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+  2698297846U,	// <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+  3760022015U,	// <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+  3642509538U,	// <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+  3762381323U,	// <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+  3730215604U,	// <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+  3730879237U,	// <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+  2657801046U,	// <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+  2658464679U,	// <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+  2659128312U,	// <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+  4047898278U,	// <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+  2215460970U,	// <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+  3734861035U,	// <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+  3731543398U,	// <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+  3736188301U,	// <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+  2663110110U,	// <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+  3731543660U,	// <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+  2664437376U,	// <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+  3087884288U,	// <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  1616003730U,	// <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+  67985515U,	// <1,0,u,2>: Cost 1 vrev LHS
+  2689893028U,	// <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+  2689745586U,	// <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+  2619316378U,	// <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  2669082807U,	// <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+  2592674888U,	// <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+  68427937U,	// <1,0,u,u>: Cost 1 vrev LHS
+  1543585802U,	// <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+  1548894310U,	// <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+  2618654892U,	// <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+  2689745654U,	// <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+  2622636370U,	// <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+  2620645791U,	// <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+  3696378367U,	// <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+  3666424905U,	// <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+  1548894866U,	// <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+  1483112550U,	// <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U,	// <1,1,1,1>: Cost 1 vdup1 LHS
+  2622636950U,	// <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+  2622637016U,	// <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+  1483115830U,	// <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2622637200U,	// <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+  2622637263U,	// <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+  2592691274U,	// <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U,	// <1,1,1,u>: Cost 1 vdup1 LHS
+  2550890588U,	// <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+  2617329183U,	// <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+  2622637672U,	// <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+  2622637734U,	// <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+  2550893878U,	// <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+  3696379744U,	// <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+  2622638010U,	// <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+  3804554170U,	// <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+  2622638139U,	// <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+  2622638230U,	// <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+  3087844148U,	// <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+  4161585244U,	// <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+  2014101606U,	// <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+  2622638594U,	// <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+  2689745920U,	// <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+  3763487753U,	// <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+  2592707660U,	// <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+  2014101611U,	// <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+  2556878950U,	// <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+  2221335351U,	// <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+  3696380988U,	// <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+  3763487805U,	// <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+  2556882230U,	// <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+  1548897590U,	// <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2758184246U,	// <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+  3666457677U,	// <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+  1548897833U,	// <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+  2693653615U,	// <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+  2617331408U,	// <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+  4029302934U,	// <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+  2689746064U,	// <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+  2221564755U,	// <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+  2955559250U,	// <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+  2617331810U,	// <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+  2825293110U,	// <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  2689746109U,	// <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+  3696382241U,	// <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+  2689746127U,	// <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+  2617332218U,	// <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+  3763487969U,	// <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+  3696382605U,	// <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+  4029309266U,	// <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+  2617332536U,	// <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+  2724840702U,	// <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+  2725504263U,	// <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+  2617332720U,	// <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+  2659800138U,	// <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  3691074717U,	// <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+  4167811174U,	// <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+  2617333094U,	// <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+  3295396702U,	// <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+  3803891014U,	// <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+  2617333356U,	// <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+  2659800138U,	// <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  1483112550U,	// <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U,	// <1,1,u,1>: Cost 1 vdup1 LHS
+  2622642056U,	// <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+  2014142566U,	// <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+  1483115830U,	// <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  1548900506U,	// <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2622642384U,	// <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+  2825293353U,	// <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  202162278U,	// <1,1,u,u>: Cost 1 vdup1 LHS
+  2635251712U,	// <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+  1561509990U,	// <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+  2618663085U,	// <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+  2696529358U,	// <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+  2635252050U,	// <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+  3769533926U,	// <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+  2621317617U,	// <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+  2659140170U,	// <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+  1561510557U,	// <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+  2623308516U,	// <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+  2635252532U,	// <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+  2631271318U,	// <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+  2958180454U,	// <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+  2550959414U,	// <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+  2635252880U,	// <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+  2635252952U,	// <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+  3732882731U,	// <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+  2958180459U,	// <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+  2629281213U,	// <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+  2635253280U,	// <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+  2618664552U,	// <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+  2689746546U,	// <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+  3764815485U,	// <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+  3760023176U,	// <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+  2635253690U,	// <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+  2659141610U,	// <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+  2689746591U,	// <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+  403488870U,	// <1,2,3,0>: Cost 1 vext1 LHS, LHS
+  1477231350U,	// <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477232232U,	// <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477233052U,	// <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+  403492150U,	// <1,2,3,4>: Cost 1 vext1 LHS, RHS
+  1525010128U,	// <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1525010938U,	// <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525011450U,	// <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403494702U,	// <1,2,3,u>: Cost 1 vext1 LHS, LHS
+  2641226607U,	// <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+  3624723446U,	// <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+  3301123609U,	// <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+  2598759198U,	// <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+  2659142864U,	// <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+  1561513270U,	// <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  2659143028U,	// <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+  2659143112U,	// <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+  1561513513U,	// <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+  2550988902U,	// <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+  2550989824U,	// <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+  3624732264U,	// <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+  2955559014U,	// <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2550992182U,	// <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+  2659143684U,	// <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+  2659143778U,	// <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+  2659143848U,	// <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+  2550994734U,	// <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+  2700289945U,	// <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+  2635256232U,	// <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+  2659144186U,	// <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+  2689746874U,	// <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+  3763488705U,	// <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+  3763488716U,	// <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+  2659144504U,	// <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+  2657817432U,	// <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+  2689746919U,	// <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+  1585402874U,	// <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+  2659144770U,	// <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+  3708998858U,	// <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+  2635257059U,	// <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+  2659145062U,	// <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+  3732886916U,	// <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+  3732886998U,	// <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+  2659145255U,	// <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+  1590711938U,	// <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+  403529835U,	// <1,2,u,0>: Cost 1 vext1 LHS, LHS
+  1477272310U,	// <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477273192U,	// <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477273750U,	// <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+  403533110U,	// <1,2,u,4>: Cost 1 vext1 LHS, RHS
+  1561516186U,	// <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  1525051898U,	// <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525052410U,	// <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403535662U,	// <1,2,u,u>: Cost 1 vext1 LHS, LHS
+  2819407872U,	// <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+  1551564902U,	// <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+  2819408630U,	// <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+  2619334911U,	// <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+  2625306962U,	// <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+  3832725879U,	// <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+  3699048959U,	// <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+  3776538827U,	// <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+  1551565469U,	// <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+  2618671862U,	// <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+  2819408692U,	// <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+  2624643975U,	// <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+  1745666150U,	// <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+  2557005110U,	// <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+  2625307792U,	// <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+  3698386127U,	// <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+  2592838748U,	// <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+  1745666155U,	// <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+  2819408790U,	// <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2625308193U,	// <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+  2819408036U,	// <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819851890U,	// <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819408794U,	// <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  3893149890U,	// <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+  2819408076U,	// <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  3772041583U,	// <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+  2819408042U,	// <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  1483276390U,	// <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+  1483277128U,	// <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+  2557019752U,	// <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+  2819408856U,	// <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+  1483279670U,	// <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+  2819409614U,	// <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2598826490U,	// <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+  3087844352U,	// <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+  1483282222U,	// <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+  2568970342U,	// <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+  2568971224U,	// <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+  3832761290U,	// <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+  2233428219U,	// <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+  2568973622U,	// <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+  1551568182U,	// <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410434U,	// <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+  3666605151U,	// <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+  1551568425U,	// <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+  2563006566U,	// <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+  2568979456U,	// <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+  2563008035U,	// <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+  2233436412U,	// <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+  2563009846U,	// <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+  2867187716U,	// <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+  2655834214U,	// <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+  1745669430U,	// <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1745669431U,	// <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2867187810U,	// <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+  3699052931U,	// <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+  2654507460U,	// <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+  3766291091U,	// <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+  2655834726U,	// <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+  3923384562U,	// <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+  2657161992U,	// <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+  2819852218U,	// <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819852219U,	// <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  2706926275U,	// <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+  2659816524U,	// <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+  3636766245U,	// <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+  2867187903U,	// <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+  2625312102U,	// <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+  2867188598U,	// <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+  3728250344U,	// <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+  2867187880U,	// <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+  2707516171U,	// <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+  1483317350U,	// <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+  1483318093U,	// <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+  2819410718U,	// <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+  1745666717U,	// <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+  1483320630U,	// <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+  1551571098U,	// <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410758U,	// <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+  1745669673U,	// <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+  1745666722U,	// <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+  2617352205U,	// <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+  2619342950U,	// <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  3692421295U,	// <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+  2619343104U,	// <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2617352530U,	// <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+  1634880402U,	// <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+  2713930652U,	// <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+  3732898396U,	// <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+  1635101613U,	// <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+  3693085430U,	// <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+  2623988535U,	// <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+  3693085590U,	// <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+  3692422134U,	// <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+  3693085726U,	// <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+  2892401974U,	// <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+  3026619702U,	// <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  3800206324U,	// <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+  2892402217U,	// <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+  3966978927U,	// <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+  3966979018U,	// <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+  3693086312U,	// <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+  2635269798U,	// <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+  3966979280U,	// <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+  2893204790U,	// <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  3693086650U,	// <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+  3666662502U,	// <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+  2893205033U,	// <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+  2563063910U,	// <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+  2563064730U,	// <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+  2563065386U,	// <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+  3693087132U,	// <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+  2619345410U,	// <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+  3087843666U,	// <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+  3087843676U,	// <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+  3666670695U,	// <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+  3087843669U,	// <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+  2620672914U,	// <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+  3630842706U,	// <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+  3313069003U,	// <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+  3642788100U,	// <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+  2713930960U,	// <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+  2619346230U,	// <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+  2713930980U,	// <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+  3736882642U,	// <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+  2619346473U,	// <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+  2557108326U,	// <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+  2557109075U,	// <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+  2598913774U,	// <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+  3630852246U,	// <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+  2557111606U,	// <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+  2895252790U,	// <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616006454U,	// <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  3899059510U,	// <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+  1616006472U,	// <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2557116518U,	// <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+  2557117236U,	// <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+  3630859880U,	// <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+  2569062550U,	// <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+  2557119798U,	// <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+  3763490174U,	// <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+  3763490183U,	// <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+  2712751498U,	// <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  2557122350U,	// <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+  2659161084U,	// <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+  3732903040U,	// <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+  3734230174U,	// <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+  3734893807U,	// <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+  3660729654U,	// <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+  3786493384U,	// <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+  2713341394U,	// <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+  3660731386U,	// <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+  2664470148U,	// <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+  2557132902U,	// <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+  2619348782U,	// <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  2563106351U,	// <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+  2713783816U,	// <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+  2622666815U,	// <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+  1640189466U,	// <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+  1616006697U,	// <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  2712751498U,	// <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  1616006715U,	// <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2620014592U,	// <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+  1546272870U,	// <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2618687664U,	// <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+  3693093120U,	// <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+  1546273106U,	// <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2620678563U,	// <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+  2714668660U,	// <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+  3772042877U,	// <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+  1546273437U,	// <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620015350U,	// <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+  2620015412U,	// <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+  2620015510U,	// <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+  2618688512U,	// <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+  2620015677U,	// <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+  2620015727U,	// <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+  2620015859U,	// <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+  3093728566U,	// <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+  2620015981U,	// <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+  3692430816U,	// <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+  2620016163U,	// <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+  2620016232U,	// <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+  2620016294U,	// <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+  3693758221U,	// <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+  3692431209U,	// <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+  2620016570U,	// <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+  4173598006U,	// <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+  2620016699U,	// <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+  2620016790U,	// <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+  2569110672U,	// <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+  3693758785U,	// <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+  2620017052U,	// <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+  2620017154U,	// <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+  3135623172U,	// <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+  4161587048U,	// <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+  2014104886U,	// <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2014104887U,	// <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2620017554U,	// <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+  2620017634U,	// <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  3693759551U,	// <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+  3642861837U,	// <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+  2575092710U,	// <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+  1546276150U,	// <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2759855414U,	// <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+  2713931718U,	// <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+  1546276393U,	// <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+  2557182054U,	// <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+  2557182812U,	// <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+  3630925347U,	// <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+  4029301675U,	// <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+  2557185334U,	// <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+  2713931780U,	// <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+  2667794530U,	// <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+  2713931800U,	// <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+  2557187886U,	// <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+  2718208036U,	// <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+  2620019115U,	// <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+  2667794938U,	// <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+  3787673666U,	// <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+  3693761165U,	// <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+  3319279297U,	// <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+  2667795256U,	// <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+  2713931874U,	// <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+  2713931883U,	// <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+  2557198438U,	// <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+  2557199156U,	// <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+  2569143974U,	// <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+  2569144592U,	// <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+  2557201718U,	// <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+  2713931944U,	// <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+  3787673770U,	// <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+  2719387828U,	// <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+  2557204270U,	// <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+  2620020435U,	// <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+  1546278702U,	// <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620020616U,	// <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+  2620020668U,	// <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+  1594054682U,	// <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+  1546279066U,	// <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2620020944U,	// <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+  2014145846U,	// <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+  2014145847U,	// <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+  3692437504U,	// <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+  2618695782U,	// <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  2618695857U,	// <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+  3794161970U,	// <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+  2620023122U,	// <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+  2620686756U,	// <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+  2621350389U,	// <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+  4028599606U,	// <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+  2618696349U,	// <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+  3692438262U,	// <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+  2625995572U,	// <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+  3692438422U,	// <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+  3692438488U,	// <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+  2625995820U,	// <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+  3692438672U,	// <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+  3692438720U,	// <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+  2958183734U,	// <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  2958183735U,	// <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+  2721526201U,	// <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+  3692439097U,	// <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+  3692439144U,	// <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+  3692439206U,	// <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+  3636948278U,	// <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+  3787674092U,	// <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+  2618697658U,	// <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+  2970799414U,	// <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2970799415U,	// <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+  2563211366U,	// <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+  3699738854U,	// <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+  2563212860U,	// <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+  3692439964U,	// <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+  2563214646U,	// <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+  4191820018U,	// <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+  2587103648U,	// <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+  3087845306U,	// <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  3087845307U,	// <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+  3693767570U,	// <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+  3693767650U,	// <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+  3636962877U,	// <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+  3325088134U,	// <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+  3693767898U,	// <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+  2618699062U,	// <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  3833670966U,	// <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+  4028632374U,	// <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+  2618699305U,	// <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+  3693768264U,	// <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+  3630998373U,	// <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+  3636971070U,	// <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+  3642943767U,	// <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+  3693768628U,	// <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+  3732918276U,	// <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+  2620690530U,	// <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+  2955562294U,	// <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+  2955562295U,	// <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+  2724180733U,	// <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+  3631006566U,	// <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+  3631007674U,	// <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+  3692442184U,	// <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+  3631009078U,	// <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+  3787674416U,	// <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+  2713932600U,	// <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+  2713932610U,	// <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+  2713932619U,	// <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+  1651102542U,	// <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+  2724918103U,	// <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+  2698302306U,	// <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+  3642960153U,	// <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+  2713932662U,	// <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+  2725213051U,	// <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+  2724844426U,	// <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+  4035956022U,	// <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+  1651692438U,	// <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+  1651766175U,	// <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+  2618701614U,	// <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  3135663508U,	// <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+  3692443580U,	// <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+  2713932743U,	// <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+  2618701978U,	// <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  2622683344U,	// <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+  3087886266U,	// <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  1652356071U,	// <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+  2726171632U,	// <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+  2626666598U,	// <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  3695100067U,	// <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+  3707044102U,	// <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+  2726466580U,	// <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+  3654921933U,	// <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+  2621358582U,	// <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+  2622022215U,	// <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+  2626667165U,	// <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+  2593128550U,	// <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+  2626667316U,	// <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+  3700409238U,	// <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+  2257294428U,	// <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+  2593131830U,	// <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+  2626667646U,	// <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+  2627331279U,	// <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+  2593133696U,	// <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+  2628658545U,	// <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+  2587164774U,	// <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+  3701073445U,	// <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+  3700409960U,	// <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+  2638612134U,	// <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+  2587168054U,	// <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+  3706382167U,	// <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+  2587169192U,	// <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+  3660911610U,	// <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+  2587170606U,	// <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+  1507459174U,	// <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+  2569257984U,	// <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+  2581202536U,	// <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+  2569259294U,	// <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+  1507462454U,	// <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+  1507462864U,	// <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+  2581205498U,	// <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+  2581206010U,	// <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+  1507465006U,	// <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+  2728826164U,	// <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+  3654951732U,	// <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+  3330987094U,	// <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+  3331060831U,	// <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+  3787674971U,	// <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+  2626669878U,	// <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+  3785979241U,	// <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+  3787085176U,	// <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+  2626670121U,	// <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+  2569273446U,	// <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+  2569274368U,	// <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+  3643016808U,	// <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+  2569275680U,	// <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+  2569276726U,	// <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+  4102034790U,	// <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+  2651222067U,	// <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+  3899378998U,	// <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+  2569279278U,	// <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+  2730153430U,	// <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+  2724845022U,	// <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+  3643025338U,	// <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+  3643025697U,	// <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+  3643026742U,	// <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+  3654971091U,	// <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+  3787675153U,	// <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+  2724845076U,	// <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+  2725508637U,	// <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+  2730817063U,	// <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+  3631088436U,	// <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+  3660949158U,	// <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+  3801904705U,	// <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+  3631090998U,	// <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+  2662503828U,	// <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+  3660951981U,	// <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+  2713933420U,	// <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+  2731406959U,	// <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+  1507500134U,	// <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+  2626672430U,	// <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  2581243496U,	// <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+  2569300259U,	// <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+  1507503414U,	// <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+  1507503829U,	// <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+  2581246458U,	// <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+  2581246970U,	// <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+  1507505966U,	// <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+  1543643153U,	// <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+  1546297446U,	// <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+  2819448852U,	// <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+  2619375876U,	// <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+  1546297685U,	// <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+  1658771190U,	// <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+  2736789248U,	// <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+  2659189376U,	// <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+  1546298013U,	// <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+  1483112550U,	// <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U,	// <1,u,1,1>: Cost 1 vdup1 LHS
+  1616009006U,	// <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  1745707110U,	// <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+  1483115830U,	// <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2620040336U,	// <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+  3026622618U,	// <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  2958183752U,	// <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  202162278U,	// <1,u,1,u>: Cost 1 vdup1 LHS
+  2819449750U,	// <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2893207342U,	// <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+  2819448996U,	// <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819450482U,	// <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819449754U,	// <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  2893207706U,	// <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  2819449036U,	// <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  2970799432U,	// <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2819449002U,	// <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  403931292U,	// <1,u,3,0>: Cost 1 vext1 LHS, LHS
+  1477673718U,	// <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  115726126U,	// <1,u,3,2>: Cost 1 vrev LHS
+  2014102173U,	// <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+  403934518U,	// <1,u,3,4>: Cost 1 vext1 LHS, RHS
+  1507536601U,	// <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+  1525453306U,	// <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  2014105129U,	// <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+  403937070U,	// <1,u,3,u>: Cost 1 vext1 LHS, LHS
+  2620042157U,	// <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+  2620042237U,	// <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+  2263217967U,	// <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+  2569341224U,	// <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+  2569342262U,	// <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+  1546300726U,	// <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  2819449180U,	// <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+  2724845649U,	// <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+  1546300969U,	// <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+  2551431270U,	// <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+  2551432192U,	// <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+  3028293422U,	// <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  2955559068U,	// <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2551434550U,	// <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+  2895255706U,	// <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616009370U,	// <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710390U,	// <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+  1745710391U,	// <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+  2653221159U,	// <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+  2725509303U,	// <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+  2659193338U,	// <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+  2689751248U,	// <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+  2867228774U,	// <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+  3764820194U,	// <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+  2657202957U,	// <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+  2819450810U,	// <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819450811U,	// <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  1585452032U,	// <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+  2557420340U,	// <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+  2569365158U,	// <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+  2569365803U,	// <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+  2557422902U,	// <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+  2662512021U,	// <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+  2724845884U,	// <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+  2659194476U,	// <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+  1590761096U,	// <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+  403972257U,	// <1,u,u,0>: Cost 1 vext1 LHS, LHS
+  202162278U,	// <1,u,u,1>: Cost 1 vdup1 LHS
+  115767091U,	// <1,u,u,2>: Cost 1 vrev LHS
+  1745707677U,	// <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+  403975478U,	// <1,u,u,4>: Cost 1 vext1 LHS, RHS
+  1546303642U,	// <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  1616009613U,	// <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710633U,	// <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+  403978030U,	// <1,u,u,u>: Cost 1 vext1 LHS, LHS
+  2551463936U,	// <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+  2685698058U,	// <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+  1610776596U,	// <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+  2619384069U,	// <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+  2551467318U,	// <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+  3899836596U,	// <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+  2621374968U,	// <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+  4168271334U,	// <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+  1611219018U,	// <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+  2551472138U,	// <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+  2690564186U,	// <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+  1611956326U,	// <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2826092646U,	// <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+  2551475510U,	// <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+  3692463248U,	// <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+  2587308473U,	// <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+  3661050874U,	// <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+  1611956380U,	// <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  1477738598U,	// <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+  2551481078U,	// <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+  2551481796U,	// <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+  2551482518U,	// <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+  1477741878U,	// <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+  2551484112U,	// <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+  2551484759U,	// <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+  2551485434U,	// <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+  1477744430U,	// <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+  2953625600U,	// <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2953627302U,	// <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  2953625764U,	// <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+  4027369695U,	// <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+  3625233718U,	// <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+  3899836110U,	// <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+  4032012618U,	// <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+  3899835392U,	// <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+  2953625770U,	// <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+  2551496806U,	// <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+  2685698386U,	// <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+  2685698396U,	// <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+  3625240726U,	// <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+  2551500086U,	// <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+  2618723638U,	// <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765409590U,	// <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  3799990664U,	// <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+  2685698450U,	// <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+  3625246822U,	// <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+  3289776304U,	// <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+  2690564526U,	// <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+  3289923778U,	// <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+  2216255691U,	// <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+  3726307332U,	// <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+  3726307426U,	// <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+  2826095926U,	// <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  2216550639U,	// <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+  4162420736U,	// <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+  2901885030U,	// <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+  2685698559U,	// <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+  3643173171U,	// <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+  2216263884U,	// <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+  3730289341U,	// <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+  3726308152U,	// <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+  3899836346U,	// <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+  2216558832U,	// <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+  2659202049U,	// <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  3726308437U,	// <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+  2726249034U,	// <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+  3734934772U,	// <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+  3726308710U,	// <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+  3726308814U,	// <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+  3736925671U,	// <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+  3726308972U,	// <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+  2659202049U,	// <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  1477787750U,	// <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+  2953668262U,	// <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  1611956893U,	// <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2551531670U,	// <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+  1477791030U,	// <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+  2618726554U,	// <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765412506U,	// <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  2826096169U,	// <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  1611956947U,	// <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  2569453670U,	// <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+  2619392102U,	// <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+  3759440619U,	// <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+  1616823030U,	// <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+  2569456950U,	// <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+  2690712328U,	// <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+  3661115841U,	// <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+  2622046794U,	// <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+  1617191715U,	// <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+  2551545958U,	// <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+  2685698868U,	// <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+  2628682646U,	// <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+  2685698888U,	// <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+  2551549238U,	// <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+  3693134992U,	// <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+  3661124034U,	// <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+  3625292794U,	// <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+  2685698933U,	// <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+  2551554150U,	// <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+  3893649571U,	// <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+  2551555688U,	// <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+  2685698966U,	// <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+  2551557430U,	// <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+  3763422123U,	// <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+  3693135802U,	// <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+  2726249402U,	// <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+  2685699011U,	// <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+  2551562342U,	// <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+  2953625610U,	// <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953627798U,	// <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  2953626584U,	// <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+  2551565622U,	// <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+  2953625938U,	// <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U,	// <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  4032013519U,	// <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+  2953625617U,	// <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+  2690565154U,	// <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+  3625313270U,	// <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+  3771532340U,	// <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+  1148404634U,	// <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+  3625315638U,	// <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+  2619395382U,	// <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+  3837242678U,	// <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+  3799991394U,	// <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+  1148773319U,	// <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+  2551578726U,	// <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+  2551579648U,	// <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+  3625321952U,	// <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+  2685699216U,	// <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+  2551582006U,	// <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+  3740913668U,	// <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+  3661156806U,	// <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+  3893652790U,	// <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+  2685699261U,	// <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+  2551586918U,	// <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+  3625329398U,	// <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+  2551588794U,	// <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+  3088679014U,	// <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+  2551590198U,	// <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+  4029382994U,	// <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+  3625333560U,	// <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+  3731624800U,	// <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+  2551592750U,	// <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+  2622051322U,	// <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+  3733615699U,	// <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+  3795125538U,	// <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+  2222171037U,	// <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+  3740915046U,	// <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+  3296060335U,	// <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+  3736933864U,	// <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+  3805300055U,	// <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+  2669827714U,	// <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+  2551603302U,	// <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+  2953666570U,	// <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953668758U,	// <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  1148437406U,	// <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+  2551606582U,	// <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+  2953666898U,	// <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U,	// <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  2669828370U,	// <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+  1148806091U,	// <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+  1543667732U,	// <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+  1548976230U,	// <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  2685699524U,	// <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+  2685699535U,	// <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+  2551614774U,	// <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+  3704422830U,	// <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+  3893657642U,	// <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+  3770574323U,	// <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+  1548976796U,	// <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+  2622718710U,	// <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+  2622718772U,	// <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+  2622718870U,	// <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+  2819915878U,	// <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+  3625364790U,	// <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+  2622719120U,	// <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+  3760031292U,	// <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+  3667170468U,	// <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+  2819915883U,	// <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+  1489829990U,	// <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  2563572470U,	// <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+  269271142U,	// <2,2,2,2>: Cost 1 vdup2 LHS
+  2685699698U,	// <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+  1489833270U,	// <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  2685699720U,	// <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+  2622719930U,	// <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+  2593436837U,	// <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U,	// <2,2,2,u>: Cost 1 vdup2 LHS
+  2685699750U,	// <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+  2690565806U,	// <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+  2953627240U,	// <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+  1879883878U,	// <2,2,3,3>: Cost 2 vzipr LHS, LHS
+  2685699790U,	// <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+  3893659342U,	// <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+  2958270812U,	// <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2593445030U,	// <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+  1879883883U,	// <2,2,3,u>: Cost 2 vzipr LHS, LHS
+  2551644262U,	// <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+  3625386742U,	// <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+  2551645902U,	// <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+  3759441686U,	// <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+  2551647542U,	// <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+  1548979510U,	// <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2764901686U,	// <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+  3667195047U,	// <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+  1548979753U,	// <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+  3696463432U,	// <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+  2617413328U,	// <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+  2685699936U,	// <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+  4027383910U,	// <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+  2228201085U,	// <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+  2617413636U,	// <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+  2617413730U,	// <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+  2819919158U,	// <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  2819919159U,	// <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+  3625402554U,	// <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+  3760031652U,	// <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+  2617414138U,	// <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+  2685700026U,	// <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+  3625405750U,	// <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+  3760031692U,	// <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+  3088679116U,	// <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+  2657891169U,	// <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+  2685700071U,	// <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+  2726250474U,	// <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+  3704427616U,	// <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+  2660545701U,	// <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+  4030718054U,	// <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+  2617415014U,	// <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+  3302033032U,	// <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+  3661246929U,	// <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+  2617415276U,	// <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+  2731558962U,	// <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+  1489829990U,	// <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  1548982062U,	// <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  269271142U,	// <2,2,u,2>: Cost 1 vdup2 LHS
+  1879924838U,	// <2,2,u,3>: Cost 2 vzipr LHS, LHS
+  1489833270U,	// <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  1548982426U,	// <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2953666908U,	// <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2819919401U,	// <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  269271142U,	// <2,2,u,u>: Cost 1 vdup2 LHS
+  1544339456U,	// <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470597734U,	// <2,3,0,1>: Cost 1 vext2 LHS, LHS
+  1548984484U,	// <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2619408648U,	// <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+  1548984658U,	// <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665857454U,	// <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  2622726655U,	// <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2593494188U,	// <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+  470598301U,	// <2,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544340214U,	// <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544340276U,	// <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544340374U,	// <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1548985304U,	// <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2551696694U,	// <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+  1548985488U,	// <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2622727375U,	// <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+  2665858347U,	// <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+  1548985709U,	// <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  2622727613U,	// <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+  2622727711U,	// <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  1544341096U,	// <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544341158U,	// <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  2622727958U,	// <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+  2622728032U,	// <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+  1548986298U,	// <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2665859050U,	// <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+  1548986427U,	// <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1548986518U,	// <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2622728415U,	// <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+  1489913458U,	// <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+  1544341916U,	// <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+  1548986882U,	// <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2665859632U,	// <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+  2234304870U,	// <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+  2958271632U,	// <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  1548987166U,	// <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+  1483948134U,	// <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+  1483948954U,	// <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+  2622729276U,	// <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2557692054U,	// <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+  1483951414U,	// <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+  470601014U,	// <2,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592118644U,	// <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2593526960U,	// <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+  470601257U,	// <2,3,4,u>: Cost 1 vext2 LHS, RHS
+  2551726182U,	// <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+  1592118992U,	// <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2665860862U,	// <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+  2551728642U,	// <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+  1592119238U,	// <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592119300U,	// <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592119394U,	// <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1592119464U,	// <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1592119545U,	// <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+  2622730529U,	// <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2557707164U,	// <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+  1592119802U,	// <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2665861682U,	// <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+  2622730893U,	// <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  2665861810U,	// <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+  1592120120U,	// <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592120142U,	// <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1592120223U,	// <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+  1592120314U,	// <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659890261U,	// <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+  2660553894U,	// <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+  2665862371U,	// <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592120678U,	// <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665862534U,	// <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2665862614U,	// <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+  1592120940U,	// <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592120962U,	// <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1548990163U,	// <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  470603566U,	// <2,3,u,1>: Cost 1 vext2 LHS, LHS
+  1548990341U,	// <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  1548990396U,	// <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+  1548990527U,	// <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  470603930U,	// <2,3,u,5>: Cost 1 vext2 LHS, RHS
+  1548990672U,	// <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1592121600U,	// <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+  470604133U,	// <2,3,u,u>: Cost 1 vext2 LHS, LHS
+  2617425942U,	// <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+  2618753126U,	// <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2618753208U,	// <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+  2619416841U,	// <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+  2587593628U,	// <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+  2712832914U,	// <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+  1634962332U,	// <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  3799993252U,	// <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+  1634962332U,	// <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  2619417334U,	// <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+  3692495668U,	// <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+  2625389466U,	// <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+  2826125414U,	// <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+  3699794995U,	// <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+  3692496016U,	// <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+  3763424238U,	// <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+  3667317942U,	// <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+  2826125419U,	// <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+  2629371336U,	// <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+  3699131946U,	// <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+  2630698602U,	// <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+  2618754766U,	// <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+  2826126234U,	// <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+  2899119414U,	// <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+  3033337142U,	// <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+  3800214597U,	// <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+  2899119657U,	// <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+  2635344033U,	// <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+  4032012325U,	// <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+  3692497228U,	// <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+  3692497308U,	// <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+  3001404624U,	// <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+  2953627342U,	// <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2953625804U,	// <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3899868160U,	// <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+  2953625806U,	// <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  2710916266U,	// <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+  3899869648U,	// <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+  3899869658U,	// <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+  3899868930U,	// <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+  2712833232U,	// <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+  2618756406U,	// <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+  2765737270U,	// <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+  4168304426U,	// <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+  2618756649U,	// <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+  2551800011U,	// <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+  2569716470U,	// <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+  2563745405U,	// <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+  2569718102U,	// <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+  2551803190U,	// <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+  3625545732U,	// <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+  1611959606U,	// <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128694U,	// <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959624U,	// <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478066278U,	// <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+  2551808758U,	// <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+  2551809516U,	// <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+  2551810198U,	// <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+  1478069558U,	// <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+  2901888310U,	// <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  2551812920U,	// <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+  2726251914U,	// <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+  1478072110U,	// <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+  2659234821U,	// <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  3786722726U,	// <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+  3734303911U,	// <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+  3734967544U,	// <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+  3727005030U,	// <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+  2726251976U,	// <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+  2726251986U,	// <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+  3727005292U,	// <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+  2659234821U,	// <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  1478082662U,	// <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+  2618758958U,	// <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2551826024U,	// <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+  2551826582U,	// <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+  1478085942U,	// <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+  2953668302U,	// <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  1611959849U,	// <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128937U,	// <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959867U,	// <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  3691839488U,	// <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+  2618097766U,	// <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620088484U,	// <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+  2619425034U,	// <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+  2620088667U,	// <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+  2620752300U,	// <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+  3693830655U,	// <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+  3094531382U,	// <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  2618098333U,	// <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  3691840246U,	// <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+  3691840308U,	// <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+  2626061206U,	// <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+  2618098688U,	// <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+  2626061364U,	// <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+  3691840656U,	// <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+  3789082310U,	// <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+  2712833744U,	// <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+  2628715896U,	// <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+  3693831613U,	// <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+  4026698642U,	// <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+  2632033896U,	// <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+  3691841190U,	// <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+  2632034061U,	// <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+  3691841352U,	// <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+  3691841466U,	// <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+  3088354614U,	// <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  3088354615U,	// <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+  2557829222U,	// <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+  2557830059U,	// <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+  2575746766U,	// <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+  3691841948U,	// <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+  2619427330U,	// <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+  2581720847U,	// <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+  2953628162U,	// <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953626624U,	// <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2953626625U,	// <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+  2569781350U,	// <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+  3631580076U,	// <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+  2569782990U,	// <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+  2569783646U,	// <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+  2569784630U,	// <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+  2618101046U,	// <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  3893905922U,	// <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+  3094564150U,	// <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  2618101289U,	// <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+  2551873638U,	// <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+  3637560320U,	// <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+  3637560966U,	// <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+  3723030343U,	// <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+  2551876918U,	// <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+  2712834052U,	// <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+  4028713474U,	// <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+  2712834072U,	// <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+  2712834081U,	// <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+  2575769702U,	// <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+  3631596462U,	// <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+  2655924730U,	// <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+  3643541856U,	// <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+  2655924849U,	// <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+  3787755607U,	// <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+  4029385218U,	// <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+  3088682294U,	// <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+  3088682295U,	// <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+  2563833958U,	// <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+  2551890678U,	// <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+  2563835528U,	// <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+  3637577878U,	// <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+  2563837238U,	// <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+  2712834216U,	// <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+  2712834220U,	// <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+  4174449974U,	// <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+  2563839790U,	// <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+  2563842150U,	// <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+  2618103598U,	// <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2563843721U,	// <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+  2569816418U,	// <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+  2622748735U,	// <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+  2618103962U,	// <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  2953669122U,	// <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953667584U,	// <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2618104165U,	// <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620096512U,	// <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+  1546354790U,	// <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620096676U,	// <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+  3693838588U,	// <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+  1546355036U,	// <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+  3694502317U,	// <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+  2551911246U,	// <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+  2720723287U,	// <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+  1546355357U,	// <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620097270U,	// <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+  2620097332U,	// <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+  2620097430U,	// <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+  2820243558U,	// <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2620097598U,	// <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+  2620097680U,	// <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+  3693839585U,	// <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+  2721386920U,	// <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+  2820243563U,	// <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2714014137U,	// <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+  2712834500U,	// <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+  2620098152U,	// <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+  2620098214U,	// <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+  2632042254U,	// <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+  2712834540U,	// <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+  2820243660U,	// <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+  2958265654U,	// <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+  2620098619U,	// <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+  2620098710U,	// <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+  3893986982U,	// <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+  2569848762U,	// <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+  2620098972U,	// <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+  2620099074U,	// <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+  3893987022U,	// <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+  3001404644U,	// <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1879887158U,	// <2,6,3,7>: Cost 2 vzipr LHS, RHS
+  1879887159U,	// <2,6,3,u>: Cost 2 vzipr LHS, RHS
+  2620099484U,	// <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+  2620099566U,	// <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+  2620099644U,	// <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+  3643599207U,	// <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+  2575830080U,	// <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+  1546358070U,	// <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2667875700U,	// <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+  4028042550U,	// <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+  1546358313U,	// <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+  3693841992U,	// <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+  2667876048U,	// <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+  2712834756U,	// <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+  3643607400U,	// <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+  2252091873U,	// <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+  2667876356U,	// <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+  2667876450U,	// <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+  2820246838U,	// <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2820246839U,	// <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2563899494U,	// <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+  3893988683U,	// <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+  2563901072U,	// <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+  3893987236U,	// <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+  2563902774U,	// <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+  3893988723U,	// <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+  2712834872U,	// <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+  2955644214U,	// <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+  2955644215U,	// <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+  2712834894U,	// <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+  2724926296U,	// <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+  2725000033U,	// <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+  2702365544U,	// <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+  2712834934U,	// <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+  3776107393U,	// <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+  2725294981U,	// <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+  2726253452U,	// <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+  2712834966U,	// <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+  2620102355U,	// <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+  1546360622U,	// <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620102536U,	// <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+  2820244125U,	// <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  1594136612U,	// <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+  1546360986U,	// <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2620102864U,	// <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+  1879928118U,	// <2,6,u,7>: Cost 2 vzipr LHS, RHS
+  1879928119U,	// <2,6,u,u>: Cost 2 vzipr LHS, RHS
+  2726179825U,	// <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+  1652511738U,	// <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+  2621431972U,	// <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+  2257949868U,	// <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+  2726474773U,	// <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+  2620768686U,	// <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+  2621432319U,	// <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+  2599760953U,	// <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+  1653027897U,	// <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+  2639348470U,	// <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695174452U,	// <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+  3695174550U,	// <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+  3694511104U,	// <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+  3713090594U,	// <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+  3693184144U,	// <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+  2627405016U,	// <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+  3799995519U,	// <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+  2639348470U,	// <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695175101U,	// <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+  3643655168U,	// <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+  2257892517U,	// <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+  3695175334U,	// <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+  3695175465U,	// <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+  2632714080U,	// <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+  2633377713U,	// <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+  3695175658U,	// <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+  2634704979U,	// <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+  1514094694U,	// <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+  2569921680U,	// <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+  2587838056U,	// <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+  2569922927U,	// <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+  1514097974U,	// <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+  2581868321U,	// <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+  1514099194U,	// <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+  2587841530U,	// <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+  1514100526U,	// <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+  2708706617U,	// <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+  3649643418U,	// <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+  3649644330U,	// <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+  2257982640U,	// <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+  3649645641U,	// <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+  2621435190U,	// <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  2712835441U,	// <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+  3799995762U,	// <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+  2621435433U,	// <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+  2729497990U,	// <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+  3643679744U,	// <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+  3637708424U,	// <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+  3643681137U,	// <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+  2599800118U,	// <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+  3786577334U,	// <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+  3786577345U,	// <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+  2599802214U,	// <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+  2599802670U,	// <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+  2581889126U,	// <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+  3643687936U,	// <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+  2663240186U,	// <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+  3643689330U,	// <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+  2581892406U,	// <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+  2581892900U,	// <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+  2587865597U,	// <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+  3786577428U,	// <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+  2581894958U,	// <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+  2726254119U,	// <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+  3804640817U,	// <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+  3637724826U,	// <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+  3734992123U,	// <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+  2552040758U,	// <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+  3799995992U,	// <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+  2663241198U,	// <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+  2712835692U,	// <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+  2731562607U,	// <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+  1514135654U,	// <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+  1657820802U,	// <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+  2587879016U,	// <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+  2569963892U,	// <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+  1514138934U,	// <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+  2621438106U,	// <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  1514140159U,	// <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+  2587882490U,	// <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+  1514141486U,	// <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+  1544380416U,	// <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470638699U,	// <2,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544380580U,	// <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1658631909U,	// <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+  1544380754U,	// <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665898414U,	// <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  1658853120U,	// <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+  3094531625U,	// <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  470639261U,	// <2,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544381174U,	// <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544381236U,	// <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544381334U,	// <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544381400U,	// <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618123325U,	// <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544381584U,	// <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618123489U,	// <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2726254427U,	// <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+  1544381823U,	// <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+  1478328422U,	// <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+  2618123807U,	// <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  269271142U,	// <2,u,2,2>: Cost 1 vdup2 LHS
+  1544382118U,	// <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1478331702U,	// <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+  2618124136U,	// <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544382394U,	// <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  3088354857U,	// <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  269271142U,	// <2,u,2,u>: Cost 1 vdup2 LHS
+  1544382614U,	// <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2953627374U,	// <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+  1490282143U,	// <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+  1879883932U,	// <2,u,3,3>: Cost 2 vzipr LHS, LHS
+  1544382978U,	// <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2953627378U,	// <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+  1514172931U,	// <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+  1879887176U,	// <2,u,3,7>: Cost 2 vzipr LHS, RHS
+  1879883937U,	// <2,u,3,u>: Cost 2 vzipr LHS, LHS
+  1484316774U,	// <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+  1484317639U,	// <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+  2552088270U,	// <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+  1190213513U,	// <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+  1484320054U,	// <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+  470641974U,	// <2,u,4,5>: Cost 1 vext2 LHS, RHS
+  1592159604U,	// <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  3094564393U,	// <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  470642217U,	// <2,u,4,u>: Cost 1 vext2 LHS, RHS
+  2552094959U,	// <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+  1592159952U,	// <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2564040353U,	// <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+  2690275455U,	// <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+  1592160198U,	// <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592160260U,	// <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1611962522U,	// <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1592160424U,	// <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1611962540U,	// <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478361190U,	// <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+  2552103670U,	// <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+  1592160762U,	// <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2685704400U,	// <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+  1478364470U,	// <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+  2901891226U,	// <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  1592161080U,	// <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592161102U,	// <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1478367022U,	// <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+  1592161274U,	// <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659931226U,	// <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+  2564056739U,	// <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+  2665903331U,	// <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592161638U,	// <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665903494U,	// <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2587947527U,	// <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+  1592161900U,	// <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592161922U,	// <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1478377574U,	// <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+  470644526U,	// <2,u,u,1>: Cost 1 vext2 LHS, LHS
+  269271142U,	// <2,u,u,2>: Cost 1 vdup2 LHS
+  1879924892U,	// <2,u,u,3>: Cost 2 vzipr LHS, LHS
+  1478380854U,	// <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+  470644890U,	// <2,u,u,5>: Cost 1 vext2 LHS, RHS
+  1611962765U,	// <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1879928136U,	// <2,u,u,7>: Cost 2 vzipr LHS, RHS
+  470645093U,	// <2,u,u,u>: Cost 1 vext2 LHS, LHS
+  1611448320U,	// <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611890698U,	// <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611890708U,	// <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  3763576860U,	// <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+  2689835045U,	// <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+  3698508206U,	// <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+  3763576887U,	// <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+  3667678434U,	// <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+  1616093258U,	// <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+  1490337894U,	// <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+  2685632602U,	// <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+  537706598U,	// <3,0,1,2>: Cost 1 vext3 LHS, LHS
+  2624766936U,	// <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+  1490341174U,	// <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+  2624767120U,	// <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+  2732966030U,	// <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+  2593944803U,	// <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+  537706652U,	// <3,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U,	// <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685632684U,	// <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  2685632692U,	// <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+  2685632702U,	// <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611890892U,	// <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2732966102U,	// <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+  2624767930U,	// <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+  2685632744U,	// <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+  1611890924U,	// <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2624768150U,	// <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+  2685632764U,	// <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  2685632774U,	// <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+  2624768412U,	// <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+  2624768514U,	// <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+  3702491714U,	// <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+  2624768632U,	// <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+  3702491843U,	// <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+  2686959934U,	// <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+  2689835336U,	// <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+  1611891026U,	// <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611891036U,	// <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3763577184U,	// <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+  2689835374U,	// <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+  1551027510U,	// <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2666573172U,	// <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+  3667711206U,	// <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+  1616093586U,	// <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2685190556U,	// <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+  2666573520U,	// <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+  3040886886U,	// <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+  3625912834U,	// <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+  2666573766U,	// <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+  2666573828U,	// <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+  2732966354U,	// <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+  2666573992U,	// <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+  3040886940U,	// <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+  2685190637U,	// <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+  2732966390U,	// <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+  2689835519U,	// <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+  3667724438U,	// <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+  3763577355U,	// <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+  3806708243U,	// <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+  2666574648U,	// <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+  2657948520U,	// <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+  2689835573U,	// <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+  2666574842U,	// <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+  2685633095U,	// <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+  2660603052U,	// <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+  3643844997U,	// <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+  2666575206U,	// <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+  3655790391U,	// <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+  3731690968U,	// <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+  2666575468U,	// <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+  2664584850U,	// <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+  1616093834U,	// <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+  1611891346U,	// <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537707165U,	// <3,0,u,2>: Cost 1 vext3 LHS, LHS
+  2689835684U,	// <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1616093874U,	// <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551030426U,	// <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2624772304U,	// <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+  2594002154U,	// <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+  537707219U,	// <3,0,u,u>: Cost 1 vext3 LHS, LHS
+  2552201318U,	// <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+  2618802278U,	// <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+  2618802366U,	// <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+  1611449078U,	// <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2552204598U,	// <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+  2732966663U,	// <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+  3906258396U,	// <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+  3667752171U,	// <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+  1611891491U,	// <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  2689835819U,	// <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+  1611449140U,	// <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+  2624775063U,	// <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+  1611891528U,	// <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  2689835859U,	// <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+  2689835868U,	// <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  3763577701U,	// <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+  3765273452U,	// <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+  1611891573U,	// <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+  2629420494U,	// <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+  2689835911U,	// <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2564163248U,	// <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+  1611449238U,	// <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+  2564164918U,	// <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+  2689835947U,	// <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  3692545978U,	// <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+  2732966842U,	// <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+  1611891651U,	// <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+  1484456038U,	// <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+  1611891672U,	// <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685633502U,	// <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2685633512U,	// <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+  1484459318U,	// <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+  1611891712U,	// <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2689836041U,	// <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2733409294U,	// <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+  1611891735U,	// <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  2552234086U,	// <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+  2732966955U,	// <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+  2732966964U,	// <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+  2685633597U,	// <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+  2552237366U,	// <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+  2618805558U,	// <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+  2769472822U,	// <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+  3667784943U,	// <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+  2685633642U,	// <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+  2689836143U,	// <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+  2564187280U,	// <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+  2564187827U,	// <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+  1611891856U,	// <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  2689836183U,	// <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+  3759375522U,	// <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+  3720417378U,	// <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+  2832518454U,	// <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611891901U,	// <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  3763578048U,	// <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+  2689836239U,	// <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2732967128U,	// <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+  2685633761U,	// <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  3763578088U,	// <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+  2689836275U,	// <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  3763578108U,	// <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+  2732967166U,	// <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+  2685633806U,	// <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+  3631972454U,	// <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+  2659947612U,	// <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+  4036102294U,	// <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+  3095396454U,	// <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  3631975734U,	// <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+  2222982144U,	// <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+  3296797705U,	// <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+  3720418924U,	// <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+  3095396459U,	// <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1484496998U,	// <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+  1611892077U,	// <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+  2685633907U,	// <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  1611892092U,	// <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+  1484500278U,	// <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+  1611892117U,	// <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685633950U,	// <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  2832518697U,	// <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611892140U,	// <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+  2623455232U,	// <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+  1549713510U,	// <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  2689836484U,	// <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+  2685633997U,	// <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2623455570U,	// <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+  2732967398U,	// <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+  2689836524U,	// <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2229044964U,	// <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+  1549714077U,	// <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+  1549714166U,	// <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+  2623456052U,	// <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+  2623456150U,	// <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+  2685634079U,	// <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2552286518U,	// <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+  2623456400U,	// <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+  2689836604U,	// <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  3667834101U,	// <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+  1155385070U,	// <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+  2689836629U,	// <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+  2689836640U,	// <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+  1611449960U,	// <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892338U,	// <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  2689836669U,	// <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+  2689836680U,	// <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2689836688U,	// <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+  3763578518U,	// <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+  1611892383U,	// <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+  1611450022U,	// <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+  2685191854U,	// <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+  2685191865U,	// <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+  2685191875U,	// <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+  1611450062U,	// <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+  2732967635U,	// <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+  2732967645U,	// <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+  2732967652U,	// <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+  1611450094U,	// <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+  2558279782U,	// <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+  2558280602U,	// <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+  2732967692U,	// <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+  2685634326U,	// <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2558283062U,	// <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+  1549716790U,	// <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689836844U,	// <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+  2229077736U,	// <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+  1549717033U,	// <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+  2552316006U,	// <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+  2228643507U,	// <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+  2689836896U,	// <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685634408U,	// <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1155122894U,	// <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+  2665263108U,	// <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+  2689836932U,	// <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2665263272U,	// <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+  1155417842U,	// <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+  2689836953U,	// <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+  2689836964U,	// <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+  2689836976U,	// <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+  1611892666U,	// <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  2689836993U,	// <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+  2689837004U,	// <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689837013U,	// <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2665263950U,	// <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+  1611892711U,	// <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  2665264122U,	// <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+  2623460419U,	// <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+  4169138340U,	// <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+  2962358374U,	// <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+  2665264486U,	// <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+  2228954841U,	// <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+  2229028578U,	// <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+  2665264748U,	// <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+  2962358379U,	// <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+  1611892795U,	// <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+  1549719342U,	// <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  1611449960U,	// <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892824U,	// <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  1611892835U,	// <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+  1549719706U,	// <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689837168U,	// <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+  2665265408U,	// <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+  1611892867U,	// <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+  2685192331U,	// <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+  1611450518U,	// <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+  2685634717U,	// <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+  2564294806U,	// <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+  2685634736U,	// <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+  2732968122U,	// <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+  3763579075U,	// <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+  4034053264U,	// <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+  1611450581U,	// <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+  2685192415U,	// <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+  1550385992U,	// <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+  2685192433U,	// <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+  2685634808U,	// <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+  2558332214U,	// <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+  2685634828U,	// <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+  3759376661U,	// <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+  2703477022U,	// <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+  1555031423U,	// <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+  2564309094U,	// <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+  2630100513U,	// <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+  1557022322U,	// <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+  2685192520U,	// <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+  2564312374U,	// <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+  2732968286U,	// <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+  2685634918U,	// <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+  2704140655U,	// <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+  1561004120U,	// <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+  1496547430U,	// <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  2624129256U,	// <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+  2630764866U,	// <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+  336380006U,	// <3,3,3,3>: Cost 1 vdup3 LHS
+  1496550710U,	// <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  2732968368U,	// <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+  2624129683U,	// <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+  2594182400U,	// <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+  336380006U,	// <3,3,3,u>: Cost 1 vdup3 LHS
+  2558353510U,	// <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+  2558354411U,	// <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+  2564327108U,	// <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+  2564327938U,	// <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+  2960343962U,	// <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+  1611893250U,	// <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+  2771619126U,	// <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+  4034086032U,	// <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+  1611893277U,	// <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+  2558361702U,	// <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+  2558362604U,	// <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+  2558363342U,	// <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+  2732968512U,	// <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+  2558364982U,	// <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+  3101279950U,	// <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+  2665934946U,	// <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+  2826636598U,	// <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2826636599U,	// <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2732968568U,	// <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+  3763579521U,	// <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+  2732968586U,	// <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+  2732968595U,	// <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+  2732968604U,	// <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+  3763579557U,	// <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+  2732968621U,	// <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+  2657973099U,	// <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+  2658636732U,	// <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+  2558378086U,	// <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+  2558378990U,	// <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+  2564351687U,	// <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+  2661291264U,	// <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+  2558381366U,	// <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+  2732968694U,	// <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+  3781126907U,	// <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+  3095397376U,	// <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+  2558383918U,	// <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+  1496547430U,	// <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  1611893534U,	// <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+  1592858504U,	// <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+  336380006U,	// <3,3,u,3>: Cost 1 vdup3 LHS
+  1496550710U,	// <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  1611893574U,	// <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+  2690280268U,	// <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+  2826636841U,	// <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  336380006U,	// <3,3,u,u>: Cost 1 vdup3 LHS
+  2624798720U,	// <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+  1551056998U,	// <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624798884U,	// <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+  3693232384U,	// <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+  2624799058U,	// <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+  1659227026U,	// <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+  1659227036U,	// <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+  3667973382U,	// <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+  1551057565U,	// <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624799478U,	// <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+  2624799540U,	// <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+  1551057818U,	// <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+  2624799704U,	// <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+  2564377910U,	// <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+  2689838050U,	// <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+  2689838062U,	// <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2628117807U,	// <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+  1555039616U,	// <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+  3626180710U,	// <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+  2624800298U,	// <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+  2624800360U,	// <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+  2624800422U,	// <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+  2624800514U,	// <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+  2709965878U,	// <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+  2689838140U,	// <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+  2634090504U,	// <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+  2689838158U,	// <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+  2624800918U,	// <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+  2636081403U,	// <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+  2636745036U,	// <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+  2624801180U,	// <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+  2624801232U,	// <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+  2905836854U,	// <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+  3040054582U,	// <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+  3702524611U,	// <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+  2624801566U,	// <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+  2564399206U,	// <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+  2564400026U,	// <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+  2564400845U,	// <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+  2570373542U,	// <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  1659227344U,	// <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1551060278U,	// <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  1659227364U,	// <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+  3668006154U,	// <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+  1551060521U,	// <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+  1490665574U,	// <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+  2689838341U,	// <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1490667214U,	// <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+  2564409494U,	// <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+  1490668854U,	// <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+  2689838381U,	// <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+  537709878U,	// <3,4,5,6>: Cost 1 vext3 LHS, RHS
+  2594272523U,	// <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+  537709896U,	// <3,4,5,u>: Cost 1 vext3 LHS, RHS
+  2689838411U,	// <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+  2558444534U,	// <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+  2666607098U,	// <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+  2558446082U,	// <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+  1659227508U,	// <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2689838462U,	// <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  2689838471U,	// <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+  2657981292U,	// <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+  1659227540U,	// <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+  2666607610U,	// <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+  3702527072U,	// <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+  2660635824U,	// <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+  3644139945U,	// <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+  2666607974U,	// <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+  2732969416U,	// <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+  2732969425U,	// <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+  2666608236U,	// <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+  2664617622U,	// <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+  1490690150U,	// <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+  1551062830U,	// <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  1490691793U,	// <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+  2624804796U,	// <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+  1490693430U,	// <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+  1551063194U,	// <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  537710121U,	// <3,4,u,6>: Cost 1 vext3 LHS, RHS
+  2594297102U,	// <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+  537710139U,	// <3,4,u,u>: Cost 1 vext3 LHS, RHS
+  3692576768U,	// <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+  2618835046U,	// <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+  2618835138U,	// <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+  3692577024U,	// <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+  2689838690U,	// <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+  2732969579U,	// <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2732969588U,	// <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+  2246963055U,	// <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+  2618835613U,	// <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+  2594308198U,	// <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+  3692577588U,	// <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+  2624807835U,	// <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+  2625471468U,	// <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+  2626135101U,	// <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+  2594311888U,	// <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+  3699877107U,	// <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+  1641680592U,	// <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+  1641754329U,	// <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+  3692578274U,	// <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+  2630116899U,	// <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+  3692578408U,	// <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+  2625472206U,	// <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+  2632107798U,	// <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+  2715938575U,	// <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+  3692578746U,	// <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+  2716086049U,	// <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+  2634762330U,	// <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+  3692578966U,	// <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+  2636089596U,	// <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+  3699214668U,	// <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+  2638080412U,	// <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+  2618837506U,	// <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+  2832844494U,	// <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+  4033415682U,	// <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+  3095072054U,	// <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+  3095072055U,	// <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+  2600304742U,	// <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+  3763580815U,	// <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+  2564474582U,	// <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+  3699879044U,	// <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+  2600308022U,	// <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+  2618838326U,	// <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+  2772454710U,	// <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1659228102U,	// <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+  1659228111U,	// <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+  2570453094U,	// <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+  2624810704U,	// <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+  2570454734U,	// <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+  2570455472U,	// <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+  2570456374U,	// <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+  1659228164U,	// <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  2732969998U,	// <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+  1659228184U,	// <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+  1659228193U,	// <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+  2732970020U,	// <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+  2732970035U,	// <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+  2564490968U,	// <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+  2732970050U,	// <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+  2732970060U,	// <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+  2732970071U,	// <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+  2732970080U,	// <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+  1659228258U,	// <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+  1659228267U,	// <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+  1484783718U,	// <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484784640U,	// <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+  2558527080U,	// <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+  2558527638U,	// <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+  1484786998U,	// <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+  1659228328U,	// <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2732970154U,	// <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+  2558531180U,	// <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+  1484789550U,	// <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484791910U,	// <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+  1484792833U,	// <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+  2558535272U,	// <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+  2558535830U,	// <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+  1484795190U,	// <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+  1659228409U,	// <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+  2772457626U,	// <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1646326023U,	// <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+  1484797742U,	// <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+  2558541926U,	// <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+  2689839393U,	// <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+  2689839404U,	// <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+  3706519808U,	// <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+  2689839420U,	// <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+  2732970314U,	// <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+  2732970316U,	// <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+  2960313654U,	// <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  2689839456U,	// <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+  3763581290U,	// <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+  3763581297U,	// <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+  2624816028U,	// <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+  3763581315U,	// <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+  2626143294U,	// <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+  3763581335U,	// <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+  2721321376U,	// <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+  2721395113U,	// <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+  2628797826U,	// <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+  2594390118U,	// <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+  2721616324U,	// <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+  2630788725U,	// <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+  3763581395U,	// <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+  2632115991U,	// <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+  2632779624U,	// <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+  2594394618U,	// <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+  1648316922U,	// <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+  1648390659U,	// <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+  3693914262U,	// <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+  3638281176U,	// <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+  3696568678U,	// <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+  2638088604U,	// <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+  2632780290U,	// <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+  3712494145U,	// <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+  3698559612U,	// <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+  2959674678U,	// <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  2959674679U,	// <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+  3763581536U,	// <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+  2722943590U,	// <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+  2732970609U,	// <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+  3698560147U,	// <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+  2732970628U,	// <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+  2689839757U,	// <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+  2732970640U,	// <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+  2960346422U,	// <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+  2689839784U,	// <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+  2576498790U,	// <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+  3650241270U,	// <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+  2732970692U,	// <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+  2576501250U,	// <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  2576501906U,	// <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+  3650244622U,	// <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+  4114633528U,	// <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+  2732970735U,	// <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+  2576504622U,	// <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+  2732970749U,	// <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+  2724270856U,	// <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+  2624819706U,	// <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+  3656223234U,	// <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+  2732970788U,	// <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+  2732970800U,	// <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+  1659228984U,	// <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659228994U,	// <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+  1659229003U,	// <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+  1659229006U,	// <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+  2558600201U,	// <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+  2558601146U,	// <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+  2725081963U,	// <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+  1659229046U,	// <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+  2715423611U,	// <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+  2722059141U,	// <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+  2962361654U,	// <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+  1659229078U,	// <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+  1659229087U,	// <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+  2689840041U,	// <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+  2558609339U,	// <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+  2576525853U,	// <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+  1659229127U,	// <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+  2689840081U,	// <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+  1659228984U,	// <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1652298720U,	// <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+  1659229159U,	// <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+  2626813952U,	// <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+  1553072230U,	// <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814116U,	// <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+  3700556028U,	// <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+  2626814290U,	// <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+  2582507375U,	// <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+  2588480072U,	// <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+  2732971055U,	// <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+  1553072797U,	// <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814710U,	// <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+  2626814772U,	// <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+  2626814870U,	// <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+  2625487854U,	// <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+  2582514998U,	// <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+  1553073296U,	// <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+  2627478753U,	// <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+  2727367810U,	// <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+  1555064195U,	// <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+  2588491878U,	// <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+  3700557318U,	// <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+  2626815592U,	// <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+  2626815654U,	// <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+  2588495158U,	// <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+  2632787817U,	// <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+  1559709626U,	// <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+  2728031443U,	// <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+  1561036892U,	// <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+  2626816150U,	// <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+  2626816268U,	// <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+  2633451878U,	// <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+  2626816412U,	// <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+  2626816514U,	// <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+  2638760514U,	// <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+  2639424147U,	// <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+  2826961920U,	// <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+  2626816798U,	// <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+  2582536294U,	// <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+  2582537360U,	// <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+  2588510138U,	// <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+  3700558996U,	// <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+  2582539574U,	// <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+  1553075510U,	// <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  2588512844U,	// <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+  2564625766U,	// <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+  1553075753U,	// <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+  2732971398U,	// <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+  2626817744U,	// <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+  3700559649U,	// <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+  2626817903U,	// <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+  2258728203U,	// <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+  2732971446U,	// <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+  2732971457U,	// <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+  2826964278U,	// <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2826964279U,	// <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2732971478U,	// <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+  2732971486U,	// <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+  2633454074U,	// <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+  2633454152U,	// <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+  2732971518U,	// <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+  2732971526U,	// <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+  2732971537U,	// <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+  2732971540U,	// <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+  2726041124U,	// <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+  2570616934U,	// <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+  2570617856U,	// <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+  2564646635U,	// <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+  2570619332U,	// <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+  2570620214U,	// <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+  2582564726U,	// <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+  2588537423U,	// <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+  1659229804U,	// <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1659229804U,	// <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+  2626819795U,	// <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+  1553078062U,	// <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626819973U,	// <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+  2826961565U,	// <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+  2626820159U,	// <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+  1553078426U,	// <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  1595545808U,	// <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+  1659229804U,	// <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1553078629U,	// <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  1611448320U,	// <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611896531U,	// <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+  1659672284U,	// <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+  1616099045U,	// <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  2685638381U,	// <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+  1663874806U,	// <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+  1663874816U,	// <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+  2960313672U,	// <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  1611896594U,	// <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+  1549763324U,	// <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+  1550426957U,	// <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+  537712430U,	// <3,u,1,2>: Cost 1 vext3 LHS, LHS
+  1616541495U,	// <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+  1490930998U,	// <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+  1553081489U,	// <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+  2627486946U,	// <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+  1659230043U,	// <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+  537712484U,	// <3,u,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U,	// <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2624833102U,	// <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+  1557063287U,	// <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+  1616099205U,	// <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+  1611890892U,	// <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2689841054U,	// <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+  1559717819U,	// <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+  1659230124U,	// <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+  1616541618U,	// <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+  1611896764U,	// <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+  1484973079U,	// <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+  2685638607U,	// <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+  336380006U,	// <3,u,3,3>: Cost 1 vdup3 LHS
+  1611896804U,	// <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+  1616541679U,	// <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  2690283512U,	// <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+  2959674696U,	// <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  336380006U,	// <3,u,3,u>: Cost 1 vdup3 LHS
+  2558722150U,	// <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+  1659672602U,	// <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+  1659672612U,	// <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  2689841196U,	// <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+  1659227344U,	// <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1611896895U,	// <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+  1663875144U,	// <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+  1659230289U,	// <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+  1611896922U,	// <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+  1490960486U,	// <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+  2689841261U,	// <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+  1490962162U,	// <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+  1616541823U,	// <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1490963766U,	// <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+  1659228164U,	// <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  537712794U,	// <3,u,5,6>: Cost 1 vext3 LHS, RHS
+  1659230371U,	// <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+  537712812U,	// <3,u,5,u>: Cost 1 vext3 LHS, RHS
+  2689841327U,	// <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+  2558739482U,	// <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+  2689841351U,	// <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+  1616099536U,	// <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1659227508U,	// <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2690283746U,	// <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+  1659228984U,	// <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659230445U,	// <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+  1616099581U,	// <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+  1485004902U,	// <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+  1485005851U,	// <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+  2558748264U,	// <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+  3095397021U,	// <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1485008182U,	// <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+  1659228328U,	// <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2722060599U,	// <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+  1659229804U,	// <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1485010734U,	// <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+  1616099665U,	// <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+  1611897179U,	// <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+  537712997U,	// <3,u,u,2>: Cost 1 vext3 LHS, LHS
+  336380006U,	// <3,u,u,3>: Cost 1 vdup3 LHS
+  1616099705U,	// <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+  1611897219U,	// <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+  537713037U,	// <3,u,u,6>: Cost 1 vext3 LHS, RHS
+  1659230607U,	// <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+  537713051U,	// <3,u,u,u>: Cost 1 vext3 LHS, LHS
+  2691907584U,	// <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+  2691907594U,	// <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+  2691907604U,	// <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+  3709862144U,	// <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+  2684682280U,	// <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+  3694600633U,	// <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+  3291431290U,	// <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+  3668342067U,	// <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+  2691907657U,	// <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+  2570715238U,	// <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+  2570716058U,	// <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+  1618165862U,	// <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570717648U,	// <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+  2570718518U,	// <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+  2594607206U,	// <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+  3662377563U,	// <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+  2594608436U,	// <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+  1618165916U,	// <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2685714598U,	// <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+  3759530159U,	// <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+  2685862072U,	// <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+  2631476937U,	// <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+  2685714636U,	// <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+  3765649622U,	// <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+  2686157020U,	// <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  3668358453U,	// <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+  2686304494U,	// <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+  3632529510U,	// <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+  2686451968U,	// <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2686525705U,	// <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+  3760341266U,	// <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+  3632532790U,	// <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+  3913254606U,	// <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+  3705219740U,	// <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+  3713845990U,	// <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+  2686451968U,	// <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2552823910U,	// <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+  2691907922U,	// <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+  2691907932U,	// <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+  3626567830U,	// <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+  2552827190U,	// <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+  2631478582U,	// <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  3626570017U,	// <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+  3668374839U,	// <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+  2552829742U,	// <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+  2558804070U,	// <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+  1839644774U,	// <4,0,5,1>: Cost 2 vzipl RHS, LHS
+  2913386660U,	// <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+  2570750420U,	// <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+  2558807350U,	// <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+  3987128750U,	// <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+  3987128822U,	// <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+  2594641208U,	// <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+  1839645341U,	// <4,0,5,u>: Cost 2 vzipl RHS, LHS
+  2552840294U,	// <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+  3047604234U,	// <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+  1973862502U,	// <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2570758613U,	// <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+  2552843574U,	// <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+  2217664887U,	// <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+  3662418528U,	// <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+  2658022257U,	// <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+  1973862556U,	// <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+  3731764218U,	// <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+  3988324454U,	// <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+  4122034278U,	// <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+  3735082246U,	// <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+  3731764536U,	// <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+  3937145718U,	// <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+  3737073145U,	// <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+  3731764844U,	// <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+  4122034332U,	// <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+  2552856678U,	// <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+  1841635430U,	// <4,0,u,1>: Cost 2 vzipl RHS, LHS
+  1618166429U,	// <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570774999U,	// <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+  2552859958U,	// <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+  2631481498U,	// <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  2686157020U,	// <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  2594665787U,	// <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+  1618166483U,	// <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2617548837U,	// <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+  2622857318U,	// <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+  3693281484U,	// <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+  2691908342U,	// <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+  2622857554U,	// <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+  3764470538U,	// <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+  3695272459U,	// <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+  3733094980U,	// <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+  2622857885U,	// <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+  3696599798U,	// <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+  2691097399U,	// <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+  2631484314U,	// <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+  2691908424U,	// <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+  3696600125U,	// <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+  3696600175U,	// <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+  3696600307U,	// <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+  3668423997U,	// <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+  2691908469U,	// <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+  2570797158U,	// <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+  2570797978U,	// <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+  3696600680U,	// <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+  1618166682U,	// <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+  2570800438U,	// <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+  3765650347U,	// <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+  3696601018U,	// <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+  3668432190U,	// <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+  1618535367U,	// <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+  2564833382U,	// <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+  2691908568U,	// <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+  2691908578U,	// <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+  2692572139U,	// <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+  2564836662U,	// <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+  2691908608U,	// <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+  2588725862U,	// <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  3662468090U,	// <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+  2691908631U,	// <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+  3760194590U,	// <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+  3693947874U,	// <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+  3765650484U,	// <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+  3113877606U,	// <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+  3760194630U,	// <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+  2622860598U,	// <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  3297436759U,	// <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+  3800007772U,	// <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+  2622860841U,	// <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+  1479164006U,	// <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552906486U,	// <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+  2552907299U,	// <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+  2552907926U,	// <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+  1479167286U,	// <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  2913387664U,	// <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+  2600686074U,	// <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+  2600686586U,	// <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479169838U,	// <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552914022U,	// <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+  2558886708U,	// <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+  4028205206U,	// <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+  3089858662U,	// <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+  2552917302U,	// <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+  2223637584U,	// <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+  4121347081U,	// <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+  3721155406U,	// <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+  2552919854U,	// <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+  2659357716U,	// <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  3733763173U,	// <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+  3734426806U,	// <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+  2695226671U,	// <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+  3721155942U,	// <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+  3721155976U,	// <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+  3662500458U,	// <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+  3721156204U,	// <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+  2659357716U,	// <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  1479188582U,	// <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+  2552931062U,	// <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+  2552931944U,	// <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+  1622148480U,	// <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+  1479191862U,	// <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+  2622863514U,	// <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  2588725862U,	// <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2600686586U,	// <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479194414U,	// <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+  2617557030U,	// <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+  2622865510U,	// <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+  2622865612U,	// <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+  3693289753U,	// <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+  2635473244U,	// <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+  3765650918U,	// <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+  2696775148U,	// <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+  3695944285U,	// <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+  2622866077U,	// <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+  3696607990U,	// <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+  3696608052U,	// <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+  3696608150U,	// <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+  3895574630U,	// <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+  2691909162U,	// <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608400U,	// <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+  3760784956U,	// <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+  3773908549U,	// <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+  2691909162U,	// <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608748U,	// <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+  3696608828U,	// <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+  2691909224U,	// <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+  2691909234U,	// <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+  3759605368U,	// <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+  3696609156U,	// <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+  3760785040U,	// <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+  3668505927U,	// <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+  2691909279U,	// <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+  2691909286U,	// <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+  3764840111U,	// <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+  3765651129U,	// <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+  2698544836U,	// <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+  2685863630U,	// <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+  2698692310U,	// <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+  3772507871U,	// <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+  2698839784U,	// <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+  2691909358U,	// <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+  2564915302U,	// <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+  2564916122U,	// <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+  2564917004U,	// <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+  2699208469U,	// <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+  2564918582U,	// <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+  2622868790U,	// <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229667632U,	// <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+  3800082229U,	// <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+  2622869033U,	// <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+  2552979558U,	// <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+  2558952342U,	// <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+  2564925032U,	// <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+  2967060582U,	// <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+  2552982838U,	// <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+  3987130190U,	// <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+  2913388474U,	// <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+  3895577910U,	// <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+  2552985390U,	// <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+  1479245926U,	// <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+  2552988406U,	// <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+  2552989288U,	// <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+  2954461286U,	// <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+  1479249206U,	// <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+  2229610281U,	// <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+  2600767994U,	// <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+  2600768506U,	// <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+  1479251758U,	// <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+  2659365909U,	// <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  3733771366U,	// <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+  3734434999U,	// <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+  2701199368U,	// <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+  4175774618U,	// <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+  3303360298U,	// <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+  3727136217U,	// <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+  3727136364U,	// <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+  2659365909U,	// <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  1479262310U,	// <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+  2553004790U,	// <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+  2553005672U,	// <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+  2954477670U,	// <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+  1479265590U,	// <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+  2622871706U,	// <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229700404U,	// <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+  2600784890U,	// <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+  1479268142U,	// <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+  3765651595U,	// <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+  2691909782U,	// <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+  2702452897U,	// <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+  3693297946U,	// <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+  3760711856U,	// <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+  2235533820U,	// <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+  3309349381U,	// <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+  3668563278U,	// <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+  2691909845U,	// <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+  2235173328U,	// <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+  3764840678U,	// <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+  2630173594U,	// <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+  2703190267U,	// <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+  3760195840U,	// <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+  3765651724U,	// <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+  3309357574U,	// <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+  3769633054U,	// <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+  2703558952U,	// <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+  3626770534U,	// <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+  2630174250U,	// <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+  3765651777U,	// <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+  2703853900U,	// <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+  3626773814U,	// <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+  2704001374U,	// <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+  3765651814U,	// <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+  3769633135U,	// <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+  2634819681U,	// <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+  3765651839U,	// <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+  3765651848U,	// <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+  3710552404U,	// <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+  2691910044U,	// <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2704591270U,	// <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+  3769633202U,	// <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+  3703917212U,	// <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+  3769633220U,	// <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+  2691910044U,	// <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2691910096U,	// <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+  2691910106U,	// <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+  2564990741U,	// <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+  3765651946U,	// <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+  2691910136U,	// <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+  2686454274U,	// <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+  2235640329U,	// <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+  3801483792U,	// <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+  2691910168U,	// <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+  2559025254U,	// <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559026237U,	// <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+  2564998862U,	// <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+  2570971548U,	// <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+  2559028534U,	// <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+  4163519477U,	// <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+  3309390346U,	// <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+  2706139747U,	// <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+  2559031086U,	// <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559033446U,	// <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+  2559034430U,	// <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+  2565007127U,	// <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+  2570979740U,	// <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+  2559036726U,	// <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+  1161841154U,	// <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+  4028203932U,	// <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+  2706803380U,	// <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+  1162062365U,	// <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+  3769633475U,	// <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+  3769633488U,	// <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+  3638757144U,	// <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+  3769633508U,	// <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+  3769633515U,	// <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+  3769633526U,	// <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+  3662647932U,	// <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+  3781208837U,	// <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+  3769633547U,	// <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+  2559049830U,	// <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+  2691910430U,	// <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+  2565023513U,	// <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+  2707835698U,	// <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+  2559053110U,	// <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+  1161857540U,	// <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+  2235673101U,	// <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+  2708130646U,	// <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+  1162078751U,	// <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+  2617573416U,	// <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+  1570373734U,	// <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779676774U,	// <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  3760196480U,	// <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+  2576977100U,	// <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+  2718747538U,	// <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+  2718747548U,	// <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+  3668637015U,	// <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+  1570374301U,	// <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+  2644116214U,	// <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+  2644116276U,	// <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+  2691910602U,	// <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+  2644116440U,	// <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+  2711227356U,	// <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+  2709310438U,	// <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+  3765652462U,	// <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+  3768970231U,	// <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+  2695891968U,	// <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+  3703260634U,	// <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+  3765652499U,	// <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+  2644117096U,	// <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+  2631509709U,	// <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+  2644117269U,	// <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+  3705251698U,	// <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+  2710047808U,	// <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+  3783863369U,	// <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+  2634827874U,	// <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+  2644117654U,	// <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+  3638797210U,	// <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+  3638798082U,	// <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+  2637482406U,	// <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  2638146039U,	// <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+  3913287374U,	// <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+  3765652625U,	// <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+  3713878762U,	// <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+  2637482406U,	// <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  1503264870U,	// <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+  2577007514U,	// <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+  2577008232U,	// <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+  2571037175U,	// <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+  161926454U,	// <4,4,4,4>: Cost 1 vdup0 RHS
+  1570377014U,	// <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+  2779680054U,	// <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+  2594927963U,	// <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U,	// <4,4,4,u>: Cost 1 vdup0 RHS
+  2571042918U,	// <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+  2571043738U,	// <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+  3638814495U,	// <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+  2571045368U,	// <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+  2571046198U,	// <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+  1839648054U,	// <4,4,5,5>: Cost 2 vzipl RHS, RHS
+  1618169142U,	// <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594936156U,	// <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+  1618169160U,	// <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  2553135206U,	// <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+  3626877686U,	// <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+  2565080782U,	// <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+  2571053561U,	// <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+  2553138486U,	// <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+  2241555675U,	// <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+  1973865782U,	// <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2658055029U,	// <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+  1973865800U,	// <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+  2644120570U,	// <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+  3638829978U,	// <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+  3638830881U,	// <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+  3735115018U,	// <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+  2662036827U,	// <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  2713292236U,	// <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+  2713365973U,	// <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+  2644121196U,	// <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+  2662036827U,	// <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  1503297638U,	// <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+  1570379566U,	// <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779682606U,	// <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  2571069947U,	// <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+  161926454U,	// <4,4,u,4>: Cost 1 vdup0 RHS
+  1841638710U,	// <4,4,u,5>: Cost 2 vzipl RHS, RHS
+  1618169385U,	// <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594960735U,	// <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+  161926454U,	// <4,4,u,u>: Cost 1 vdup0 RHS
+  2631516160U,	// <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+  1557774438U,	// <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2618908875U,	// <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+  2571078140U,	// <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+  2626871634U,	// <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+  3705258414U,	// <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+  2594968438U,	// <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+  2594968928U,	// <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+  1557775005U,	// <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631516918U,	// <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+  2624217939U,	// <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+  2631517078U,	// <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+  2821341286U,	// <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+  3895086054U,	// <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+  2626872471U,	// <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+  3895083131U,	// <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+  2718748368U,	// <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+  2821341291U,	// <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+  2571092070U,	// <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+  3699287585U,	// <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+  2630854269U,	// <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+  1557776078U,	// <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+  2631517974U,	// <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+  3692652384U,	// <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+  2631518138U,	// <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+  4164013366U,	// <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+  1561094243U,	// <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+  2631518358U,	// <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+  3895084710U,	// <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+  2631518540U,	// <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+  2631518620U,	// <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+  2631518716U,	// <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+  2631518784U,	// <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+  2658060980U,	// <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+  2640145131U,	// <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+  2631519006U,	// <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+  2571108454U,	// <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+  3632907342U,	// <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+  2571110094U,	// <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+  2571110912U,	// <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+  2571111734U,	// <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+  1557777718U,	// <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2645454195U,	// <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+  2718748614U,	// <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+  1557777961U,	// <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+  1503346790U,	// <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+  2913398480U,	// <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+  2631519998U,	// <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+  2577090710U,	// <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+  1503349978U,	// <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+  2631520260U,	// <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+  2913390690U,	// <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+  2821344566U,	// <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+  1503352622U,	// <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+  1497383014U,	// <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+  2559181904U,	// <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+  2565154601U,	// <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+  1497385474U,	// <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+  1497386294U,	// <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+  3047608324U,	// <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+  2571129656U,	// <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+  27705344U,	// <4,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,6,u>: Cost 0 copy RHS
+  2565161062U,	// <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+  2565161882U,	// <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+  2565162794U,	// <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+  2661381387U,	// <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+  2565164342U,	// <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+  2718748840U,	// <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+  2718748846U,	// <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+  2719412407U,	// <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+  2565166894U,	// <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+  1497399398U,	// <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+  1557780270U,	// <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631522181U,	// <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+  1497401860U,	// <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+  1497402678U,	// <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+  1557780634U,	// <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2631522512U,	// <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+  27705344U,	// <4,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,u,u>: Cost 0 copy RHS
+  2618916864U,	// <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+  1545175142U,	// <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1545175244U,	// <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+  3692658940U,	// <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+  2618917202U,	// <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+  3852910806U,	// <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+  2253525648U,	// <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+  4040764726U,	// <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+  1545175709U,	// <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  2618917622U,	// <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+  2618917684U,	// <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+  2618917782U,	// <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+  2618917848U,	// <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+  3692659773U,	// <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+  2618918032U,	// <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+  3692659937U,	// <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+  4032146742U,	// <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+  2618918253U,	// <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+  2618918380U,	// <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+  2618918460U,	// <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+  2618918504U,	// <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+  2618918566U,	// <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+  2618918679U,	// <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+  2618918788U,	// <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+  2618918842U,	// <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+  2718749178U,	// <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+  2618918971U,	// <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+  2618919062U,	// <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+  2636171526U,	// <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+  3692661057U,	// <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+  2618919324U,	// <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+  2618919426U,	// <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+  2638826058U,	// <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+  3913303030U,	// <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+  2722730572U,	// <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+  2618919710U,	// <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+  2565210214U,	// <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+  2718749286U,	// <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+  2565211952U,	// <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+  2571184649U,	// <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+  2565213494U,	// <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+  1545178422U,	// <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705430326U,	// <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+  2595075437U,	// <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+  1545178665U,	// <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+  2565218406U,	// <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+  2645462736U,	// <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+  2913399290U,	// <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+  3913305394U,	// <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+  2645462982U,	// <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+  2779172868U,	// <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+  2913391416U,	// <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+  2821426486U,	// <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+  2821426487U,	// <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+  1503428710U,	// <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+  2577171190U,	// <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+  2645463546U,	// <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+  2577172630U,	// <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+  1503431908U,	// <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+  2253501069U,	// <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+  2618921784U,	// <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+  2954464566U,	// <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+  1503434542U,	// <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+  2645464058U,	// <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+  2779173882U,	// <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+  3638978355U,	// <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+  2725090156U,	// <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+  2645464422U,	// <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+  2779174246U,	// <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  3852915914U,	// <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+  2779174508U,	// <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2779173945U,	// <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+  1503445094U,	// <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+  1545180974U,	// <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1705432878U,	// <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+  2618922940U,	// <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+  1503448294U,	// <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+  1545181338U,	// <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705433242U,	// <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+  2954480950U,	// <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+  1545181541U,	// <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  3706601472U,	// <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+  2632859750U,	// <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2726343685U,	// <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+  3701293312U,	// <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+  3706601810U,	// <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+  2259424608U,	// <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+  3695321617U,	// <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+  3800454194U,	// <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+  2632860317U,	// <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+  2259064116U,	// <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+  3700630324U,	// <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+  2632860570U,	// <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+  3769635936U,	// <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+  3656920374U,	// <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+  3700630681U,	// <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+  3701294314U,	// <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+  3793818754U,	// <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+  2259654012U,	// <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+  3656925286U,	// <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+  3706603050U,	// <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+  3706603112U,	// <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+  2727744688U,	// <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+  3705939745U,	// <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+  2632861554U,	// <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+  3706603450U,	// <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+  3792491731U,	// <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+  2634852453U,	// <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+  3706603670U,	// <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+  3662906266U,	// <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+  3725183326U,	// <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+  3706603932U,	// <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+  3701295618U,	// <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+  2638834251U,	// <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+  2639497884U,	// <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+  3802445093U,	// <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+  2640825150U,	// <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+  2718750004U,	// <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+  3706604490U,	// <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+  3656943474U,	// <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+  3779884371U,	// <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+  2259383643U,	// <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+  2632863030U,	// <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+  2259531117U,	// <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+  3907340074U,	// <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+  2632863273U,	// <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+  2913391610U,	// <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+  3645006848U,	// <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+  2589181646U,	// <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+  3645008403U,	// <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+  2913391974U,	// <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+  2583211973U,	// <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+  2589184670U,	// <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+  2913392236U,	// <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+  2913392258U,	// <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+  1509474406U,	// <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+  3047609338U,	// <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+  2583217768U,	// <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+  2583218326U,	// <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+  1509477686U,	// <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+  1509478342U,	// <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+  2583220730U,	// <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+  3047609964U,	// <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509480238U,	// <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+  3650994278U,	// <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+  3650995098U,	// <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+  3650996010U,	// <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+  3804804677U,	// <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+  3650997486U,	// <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+  2662725039U,	// <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+  3662942880U,	// <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+  2718750316U,	// <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+  2664715938U,	// <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+  1509490790U,	// <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+  2632865582U,	// <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2583234152U,	// <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+  2583234710U,	// <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+  1509494070U,	// <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+  1509494728U,	// <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+  2583237114U,	// <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+  3047757420U,	// <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509496622U,	// <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+  2618933248U,	// <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+  1545191526U,	// <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1545191630U,	// <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+  2691913445U,	// <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+  2618933586U,	// <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+  2265397305U,	// <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+  2595189625U,	// <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+  2595190139U,	// <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+  1545192093U,	// <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+  2618934006U,	// <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+  2618934068U,	// <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+  1618171694U,	// <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2618934232U,	// <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+  2695894848U,	// <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+  2618934416U,	// <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+  3692676321U,	// <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+  2718750555U,	// <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+  1618171748U,	// <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2553397350U,	// <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+  2630215215U,	// <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+  2618934888U,	// <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+  1557800657U,	// <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+  2618935065U,	// <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+  2733864859U,	// <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+  2618935226U,	// <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+  2718750636U,	// <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+  1561118822U,	// <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+  2618935446U,	// <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+  2779318422U,	// <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+  2636851545U,	// <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+  2618935708U,	// <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+  2618935810U,	// <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+  2691913711U,	// <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+  2588725862U,	// <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2640169710U,	// <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+  2618936094U,	// <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+  1503559782U,	// <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+  2692282391U,	// <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+  2565359426U,	// <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+  2571332123U,	// <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+  161926454U,	// <4,u,4,4>: Cost 1 vdup0 RHS
+  1545194806U,	// <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1705577782U,	// <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+  2718750801U,	// <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+  161926454U,	// <4,u,4,u>: Cost 1 vdup0 RHS
+  1479164006U,	// <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  1839650606U,	// <4,u,5,1>: Cost 2 vzipl RHS, LHS
+  2565367502U,	// <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+  3089777309U,	// <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+  1479167286U,	// <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  1839650970U,	// <4,u,5,5>: Cost 2 vzipl RHS, RHS
+  1618172058U,	// <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  3089780265U,	// <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+  1618172076U,	// <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  1479688294U,	// <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+  2553430774U,	// <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+  1973868334U,	// <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+  1497606685U,	// <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+  1479691574U,	// <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+  1509552079U,	// <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+  1973868698U,	// <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+  27705344U,	// <4,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,6,u>: Cost 0 copy RHS
+  2565382246U,	// <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+  2565383066U,	// <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+  2565384005U,	// <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+  2661405966U,	// <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+  2565385526U,	// <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+  2779321702U,	// <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  2589274793U,	// <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+  2779321964U,	// <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2565388078U,	// <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+  1479704678U,	// <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+  1545197358U,	// <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1618172261U,	// <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  1497623071U,	// <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+  161926454U,	// <4,u,u,4>: Cost 1 vdup0 RHS
+  1545197722U,	// <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1618172301U,	// <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  27705344U,	// <4,u,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,u,u>: Cost 0 copy RHS
+  2687123456U,	// <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+  2687123466U,	// <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+  2687123476U,	// <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+  3710599434U,	// <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+  2642166098U,	// <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+  3657060306U,	// <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+  3292094923U,	// <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+  3669005700U,	// <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+  2687123530U,	// <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+  2559434854U,	// <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+  2559435887U,	// <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+  1613381734U,	// <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  3698656256U,	// <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+  2559438134U,	// <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+  2583326675U,	// <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+  3715908851U,	// <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+  3657069562U,	// <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+  1613381788U,	// <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2686017700U,	// <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+  2685796528U,	// <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2698625208U,	// <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+  2685944002U,	// <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+  2686017739U,	// <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+  2686091476U,	// <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+  2725167324U,	// <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+  2595280230U,	// <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  2686312687U,	// <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+  3760128248U,	// <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+  3759685888U,	// <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+  2686533898U,	// <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+  3760349459U,	// <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+  2638187004U,	// <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+  3776348452U,	// <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+  3713256094U,	// <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+  3914064896U,	// <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+  2686976320U,	// <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+  2559459430U,	// <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+  1613381970U,	// <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  2687123804U,	// <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+  3761013092U,	// <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+  2559462710U,	// <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+  2638187830U,	// <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  3761234303U,	// <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+  2646150600U,	// <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1613381970U,	// <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  3766763926U,	// <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+  2919268454U,	// <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+  3053486182U,	// <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+  3723210589U,	// <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+  3766763966U,	// <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+  2650796031U,	// <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+  3719893090U,	// <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+  3914067254U,	// <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+  2919269021U,	// <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+  4047519744U,	// <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+  2920038502U,	// <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  3759759871U,	// <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+  3645164070U,	// <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+  3762414095U,	// <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+  3993780690U,	// <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+  3719893816U,	// <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+  2662077302U,	// <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+  2920039069U,	// <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+  2565455974U,	// <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+  2565456790U,	// <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+  2565457742U,	// <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+  3639199894U,	// <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+  2565459254U,	// <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+  2589347938U,	// <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+  2589348530U,	// <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+  4188456422U,	// <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+  2565461806U,	// <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+  2687124106U,	// <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+  1616036502U,	// <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+  1613382301U,	// <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  2689925800U,	// <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+  2687124146U,	// <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+  2638190746U,	// <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  2589356723U,	// <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+  2595280230U,	// <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  1613382355U,	// <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2646818816U,	// <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+  1573077094U,	// <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2646818980U,	// <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+  2687124214U,	// <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+  2641510738U,	// <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+  2641510814U,	// <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+  3720561142U,	// <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+  3298141357U,	// <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+  1573077661U,	// <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+  2223891567U,	// <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+  2687124276U,	// <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+  2646819734U,	// <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+  2687124296U,	// <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+  2691326803U,	// <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+  2691400540U,	// <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+  3765216101U,	// <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+  3765289838U,	// <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+  2687124341U,	// <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+  3297641584U,	// <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+  3763520391U,	// <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+  2646820456U,	// <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+  2687124374U,	// <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+  2691990436U,	// <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+  2687124395U,	// <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+  2646820794U,	// <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+  3808199610U,	// <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+  2687124419U,	// <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+  2577440870U,	// <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+  2687124440U,	// <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+  3759686627U,	// <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+  2692580332U,	// <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+  2687124469U,	// <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+  2685207552U,	// <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+  3760866313U,	// <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+  2692875280U,	// <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+  2687124503U,	// <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+  1567771538U,	// <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+  2693096491U,	// <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+  2693170228U,	// <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+  2687124541U,	// <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+  2646822096U,	// <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+  1573080374U,	// <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646822260U,	// <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+  3298174129U,	// <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+  1573080602U,	// <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+  2687124591U,	// <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+  2646822543U,	// <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+  3760866433U,	// <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+  2687124624U,	// <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+  2687124631U,	// <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+  2646822916U,	// <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+  2646823010U,	// <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+  2646823080U,	// <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+  2687124663U,	// <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+  2553577574U,	// <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+  3763520719U,	// <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+  2646823418U,	// <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+  3760866529U,	// <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+  2553580854U,	// <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+  2687124723U,	// <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+  2646823736U,	// <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+  2646823758U,	// <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+  2646823839U,	// <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+  2559557734U,	// <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+  2559558452U,	// <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+  2571503270U,	// <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+  2040971366U,	// <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2559561014U,	// <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+  2595393232U,	// <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+  4188455035U,	// <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+  2646824556U,	// <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+  2040971371U,	// <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1591662326U,	// <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+  1573082926U,	// <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2695824760U,	// <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+  2040979558U,	// <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+  2687124874U,	// <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+  1573083290U,	// <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646825168U,	// <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+  2646825216U,	// <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+  2040979563U,	// <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+  3702652928U,	// <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+  2628911206U,	// <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2641518756U,	// <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+  3759760847U,	// <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+  3760866775U,	// <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+  3759539680U,	// <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+  3760866796U,	// <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+  3304114054U,	// <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+  2628911773U,	// <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+  2623603464U,	// <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+  3698008921U,	// <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+  3633325603U,	// <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+  2687125027U,	// <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+  3633327414U,	// <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+  3759539760U,	// <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+  3760866876U,	// <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+  3304122247U,	// <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+  2687125072U,	// <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+  3633332326U,	// <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+  3759760992U,	// <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+  2687125096U,	// <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+  2687125106U,	// <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+  2697963133U,	// <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+  3759466120U,	// <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+  3760866960U,	// <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+  3771926168U,	// <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+  2687125151U,	// <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+  2687125158U,	// <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+  2698405555U,	// <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+  2577516238U,	// <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+  3759687365U,	// <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+  1624884942U,	// <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+  2698700503U,	// <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+  3772368608U,	// <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+  3702655716U,	// <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+  1625179890U,	// <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+  2641521555U,	// <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+  3772368642U,	// <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+  2699142925U,	// <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+  2698626838U,	// <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+  2698626848U,	// <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+  2628914486U,	// <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2645503353U,	// <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+  3304146826U,	// <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+  2628914729U,	// <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+  2553643110U,	// <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+  3758950227U,	// <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+  3759761248U,	// <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+  2982396006U,	// <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+  2553646390U,	// <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+  2553647108U,	// <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+  3760867204U,	// <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+  3702657141U,	// <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+  2982396011U,	// <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+  3627393126U,	// <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+  3760867236U,	// <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+  2645504506U,	// <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+  2687125434U,	// <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+  2700617665U,	// <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+  3760867276U,	// <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+  3763521493U,	// <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+  3719246670U,	// <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+  2687125479U,	// <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+  2565603430U,	// <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+  2553660150U,	// <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+  2565605216U,	// <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+  2961178726U,	// <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+  2565606710U,	// <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+  4034920552U,	// <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+  3114713292U,	// <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+  3702658668U,	// <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+  2961178731U,	// <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+  2687125563U,	// <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+  2628917038U,	// <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2565613409U,	// <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+  2687125592U,	// <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+  1628203107U,	// <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+  2628917402U,	// <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2702092405U,	// <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+  3304179598U,	// <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+  1628498055U,	// <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+  3760867467U,	// <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+  2687125654U,	// <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+  3759761565U,	// <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+  3633391766U,	// <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+  2687125680U,	// <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+  3760277690U,	// <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+  3310013014U,	// <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+  2236344927U,	// <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+  2687125717U,	// <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+  3760867551U,	// <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+  3760867558U,	// <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+  2624938923U,	// <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+  2703198460U,	// <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+  3760867587U,	// <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+  2636219536U,	// <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+  3698681075U,	// <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+  2703493408U,	// <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+  2628920721U,	// <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+  3766765870U,	// <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+  3698681379U,	// <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+  3760867649U,	// <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+  2698627404U,	// <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+  2703935830U,	// <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+  2698627422U,	// <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+  3760867686U,	// <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+  3769788783U,	// <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+  2701945209U,	// <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+  3760867711U,	// <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+  2636220684U,	// <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+  3772369298U,	// <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+  2687125916U,	// <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+  2704599463U,	// <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+  2704673200U,	// <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+  3709962935U,	// <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+  3772369346U,	// <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+  2704894411U,	// <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+  2704968148U,	// <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+  3698682850U,	// <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+  2642857014U,	// <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+  2705189359U,	// <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+  2705263096U,	// <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+  2685946370U,	// <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+  3779152394U,	// <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+  2236377699U,	// <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+  2687126045U,	// <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+  2571632742U,	// <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+  2559689870U,	// <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+  2571634382U,	// <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+  2571635264U,	// <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+  2571636022U,	// <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+  2559692804U,	// <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+  3720581218U,	// <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+  2236385892U,	// <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+  2571638574U,	// <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+  2565668966U,	// <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+  3633439887U,	// <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+  2565670760U,	// <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+  2565671426U,	// <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+  2565672246U,	// <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+  3639414630U,	// <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+  4047521640U,	// <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+  2725169844U,	// <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+  2565674798U,	// <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+  1485963366U,	// <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485964432U,	// <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+  2559706728U,	// <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+  2559707286U,	// <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+  1485966646U,	// <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+  2559708880U,	// <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+  2601513466U,	// <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+  3114714112U,	// <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+  1485969198U,	// <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485971558U,	// <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+  1485972625U,	// <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+  2559714920U,	// <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+  2559715478U,	// <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+  1485974838U,	// <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+  2687126342U,	// <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+  2601521658U,	// <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+  2236410471U,	// <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+  1485977390U,	// <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+  3627491430U,	// <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+  2636890214U,	// <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+  3703333028U,	// <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+  3782249348U,	// <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+  2642198866U,	// <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+  2687126418U,	// <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+  2242243887U,	// <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+  3316059448U,	// <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+  2636890781U,	// <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+  2241809658U,	// <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+  3698025307U,	// <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+  3698688940U,	// <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+  3698689024U,	// <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+  3700016206U,	// <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+  2687126498U,	// <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+  3760868336U,	// <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+  3316067641U,	// <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+  2242399554U,	// <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+  3703334371U,	// <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+  3703998004U,	// <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+  3704661637U,	// <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+  2636891854U,	// <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+  3705988903U,	// <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+  2698628150U,	// <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+  3760868415U,	// <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+  3783871562U,	// <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+  2666752099U,	// <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+  3639459942U,	// <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+  3709970701U,	// <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+  2636892510U,	// <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+  3710634396U,	// <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+  2638219776U,	// <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+  3766987908U,	// <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+  2710719634U,	// <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+  3914097664U,	// <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+  2640874308U,	// <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+  2583642214U,	// <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+  2642201574U,	// <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+  3710635062U,	// <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+  3717270664U,	// <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+  2713963728U,	// <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+  1637567706U,	// <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+  2242276659U,	// <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+  2646183372U,	// <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+  1637788917U,	// <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+  2559762534U,	// <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+  2559763607U,	// <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+  2698628366U,	// <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+  3633506454U,	// <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+  2559765814U,	// <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+  2583654395U,	// <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+  1613385014U,	// <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  3901639990U,	// <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+  1613385032U,	// <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+  2559770726U,	// <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+  2559771648U,	// <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+  3633514088U,	// <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+  2571717122U,	// <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+  2559774006U,	// <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+  2712636796U,	// <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+  3760868743U,	// <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+  2712784270U,	// <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+  2559776558U,	// <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+  2565750886U,	// <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+  2565751706U,	// <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+  2565752690U,	// <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+  2571725387U,	// <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+  2565754166U,	// <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+  3114713426U,	// <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+  94817590U,	// <5,4,7,6>: Cost 1 vrev RHS
+  2595616175U,	// <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+  94965064U,	// <5,4,7,u>: Cost 1 vrev RHS
+  2559787110U,	// <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+  2559788186U,	// <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+  2242014483U,	// <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+  2667419628U,	// <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+  2559790390U,	// <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+  1640222238U,	// <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+  94825783U,	// <5,4,u,6>: Cost 1 vrev RHS
+  2714111536U,	// <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+  94973257U,	// <5,4,u,u>: Cost 1 vrev RHS
+  2646851584U,	// <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+  1573109862U,	// <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646851748U,	// <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+  3760279130U,	// <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+  2687127138U,	// <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+  2248142847U,	// <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+  3720593910U,	// <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+  4182502710U,	// <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+  1573110429U,	// <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646852342U,	// <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+  2624291676U,	// <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+  2646852502U,	// <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+  2646852568U,	// <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+  2715217591U,	// <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+  2628936848U,	// <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+  3698033907U,	// <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+  2713964240U,	// <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+  2628937107U,	// <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+  3645497446U,	// <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+  3760869099U,	// <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+  2646853224U,	// <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+  2698628862U,	// <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+  3772370694U,	// <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+  2713964303U,	// <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+  2646853562U,	// <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+  4038198272U,	// <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+  2701946667U,	// <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+  2646853782U,	// <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+  3698034922U,	// <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+  3702679919U,	// <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+  2637564336U,	// <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+  2646854146U,	// <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+  2638891602U,	// <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+  3702680247U,	// <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+  3702680259U,	// <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+  2646854430U,	// <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+  2646854546U,	// <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+  2642209767U,	// <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+  3711306806U,	// <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+  3645516369U,	// <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+  1570458842U,	// <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1573113142U,	// <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+  2645527932U,	// <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+  2713964486U,	// <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+  1573113374U,	// <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+  1509982310U,	// <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2646855376U,	// <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+  2583725672U,	// <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+  2583726230U,	// <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+  1509985590U,	// <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U,	// <5,5,5,5>: Cost 1 vdup1 RHS
+  2646855778U,	// <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+  2646855848U,	// <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+  229035318U,	// <5,5,5,u>: Cost 1 vdup1 RHS
+  2577760358U,	// <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+  3633587361U,	// <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+  2646856186U,	// <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+  3633588738U,	// <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+  2718535756U,	// <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+  2644202223U,	// <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+  2973780482U,	// <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+  2646856526U,	// <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+  2646856607U,	// <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+  2571796582U,	// <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+  3633595392U,	// <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+  2571798222U,	// <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+  2571799124U,	// <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+  2571799862U,	// <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+  3114717188U,	// <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+  4034923010U,	// <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+  2040974646U,	// <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+  2040974647U,	// <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+  1509982310U,	// <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  1573115694U,	// <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2571806414U,	// <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+  2571807317U,	// <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+  1509985590U,	// <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U,	// <5,5,u,5>: Cost 1 vdup1 RHS
+  2646857936U,	// <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+  2040982838U,	// <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+  229035318U,	// <5,5,u,u>: Cost 1 vdup1 RHS
+  2638233600U,	// <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+  1564491878U,	// <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  2632261796U,	// <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+  2638233856U,	// <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+  2638233938U,	// <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+  3706003885U,	// <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+  3706003967U,	// <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+  4047473974U,	// <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+  1564492445U,	// <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+  2638234358U,	// <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+  2638234420U,	// <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+  2638234518U,	// <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+  2638234584U,	// <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+  2626290768U,	// <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+  2638234768U,	// <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+  3700032719U,	// <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+  2982366518U,	// <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  2628945300U,	// <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+  3706004925U,	// <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+  3711976966U,	// <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+  2638235240U,	// <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+  2638235302U,	// <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+  2632263465U,	// <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+  2638235496U,	// <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+  2638235578U,	// <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+  2713965050U,	// <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+  2634917997U,	// <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+  2638235798U,	// <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+  3711977695U,	// <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+  3710650720U,	// <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+  2638236060U,	// <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+  1564494338U,	// <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+  2638236234U,	// <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+  3711978104U,	// <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+  4034227510U,	// <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+  1567148870U,	// <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+  2577817702U,	// <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+  3700034544U,	// <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+  2723033713U,	// <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+  2638236818U,	// <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+  2644208859U,	// <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+  1564495158U,	// <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  2645536125U,	// <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+  2723402398U,	// <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+  1564495401U,	// <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+  2577825894U,	// <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+  2662125264U,	// <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+  3775836867U,	// <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+  3711979343U,	// <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+  2650181556U,	// <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+  2662125572U,	// <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+  2638237732U,	// <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+  2982399286U,	// <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+  2982399287U,	// <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+  2583806054U,	// <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+  3711979910U,	// <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+  2662126074U,	// <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+  2583808514U,	// <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+  2583809334U,	// <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+  2583810062U,	// <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+  2638238520U,	// <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+  2973781302U,	// <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2973781303U,	// <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+  430358630U,	// <5,6,7,0>: Cost 1 vext1 RHS, LHS
+  1504101110U,	// <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1504101992U,	// <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504102550U,	// <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430361910U,	// <5,6,7,4>: Cost 1 vext1 RHS, RHS
+  1504104390U,	// <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+  1504105272U,	// <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+  1504106092U,	// <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+  430364462U,	// <5,6,7,u>: Cost 1 vext1 RHS, LHS
+  430366822U,	// <5,6,u,0>: Cost 1 vext1 RHS, LHS
+  1564497710U,	// <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  1504110184U,	// <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504110742U,	// <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430370103U,	// <5,6,u,4>: Cost 1 vext1 RHS, RHS
+  1564498074U,	// <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  1504113146U,	// <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1504113658U,	// <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+  430372654U,	// <5,6,u,u>: Cost 1 vext1 RHS, LHS
+  2625634304U,	// <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+  1551892582U,	// <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625634468U,	// <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+  2571889247U,	// <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+  2625634642U,	// <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+  2595778728U,	// <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+  3699376639U,	// <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+  2260235715U,	// <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+  1551893149U,	// <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625635062U,	// <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+  2624308020U,	// <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+  2625635222U,	// <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+  1551893504U,	// <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+  2571898166U,	// <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+  2625635472U,	// <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+  2627626227U,	// <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+  3702031684U,	// <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+  1555211669U,	// <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+  2629617126U,	// <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+  3699377670U,	// <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+  2625635944U,	// <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+  2625636006U,	// <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+  2632271658U,	// <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+  2625636201U,	// <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+  2625636282U,	// <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+  3708004381U,	// <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+  2625636411U,	// <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+  2625636502U,	// <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+  2625636604U,	// <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+  3699378478U,	// <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+  2625636764U,	// <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+  2625636866U,	// <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+  2625636959U,	// <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+  3699378808U,	// <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+  2640235254U,	// <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+  2625637150U,	// <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+  2571919462U,	// <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+  2571920384U,	// <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+  3699379260U,	// <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+  2571922019U,	// <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+  2571922742U,	// <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+  1551895862U,	// <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2846277980U,	// <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646207951U,	// <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+  1551896105U,	// <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+  2583871590U,	// <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+  2652180176U,	// <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+  2625638177U,	// <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+  2625638262U,	// <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+  2583874870U,	// <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+  2846281732U,	// <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+  2651517015U,	// <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+  1772539190U,	// <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+  1772539191U,	// <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+  2846281826U,	// <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+  3699380615U,	// <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+  2846281108U,	// <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+  2589854210U,	// <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+  2846281830U,	// <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+  2725467658U,	// <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+  2846281076U,	// <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2846279610U,	// <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+  2846279611U,	// <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+  1510146150U,	// <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+  2846282574U,	// <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+  2583889512U,	// <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+  2846281919U,	// <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+  1510149430U,	// <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+  1510150168U,	// <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+  2583892474U,	// <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+  2625640044U,	// <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+  1510151982U,	// <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+  1510154342U,	// <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+  1551898414U,	// <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625640325U,	// <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+  1772536477U,	// <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+  1510157622U,	// <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+  1551898778U,	// <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2625640656U,	// <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+  1772539433U,	// <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+  1551898981U,	// <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625642496U,	// <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+  1551900774U,	// <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625642660U,	// <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+  2698630885U,	// <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+  2687129325U,	// <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+  2689783542U,	// <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+  2266134675U,	// <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+  2595853772U,	// <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+  1551901341U,	// <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625643254U,	// <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+  2625643316U,	// <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+  1613387566U,	// <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1551901697U,	// <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+  2626307154U,	// <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+  2689783622U,	// <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+  2627634420U,	// <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+  2982366536U,	// <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  1613387620U,	// <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2846286742U,	// <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+  2685796528U,	// <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2625644136U,	// <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+  2687129480U,	// <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+  2632279851U,	// <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+  2625644394U,	// <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+  2625644474U,	// <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+  2713966508U,	// <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+  2625644603U,	// <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+  2687129532U,	// <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+  2636261649U,	// <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+  2636925282U,	// <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+  2625644956U,	// <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+  1564510724U,	// <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+  2625645160U,	// <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+  2734610422U,	// <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+  2640243447U,	// <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+  1567165256U,	// <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+  1567828889U,	// <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+  1661163546U,	// <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+  2734463012U,	// <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+  2698631212U,	// <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+  1570458842U,	// <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1551904054U,	// <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+  2846286172U,	// <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646216144U,	// <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+  1551904297U,	// <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+  1509982310U,	// <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2560058555U,	// <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+  2698926194U,	// <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+  2698631295U,	// <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+  1509985590U,	// <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U,	// <5,u,5,5>: Cost 1 vdup1 RHS
+  1613387930U,	// <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  1772547382U,	// <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+  229035318U,	// <5,u,5,u>: Cost 1 vdup1 RHS
+  2566037606U,	// <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+  2920044334U,	// <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  2566039445U,	// <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+  2687129808U,	// <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+  2566040886U,	// <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+  2920044698U,	// <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+  2846289268U,	// <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2973781320U,	// <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2687129853U,	// <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+  430506086U,	// <5,u,7,0>: Cost 1 vext1 RHS, LHS
+  1486333117U,	// <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+  1504249448U,	// <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  2040971933U,	// <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+  430509384U,	// <5,u,7,4>: Cost 1 vext1 RHS, RHS
+  1504251600U,	// <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  118708378U,	// <5,u,7,6>: Cost 1 vrev RHS
+  2040974889U,	// <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+  430511918U,	// <5,u,7,u>: Cost 1 vext1 RHS, LHS
+  430514278U,	// <5,u,u,0>: Cost 1 vext1 RHS, LHS
+  1551906606U,	// <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  1613388133U,	// <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1772544669U,	// <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+  430517577U,	// <5,u,u,4>: Cost 1 vext1 RHS, RHS
+  229035318U,	// <5,u,u,5>: Cost 1 vdup1 RHS
+  118716571U,	// <5,u,u,6>: Cost 1 vrev RHS
+  1772547625U,	// <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+  430520110U,	// <5,u,u,u>: Cost 1 vext1 RHS, LHS
+  2686025728U,	// <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+  2686025738U,	// <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+  2686025748U,	// <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+  3779084320U,	// <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+  2642903388U,	// <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+  3657723939U,	// <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+  3926676514U,	// <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+  3926675786U,	// <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+  2686025802U,	// <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+  2566070374U,	// <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+  3759767642U,	// <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+  1612284006U,	// <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2583988738U,	// <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+  2566073654U,	// <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+  2583990308U,	// <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+  2589963005U,	// <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+  2595935702U,	// <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+  1612284060U,	// <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2686025892U,	// <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+  2685804721U,	// <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+  3759620282U,	// <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+  2705342658U,	// <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+  1612284108U,	// <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+  3706029956U,	// <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+  2686173406U,	// <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+  3651769338U,	// <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+  1612579056U,	// <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+  3706030230U,	// <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+  2705342720U,	// <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+  2705342730U,	// <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+  3706030492U,	// <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+  2644896258U,	// <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+  3718638154U,	// <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+  3729918619U,	// <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+  3926672384U,	// <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+  2705342784U,	// <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+  2687058250U,	// <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+  2686026066U,	// <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+  1613463900U,	// <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+  3761021285U,	// <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+  2687353198U,	// <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+  2632289590U,	// <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2645560704U,	// <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+  2646224337U,	// <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+  1613906322U,	// <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+  3651788902U,	// <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+  2687795620U,	// <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+  3761611181U,	// <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+  3723284326U,	// <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+  2646224838U,	// <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+  3718639630U,	// <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+  2652196962U,	// <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+  2852932918U,	// <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852932919U,	// <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852933730U,	// <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+  2925985894U,	// <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+  3060203622U,	// <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+  3718640178U,	// <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+  2656178832U,	// <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+  3725939378U,	// <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+  2657506098U,	// <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+  2619020110U,	// <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+  2925986461U,	// <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+  2572091494U,	// <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+  2572092310U,	// <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+  2980495524U,	// <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+  2572094072U,	// <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+  2572094774U,	// <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+  4054238242U,	// <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+  3645837653U,	// <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+  4054239054U,	// <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+  2572097326U,	// <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+  2686026378U,	// <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+  2686026386U,	// <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+  1612284573U,	// <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2705343144U,	// <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+  1616265906U,	// <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+  2632292506U,	// <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2590020356U,	// <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+  2852933161U,	// <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  1612284627U,	// <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2595995750U,	// <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+  2646229094U,	// <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+  3694092492U,	// <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+  2686026486U,	// <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+  2595999030U,	// <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+  3767730952U,	// <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+  2596000590U,	// <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+  2596001246U,	// <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+  2686026531U,	// <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+  3763602219U,	// <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+  2686026548U,	// <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+  3764929346U,	// <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+  2686026568U,	// <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+  2691334996U,	// <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+  3760874332U,	// <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+  3765224294U,	// <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+  3669751263U,	// <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+  2686026613U,	// <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+  2554208358U,	// <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+  3763602311U,	// <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+  3639895971U,	// <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+  2686026646U,	// <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+  2554211638U,	// <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+  3760874411U,	// <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+  2554212858U,	// <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+  3802973114U,	// <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+  2686026691U,	// <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+  2566160486U,	// <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+  2686026712U,	// <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+  2686026724U,	// <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+  3759768552U,	// <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+  2692662262U,	// <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+  2686026752U,	// <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+  2590053128U,	// <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+  3663795194U,	// <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+  2686026775U,	// <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+  2641587099U,	// <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+  2693104684U,	// <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+  3639912357U,	// <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+  2687206462U,	// <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+  3633941814U,	// <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+  2693399632U,	// <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+  3765077075U,	// <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+  2646232530U,	// <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+  2687206507U,	// <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+  2647559796U,	// <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+  3765077118U,	// <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+  3767583878U,	// <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+  2686026896U,	// <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+  2693989528U,	// <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+  3767805089U,	// <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+  2652868706U,	// <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+  3908250934U,	// <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+  2686026941U,	// <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+  2554241126U,	// <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+  3763602639U,	// <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+  3759547607U,	// <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+  3115221094U,	// <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2554244406U,	// <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+  3760874739U,	// <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+  2554245944U,	// <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+  3719975758U,	// <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+  3115221099U,	// <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2560221286U,	// <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560222415U,	// <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+  2980497558U,	// <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+  3103211622U,	// <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+  2560224566U,	// <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+  2980495698U,	// <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+  3633967526U,	// <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+  4054237686U,	// <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+  2560227118U,	// <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560229478U,	// <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+  2686027117U,	// <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+  2686027129U,	// <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+  2686027132U,	// <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+  2687206795U,	// <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+  2686027157U,	// <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+  2590094093U,	// <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+  2596066790U,	// <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+  2686027177U,	// <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+  2646900736U,	// <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+  1573159014U,	// <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2646900900U,	// <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+  3759769037U,	// <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+  2641592668U,	// <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+  3779085794U,	// <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+  2686027244U,	// <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+  3669816807U,	// <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+  1573159581U,	// <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+  2230527897U,	// <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+  2646901556U,	// <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+  2646901654U,	// <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+  2847047782U,	// <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+  3771049517U,	// <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+  2646901904U,	// <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+  2686027324U,	// <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+  3669825000U,	// <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+  2231117793U,	// <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+  3763603029U,	// <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+  3759769184U,	// <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+  2686027368U,	// <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+  2686027378U,	// <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+  2697971326U,	// <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+  3759769224U,	// <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+  2698118800U,	// <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+  3920794092U,	// <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+  2686027423U,	// <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+  2686027430U,	// <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+  3759769262U,	// <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+  2698487485U,	// <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+  2705344196U,	// <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+  2686027470U,	// <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+  2698708696U,	// <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+  2724660961U,	// <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+  2729232104U,	// <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+  2686027502U,	// <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+  1567853468U,	// <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  3759769351U,	// <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+  2699151118U,	// <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+  2686027543U,	// <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+  2699298592U,	// <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+  1573162294U,	// <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686027564U,	// <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+  3719982547U,	// <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+  1573162532U,	// <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+  3779086154U,	// <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+  2646904528U,	// <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+  3759769440U,	// <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+  2699888488U,	// <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+  2230855617U,	// <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+  2646904836U,	// <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+  2646904930U,	// <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+  2847051062U,	// <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  2700257173U,	// <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+  2687207321U,	// <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+  2686027684U,	// <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+  2566260656U,	// <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+  2685806522U,	// <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+  2687207361U,	// <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+  2686027724U,	// <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+  2646905656U,	// <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+  2646905678U,	// <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+  2686027751U,	// <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+  2554323046U,	// <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+  2572239606U,	// <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+  2566268849U,	// <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+  1906753638U,	// <6,2,7,3>: Cost 2 vzipr RHS, LHS
+  2554326326U,	// <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+  3304687564U,	// <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+  2980495708U,	// <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2646906476U,	// <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+  1906753643U,	// <6,2,7,u>: Cost 2 vzipr RHS, LHS
+  1591744256U,	// <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+  1573164846U,	// <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2701805650U,	// <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+  1906761830U,	// <6,2,u,3>: Cost 2 vzipr RHS, LHS
+  2686027875U,	// <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+  1573165210U,	// <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686322800U,	// <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+  2847051305U,	// <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  1906761835U,	// <6,2,u,u>: Cost 2 vzipr RHS, LHS
+  3759769739U,	// <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+  2686027926U,	// <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+  2686027937U,	// <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+  3640027286U,	// <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+  2687207601U,	// <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+  2705344698U,	// <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+  3663917847U,	// <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+  2237008560U,	// <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+  2686027989U,	// <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+  3759769823U,	// <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+  3759769830U,	// <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+  3759769841U,	// <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+  3759769848U,	// <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+  2703280390U,	// <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3759769868U,	// <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+  3704063194U,	// <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+  3767732510U,	// <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+  2703280390U,	// <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3704063468U,	// <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+  2630321724U,	// <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769921U,	// <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+  3759769928U,	// <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+  3704063767U,	// <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+  3704063876U,	// <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+  2636957626U,	// <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+  3777907058U,	// <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+  2630321724U,	// <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769983U,	// <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+  3710036245U,	// <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+  2636958054U,	// <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+  2686028188U,	// <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+  2704607656U,	// <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+  3773041072U,	// <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+  3711363731U,	// <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+  3767732676U,	// <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+  2707999179U,	// <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+  2584232038U,	// <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+  2642267118U,	// <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+  2642930751U,	// <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+  2705197552U,	// <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+  2584235318U,	// <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+  1631603202U,	// <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+  2654211444U,	// <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+  2237041332U,	// <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+  1631824413U,	// <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+  3640066150U,	// <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+  3772746288U,	// <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+  3640067790U,	// <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+  3773041216U,	// <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+  2705934922U,	// <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+  3773041236U,	// <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+  3779086940U,	// <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+  3767732831U,	// <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+  2706229870U,	// <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+  2602164326U,	// <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+  2654212512U,	// <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+  2566334393U,	// <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+  3704066588U,	// <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+  2602167524U,	// <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+  3710702321U,	// <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+  2724661933U,	// <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+  3710702465U,	// <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+  2602170158U,	// <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+  1492598886U,	// <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+  2560369889U,	// <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+  1492600762U,	// <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+  2566342806U,	// <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+  1492602166U,	// <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+  2602176208U,	// <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+  2566345210U,	// <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+  2980496528U,	// <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492604718U,	// <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+  1492607078U,	// <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+  2686028574U,	// <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+  1492608955U,	// <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+  2566350998U,	// <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+  1492610358U,	// <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+  1634257734U,	// <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+  2566353489U,	// <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+  2980504720U,	// <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492612910U,	// <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+  3703406592U,	// <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+  2629664870U,	// <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2629664972U,	// <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+  3779087232U,	// <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+  2642936156U,	// <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+  2712570770U,	// <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+  2687208348U,	// <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+  3316723081U,	// <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+  2629665437U,	// <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+  2242473291U,	// <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+  3700089652U,	// <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+  3703407510U,	// <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+  2852962406U,	// <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+  3628166454U,	// <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+  3760876514U,	// <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+  2687208430U,	// <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+  3316731274U,	// <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+  2243063187U,	// <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+  2629666284U,	// <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+  3703408188U,	// <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+  3703408232U,	// <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+  3703408294U,	// <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+  2632320816U,	// <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+  2923384118U,	// <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+  2687208508U,	// <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+  3760950341U,	// <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+  2634975348U,	// <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+  3703408790U,	// <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+  3316305238U,	// <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+  3703408947U,	// <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+  3703409052U,	// <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+  2644929026U,	// <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+  3718670922U,	// <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+  2705345682U,	// <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+  3926705152U,	// <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+  2668817222U,	// <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+  2590277734U,	// <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+  3716017135U,	// <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+  2642938944U,	// <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+  3717344401U,	// <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+  2712571088U,	// <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+  2629668150U,	// <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1637649636U,	// <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2646257109U,	// <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+  1637649636U,	// <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2566398054U,	// <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+  3760876805U,	// <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+  2566399937U,	// <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+  2584316418U,	// <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+  2566401334U,	// <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+  2584318028U,	// <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+  1612287286U,	// <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965686U,	// <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287304U,	// <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504608358U,	// <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2578350838U,	// <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+  2578351720U,	// <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+  2578352278U,	// <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+  1504611638U,	// <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+  2578353872U,	// <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+  2578354682U,	// <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+  2578355194U,	// <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+  1504614190U,	// <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+  2572386406U,	// <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+  2572387226U,	// <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+  3640157902U,	// <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+  2572389020U,	// <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+  2572389686U,	// <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+  2980497102U,	// <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+  2980495564U,	// <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+  4054239090U,	// <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+  2572392238U,	// <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+  1504608358U,	// <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2629670702U,	// <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2566424516U,	// <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+  2584340994U,	// <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+  1640156694U,	// <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+  2629671066U,	// <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1612287529U,	// <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965929U,	// <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287547U,	// <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  3708723200U,	// <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+  2634981478U,	// <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+  3694125260U,	// <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+  3779087962U,	// <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+  3760877154U,	// <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+  4195110916U,	// <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+  3696779775U,	// <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+  1175212130U,	// <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+  1175285867U,	// <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+  2248445988U,	// <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+  3698107237U,	// <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+  3708724118U,	// <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+  3908575334U,	// <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+  3716023376U,	// <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+  3708724368U,	// <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+  3767733960U,	// <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+  2712571600U,	// <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+  2712571609U,	// <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+  2578391142U,	// <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+  3704079934U,	// <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+  3708724840U,	// <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+  3705407182U,	// <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+  2578394422U,	// <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+  3717351272U,	// <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+  2634983354U,	// <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+  3115486518U,	// <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+  2634983541U,	// <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+  3708725398U,	// <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+  3710052631U,	// <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+  3708725606U,	// <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+  3708725660U,	// <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+  2643610114U,	// <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+  3717352010U,	// <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+  3773632358U,	// <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+  2248978533U,	// <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+  2249052270U,	// <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+  2596323430U,	// <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+  3716025328U,	// <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+  3716688961U,	// <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+  2643610770U,	// <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+  2596326710U,	// <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+  2634984758U,	// <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  3767734199U,	// <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+  1643696070U,	// <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+  1643769807U,	// <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+  2578415718U,	// <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+  3652158198U,	// <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+  3652159080U,	// <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+  3652159638U,	// <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+  2578418998U,	// <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+  2712571908U,	// <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+  2718027790U,	// <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+  2712571928U,	// <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+  2712571937U,	// <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+  2705346596U,	// <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+  3767144496U,	// <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+  3773116473U,	// <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+  2705346626U,	// <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+  2705346636U,	// <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+  3908577217U,	// <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+  2578428728U,	// <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+  2712572002U,	// <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+  2705346668U,	// <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+  2560516198U,	// <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560517363U,	// <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+  2566490060U,	// <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+  3634260118U,	// <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+  2560519478U,	// <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+  2980498650U,	// <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+  2980497922U,	// <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  3103214902U,	// <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+  2560522030U,	// <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560524390U,	// <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+  2560525556U,	// <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+  2566498253U,	// <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+  2646931439U,	// <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+  2560527670U,	// <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+  2634987674U,	// <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  2980506114U,	// <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  1175277674U,	// <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+  1175351411U,	// <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+  2578448486U,	// <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+  1573191782U,	// <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2686030124U,	// <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+  3779088690U,	// <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+  2687209788U,	// <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+  3652194000U,	// <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+  2254852914U,	// <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+  4041575734U,	// <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+  1573192349U,	// <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+  2646934262U,	// <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+  2646934324U,	// <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+  2646934422U,	// <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+  2846785638U,	// <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3760951694U,	// <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+  2646934672U,	// <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+  2712572320U,	// <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+  3775549865U,	// <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+  2846785643U,	// <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3759772094U,	// <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+  3704751676U,	// <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+  2631009936U,	// <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+  2646935206U,	// <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+  3759772127U,	// <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+  3704752004U,	// <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+  2646935482U,	// <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+  2712572410U,	// <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+  2712572419U,	// <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+  2646935702U,	// <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+  3777024534U,	// <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+  3704752453U,	// <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+  2646935964U,	// <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+  2705347122U,	// <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+  3779678778U,	// <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+  2657553069U,	// <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+  4039609654U,	// <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+  2708001366U,	// <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+  2578481254U,	// <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+  3652223734U,	// <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+  3760951922U,	// <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+  3779089019U,	// <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+  1570540772U,	// <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1573195062U,	// <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  2712572560U,	// <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+  2723410591U,	// <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+  1573195304U,	// <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+  3640287334U,	// <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+  2646937296U,	// <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+  3640289235U,	// <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+  3720679279U,	// <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+  2646937542U,	// <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+  2646937604U,	// <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+  2646937698U,	// <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+  2846788918U,	// <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+  2846788919U,	// <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+  1516699750U,	// <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  2590442230U,	// <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+  2646938106U,	// <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+  2590443670U,	// <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+  1516703030U,	// <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  2590445264U,	// <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+  296144182U,	// <6,6,6,6>: Cost 1 vdup2 RHS
+  2712572738U,	// <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+  296144182U,	// <6,6,6,u>: Cost 1 vdup2 RHS
+  2566561894U,	// <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+  3634332924U,	// <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+  2566563797U,	// <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+  2584480258U,	// <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+  2566565174U,	// <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+  2717438846U,	// <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+  2980500280U,	// <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+  1906756918U,	// <6,6,7,7>: Cost 2 vzipr RHS, RHS
+  1906756919U,	// <6,6,7,u>: Cost 2 vzipr RHS, RHS
+  1516699750U,	// <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  1573197614U,	// <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2566571990U,	// <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+  2846786205U,	// <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  1516703030U,	// <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  1573197978U,	// <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  296144182U,	// <6,6,u,6>: Cost 1 vdup2 RHS
+  1906765110U,	// <6,6,u,7>: Cost 2 vzipr RHS, RHS
+  296144182U,	// <6,6,u,u>: Cost 1 vdup2 RHS
+  1571209216U,	// <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497467494U,	// <6,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571209380U,	// <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2644951292U,	// <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+  1571209554U,	// <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510756450U,	// <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+  2644951542U,	// <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  2584499194U,	// <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+  497468061U,	// <6,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571209974U,	// <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571210036U,	// <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571210134U,	// <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1571210200U,	// <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2644952098U,	// <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+  1571210384U,	// <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644952271U,	// <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2578535418U,	// <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+  1571210605U,	// <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+  2644952509U,	// <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+  2644952582U,	// <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571210856U,	// <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571210918U,	// <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2644952828U,	// <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+  2633009028U,	// <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+  1571211194U,	// <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2668840938U,	// <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+  1571211323U,	// <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571211414U,	// <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644953311U,	// <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2644953390U,	// <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+  1571211676U,	// <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571211778U,	// <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2644953648U,	// <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+  2644953720U,	// <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+  2644953795U,	// <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571212062U,	// <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1573202834U,	// <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644954058U,	// <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  2644954166U,	// <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2644954258U,	// <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+  1571212496U,	// <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497470774U,	// <6,7,4,5>: Cost 1 vext2 RHS, RHS
+  1573203316U,	// <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2646281688U,	// <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+  497471017U,	// <6,7,4,u>: Cost 1 vext2 RHS, RHS
+  2644954696U,	// <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1573203664U,	// <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2644954878U,	// <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2644954991U,	// <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571213254U,	// <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571213316U,	// <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571213410U,	// <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1573204136U,	// <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1573204217U,	// <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  2644955425U,	// <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+  2644955561U,	// <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+  1573204474U,	// <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2644955698U,	// <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  2644955789U,	// <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+  2644955889U,	// <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+  1571214136U,	// <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571214158U,	// <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1573204895U,	// <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1573204986U,	// <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2572608656U,	// <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+  2644956362U,	// <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+  2572610231U,	// <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+  1573205350U,	// <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  2646947220U,	// <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+  1516786498U,	// <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+  1571214956U,	// <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+  1573205634U,	// <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+  1571215059U,	// <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497473326U,	// <6,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571215237U,	// <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571215292U,	// <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571215423U,	// <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497473690U,	// <6,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571215568U,	// <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  1573206272U,	// <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+  497473893U,	// <6,7,u,u>: Cost 1 vext2 RHS, LHS
+  1571217408U,	// <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497475686U,	// <6,u,0,1>: Cost 1 vext2 RHS, LHS
+  1571217572U,	// <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2689865445U,	// <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+  1571217746U,	// <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510830187U,	// <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+  2644959734U,	// <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  1193130221U,	// <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+  497476253U,	// <6,u,0,u>: Cost 1 vext2 RHS, LHS
+  1571218166U,	// <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571218228U,	// <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1612289838U,	// <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571218392U,	// <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2566663478U,	// <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+  1571218576U,	// <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644960463U,	// <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2717439835U,	// <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+  1612289892U,	// <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  1504870502U,	// <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+  2644960774U,	// <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571219048U,	// <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571219110U,	// <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  1504873782U,	// <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+  2633017221U,	// <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+  1571219386U,	// <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2712573868U,	// <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+  1571219515U,	// <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571219606U,	// <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644961503U,	// <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2566678499U,	// <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+  1571219868U,	// <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571219970U,	// <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2689865711U,	// <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+  2708002806U,	// <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+  2644961987U,	// <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571220254U,	// <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571220370U,	// <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644962250U,	// <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  1661245476U,	// <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+  2686031917U,	// <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+  1571220688U,	// <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497478967U,	// <6,u,4,5>: Cost 1 vext2 RHS, RHS
+  1571220852U,	// <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1661614161U,	// <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+  497479209U,	// <6,u,4,u>: Cost 1 vext2 RHS, RHS
+  2566692966U,	// <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+  1571221200U,	// <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2566694885U,	// <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+  2689865855U,	// <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+  1571221446U,	// <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571221508U,	// <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1612290202U,	// <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  1571221672U,	// <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1612290220U,	// <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504903270U,	// <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+  2644963752U,	// <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571222010U,	// <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2686032080U,	// <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+  1504906550U,	// <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+  2644964079U,	// <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+  296144182U,	// <6,u,6,6>: Cost 1 vdup2 RHS
+  1571222350U,	// <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  296144182U,	// <6,u,6,u>: Cost 1 vdup2 RHS
+  1492967526U,	// <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+  2560738574U,	// <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+  1492969447U,	// <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+  1906753692U,	// <6,u,7,3>: Cost 2 vzipr RHS, LHS
+  1492970806U,	// <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+  2980495761U,	// <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+  1516860235U,	// <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+  1906756936U,	// <6,u,7,7>: Cost 2 vzipr RHS, RHS
+  1492973358U,	// <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+  1492975718U,	// <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+  497481518U,	// <6,u,u,1>: Cost 1 vext2 RHS, LHS
+  1612290405U,	// <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571223484U,	// <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1492978998U,	// <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+  497481882U,	// <6,u,u,5>: Cost 1 vext2 RHS, RHS
+  296144182U,	// <6,u,u,6>: Cost 1 vdup2 RHS
+  1906765128U,	// <6,u,u,7>: Cost 2 vzipr RHS, RHS
+  497482085U,	// <6,u,u,u>: Cost 1 vext2 RHS, LHS
+  1638318080U,	// <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638318090U,	// <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+  1638318100U,	// <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+  3646442178U,	// <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+  2712059941U,	// <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+  2651603364U,	// <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+  2590618445U,	// <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+  3785801798U,	// <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+  1638318153U,	// <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+  1516879974U,	// <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+  2693922911U,	// <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+  564576358U,	// <7,0,1,2>: Cost 1 vext3 RHS, LHS
+  2638996480U,	// <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+  1516883254U,	// <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+  2649613456U,	// <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+  1516884814U,	// <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+  2590626808U,	// <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+  564576412U,	// <7,0,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U,	// <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2692743344U,	// <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+  2712060084U,	// <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+  2712060094U,	// <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+  1638318284U,	// <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712060118U,	// <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2651604922U,	// <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+  2686255336U,	// <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+  1638318316U,	// <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+  2651605142U,	// <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+  2712060156U,	// <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+  2712060165U,	// <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+  2651605404U,	// <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+  2651605506U,	// <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+  2638998111U,	// <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+  2639661744U,	// <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+  3712740068U,	// <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+  2640989010U,	// <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+  2712060232U,	// <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+  1638318418U,	// <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+  1638318428U,	// <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+  3646474950U,	// <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+  2712060270U,	// <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+  1577864502U,	// <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  2651606388U,	// <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+  3787792776U,	// <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+  1638318481U,	// <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+  2590654566U,	// <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+  2651606736U,	// <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+  2712060334U,	// <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649616239U,	// <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+  2651606982U,	// <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+  2651607044U,	// <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+  1577865314U,	// <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+  2651607208U,	// <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+  1579192580U,	// <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+  2688393709U,	// <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+  2712060406U,	// <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  2688541183U,	// <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+  2655588936U,	// <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+  3762430481U,	// <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+  2651607730U,	// <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+  2651607864U,	// <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+  2651607886U,	// <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+  2688983605U,	// <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+  2651608058U,	// <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+  2932703334U,	// <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+  3066921062U,	// <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+  3712742678U,	// <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+  2651608422U,	// <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+  2651608513U,	// <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+  2663552532U,	// <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+  2651608684U,	// <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+  2651608706U,	// <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+  1638318730U,	// <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+  1638318738U,	// <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+  564576925U,	// <7,0,u,2>: Cost 1 vext3 RHS, LHS
+  2572765898U,	// <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+  1638318770U,	// <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+  1577867418U,	// <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  1516942165U,	// <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+  2651609344U,	// <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+  564576979U,	// <7,0,u,u>: Cost 1 vext3 RHS, LHS
+  2590687334U,	// <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+  2639003750U,	// <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+  2793357414U,	// <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+  1638318838U,	// <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+  2590690614U,	// <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+  2712060679U,	// <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2590692182U,	// <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+  3785802521U,	// <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+  1638318883U,	// <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+  2712060715U,	// <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+  1638318900U,	// <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  3774300994U,	// <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+  1638318920U,	// <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+  2712060755U,	// <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+  2691416926U,	// <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+  2590700375U,	// <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+  3765158766U,	// <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+  1638318965U,	// <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+  2712060796U,	// <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+  2712060807U,	// <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+  3712747112U,	// <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+  1638318998U,	// <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+  2712060836U,	// <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+  2712060843U,	// <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+  2590708568U,	// <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+  2735948730U,	// <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+  1638319043U,	// <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+  2712060876U,	// <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+  1638319064U,	// <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2712060894U,	// <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+  2692596718U,	// <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+  2712060917U,	// <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+  1619002368U,	// <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+  2692817929U,	// <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+  2735948814U,	// <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+  1619223579U,	// <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+  2712060962U,	// <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+  2712060971U,	// <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+  2712060980U,	// <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+  2712060989U,	// <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+  3785802822U,	// <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+  2639007030U,	// <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+  2645642634U,	// <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+  3719384520U,	// <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+  2639007273U,	// <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+  2572812390U,	// <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+  2693776510U,	// <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+  3774301318U,	// <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+  1620182160U,	// <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+  2572815670U,	// <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+  3766486178U,	// <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+  2651615331U,	// <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+  2652278964U,	// <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+  1620550845U,	// <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+  3768108230U,	// <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+  2694440143U,	// <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+  2712061144U,	// <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2694587617U,	// <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+  3768403178U,	// <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+  2694735091U,	// <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+  3768550652U,	// <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+  2652279630U,	// <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+  2694956302U,	// <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+  2645644282U,	// <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+  2859062094U,	// <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+  3779462437U,	// <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+  3121938534U,	// <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2554916150U,	// <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+  3769140548U,	// <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+  3726022164U,	// <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+  2554918508U,	// <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+  3121938539U,	// <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2572836966U,	// <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+  1638319469U,	// <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+  2712061299U,	// <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+  1622173059U,	// <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+  2572840246U,	// <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+  1622320533U,	// <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+  2696136094U,	// <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+  2859060777U,	// <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+  1622541744U,	// <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+  2712061364U,	// <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+  2712061373U,	// <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+  2712061380U,	// <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+  2712061389U,	// <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+  2712061404U,	// <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+  2696725990U,	// <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+  2712061417U,	// <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+  3785803251U,	// <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+  2696947201U,	// <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+  2712061446U,	// <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+  3785803276U,	// <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+  3785803285U,	// <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+  2712061471U,	// <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+  2712061482U,	// <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+  3766486576U,	// <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+  2712061500U,	// <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+  2602718850U,	// <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  2712061516U,	// <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+  2712061525U,	// <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+  2712061536U,	// <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+  1638319720U,	// <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638319730U,	// <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+  2712061565U,	// <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+  2698053256U,	// <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+  2712061584U,	// <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+  3771795096U,	// <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+  1638319775U,	// <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+  1638319782U,	// <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+  2693924531U,	// <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+  2700560061U,	// <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+  2693924551U,	// <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+  1638319822U,	// <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+  2698716889U,	// <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+  2712061665U,	// <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+  2735949540U,	// <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+  1638319854U,	// <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+  2712061692U,	// <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+  2712061698U,	// <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+  2712061708U,	// <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+  2712061718U,	// <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+  2712061728U,	// <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+  2699380522U,	// <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+  2712061740U,	// <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+  3809691445U,	// <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+  2699601733U,	// <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+  2699675470U,	// <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+  3766486867U,	// <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+  2699822944U,	// <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+  2692745065U,	// <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+  2699970418U,	// <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+  3766486907U,	// <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+  2700117892U,	// <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+  3771795334U,	// <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+  2692745110U,	// <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+  2572894310U,	// <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+  2712061860U,	// <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+  2700486577U,	// <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+  1626818490U,	// <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+  2572897590U,	// <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+  2700707788U,	// <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+  2700781525U,	// <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+  3774597086U,	// <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+  1627187175U,	// <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+  2735949802U,	// <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+  3780200434U,	// <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+  3773564928U,	// <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+  2986541158U,	// <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+  2554989878U,	// <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+  3775113245U,	// <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+  4060283228U,	// <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+  2554992236U,	// <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+  2986541163U,	// <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+  1638320187U,	// <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+  2693924936U,	// <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+  1638319720U,	// <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1628145756U,	// <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+  1638320227U,	// <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+  2702035054U,	// <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+  2702108791U,	// <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+  2735949945U,	// <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+  1628514441U,	// <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+  2712062091U,	// <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+  1638320278U,	// <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+  2712062109U,	// <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+  2590836886U,	// <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+  2712062128U,	// <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+  2712062138U,	// <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+  2590839656U,	// <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+  3311414017U,	// <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+  1638320341U,	// <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+  2237164227U,	// <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+  2712062182U,	// <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+  2712062193U,	// <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+  2692745468U,	// <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+  2712062214U,	// <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+  2693925132U,	// <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+  3768183059U,	// <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+  2692745504U,	// <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+  2696063273U,	// <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+  2712062254U,	// <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+  2712062262U,	// <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+  2712062273U,	// <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+  2712062280U,	// <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+  2712062294U,	// <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+  2712062302U,	// <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+  2700560742U,	// <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+  2712062319U,	// <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+  2712062325U,	// <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+  2712062335U,	// <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+  2636368158U,	// <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+  2637031791U,	// <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+  1638320540U,	// <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062374U,	// <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+  2704689586U,	// <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+  2590864235U,	// <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+  2704837060U,	// <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+  1638320540U,	// <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062416U,	// <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+  2712062426U,	// <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+  2566981640U,	// <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+  2712062447U,	// <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+  2712062456U,	// <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+  1638320642U,	// <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+  2648313204U,	// <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+  3311446789U,	// <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+  1638320669U,	// <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+  2602819686U,	// <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+  1574571728U,	// <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+  2648977185U,	// <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+  2705869378U,	// <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+  2237491947U,	// <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+  2706016852U,	// <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+  2648313954U,	// <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+  2692745823U,	// <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+  1579217159U,	// <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+  2706311800U,	// <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+  2654286249U,	// <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+  1581208058U,	// <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+  2706533011U,	// <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+  2706606748U,	// <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+  3780422309U,	// <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+  2712062637U,	// <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+  2706827959U,	// <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+  1585189856U,	// <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+  2693925571U,	// <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+  2693925584U,	// <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+  2700561114U,	// <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+  2572978916U,	// <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+  2693925611U,	// <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+  2707344118U,	// <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+  2654950894U,	// <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+  2648315500U,	// <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+  2693925643U,	// <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+  2237221578U,	// <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+  1638320926U,	// <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+  1593153452U,	// <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+  1638320540U,	// <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2237516526U,	// <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+  1638320966U,	// <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+  2712062796U,	// <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+  2692967250U,	// <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+  1638320989U,	// <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+  2651635712U,	// <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+  1577893990U,	// <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2651635876U,	// <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+  3785804672U,	// <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+  2651636050U,	// <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+  1638468498U,	// <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638468508U,	// <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787795364U,	// <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1640459181U,	// <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+  2651636470U,	// <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+  2651636532U,	// <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+  2712062922U,	// <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+  2639029248U,	// <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+  2712062940U,	// <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+  2712062946U,	// <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+  2712062958U,	// <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+  3785804791U,	// <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+  2712062973U,	// <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+  3785804807U,	// <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+  3785804818U,	// <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+  2651637352U,	// <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+  2651637414U,	// <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+  3716753194U,	// <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+  2712063030U,	// <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  2712063036U,	// <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+  3773123658U,	// <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+  2712063054U,	// <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+  2651637910U,	// <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+  3712772348U,	// <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+  3785804906U,	// <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+  2651638172U,	// <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+  2651638274U,	// <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+  2639030883U,	// <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+  2712063122U,	// <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+  3712772836U,	// <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+  2641021782U,	// <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+  2714053802U,	// <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+  3785804978U,	// <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+  3716754505U,	// <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+  3785804998U,	// <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+  1638321360U,	// <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638468826U,	// <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+  1638468836U,	// <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  3785215214U,	// <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+  1640459509U,	// <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+  1517207654U,	// <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+  2573034640U,	// <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+  2712063246U,	// <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+  2573036267U,	// <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+  1517210934U,	// <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+  2711989549U,	// <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+  564579638U,	// <7,4,5,6>: Cost 1 vext3 RHS, RHS
+  2651639976U,	// <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+  564579656U,	// <7,4,5,u>: Cost 1 vext3 RHS, RHS
+  2712063307U,	// <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+  3767668056U,	// <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+  2651640314U,	// <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+  2655621708U,	// <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+  1638468980U,	// <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712063358U,	// <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+  2712063367U,	// <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+  2712210826U,	// <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1638469012U,	// <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+  2651640826U,	// <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+  3773713830U,	// <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+  3773713842U,	// <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+  3780349372U,	// <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+  2651641140U,	// <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+  2712210888U,	// <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  2712210898U,	// <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+  2651641452U,	// <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+  2713538026U,	// <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+  1517232230U,	// <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+  1577899822U,	// <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2712063489U,	// <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+  2573060846U,	// <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+  1640312342U,	// <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+  1638469146U,	// <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+  564579881U,	// <7,4,u,6>: Cost 1 vext3 RHS, RHS
+  2714054192U,	// <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+  564579899U,	// <7,4,u,u>: Cost 1 vext3 RHS, RHS
+  2579038310U,	// <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+  2636382310U,	// <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2796339302U,	// <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+  3646810719U,	// <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+  2712063586U,	// <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+  2735951467U,	// <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+  2735951476U,	// <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+  2579043322U,	// <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+  2636382877U,	// <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211087U,	// <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+  3698180916U,	// <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+  3710124950U,	// <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+  2636383232U,	// <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+  2712211127U,	// <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+  2590994128U,	// <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+  2590995323U,	// <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+  1638469328U,	// <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638469337U,	// <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  3785805536U,	// <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+  3785805544U,	// <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+  3704817288U,	// <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+  2712063742U,	// <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+  3716761386U,	// <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+  2714054415U,	// <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  3774304024U,	// <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+  2712063777U,	// <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+  2712063787U,	// <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+  3634888806U,	// <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+  2636384544U,	// <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+  3710790001U,	// <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+  3710126492U,	// <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+  3634892086U,	// <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+  2639039076U,	// <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+  3713444533U,	// <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+  2693926767U,	// <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+  2712063864U,	// <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+  2579071078U,	// <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+  3646841856U,	// <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+  3716762698U,	// <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+  3646843491U,	// <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+  2579074358U,	// <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+  2636385590U,	// <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+  2645675406U,	// <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+  1638322118U,	// <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1638469583U,	// <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+  2714054611U,	// <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+  2652974800U,	// <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+  3710127905U,	// <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+  3785805808U,	// <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+  2712211450U,	// <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+  1638322180U,	// <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+  2712064014U,	// <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638469656U,	// <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  1638469665U,	// <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+  2712064036U,	// <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+  2714054707U,	// <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+  3785805879U,	// <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+  2712064066U,	// <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+  2712064076U,	// <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+  2714054743U,	// <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712064096U,	// <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  1638322274U,	// <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+  1638469739U,	// <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+  1511325798U,	// <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+  2692747392U,	// <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+  2585069160U,	// <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+  2573126390U,	// <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+  1511329078U,	// <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+  1638469800U,	// <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712211626U,	// <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2712211636U,	// <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+  1638469823U,	// <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+  1511333990U,	// <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+  2636388142U,	// <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211671U,	// <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+  2573134583U,	// <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+  1511337270U,	// <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+  1638469881U,	// <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+  2712064258U,	// <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+  1638469892U,	// <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+  1638469904U,	// <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+  2650324992U,	// <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+  1576583270U,	// <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712064300U,	// <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+  2255295336U,	// <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+  2712064316U,	// <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+  2585088098U,	// <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+  2735952204U,	// <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+  2712211799U,	// <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+  1576583837U,	// <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+  1181340494U,	// <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+  2650325812U,	// <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+  2650325910U,	// <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+  2650325976U,	// <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+  2579123510U,	// <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+  2650326160U,	// <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+  2714055072U,	// <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2712064425U,	// <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+  1181930390U,	// <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+  2712211897U,	// <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+  2714055108U,	// <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+  2650326632U,	// <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+  2650326694U,	// <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+  2714055137U,	// <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+  2714055148U,	// <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+  2650326970U,	// <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+  1638470138U,	// <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638470147U,	// <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2650327190U,	// <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+  2255172441U,	// <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+  2255246178U,	// <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+  2650327452U,	// <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+  2712064562U,	// <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+  2650327627U,	// <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+  3713452726U,	// <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+  2700563016U,	// <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+  2712064593U,	// <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+  2650327954U,	// <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+  2735952486U,	// <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+  2735952497U,	// <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+  2255328108U,	// <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+  2712212100U,	// <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+  1576586550U,	// <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  2714055312U,	// <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+  2712212126U,	// <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+  1576586793U,	// <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+  2579152998U,	// <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+  2650328784U,	// <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+  2714055364U,	// <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+  3785806538U,	// <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+  1576587206U,	// <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+  2650329092U,	// <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+  2650329186U,	// <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+  2712064753U,	// <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+  1181963162U,	// <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+  2714055421U,	// <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+  2714055432U,	// <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+  2650329594U,	// <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+  3785806619U,	// <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+  2712212260U,	// <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+  2714055472U,	// <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+  1638323000U,	// <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470466U,	// <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  1638470475U,	// <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+  1638323022U,	// <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+  2712064854U,	// <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+  2712064865U,	// <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+  2712064872U,	// <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+  1638323062U,	// <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+  2712064894U,	// <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+  2712064905U,	// <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+  2712064915U,	// <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+  1638323094U,	// <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+  1638470559U,	// <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+  1576589102U,	// <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712212402U,	// <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+  2712212409U,	// <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+  1638470599U,	// <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+  1576589466U,	// <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  1638323000U,	// <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470624U,	// <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+  1638470631U,	// <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+  2712065007U,	// <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+  1638323194U,	// <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+  2712065025U,	// <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+  3646958337U,	// <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+  2712065044U,	// <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+  2585161907U,	// <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+  2591134604U,	// <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+  2591134714U,	// <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+  1638323257U,	// <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+  2712065091U,	// <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+  2712065098U,	// <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+  2712065109U,	// <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+  2692748384U,	// <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+  2585169206U,	// <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+  2693928048U,	// <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+  2585170766U,	// <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+  2735953024U,	// <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+  2695918731U,	// <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+  3770471574U,	// <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+  3785807002U,	// <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+  2712065189U,	// <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+  2712065196U,	// <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+  3773125818U,	// <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+  3766490305U,	// <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+  2700563658U,	// <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+  2735953107U,	// <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+  2701890780U,	// <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+  2712065251U,	// <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+  3766490350U,	// <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+  3774305530U,	// <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+  2637728196U,	// <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+  2712065291U,	// <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+  2585186486U,	// <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+  2639719095U,	// <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+  2640382728U,	// <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+  2641046361U,	// <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+  2712212792U,	// <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+  3646989312U,	// <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+  3785807176U,	// <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+  3646991109U,	// <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+  2712065371U,	// <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+  1638323558U,	// <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+  2712212845U,	// <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+  2591167846U,	// <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+  1638323585U,	// <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+  2585198694U,	// <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+  2712212884U,	// <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+  3711471393U,	// <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+  2649673590U,	// <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+  2712065455U,	// <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+  1577259032U,	// <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+  2712065473U,	// <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+  2712212936U,	// <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+  1579249931U,	// <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+  2591178854U,	// <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+  2735953374U,	// <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+  2712212974U,	// <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+  2655646287U,	// <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+  2591182134U,	// <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+  2656973553U,	// <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+  1583895362U,	// <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+  2712065556U,	// <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+  1585222628U,	// <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+  1523417190U,	// <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  2597159670U,	// <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+  2597160552U,	// <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+  2597161110U,	// <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+  1523420470U,	// <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  2651002296U,	// <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+  2657637906U,	// <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+  363253046U,	// <7,7,7,7>: Cost 1 vdup3 RHS
+  363253046U,	// <7,7,7,u>: Cost 1 vdup3 RHS
+  1523417190U,	// <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  1638471298U,	// <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+  2712213132U,	// <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+  2712213138U,	// <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+  1523420470U,	// <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  1638471338U,	// <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+  1595840756U,	// <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+  363253046U,	// <7,7,u,7>: Cost 1 vdup3 RHS
+  363253046U,	// <7,7,u,u>: Cost 1 vdup3 RHS
+  1638318080U,	// <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638323923U,	// <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+  1662211804U,	// <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+  1638323941U,	// <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+  2712065773U,	// <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+  1662359286U,	// <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+  1662359296U,	// <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  2987150664U,	// <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+  1638323986U,	// <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+  1517469798U,	// <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+  1638318900U,	// <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  564582190U,	// <7,u,1,2>: Cost 1 vext3 RHS, LHS
+  1638324023U,	// <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+  1517473078U,	// <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+  2693928777U,	// <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+  1517474710U,	// <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+  1640462171U,	// <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  564582244U,	// <7,u,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U,	// <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2712065907U,	// <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+  1638319720U,	// <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638324101U,	// <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+  1638318284U,	// <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712065947U,	// <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+  2700564387U,	// <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+  1640314796U,	// <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  1638324146U,	// <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+  1638324156U,	// <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+  1638319064U,	// <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2700564435U,	// <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+  1638320540U,	// <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  1638324196U,	// <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+  1638324207U,	// <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+  2700564472U,	// <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+  2695919610U,	// <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+  1638324228U,	// <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+  2712066061U,	// <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+  1662212122U,	// <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+  1662212132U,	// <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+  2712066092U,	// <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+  1638321360U,	// <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638324287U,	// <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+  1662359624U,	// <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+  1640314961U,	// <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  1638324314U,	// <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+  1517502566U,	// <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+  1574612693U,	// <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+  2712066162U,	// <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+  1638324351U,	// <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+  1576603592U,	// <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+  1577267225U,	// <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+  564582554U,	// <7,u,5,6>: Cost 1 vext3 RHS, RHS
+  1640462499U,	// <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+  564582572U,	// <7,u,5,u>: Cost 1 vext3 RHS, RHS
+  2712066223U,	// <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+  2712066238U,	// <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+  1581249023U,	// <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+  1638324432U,	// <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+  1638468980U,	// <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712066274U,	// <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+  1583903555U,	// <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+  1640315117U,	// <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+  1638324477U,	// <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+  1638471936U,	// <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+  2692970763U,	// <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+  2700933399U,	// <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+  2573347601U,	// <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+  1638471976U,	// <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+  1511551171U,	// <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+  2712213815U,	// <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+  363253046U,	// <7,u,7,7>: Cost 1 vdup3 RHS
+  363253046U,	// <7,u,7,u>: Cost 1 vdup3 RHS
+  1638324561U,	// <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+  1638324571U,	// <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+  564582757U,	// <7,u,u,2>: Cost 1 vext3 RHS, LHS
+  1638324587U,	// <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+  1638324601U,	// <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+  1638324611U,	// <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+  564582797U,	// <7,u,u,6>: Cost 1 vext3 RHS, RHS
+  363253046U,	// <7,u,u,7>: Cost 1 vdup3 RHS
+  564582811U,	// <7,u,u,u>: Cost 1 vext3 RHS, LHS
+  135053414U,	// <u,0,0,0>: Cost 1 vdup0 LHS
+  1611489290U,	// <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611489300U,	// <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  2568054923U,	// <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1481706806U,	// <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+  2555449040U,	// <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+  2591282078U,	// <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+  2591945711U,	// <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U,	// <u,0,0,u>: Cost 1 vdup0 LHS
+  1493655654U,	// <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+  1860550758U,	// <u,0,1,1>: Cost 2 vzipl LHS, LHS
+  537747563U,	// <u,0,1,2>: Cost 1 vext3 LHS, LHS
+  2625135576U,	// <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+  1493658934U,	// <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+  2625135760U,	// <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+  1517548447U,	// <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+  2591290362U,	// <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+  537747612U,	// <u,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611489444U,	// <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685231276U,	// <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  1994768486U,	// <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2685231294U,	// <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611489484U,	// <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2712068310U,	// <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2625136570U,	// <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+  2591962097U,	// <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1611489516U,	// <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2954067968U,	// <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2685231356U,	// <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  72589981U,	// <u,0,3,2>: Cost 1 vrev LHS
+  2625137052U,	// <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+  2625137154U,	// <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+  2639071848U,	// <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+  2639735481U,	// <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+  2597279354U,	// <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+  73032403U,	// <u,0,3,u>: Cost 1 vrev LHS
+  2687074636U,	// <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+  1611489618U,	// <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611489628U,	// <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3629222038U,	// <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+  2555481398U,	// <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+  1551396150U,	// <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  2651680116U,	// <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+  2646150600U,	// <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1611932050U,	// <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2561458278U,	// <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+  1863532646U,	// <u,0,5,1>: Cost 2 vzipl RHS, LHS
+  2712068526U,	// <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649689976U,	// <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+  2220237489U,	// <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+  2651680772U,	// <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+  1577939051U,	// <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+  2830077238U,	// <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  1579266317U,	// <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+  2555494502U,	// <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+  2712068598U,	// <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  1997750374U,	// <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2655662673U,	// <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+  2555497782U,	// <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+  2651681459U,	// <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+  2651681592U,	// <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+  2651681614U,	// <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+  1997750428U,	// <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+  2567446630U,	// <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+  2567447446U,	// <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+  2567448641U,	// <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+  2573421338U,	// <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+  2567449910U,	// <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+  2651682242U,	// <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+  2591339429U,	// <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+  2651682412U,	// <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+  2567452462U,	// <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+  135053414U,	// <u,0,u,0>: Cost 1 vdup0 LHS
+  1611489938U,	// <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537748125U,	// <u,0,u,2>: Cost 1 vext3 LHS, LHS
+  2685674148U,	// <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1611932338U,	// <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551399066U,	// <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  1517605798U,	// <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+  2830077481U,	// <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  537748179U,	// <u,0,u,u>: Cost 1 vext3 LHS, LHS
+  1544101961U,	// <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+  1558036582U,	// <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+  2619171051U,	// <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+  1611490038U,	// <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2555522358U,	// <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+  2712068871U,	// <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2591355815U,	// <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+  2597328512U,	// <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+  1611490083U,	// <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  1481785446U,	// <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+  202162278U,	// <u,1,1,1>: Cost 1 vdup1 LHS
+  2555528808U,	// <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+  1611490120U,	// <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  1481788726U,	// <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+  2689876828U,	// <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  2591364008U,	// <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+  2592691274U,	// <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U,	// <u,1,1,u>: Cost 1 vdup1 LHS
+  1499709542U,	// <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+  2689876871U,	// <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2631116445U,	// <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+  835584U,	// <u,1,2,3>: Cost 0 copy LHS
+  1499712822U,	// <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+  2689876907U,	// <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  2631780282U,	// <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+  1523603074U,	// <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+  835584U,	// <u,1,2,u>: Cost 0 copy LHS
+  1487773798U,	// <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+  1611490264U,	// <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685232094U,	// <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2018746470U,	// <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+  1487777078U,	// <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+  1611490304U,	// <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2685674505U,	// <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2640407307U,	// <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+  1611490327U,	// <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  1567992749U,	// <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+  2693121070U,	// <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+  2693194807U,	// <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+  1152386432U,	// <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+  2555555126U,	// <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+  1558039862U,	// <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+  2645716371U,	// <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+  2597361284U,	// <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+  1152755117U,	// <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+  1481818214U,	// <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+  2555560694U,	// <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+  2555561576U,	// <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+  1611490448U,	// <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  1481821494U,	// <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+  2651025435U,	// <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+  2651689068U,	// <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+  2823966006U,	// <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+  1611932861U,	// <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  2555568230U,	// <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+  2689877199U,	// <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2712069336U,	// <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2685232353U,	// <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  2555571510U,	// <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+  2689877235U,	// <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  2657661765U,	// <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+  1584583574U,	// <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+  1585247207U,	// <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+  2561548390U,	// <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+  2561549681U,	// <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+  2573493926U,	// <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+  2042962022U,	// <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2561551670U,	// <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+  2226300309U,	// <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+  2658325990U,	// <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+  2658326124U,	// <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+  2042962027U,	// <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1481842790U,	// <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+  202162278U,	// <u,1,u,1>: Cost 1 vdup1 LHS
+  2685674867U,	// <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  835584U,	// <u,1,u,3>: Cost 0 copy LHS
+  1481846070U,	// <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+  1611933077U,	// <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685674910U,	// <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  1523652232U,	// <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+  835584U,	// <u,1,u,u>: Cost 0 copy LHS
+  1544110154U,	// <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+  1545437286U,	// <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  1545437420U,	// <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+  2685232589U,	// <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2619179346U,	// <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+  2712069606U,	// <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+  2689877484U,	// <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2659656273U,	// <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+  1545437853U,	// <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+  1550082851U,	// <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+  2619179828U,	// <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+  2619179926U,	// <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+  2685232671U,	// <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2555604278U,	// <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+  2619180176U,	// <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+  2689877564U,	// <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  2602718850U,	// <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  1158703235U,	// <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+  1481867366U,	// <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+  2555609846U,	// <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+  269271142U,	// <u,2,2,2>: Cost 1 vdup2 LHS
+  1611490930U,	// <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  1481870646U,	// <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+  2689877640U,	// <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2619180986U,	// <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+  2593436837U,	// <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U,	// <u,2,2,u>: Cost 1 vdup2 LHS
+  408134301U,	// <u,2,3,0>: Cost 1 vext1 LHS, LHS
+  1481876214U,	// <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1481877096U,	// <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1880326246U,	// <u,2,3,3>: Cost 2 vzipr LHS, LHS
+  408137014U,	// <u,2,3,4>: Cost 1 vext1 LHS, RHS
+  1529654992U,	// <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1529655802U,	// <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1529656314U,	// <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408139566U,	// <u,2,3,u>: Cost 1 vext1 LHS, LHS
+  1567853468U,	// <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  2561598362U,	// <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+  2555627214U,	// <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+  2685232918U,	// <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2555628854U,	// <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+  1545440566U,	// <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1571982740U,	// <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+  2592125957U,	// <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1545440809U,	// <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+  2555633766U,	// <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+  2561606550U,	// <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+  2689877856U,	// <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685233000U,	// <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1158441059U,	// <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+  2645725188U,	// <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+  2689877892U,	// <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2823900470U,	// <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+  1158736007U,	// <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+  1481900134U,	// <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+  2555642614U,	// <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+  2555643496U,	// <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+  1611491258U,	// <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  1481903414U,	// <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+  2689877964U,	// <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689877973U,	// <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2645726030U,	// <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+  1611933671U,	// <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  1585919033U,	// <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+  2573566710U,	// <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+  2567596115U,	// <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+  1906901094U,	// <u,2,7,3>: Cost 2 vzipr RHS, LHS
+  2555653430U,	// <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+  2800080230U,	// <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+  2980643164U,	// <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2645726828U,	// <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+  1906901099U,	// <u,2,7,u>: Cost 2 vzipr RHS, LHS
+  408175266U,	// <u,2,u,0>: Cost 1 vext1 LHS, LHS
+  1545443118U,	// <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  269271142U,	// <u,2,u,2>: Cost 1 vdup2 LHS
+  1611491416U,	// <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  408177974U,	// <u,2,u,4>: Cost 1 vext1 LHS, RHS
+  1545443482U,	// <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1726339226U,	// <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+  1529697274U,	// <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408180526U,	// <u,2,u,u>: Cost 1 vext1 LHS, LHS
+  1544781824U,	// <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  471040156U,	// <u,3,0,1>: Cost 1 vext2 LHS, LHS
+  1544781988U,	// <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2618523900U,	// <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+  1544782162U,	// <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2238188352U,	// <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+  2623169023U,	// <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2238335826U,	// <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+  471040669U,	// <u,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544782582U,	// <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544782644U,	// <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544782742U,	// <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544782808U,	// <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618524733U,	// <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544782992U,	// <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618524897U,	// <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2703517987U,	// <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+  1544783213U,	// <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  1529716838U,	// <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+  1164167966U,	// <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+  1544783464U,	// <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544783526U,	// <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1529720118U,	// <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+  2618525544U,	// <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544783802U,	// <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2704181620U,	// <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+  1544783931U,	// <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1544784022U,	// <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  1487922559U,	// <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+  1493895256U,	// <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+  336380006U,	// <u,3,3,3>: Cost 1 vdup3 LHS
+  1544784386U,	// <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2824054478U,	// <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2238286668U,	// <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+  2954069136U,	// <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  336380006U,	// <u,3,3,u>: Cost 1 vdup3 LHS
+  1487929446U,	// <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+  1487930752U,	// <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+  2623171644U,	// <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2561673366U,	// <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+  1487932726U,	// <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+  471043382U,	// <u,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592561012U,	// <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2238368598U,	// <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+  471043625U,	// <u,3,4,u>: Cost 1 vext2 LHS, RHS
+  2555707494U,	// <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+  1574645465U,	// <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+  2567653106U,	// <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+  2555709954U,	// <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+  1592561606U,	// <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592561668U,	// <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592561762U,	// <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1750314294U,	// <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1750314295U,	// <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2623172897U,	// <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2561688962U,	// <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+  1581281795U,	// <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+  2706541204U,	// <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+  2623173261U,	// <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  1164495686U,	// <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+  1592562488U,	// <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592562510U,	// <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1164716897U,	// <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+  1487954022U,	// <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+  1487955331U,	// <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+  1493928028U,	// <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+  2561697942U,	// <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+  1487957302U,	// <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+  2707352311U,	// <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+  2655024623U,	// <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+  1592563308U,	// <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1487959854U,	// <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+  1544787667U,	// <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  471045934U,	// <u,3,u,1>: Cost 1 vext2 LHS, LHS
+  1549432709U,	// <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  336380006U,	// <u,3,u,3>: Cost 1 vdup3 LHS
+  1544788031U,	// <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  471046298U,	// <u,3,u,5>: Cost 1 vext2 LHS, RHS
+  1549433040U,	// <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1750314537U,	// <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+  471046501U,	// <u,3,u,u>: Cost 1 vext2 LHS, LHS
+  2625167360U,	// <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+  1551425638U,	// <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  2619195630U,	// <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+  2619343104U,	// <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2625167698U,	// <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+  1638329234U,	// <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638329244U,	// <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787803556U,	// <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1551426205U,	// <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+  2555748454U,	// <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+  2625168180U,	// <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+  1551426503U,	// <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+  2625168344U,	// <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+  2555751734U,	// <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+  1860554038U,	// <u,4,1,5>: Cost 2 vzipl LHS, RHS
+  2689879022U,	// <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2592248852U,	// <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1555408301U,	// <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+  2555756646U,	// <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+  2625168943U,	// <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+  2625169000U,	// <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+  2619197134U,	// <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+  2555759926U,	// <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+  2712071222U,	// <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  1994771766U,	// <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U,	// <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1994771784U,	// <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+  2625169558U,	// <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+  2567709594U,	// <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+  2567710817U,	// <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+  2625169820U,	// <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+  2625169922U,	// <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+  2954069710U,	// <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2954068172U,	// <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3903849472U,	// <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+  2954068174U,	// <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  1505919078U,	// <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+  2567717831U,	// <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+  2567719010U,	// <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+  2570373542U,	// <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  161926454U,	// <u,4,4,4>: Cost 1 vdup0 RHS
+  1551428918U,	// <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  1638329572U,	// <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  2594927963U,	// <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U,	// <u,4,4,u>: Cost 1 vdup0 RHS
+  1493983334U,	// <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+  2689879301U,	// <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1493985379U,	// <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+  2567727254U,	// <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+  1493986614U,	// <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+  1863535926U,	// <u,4,5,5>: Cost 2 vzipl RHS, RHS
+  537750838U,	// <u,4,5,6>: Cost 1 vext3 LHS, RHS
+  2830110006U,	// <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537750856U,	// <u,4,5,u>: Cost 1 vext3 LHS, RHS
+  1482047590U,	// <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+  2555790070U,	// <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+  2555790952U,	// <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+  2555791510U,	// <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+  1482050870U,	// <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+  2689879422U,	// <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  1997753654U,	// <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2712071562U,	// <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1482053422U,	// <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+  2567741542U,	// <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+  2567742362U,	// <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+  2567743589U,	// <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+  2573716286U,	// <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+  2567744822U,	// <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+  2712071624U,	// <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  96808489U,	// <u,4,7,6>: Cost 1 vrev RHS
+  2651715180U,	// <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+  96955963U,	// <u,4,7,u>: Cost 1 vrev RHS
+  1482063974U,	// <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+  1551431470U,	// <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  1494009958U,	// <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+  2555807894U,	// <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+  161926454U,	// <u,4,u,4>: Cost 1 vdup0 RHS
+  1551431834U,	// <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  537751081U,	// <u,4,u,6>: Cost 1 vext3 LHS, RHS
+  2830110249U,	// <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537751099U,	// <u,4,u,u>: Cost 1 vext3 LHS, RHS
+  2631811072U,	// <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+  1558069350U,	// <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+  2619203823U,	// <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+  2619867456U,	// <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+  1546273106U,	// <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2733010539U,	// <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2597622682U,	// <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+  1176539396U,	// <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+  1558069917U,	// <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+  1505968230U,	// <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+  2624512887U,	// <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+  2631811990U,	// <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+  2618541056U,	// <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+  1505971510U,	// <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+  2627167419U,	// <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+  2579714554U,	// <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+  1638330064U,	// <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638477529U,	// <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  2561802342U,	// <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+  2561803264U,	// <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+  2631149217U,	// <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+  1558071026U,	// <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+  2561805622U,	// <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+  2714062607U,	// <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  2631813050U,	// <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+  3092335926U,	// <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+  1561389191U,	// <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+  2561810534U,	// <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+  2561811857U,	// <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+  2631813474U,	// <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+  2631813532U,	// <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+  2619869698U,	// <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+  3001847002U,	// <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+  2954070530U,	// <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2018749750U,	// <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2018749751U,	// <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2573762662U,	// <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+  2620017634U,	// <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  2573764338U,	// <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+  2573765444U,	// <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+  1570680053U,	// <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+  1558072630U,	// <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+  2645749143U,	// <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+  1638330310U,	// <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1558072873U,	// <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+  1506000998U,	// <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+  2561827984U,	// <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+  2579744360U,	// <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+  2579744918U,	// <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+  1506004278U,	// <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+  229035318U,	// <u,5,5,5>: Cost 1 vdup1 RHS
+  2712072206U,	// <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638330392U,	// <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  229035318U,	// <u,5,5,u>: Cost 1 vdup1 RHS
+  1500037222U,	// <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+  2561836436U,	// <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+  2567809133U,	// <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+  1500040006U,	// <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+  1500040502U,	// <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+  2714062935U,	// <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712072288U,	// <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  27705344U,	// <u,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,6,u>: Cost 0 copy RHS
+  1488101478U,	// <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488102805U,	// <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+  2561844840U,	// <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+  2561845398U,	// <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+  1488104758U,	// <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+  1638330536U,	// <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712072362U,	// <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2042965302U,	// <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+  1488107310U,	// <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488109670U,	// <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+  1488110998U,	// <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+  2561853032U,	// <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+  1500056392U,	// <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+  1488112950U,	// <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+  229035318U,	// <u,5,u,5>: Cost 1 vdup1 RHS
+  2954111490U,	// <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  27705344U,	// <u,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,u,u>: Cost 0 copy RHS
+  2619211776U,	// <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+  1545470054U,	// <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1545470192U,	// <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+  2255958969U,	// <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+  1546797458U,	// <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+  2720624971U,	// <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+  2256180180U,	// <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+  2960682294U,	// <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+  1545470621U,	// <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+  1182004127U,	// <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+  2619212596U,	// <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+  2619212694U,	// <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+  2619212760U,	// <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+  2626511979U,	// <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+  2619212944U,	// <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+  2714063264U,	// <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2967326006U,	// <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+  1182594023U,	// <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+  1506050150U,	// <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+  2579792630U,	// <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+  2619213416U,	// <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+  2619213478U,	// <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+  1506053430U,	// <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+  2633148309U,	// <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+  2619213754U,	// <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+  1638330874U,	// <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638478339U,	// <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2619213974U,	// <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+  2255836074U,	// <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+  2255909811U,	// <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+  2619214236U,	// <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+  1564715549U,	// <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+  2639121006U,	// <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+  3001847012U,	// <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1880329526U,	// <u,6,3,7>: Cost 2 vzipr LHS, RHS
+  1880329527U,	// <u,6,3,u>: Cost 2 vzipr LHS, RHS
+  2567864422U,	// <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+  2733011558U,	// <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+  2567866484U,	// <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+  2638458005U,	// <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+  1570540772U,	// <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1545473334U,	// <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  1572015512U,	// <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+  2960715062U,	// <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+  1545473577U,	// <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+  2567872614U,	// <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+  2645757648U,	// <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+  2567874490U,	// <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+  2576501250U,	// <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  1576660943U,	// <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+  2645757956U,	// <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+  2645758050U,	// <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+  2824080694U,	// <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+  1182626795U,	// <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+  1506082918U,	// <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+  2579825398U,	// <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+  2645758458U,	// <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+  2579826838U,	// <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+  1506086198U,	// <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+  2579828432U,	// <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+  296144182U,	// <u,6,6,6>: Cost 1 vdup2 RHS
+  1638331202U,	// <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  296144182U,	// <u,6,6,u>: Cost 1 vdup2 RHS
+  432349286U,	// <u,6,7,0>: Cost 1 vext1 RHS, LHS
+  1506091766U,	// <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1506092648U,	// <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506093206U,	// <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432352809U,	// <u,6,7,4>: Cost 1 vext1 RHS, RHS
+  1506094800U,	// <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  1506095610U,	// <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1906904374U,	// <u,6,7,7>: Cost 2 vzipr RHS, RHS
+  432355118U,	// <u,6,7,u>: Cost 1 vext1 RHS, LHS
+  432357478U,	// <u,6,u,0>: Cost 1 vext1 RHS, LHS
+  1545475886U,	// <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1506100840U,	// <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506101398U,	// <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432361002U,	// <u,6,u,4>: Cost 1 vext1 RHS, RHS
+  1545476250U,	// <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  296144182U,	// <u,6,u,6>: Cost 1 vdup2 RHS
+  1880370486U,	// <u,6,u,7>: Cost 2 vzipr LHS, RHS
+  432363310U,	// <u,6,u,u>: Cost 1 vext1 RHS, LHS
+  1571356672U,	// <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497614950U,	// <u,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571356836U,	// <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2573880146U,	// <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+  1571357010U,	// <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1512083716U,	// <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+  2621874741U,	// <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+  2585826298U,	// <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+  497615517U,	// <u,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571357430U,	// <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571357492U,	// <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571357590U,	// <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1552114715U,	// <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+  2573888822U,	// <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+  1553441981U,	// <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+  2627847438U,	// <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+  2727408775U,	// <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+  1555432880U,	// <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+  2629838337U,	// <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+  1188058754U,	// <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+  1571358312U,	// <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571358374U,	// <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2632492869U,	// <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+  2633156502U,	// <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+  1560078311U,	// <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+  2728072408U,	// <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+  1561405577U,	// <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+  1571358870U,	// <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2627184913U,	// <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+  2633820523U,	// <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+  1571359132U,	// <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571359234U,	// <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  1512108295U,	// <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+  1518080992U,	// <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+  2640456465U,	// <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+  1571359518U,	// <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571359634U,	// <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2573911067U,	// <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+  2645101622U,	// <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2573912918U,	// <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+  1571359952U,	// <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497618248U,	// <u,7,4,5>: Cost 1 vext2 RHS, RHS
+  1571360116U,	// <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2645102024U,	// <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+  497618473U,	// <u,7,4,u>: Cost 1 vext2 RHS, RHS
+  2645102152U,	// <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1571360464U,	// <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2645102334U,	// <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2645102447U,	// <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571360710U,	// <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571360772U,	// <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571360866U,	// <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1571360936U,	// <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1571361017U,	// <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  1530044518U,	// <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+  2645103016U,	// <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571361274U,	// <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2645103154U,	// <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  1530047798U,	// <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+  1188386474U,	// <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+  1571361592U,	// <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571361614U,	// <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1571361695U,	// <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1571361786U,	// <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2573935616U,	// <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+  2645103781U,	// <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+  2573937497U,	// <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+  1571362150U,	// <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  1512141067U,	// <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+  1518113764U,	// <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+  363253046U,	// <u,7,7,7>: Cost 1 vdup3 RHS
+  363253046U,	// <u,7,7,u>: Cost 1 vdup3 RHS
+  1571362515U,	// <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497620782U,	// <u,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571362693U,	// <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571362748U,	// <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571362879U,	// <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497621146U,	// <u,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571363024U,	// <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  363253046U,	// <u,7,u,7>: Cost 1 vdup3 RHS
+  497621349U,	// <u,7,u,u>: Cost 1 vext2 RHS, LHS
+  135053414U,	// <u,u,0,0>: Cost 1 vdup0 LHS
+  471081121U,	// <u,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544822948U,	// <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1616140005U,	// <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  1544823122U,	// <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  1512157453U,	// <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+  1662220032U,	// <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  1194457487U,	// <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+  471081629U,	// <u,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544823542U,	// <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  202162278U,	// <u,u,1,1>: Cost 1 vdup1 LHS
+  537753390U,	// <u,u,1,2>: Cost 1 vext3 LHS, LHS
+  1544823768U,	// <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  1494248758U,	// <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+  1544823952U,	// <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  1518138343U,	// <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+  1640322907U,	// <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  537753444U,	// <u,u,1,u>: Cost 1 vext3 LHS, LHS
+  1482309734U,	// <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+  1194031451U,	// <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+  269271142U,	// <u,u,2,2>: Cost 1 vdup2 LHS
+  835584U,	// <u,u,2,3>: Cost 0 copy LHS
+  1482313014U,	// <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+  2618566504U,	// <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544824762U,	// <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  1638479788U,	// <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  835584U,	// <u,u,2,u>: Cost 0 copy LHS
+  408576723U,	// <u,u,3,0>: Cost 1 vext1 LHS, LHS
+  1482318582U,	// <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  120371557U,	// <u,u,3,2>: Cost 1 vrev LHS
+  336380006U,	// <u,u,3,3>: Cost 1 vdup3 LHS
+  408579382U,	// <u,u,3,4>: Cost 1 vext1 LHS, RHS
+  1616140271U,	// <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  1530098170U,	// <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1880329544U,	// <u,u,3,7>: Cost 2 vzipr LHS, RHS
+  408581934U,	// <u,u,3,u>: Cost 1 vext1 LHS, LHS
+  1488298086U,	// <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+  1488299437U,	// <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+  1659271204U,	// <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  1194195311U,	// <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+  161926454U,	// <u,u,4,4>: Cost 1 vdup0 RHS
+  471084342U,	// <u,u,4,5>: Cost 1 vext2 LHS, RHS
+  1571368308U,	// <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1640323153U,	// <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  471084585U,	// <u,u,4,u>: Cost 1 vext2 LHS, RHS
+  1494278246U,	// <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+  1571368656U,	// <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  1494280327U,	// <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+  1616140415U,	// <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1494281526U,	// <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+  229035318U,	// <u,u,5,5>: Cost 1 vdup1 RHS
+  537753754U,	// <u,u,5,6>: Cost 1 vext3 LHS, RHS
+  1750355254U,	// <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+  537753772U,	// <u,u,5,u>: Cost 1 vext3 LHS, RHS
+  1482342502U,	// <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+  2556084982U,	// <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+  1571369466U,	// <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  1611938000U,	// <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1482345782U,	// <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+  1194359171U,	// <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+  296144182U,	// <u,u,6,6>: Cost 1 vdup2 RHS
+  27705344U,	// <u,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,u,6,u>: Cost 0 copy RHS
+  432496742U,	// <u,u,7,0>: Cost 1 vext1 RHS, LHS
+  1488324016U,	// <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+  1494296713U,	// <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+  1906901148U,	// <u,u,7,3>: Cost 2 vzipr RHS, LHS
+  432500283U,	// <u,u,7,4>: Cost 1 vext1 RHS, RHS
+  1506242256U,	// <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  120699277U,	// <u,u,7,6>: Cost 1 vrev RHS
+  363253046U,	// <u,u,7,7>: Cost 1 vdup3 RHS
+  432502574U,	// <u,u,7,u>: Cost 1 vext1 RHS, LHS
+  408617688U,	// <u,u,u,0>: Cost 1 vext1 LHS, LHS
+  471086894U,	// <u,u,u,1>: Cost 1 vext2 LHS, LHS
+  537753957U,	// <u,u,u,2>: Cost 1 vext3 LHS, LHS
+  835584U,	// <u,u,u,3>: Cost 0 copy LHS
+  408620342U,	// <u,u,u,4>: Cost 1 vext1 LHS, RHS
+  471087258U,	// <u,u,u,5>: Cost 1 vext2 LHS, RHS
+  537753997U,	// <u,u,u,6>: Cost 1 vext3 LHS, RHS
+  27705344U,	// <u,u,u,7>: Cost 0 copy RHS
+  835584U,	// <u,u,u,u>: Cost 0 copy LHS
+  0
+};
diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp
index f809f37..d5bc3f6 100644
--- a/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "ARM.h"
 #include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMRegisterInfo.h"
@@ -26,6 +27,7 @@
 #include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetFrameInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -33,1370 +35,7 @@
 #include "llvm/ADT/SmallVector.h"
 using namespace llvm;
 
-unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum) {
-  using namespace ARM;
-  switch (RegEnum) {
-  case R0:  case S0:  case D0:  return 0;
-  case R1:  case S1:  case D1:  return 1;
-  case R2:  case S2:  case D2:  return 2;
-  case R3:  case S3:  case D3:  return 3;
-  case R4:  case S4:  case D4:  return 4;
-  case R5:  case S5:  case D5:  return 5;
-  case R6:  case S6:  case D6:  return 6;
-  case R7:  case S7:  case D7:  return 7;
-  case R8:  case S8:  case D8:  return 8;
-  case R9:  case S9:  case D9:  return 9;
-  case R10: case S10: case D10: return 10;
-  case R11: case S11: case D11: return 11;
-  case R12: case S12: case D12: return 12;
-  case SP:  case S13: case D13: return 13;
-  case LR:  case S14: case D14: return 14;
-  case PC:  case S15: case D15: return 15;
-  case S16: return 16;
-  case S17: return 17;
-  case S18: return 18;
-  case S19: return 19;
-  case S20: return 20;
-  case S21: return 21;
-  case S22: return 22;
-  case S23: return 23;
-  case S24: return 24;
-  case S25: return 25;
-  case S26: return 26;
-  case S27: return 27;
-  case S28: return 28;
-  case S29: return 29;
-  case S30: return 30;
-  case S31: return 31;
-  default:
-    assert(0 && "Unknown ARM register!");
-    abort();
-  }
-}
-
-unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum,
-                                                   bool &isSPVFP) {
-  isSPVFP = false;
-
-  using namespace ARM;
-  switch (RegEnum) {
-  default:
-    assert(0 && "Unknown ARM register!");
-    abort();
-  case R0:  case D0:  return 0;
-  case R1:  case D1:  return 1;
-  case R2:  case D2:  return 2;
-  case R3:  case D3:  return 3;
-  case R4:  case D4:  return 4;
-  case R5:  case D5:  return 5;
-  case R6:  case D6:  return 6;
-  case R7:  case D7:  return 7;
-  case R8:  case D8:  return 8;
-  case R9:  case D9:  return 9;
-  case R10: case D10: return 10;
-  case R11: case D11: return 11;
-  case R12: case D12: return 12;
-  case SP:  case D13: return 13;
-  case LR:  case D14: return 14;
-  case PC:  case D15: return 15;
-
-  case S0: case S1: case S2: case S3:
-  case S4: case S5: case S6: case S7:
-  case S8: case S9: case S10: case S11:
-  case S12: case S13: case S14: case S15:
-  case S16: case S17: case S18: case S19:
-  case S20: case S21: case S22: case S23:
-  case S24: case S25: case S26: case S27:
-  case S28: case S29: case S30: case S31:  {
-    isSPVFP = true;
-    switch (RegEnum) {
-    default: return 0; // Avoid compile time warning.
-    case S0: return 0;
-    case S1: return 1;
-    case S2: return 2;
-    case S3: return 3;
-    case S4: return 4;
-    case S5: return 5;
-    case S6: return 6;
-    case S7: return 7;
-    case S8: return 8;
-    case S9: return 9;
-    case S10: return 10;
-    case S11: return 11;
-    case S12: return 12;
-    case S13: return 13;
-    case S14: return 14;
-    case S15: return 15;
-    case S16: return 16;
-    case S17: return 17;
-    case S18: return 18;
-    case S19: return 19;
-    case S20: return 20;
-    case S21: return 21;
-    case S22: return 22;
-    case S23: return 23;
-    case S24: return 24;
-    case S25: return 25;
-    case S26: return 26;
-    case S27: return 27;
-    case S28: return 28;
-    case S29: return 29;
-    case S30: return 30;
-    case S31: return 31;
-    }
-  }
-  }
-}
-
-ARMBaseRegisterInfo::ARMBaseRegisterInfo(const TargetInstrInfo &tii,
-                                         const ARMSubtarget &sti)
-  : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
-    TII(tii), STI(sti),
-    FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11) {
-}
-
-ARMRegisterInfo::ARMRegisterInfo(const TargetInstrInfo &tii,
+ARMRegisterInfo::ARMRegisterInfo(const ARMBaseInstrInfo &tii,
                                  const ARMSubtarget &sti)
   : ARMBaseRegisterInfo(tii, sti) {
 }
-
-static inline
-const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
-  return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
-}
-
-static inline
-const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
-  return MIB.addReg(0);
-}
-
-/// emitLoadConstPool - Emits a load from constpool to materialize the
-/// specified immediate.
-void ARMRegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator &MBBI,
-                                        const TargetInstrInfo *TII, DebugLoc dl,
-                                        unsigned DestReg, int Val,
-                                        ARMCC::CondCodes Pred,
-                                        unsigned PredReg) const {
-  MachineFunction &MF = *MBB.getParent();
-  MachineConstantPool *ConstantPool = MF.getConstantPool();
-  Constant *C = ConstantInt::get(Type::Int32Ty, Val);
-  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
-
-  BuildMI(MBB, MBBI, dl, TII->get(ARM::LDRcp), DestReg)
-    .addConstantPoolIndex(Idx)
-    .addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
-}
-
-const unsigned*
-ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  static const unsigned CalleeSavedRegs[] = {
-    ARM::LR, ARM::R11, ARM::R10, ARM::R9, ARM::R8,
-    ARM::R7, ARM::R6,  ARM::R5,  ARM::R4,
-
-    ARM::D15, ARM::D14, ARM::D13, ARM::D12,
-    ARM::D11, ARM::D10, ARM::D9,  ARM::D8,
-    0
-  };
-
-  static const unsigned DarwinCalleeSavedRegs[] = {
-    // Darwin ABI deviates from ARM standard ABI. R9 is not a callee-saved
-    // register.
-    ARM::LR,  ARM::R7,  ARM::R6, ARM::R5, ARM::R4,
-    ARM::R11, ARM::R10, ARM::R8,
-
-    ARM::D15, ARM::D14, ARM::D13, ARM::D12,
-    ARM::D11, ARM::D10, ARM::D9,  ARM::D8,
-    0
-  };
-  return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs;
-}
-
-const TargetRegisterClass* const *
-ARMBaseRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    0
-  };
-
-  static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = {
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass,
-    &ARM::tGPRRegClass,&ARM::tGPRRegClass,&ARM::tGPRRegClass,
-
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    0
-  };
-
-  static const TargetRegisterClass * const DarwinCalleeSavedRegClasses[] = {
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass,
-
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    0
-  };
-
-  static const TargetRegisterClass * const DarwinThumbCalleeSavedRegClasses[] ={
-    &ARM::GPRRegClass,  &ARM::tGPRRegClass, &ARM::tGPRRegClass,
-    &ARM::tGPRRegClass, &ARM::tGPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass,  &ARM::GPRRegClass,
-
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    0
-  };
-
-  if (STI.isThumb()) {
-    return STI.isTargetDarwin()
-      ? DarwinThumbCalleeSavedRegClasses : ThumbCalleeSavedRegClasses;
-  }
-  return STI.isTargetDarwin()
-    ? DarwinCalleeSavedRegClasses : CalleeSavedRegClasses;
-}
-
-BitVector ARMBaseRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  // FIXME: avoid re-calculating this everytime.
-  BitVector Reserved(getNumRegs());
-  Reserved.set(ARM::SP);
-  Reserved.set(ARM::PC);
-  if (STI.isTargetDarwin() || hasFP(MF))
-    Reserved.set(FramePtr);
-  // Some targets reserve R9.
-  if (STI.isR9Reserved())
-    Reserved.set(ARM::R9);
-  return Reserved;
-}
-
-bool
-ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF, unsigned Reg) const {
-  switch (Reg) {
-  default: break;
-  case ARM::SP:
-  case ARM::PC:
-    return true;
-  case ARM::R7:
-  case ARM::R11:
-    if (FramePtr == Reg && (STI.isTargetDarwin() || hasFP(MF)))
-      return true;
-    break;
-  case ARM::R9:
-    return STI.isR9Reserved();
-  }
-
-  return false;
-}
-
-const TargetRegisterClass *ARMBaseRegisterInfo::getPointerRegClass() const {
-  return &ARM::GPRRegClass;
-}
-
-/// getAllocationOrder - Returns the register allocation order for a specified
-/// register class in the form of a pair of TargetRegisterClass iterators.
-std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
-ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC,
-                                        unsigned HintType, unsigned HintReg,
-                                        const MachineFunction &MF) const {
-  // Alternative register allocation orders when favoring even / odd registers
-  // of register pairs.
-
-  // No FP, R9 is available.
-  static const unsigned GPREven1[] = {
-    ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8, ARM::R10,
-    ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7,
-    ARM::R9, ARM::R11
-  };
-  static const unsigned GPROdd1[] = {
-    ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R9, ARM::R11,
-    ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6,
-    ARM::R8, ARM::R10
-  };
-
-  // FP is R7, R9 is available.
-  static const unsigned GPREven2[] = {
-    ARM::R0, ARM::R2, ARM::R4,          ARM::R8, ARM::R10,
-    ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6,
-    ARM::R9, ARM::R11
-  };
-  static const unsigned GPROdd2[] = {
-    ARM::R1, ARM::R3, ARM::R5,          ARM::R9, ARM::R11,
-    ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6,
-    ARM::R8, ARM::R10
-  };
-
-  // FP is R11, R9 is available.
-  static const unsigned GPREven3[] = {
-    ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8,
-    ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7,
-    ARM::R9
-  };
-  static const unsigned GPROdd3[] = {
-    ARM::R1, ARM::R3, ARM::R5, ARM::R6, ARM::R9,
-    ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R7,
-    ARM::R8
-  };
-
-  // No FP, R9 is not available.
-  static const unsigned GPREven4[] = {
-    ARM::R0, ARM::R2, ARM::R4, ARM::R6,          ARM::R10,
-    ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8,
-    ARM::R11
-  };
-  static const unsigned GPROdd4[] = {
-    ARM::R1, ARM::R3, ARM::R5, ARM::R7,          ARM::R11,
-    ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8,
-    ARM::R10
-  };
-
-  // FP is R7, R9 is not available.
-  static const unsigned GPREven5[] = {
-    ARM::R0, ARM::R2, ARM::R4,                   ARM::R10,
-    ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6, ARM::R8,
-    ARM::R11
-  };
-  static const unsigned GPROdd5[] = {
-    ARM::R1, ARM::R3, ARM::R5,                   ARM::R11,
-    ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8,
-    ARM::R10
-  };
-
-  // FP is R11, R9 is not available.
-  static const unsigned GPREven6[] = {
-    ARM::R0, ARM::R2, ARM::R4, ARM::R6,
-    ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8
-  };
-  static const unsigned GPROdd6[] = {
-    ARM::R1, ARM::R3, ARM::R5, ARM::R7,
-    ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8
-  };
-
-
-  if (HintType == ARMRI::RegPairEven) {
-    if (isPhysicalRegister(HintReg) && getRegisterPairEven(HintReg, MF) == 0)
-      // It's no longer possible to fulfill this hint. Return the default
-      // allocation order.
-      return std::make_pair(RC->allocation_order_begin(MF),
-                            RC->allocation_order_end(MF));
-
-    if (!STI.isTargetDarwin() && !hasFP(MF)) {
-      if (!STI.isR9Reserved())
-        return std::make_pair(GPREven1,
-                              GPREven1 + (sizeof(GPREven1)/sizeof(unsigned)));
-      else
-        return std::make_pair(GPREven4,
-                              GPREven4 + (sizeof(GPREven4)/sizeof(unsigned)));
-    } else if (FramePtr == ARM::R7) {
-      if (!STI.isR9Reserved())
-        return std::make_pair(GPREven2,
-                              GPREven2 + (sizeof(GPREven2)/sizeof(unsigned)));
-      else
-        return std::make_pair(GPREven5,
-                              GPREven5 + (sizeof(GPREven5)/sizeof(unsigned)));
-    } else { // FramePtr == ARM::R11
-      if (!STI.isR9Reserved())
-        return std::make_pair(GPREven3,
-                              GPREven3 + (sizeof(GPREven3)/sizeof(unsigned)));
-      else
-        return std::make_pair(GPREven6,
-                              GPREven6 + (sizeof(GPREven6)/sizeof(unsigned)));
-    }
-  } else if (HintType == ARMRI::RegPairOdd) {
-    if (isPhysicalRegister(HintReg) && getRegisterPairOdd(HintReg, MF) == 0)
-      // It's no longer possible to fulfill this hint. Return the default
-      // allocation order.
-      return std::make_pair(RC->allocation_order_begin(MF),
-                            RC->allocation_order_end(MF));
-
-    if (!STI.isTargetDarwin() && !hasFP(MF)) {
-      if (!STI.isR9Reserved())
-        return std::make_pair(GPROdd1,
-                              GPROdd1 + (sizeof(GPROdd1)/sizeof(unsigned)));
-      else
-        return std::make_pair(GPROdd4,
-                              GPROdd4 + (sizeof(GPROdd4)/sizeof(unsigned)));
-    } else if (FramePtr == ARM::R7) {
-      if (!STI.isR9Reserved())
-        return std::make_pair(GPROdd2,
-                              GPROdd2 + (sizeof(GPROdd2)/sizeof(unsigned)));
-      else
-        return std::make_pair(GPROdd5,
-                              GPROdd5 + (sizeof(GPROdd5)/sizeof(unsigned)));
-    } else { // FramePtr == ARM::R11
-      if (!STI.isR9Reserved())
-        return std::make_pair(GPROdd3,
-                              GPROdd3 + (sizeof(GPROdd3)/sizeof(unsigned)));
-      else
-        return std::make_pair(GPROdd6,
-                              GPROdd6 + (sizeof(GPROdd6)/sizeof(unsigned)));
-    }
-  }
-  return std::make_pair(RC->allocation_order_begin(MF),
-                        RC->allocation_order_end(MF));
-}
-
-/// ResolveRegAllocHint - Resolves the specified register allocation hint
-/// to a physical register. Returns the physical register if it is successful.
-unsigned
-ARMBaseRegisterInfo::ResolveRegAllocHint(unsigned Type, unsigned Reg,
-                                         const MachineFunction &MF) const {
-  if (Reg == 0 || !isPhysicalRegister(Reg))
-    return 0;
-  if (Type == 0)
-    return Reg;
-  else if (Type == (unsigned)ARMRI::RegPairOdd)
-    // Odd register.
-    return getRegisterPairOdd(Reg, MF);
-  else if (Type == (unsigned)ARMRI::RegPairEven)
-    // Even register.
-    return getRegisterPairEven(Reg, MF);
-  return 0;
-}
-
-void
-ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
-                                        MachineFunction &MF) const {
-  MachineRegisterInfo *MRI = &MF.getRegInfo();
-  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
-  if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
-       Hint.first == (unsigned)ARMRI::RegPairEven) &&
-      Hint.second && TargetRegisterInfo::isVirtualRegister(Hint.second)) {
-    // If 'Reg' is one of the even / odd register pair and it's now changed
-    // (e.g. coalesced) into a different register. The other register of the
-    // pair allocation hint must be updated to reflect the relationship
-    // change.
-    unsigned OtherReg = Hint.second;
-    Hint = MRI->getRegAllocationHint(OtherReg);
-    if (Hint.second == Reg)
-      // Make sure the pair has not already divorced.
-      MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
-  }
-}
-
-bool
-ARMRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
-  return true;
-}
-
-/// hasFP - Return true if the specified function should have a dedicated frame
-/// pointer register.  This is true if the function has variable sized allocas
-/// or if frame pointer elimination is disabled.
-///
-bool ARMBaseRegisterInfo::hasFP(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return (NoFramePointerElim ||
-          MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken());
-}
-
-// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
-// not required, we reserve argument space for call sites in the function
-// immediately on entry to the current function. This eliminates the need for
-// add/sub sp brackets around call sites. Returns true if the call frame is
-// included as part of the stack frame.
-bool ARMRegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
-  const MachineFrameInfo *FFI = MF.getFrameInfo();
-  unsigned CFSize = FFI->getMaxCallFrameSize();
-  // It's not always a good idea to include the call frame as part of the
-  // stack frame. ARM (especially Thumb) has small immediate offset to
-  // address the stack frame. So a large call frame can cause poor codegen
-  // and may even makes it impossible to scavenge a register.
-  if (CFSize >= ((1 << 12) - 1) / 2)  // Half of imm12
-    return false;
-
-  return !MF.getFrameInfo()->hasVarSizedObjects();
-}
-
-/// emitARMRegPlusImmediate - Emits a series of instructions to materialize
-/// a destreg = basereg + immediate in ARM code.
-static
-void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator &MBBI,
-                             unsigned DestReg, unsigned BaseReg, int NumBytes,
-                             ARMCC::CondCodes Pred, unsigned PredReg,
-                             const TargetInstrInfo &TII,
-                             DebugLoc dl) {
-  bool isSub = NumBytes < 0;
-  if (isSub) NumBytes = -NumBytes;
-
-  while (NumBytes) {
-    unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes);
-    unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt);
-    assert(ThisVal && "Didn't extract field correctly");
-
-    // We will handle these bits from offset, clear them.
-    NumBytes &= ~ThisVal;
-
-    // Get the properly encoded SOImmVal field.
-    int SOImmVal = ARM_AM::getSOImmVal(ThisVal);
-    assert(SOImmVal != -1 && "Bit extraction didn't work?");
-
-    // Build the new ADD / SUB.
-    BuildMI(MBB, MBBI, dl, TII.get(isSub ? ARM::SUBri : ARM::ADDri), DestReg)
-      .addReg(BaseReg, RegState::Kill).addImm(SOImmVal)
-      .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
-    BaseReg = DestReg;
-  }
-}
-
-static void
-emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-             const TargetInstrInfo &TII, DebugLoc dl,
-             int NumBytes,
-             ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
-  emitARMRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes,
-                          Pred, PredReg, TII, dl);
-}
-
-void ARMRegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  if (!hasReservedCallFrame(MF)) {
-    // If we have alloca, convert as follows:
-    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
-    // ADJCALLSTACKUP   -> add, sp, sp, amount
-    MachineInstr *Old = I;
-    DebugLoc dl = Old->getDebugLoc();
-    unsigned Amount = Old->getOperand(0).getImm();
-    if (Amount != 0) {
-      // We need to keep the stack aligned properly.  To do this, we round the
-      // amount of space needed for the outgoing arguments up to the next
-      // alignment boundary.
-      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
-      Amount = (Amount+Align-1)/Align*Align;
-
-      // Replace the pseudo instruction with a new instruction...
-      unsigned Opc = Old->getOpcode();
-      ARMCC::CondCodes Pred = (ARMCC::CondCodes)Old->getOperand(1).getImm();
-      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
-        // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
-        unsigned PredReg = Old->getOperand(2).getReg();
-        emitSPUpdate(MBB, I, TII, dl, -Amount, Pred, PredReg);
-      } else {
-        // Note: PredReg is operand 3 for ADJCALLSTACKUP.
-        unsigned PredReg = Old->getOperand(3).getReg();
-        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
-        emitSPUpdate(MBB, I, TII, dl, Amount, Pred, PredReg);
-      }
-    }
-  }
-  MBB.erase(I);
-}
-
-/// findScratchRegister - Find a 'free' ARM register. If register scavenger
-/// is not being used, R12 is available. Otherwise, try for a call-clobbered
-/// register first and then a spilled callee-saved register if that fails.
-static
-unsigned findScratchRegister(RegScavenger *RS, const TargetRegisterClass *RC,
-                             ARMFunctionInfo *AFI) {
-  unsigned Reg = RS ? RS->FindUnusedReg(RC, true) : (unsigned) ARM::R12;
-  assert (!AFI->isThumbFunction());
-  if (Reg == 0)
-    // Try a already spilled CS register.
-    Reg = RS->FindUnusedReg(RC, AFI->getSpilledCSRegisters());
-
-  return Reg;
-}
-
-void ARMRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                          int SPAdj, RegScavenger *RS) const{
-  unsigned i = 0;
-  MachineInstr &MI = *II;
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  DebugLoc dl = MI.getDebugLoc();
-
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-
-  unsigned FrameReg = ARM::SP;
-  int FrameIndex = MI.getOperand(i).getIndex();
-  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
-               MF.getFrameInfo()->getStackSize() + SPAdj;
-
-  if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
-    Offset -= AFI->getGPRCalleeSavedArea1Offset();
-  else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
-    Offset -= AFI->getGPRCalleeSavedArea2Offset();
-  else if (AFI->isDPRCalleeSavedAreaFrame(FrameIndex))
-    Offset -= AFI->getDPRCalleeSavedAreaOffset();
-  else if (hasFP(MF)) {
-    assert(SPAdj == 0 && "Unexpected");
-    // There is alloca()'s in this function, must reference off the frame
-    // pointer instead.
-    FrameReg = getFrameRegister(MF);
-    Offset -= AFI->getFramePtrSpillOffset();
-  }
-
-  unsigned Opcode = MI.getOpcode();
-  const TargetInstrDesc &Desc = MI.getDesc();
-  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
-  bool isSub = false;
-
-  // Memory operands in inline assembly always use AddrMode2.
-  if (Opcode == ARM::INLINEASM)
-    AddrMode = ARMII::AddrMode2;
-
-  if (Opcode == ARM::ADDri) {
-    Offset += MI.getOperand(i+1).getImm();
-    if (Offset == 0) {
-      // Turn it into a move.
-      MI.setDesc(TII.get(ARM::MOVr));
-      MI.getOperand(i).ChangeToRegister(FrameReg, false);
-      MI.RemoveOperand(i+1);
-      return;
-    } else if (Offset < 0) {
-      Offset = -Offset;
-      isSub = true;
-      MI.setDesc(TII.get(ARM::SUBri));
-    }
-
-    // Common case: small offset, fits into instruction.
-    int ImmedOffset = ARM_AM::getSOImmVal(Offset);
-    if (ImmedOffset != -1) {
-      // Replace the FrameIndex with sp / fp
-      MI.getOperand(i).ChangeToRegister(FrameReg, false);
-      MI.getOperand(i+1).ChangeToImmediate(ImmedOffset);
-      return;
-    }
-
-    // Otherwise, we fallback to common code below to form the imm offset with
-    // a sequence of ADDri instructions.  First though, pull as much of the imm
-    // into this ADDri as possible.
-    unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset);
-    unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt);
-
-    // We will handle these bits from offset, clear them.
-    Offset &= ~ThisImmVal;
-
-    // Get the properly encoded SOImmVal field.
-    int ThisSOImmVal = ARM_AM::getSOImmVal(ThisImmVal);
-    assert(ThisSOImmVal != -1 && "Bit extraction didn't work?");
-    MI.getOperand(i+1).ChangeToImmediate(ThisSOImmVal);
-  } else {
-    unsigned ImmIdx = 0;
-    int InstrOffs = 0;
-    unsigned NumBits = 0;
-    unsigned Scale = 1;
-    switch (AddrMode) {
-    case ARMII::AddrMode2: {
-      ImmIdx = i+2;
-      InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
-      if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
-        InstrOffs *= -1;
-      NumBits = 12;
-      break;
-    }
-    case ARMII::AddrMode3: {
-      ImmIdx = i+2;
-      InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm());
-      if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
-        InstrOffs *= -1;
-      NumBits = 8;
-      break;
-    }
-    case ARMII::AddrMode5: {
-      ImmIdx = i+1;
-      InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
-      if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
-        InstrOffs *= -1;
-      NumBits = 8;
-      Scale = 4;
-      break;
-    }
-    default:
-      assert(0 && "Unsupported addressing mode!");
-      abort();
-      break;
-    }
-
-    Offset += InstrOffs * Scale;
-    assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
-    if (Offset < 0) {
-      Offset = -Offset;
-      isSub = true;
-    }
-
-    // Common case: small offset, fits into instruction.
-    MachineOperand &ImmOp = MI.getOperand(ImmIdx);
-    int ImmedOffset = Offset / Scale;
-    unsigned Mask = (1 << NumBits) - 1;
-    if ((unsigned)Offset <= Mask * Scale) {
-      // Replace the FrameIndex with sp
-      MI.getOperand(i).ChangeToRegister(FrameReg, false);
-      if (isSub)
-        ImmedOffset |= 1 << NumBits;
-      ImmOp.ChangeToImmediate(ImmedOffset);
-      return;
-    }
-
-    // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
-    ImmedOffset = ImmedOffset & Mask;
-    if (isSub)
-      ImmedOffset |= 1 << NumBits;
-    ImmOp.ChangeToImmediate(ImmedOffset);
-    Offset &= ~(Mask*Scale);
-  }
-
-  // If we get here, the immediate doesn't fit into the instruction.  We folded
-  // as much as possible above, handle the rest, providing a register that is
-  // SP+LargeImm.
-  assert(Offset && "This code isn't needed if offset already handled!");
-
-  // Insert a set of r12 with the full address: r12 = sp + offset
-  // If the offset we have is too large to fit into the instruction, we need
-  // to form it with a series of ADDri's.  Do this by taking 8-bit chunks
-  // out of 'Offset'.
-  unsigned ScratchReg = findScratchRegister(RS, &ARM::GPRRegClass, AFI);
-  if (ScratchReg == 0)
-    // No register is "free". Scavenge a register.
-    ScratchReg = RS->scavengeRegister(&ARM::GPRRegClass, II, SPAdj);
-  int PIdx = MI.findFirstPredOperandIdx();
-  ARMCC::CondCodes Pred = (PIdx == -1)
-    ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
-  unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg();
-  emitARMRegPlusImmediate(MBB, II, ScratchReg, FrameReg,
-                          isSub ? -Offset : Offset, Pred, PredReg, TII, dl);
-  MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true);
-}
-
-static unsigned estimateStackSize(MachineFunction &MF, MachineFrameInfo *MFI) {
-  const MachineFrameInfo *FFI = MF.getFrameInfo();
-  int Offset = 0;
-  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
-    int FixedOff = -FFI->getObjectOffset(i);
-    if (FixedOff > Offset) Offset = FixedOff;
-  }
-  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
-    if (FFI->isDeadObjectIndex(i))
-      continue;
-    Offset += FFI->getObjectSize(i);
-    unsigned Align = FFI->getObjectAlignment(i);
-    // Adjust to alignment boundary
-    Offset = (Offset+Align-1)/Align*Align;
-  }
-  return (unsigned)Offset;
-}
-
-void
-ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                          RegScavenger *RS) const {
-  // This tells PEI to spill the FP as if it is any other callee-save register
-  // to take advantage the eliminateFrameIndex machinery. This also ensures it
-  // is spilled in the order specified by getCalleeSavedRegs() to make it easier
-  // to combine multiple loads / stores.
-  bool CanEliminateFrame = true;
-  bool CS1Spilled = false;
-  bool LRSpilled = false;
-  unsigned NumGPRSpills = 0;
-  SmallVector<unsigned, 4> UnspilledCS1GPRs;
-  SmallVector<unsigned, 4> UnspilledCS2GPRs;
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-
-  // Don't spill FP if the frame can be eliminated. This is determined
-  // by scanning the callee-save registers to see if any is used.
-  const unsigned *CSRegs = getCalleeSavedRegs();
-  const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses();
-  for (unsigned i = 0; CSRegs[i]; ++i) {
-    unsigned Reg = CSRegs[i];
-    bool Spilled = false;
-    if (MF.getRegInfo().isPhysRegUsed(Reg)) {
-      AFI->setCSRegisterIsSpilled(Reg);
-      Spilled = true;
-      CanEliminateFrame = false;
-    } else {
-      // Check alias registers too.
-      for (const unsigned *Aliases = getAliasSet(Reg); *Aliases; ++Aliases) {
-        if (MF.getRegInfo().isPhysRegUsed(*Aliases)) {
-          Spilled = true;
-          CanEliminateFrame = false;
-        }
-      }
-    }
-
-    if (CSRegClasses[i] == &ARM::GPRRegClass) {
-      if (Spilled) {
-        NumGPRSpills++;
-
-        if (!STI.isTargetDarwin()) {
-          if (Reg == ARM::LR)
-            LRSpilled = true;
-          CS1Spilled = true;
-          continue;
-        }
-
-        // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
-        switch (Reg) {
-        case ARM::LR:
-          LRSpilled = true;
-          // Fallthrough
-        case ARM::R4:
-        case ARM::R5:
-        case ARM::R6:
-        case ARM::R7:
-          CS1Spilled = true;
-          break;
-        default:
-          break;
-        }
-      } else {
-        if (!STI.isTargetDarwin()) {
-          UnspilledCS1GPRs.push_back(Reg);
-          continue;
-        }
-
-        switch (Reg) {
-        case ARM::R4:
-        case ARM::R5:
-        case ARM::R6:
-        case ARM::R7:
-        case ARM::LR:
-          UnspilledCS1GPRs.push_back(Reg);
-          break;
-        default:
-          UnspilledCS2GPRs.push_back(Reg);
-          break;
-        }
-      }
-    }
-  }
-
-  bool ForceLRSpill = false;
-  if (!LRSpilled && AFI->isThumbFunction()) {
-    unsigned FnSize = TII.GetFunctionSizeInBytes(MF);
-    // Force LR to be spilled if the Thumb function size is > 2048. This enables
-    // use of BL to implement far jump. If it turns out that it's not needed
-    // then the branch fix up path will undo it.
-    if (FnSize >= (1 << 11)) {
-      CanEliminateFrame = false;
-      ForceLRSpill = true;
-    }
-  }
-
-  bool ExtraCSSpill = false;
-  if (!CanEliminateFrame || hasFP(MF)) {
-    AFI->setHasStackFrame(true);
-
-    // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
-    // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
-    if (!LRSpilled && CS1Spilled) {
-      MF.getRegInfo().setPhysRegUsed(ARM::LR);
-      AFI->setCSRegisterIsSpilled(ARM::LR);
-      NumGPRSpills++;
-      UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
-                                    UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
-      ForceLRSpill = false;
-      ExtraCSSpill = true;
-    }
-
-    // Darwin ABI requires FP to point to the stack slot that contains the
-    // previous FP.
-    if (STI.isTargetDarwin() || hasFP(MF)) {
-      MF.getRegInfo().setPhysRegUsed(FramePtr);
-      NumGPRSpills++;
-    }
-
-    // If stack and double are 8-byte aligned and we are spilling an odd number
-    // of GPRs. Spill one extra callee save GPR so we won't have to pad between
-    // the integer and double callee save areas.
-    unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
-    if (TargetAlign == 8 && (NumGPRSpills & 1)) {
-      if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
-        for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
-          unsigned Reg = UnspilledCS1GPRs[i];
-          // Don't spiil high register if the function is thumb
-          if (!AFI->isThumbFunction() ||
-              isARMLowRegister(Reg) || Reg == ARM::LR) {
-            MF.getRegInfo().setPhysRegUsed(Reg);
-            AFI->setCSRegisterIsSpilled(Reg);
-            if (!isReservedReg(MF, Reg))
-              ExtraCSSpill = true;
-            break;
-          }
-        }
-      } else if (!UnspilledCS2GPRs.empty() &&
-                 !AFI->isThumbFunction()) {
-        unsigned Reg = UnspilledCS2GPRs.front();
-        MF.getRegInfo().setPhysRegUsed(Reg);
-        AFI->setCSRegisterIsSpilled(Reg);
-        if (!isReservedReg(MF, Reg))
-          ExtraCSSpill = true;
-      }
-    }
-
-    // Estimate if we might need to scavenge a register at some point in order
-    // to materialize a stack offset. If so, either spill one additional
-    // callee-saved register or reserve a special spill slot to facilitate
-    // register scavenging.
-    if (RS && !ExtraCSSpill && !AFI->isThumbFunction()) {
-      MachineFrameInfo  *MFI = MF.getFrameInfo();
-      unsigned Size = estimateStackSize(MF, MFI);
-      unsigned Limit = (1 << 12) - 1;
-      for (MachineFunction::iterator BB = MF.begin(),E = MF.end();BB != E; ++BB)
-        for (MachineBasicBlock::iterator I= BB->begin(); I != BB->end(); ++I) {
-          for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
-            if (I->getOperand(i).isFI()) {
-              unsigned Opcode = I->getOpcode();
-              const TargetInstrDesc &Desc = TII.get(Opcode);
-              unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
-              if (AddrMode == ARMII::AddrMode3) {
-                Limit = (1 << 8) - 1;
-                goto DoneEstimating;
-              } else if (AddrMode == ARMII::AddrMode5) {
-                unsigned ThisLimit = ((1 << 8) - 1) * 4;
-                if (ThisLimit < Limit)
-                  Limit = ThisLimit;
-              }
-            }
-        }
-    DoneEstimating:
-      if (Size >= Limit) {
-        // If any non-reserved CS register isn't spilled, just spill one or two
-        // extra. That should take care of it!
-        unsigned NumExtras = TargetAlign / 4;
-        SmallVector<unsigned, 2> Extras;
-        while (NumExtras && !UnspilledCS1GPRs.empty()) {
-          unsigned Reg = UnspilledCS1GPRs.back();
-          UnspilledCS1GPRs.pop_back();
-          if (!isReservedReg(MF, Reg)) {
-            Extras.push_back(Reg);
-            NumExtras--;
-          }
-        }
-        while (NumExtras && !UnspilledCS2GPRs.empty()) {
-          unsigned Reg = UnspilledCS2GPRs.back();
-          UnspilledCS2GPRs.pop_back();
-          if (!isReservedReg(MF, Reg)) {
-            Extras.push_back(Reg);
-            NumExtras--;
-          }
-        }
-        if (Extras.size() && NumExtras == 0) {
-          for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
-            MF.getRegInfo().setPhysRegUsed(Extras[i]);
-            AFI->setCSRegisterIsSpilled(Extras[i]);
-          }
-        } else {
-          // Reserve a slot closest to SP or frame pointer.
-          const TargetRegisterClass *RC = &ARM::GPRRegClass;
-          RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
-                                                           RC->getAlignment()));
-        }
-      }
-    }
-  }
-
-  if (ForceLRSpill) {
-    MF.getRegInfo().setPhysRegUsed(ARM::LR);
-    AFI->setCSRegisterIsSpilled(ARM::LR);
-    AFI->setLRIsSpilledForFarJump(true);
-  }
-}
-
-/// Move iterator pass the next bunch of callee save load / store ops for
-/// the particular spill area (1: integer area 1, 2: integer area 2,
-/// 3: fp area, 0: don't care).
-static void movePastCSLoadStoreOps(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator &MBBI,
-                                   int Opc, unsigned Area,
-                                   const ARMSubtarget &STI) {
-  while (MBBI != MBB.end() &&
-         MBBI->getOpcode() == Opc && MBBI->getOperand(1).isFI()) {
-    if (Area != 0) {
-      bool Done = false;
-      unsigned Category = 0;
-      switch (MBBI->getOperand(0).getReg()) {
-      case ARM::R4:  case ARM::R5:  case ARM::R6: case ARM::R7:
-      case ARM::LR:
-        Category = 1;
-        break;
-      case ARM::R8:  case ARM::R9:  case ARM::R10: case ARM::R11:
-        Category = STI.isTargetDarwin() ? 2 : 1;
-        break;
-      case ARM::D8:  case ARM::D9:  case ARM::D10: case ARM::D11:
-      case ARM::D12: case ARM::D13: case ARM::D14: case ARM::D15:
-        Category = 3;
-        break;
-      default:
-        Done = true;
-        break;
-      }
-      if (Done || Category != Area)
-        break;
-    }
-
-    ++MBBI;
-  }
-}
-
-void ARMRegisterInfo::emitPrologue(MachineFunction &MF) const {
-  MachineBasicBlock &MBB = MF.front();
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  MachineFrameInfo  *MFI = MF.getFrameInfo();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
-  unsigned NumBytes = MFI->getStackSize();
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  DebugLoc dl = (MBBI != MBB.end() ?
-                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
-
-  // Determine the sizes of each callee-save spill areas and record which frame
-  // belongs to which callee-save spill areas.
-  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
-  int FramePtrSpillFI = 0;
-
-  if (VARegSaveSize)
-    emitSPUpdate(MBB, MBBI, TII, dl, -VARegSaveSize);
-
-  if (!AFI->hasStackFrame()) {
-    if (NumBytes != 0)
-      emitSPUpdate(MBB, MBBI, TII, dl, -NumBytes);
-    return;
-  }
-
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
-    int FI = CSI[i].getFrameIdx();
-    switch (Reg) {
-    case ARM::R4:
-    case ARM::R5:
-    case ARM::R6:
-    case ARM::R7:
-    case ARM::LR:
-      if (Reg == FramePtr)
-        FramePtrSpillFI = FI;
-      AFI->addGPRCalleeSavedArea1Frame(FI);
-      GPRCS1Size += 4;
-      break;
-    case ARM::R8:
-    case ARM::R9:
-    case ARM::R10:
-    case ARM::R11:
-      if (Reg == FramePtr)
-        FramePtrSpillFI = FI;
-      if (STI.isTargetDarwin()) {
-        AFI->addGPRCalleeSavedArea2Frame(FI);
-        GPRCS2Size += 4;
-      } else {
-        AFI->addGPRCalleeSavedArea1Frame(FI);
-        GPRCS1Size += 4;
-      }
-      break;
-    default:
-      AFI->addDPRCalleeSavedAreaFrame(FI);
-      DPRCSSize += 8;
-    }
-  }
-
-  // Build the new SUBri to adjust SP for integer callee-save spill area 1.
-  emitSPUpdate(MBB, MBBI, TII, dl, -GPRCS1Size);
-  movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, 1, STI);
-
-  // Darwin ABI requires FP to point to the stack slot that contains the
-  // previous FP.
-  if (STI.isTargetDarwin() || hasFP(MF)) {
-    MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::ADDri), FramePtr)
-      .addFrameIndex(FramePtrSpillFI).addImm(0);
-    AddDefaultCC(AddDefaultPred(MIB));
-  }
-
-  // Build the new SUBri to adjust SP for integer callee-save spill area 2.
-  emitSPUpdate(MBB, MBBI, TII, dl, -GPRCS2Size);
-
-  // Build the new SUBri to adjust SP for FP callee-save spill area.
-  movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, 2, STI);
-  emitSPUpdate(MBB, MBBI, TII, dl, -DPRCSSize);
-
-  // Determine starting offsets of spill areas.
-  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
-  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
-  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
-  AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes);
-  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
-  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
-  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
-
-  NumBytes = DPRCSOffset;
-  if (NumBytes) {
-    // Insert it after all the callee-save spills.
-    movePastCSLoadStoreOps(MBB, MBBI, ARM::FSTD, 3, STI);
-    emitSPUpdate(MBB, MBBI, TII, dl, -NumBytes);
-  }
-
-  if (STI.isTargetELF() && hasFP(MF)) {
-    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
-                             AFI->getFramePtrSpillOffset());
-  }
-
-  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
-  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
-  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
-}
-
-static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
-static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) {
-  return ((MI->getOpcode() == ARM::FLDD ||
-           MI->getOpcode() == ARM::LDR) &&
-          MI->getOperand(1).isFI() &&
-          isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs));
-}
-
-void ARMRegisterInfo::emitEpilogue(MachineFunction &MF,
-                                   MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = prior(MBB.end());
-  assert(MBBI->getOpcode() == ARM::BX_RET &&
-         "Can only insert epilog into returning blocks");
-  DebugLoc dl = MBBI->getDebugLoc();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
-  int NumBytes = (int)MFI->getStackSize();
-
-  if (!AFI->hasStackFrame()) {
-    if (NumBytes != 0)
-      emitSPUpdate(MBB, MBBI, TII, dl, NumBytes);
-  } else {
-    // Unwind MBBI to point to first LDR / FLDD.
-    const unsigned *CSRegs = getCalleeSavedRegs();
-    if (MBBI != MBB.begin()) {
-      do
-        --MBBI;
-      while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs));
-      if (!isCSRestore(MBBI, CSRegs))
-        ++MBBI;
-    }
-
-    // Move SP to start of FP callee save spill area.
-    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
-                 AFI->getGPRCalleeSavedArea2Size() +
-                 AFI->getDPRCalleeSavedAreaSize());
-
-    // Darwin ABI requires FP to point to the stack slot that contains the
-    // previous FP.
-    if ((STI.isTargetDarwin() && NumBytes) || hasFP(MF)) {
-      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
-      // Reset SP based on frame pointer only if the stack frame extends beyond
-      // frame pointer stack slot or target is ELF and the function has FP.
-      if (AFI->getGPRCalleeSavedArea2Size() ||
-          AFI->getDPRCalleeSavedAreaSize()  ||
-          AFI->getDPRCalleeSavedAreaOffset()||
-          hasFP(MF)) {
-        if (NumBytes)
-          BuildMI(MBB, MBBI, dl, TII.get(ARM::SUBri), ARM::SP).addReg(FramePtr)
-            .addImm(NumBytes)
-            .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
-        else
-          BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP).addReg(FramePtr)
-            .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
-      }
-    } else if (NumBytes) {
-      emitSPUpdate(MBB, MBBI, TII, dl, NumBytes);
-    }
-
-    // Move SP to start of integer callee save spill area 2.
-    movePastCSLoadStoreOps(MBB, MBBI, ARM::FLDD, 3, STI);
-    emitSPUpdate(MBB, MBBI, TII, dl, AFI->getDPRCalleeSavedAreaSize());
-
-    // Move SP to start of integer callee save spill area 1.
-    movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, 2, STI);
-    emitSPUpdate(MBB, MBBI, TII, dl, AFI->getGPRCalleeSavedArea2Size());
-
-    // Move SP to SP upon entry to the function.
-    movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, 1, STI);
-    emitSPUpdate(MBB, MBBI, TII, dl, AFI->getGPRCalleeSavedArea1Size());
-  }
-
-  if (VARegSaveSize)
-    emitSPUpdate(MBB, MBBI, TII, dl, VARegSaveSize);
-
-}
-
-unsigned ARMBaseRegisterInfo::getRARegister() const {
-  return ARM::LR;
-}
-
-unsigned ARMBaseRegisterInfo::getFrameRegister(MachineFunction &MF) const {
-  if (STI.isTargetDarwin() || hasFP(MF))
-    return FramePtr;
-  return ARM::SP;
-}
-
-unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const {
-  assert(0 && "What is the exception register");
-  return 0;
-}
-
-unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const {
-  assert(0 && "What is the exception handler register");
-  return 0;
-}
-
-int ARMBaseRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-}
-
-unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
-                                               const MachineFunction &MF) const {
-  switch (Reg) {
-  default: break;
-  // Return 0 if either register of the pair is a special register.
-  // So no R12, etc.
-  case ARM::R1:
-    return ARM::R0;
-  case ARM::R3:
-    // FIXME!
-    return STI.isThumb() ? 0 : ARM::R2;
-  case ARM::R5:
-    return ARM::R4;
-  case ARM::R7:
-    return isReservedReg(MF, ARM::R7)  ? 0 : ARM::R6;
-  case ARM::R9:
-    return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R8;
-  case ARM::R11:
-    return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10;
-
-  case ARM::S1:
-    return ARM::S0;
-  case ARM::S3:
-    return ARM::S2;
-  case ARM::S5:
-    return ARM::S4;
-  case ARM::S7:
-    return ARM::S6;
-  case ARM::S9:
-    return ARM::S8;
-  case ARM::S11:
-    return ARM::S10;
-  case ARM::S13:
-    return ARM::S12;
-  case ARM::S15:
-    return ARM::S14;
-  case ARM::S17:
-    return ARM::S16;
-  case ARM::S19:
-    return ARM::S18;
-  case ARM::S21:
-    return ARM::S20;
-  case ARM::S23:
-    return ARM::S22;
-  case ARM::S25:
-    return ARM::S24;
-  case ARM::S27:
-    return ARM::S26;
-  case ARM::S29:
-    return ARM::S28;
-  case ARM::S31:
-    return ARM::S30;
-
-  case ARM::D1:
-    return ARM::D0;
-  case ARM::D3:
-    return ARM::D2;
-  case ARM::D5:
-    return ARM::D4;
-  case ARM::D7:
-    return ARM::D6;
-  case ARM::D9:
-    return ARM::D8;
-  case ARM::D11:
-    return ARM::D10;
-  case ARM::D13:
-    return ARM::D12;
-  case ARM::D15:
-    return ARM::D14;
-  }
-
-  return 0;
-}
-
-unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg,
-                                             const MachineFunction &MF) const {
-  switch (Reg) {
-  default: break;
-  // Return 0 if either register of the pair is a special register.
-  // So no R12, etc.
-  case ARM::R0:
-    return ARM::R1;
-  case ARM::R2:
-    // FIXME!
-    return STI.isThumb() ? 0 : ARM::R3;
-  case ARM::R4:
-    return ARM::R5;
-  case ARM::R6:
-    return isReservedReg(MF, ARM::R7)  ? 0 : ARM::R7;
-  case ARM::R8:
-    return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R9;
-  case ARM::R10:
-    return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11;
-
-  case ARM::S0:
-    return ARM::S1;
-  case ARM::S2:
-    return ARM::S3;
-  case ARM::S4:
-    return ARM::S5;
-  case ARM::S6:
-    return ARM::S7;
-  case ARM::S8:
-    return ARM::S9;
-  case ARM::S10:
-    return ARM::S11;
-  case ARM::S12:
-    return ARM::S13;
-  case ARM::S14:
-    return ARM::S15;
-  case ARM::S16:
-    return ARM::S17;
-  case ARM::S18:
-    return ARM::S19;
-  case ARM::S20:
-    return ARM::S21;
-  case ARM::S22:
-    return ARM::S23;
-  case ARM::S24:
-    return ARM::S25;
-  case ARM::S26:
-    return ARM::S27;
-  case ARM::S28:
-    return ARM::S29;
-  case ARM::S30:
-    return ARM::S31;
-
-  case ARM::D0:
-    return ARM::D1;
-  case ARM::D2:
-    return ARM::D3;
-  case ARM::D4:
-    return ARM::D5;
-  case ARM::D6:
-    return ARM::D7;
-  case ARM::D8:
-    return ARM::D9;
-  case ARM::D10:
-    return ARM::D11;
-  case ARM::D12:
-    return ARM::D13;
-  case ARM::D14:
-    return ARM::D15;
-  }
-
-  return 0;
-}
-
-#include "ARMGenRegisterInfo.inc"
diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h
index 7fe075a..041afd0 100644
--- a/lib/Target/ARM/ARMRegisterInfo.h
+++ b/lib/Target/ARM/ARMRegisterInfo.h
@@ -16,127 +16,26 @@
 
 #include "ARM.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "ARMGenRegisterInfo.h.inc"
+#include "ARMBaseRegisterInfo.h"
 
 namespace llvm {
   class ARMSubtarget;
-  class TargetInstrInfo;
+  class ARMBaseInstrInfo;
   class Type;
 
-/// Register allocation hints.
-namespace ARMRI {
-  enum {
-    RegPairOdd  = 1,
-    RegPairEven = 2
+namespace ARM {
+  /// SubregIndex - The index of various subregister classes. Note that 
+  /// these indices must be kept in sync with the class indices in the 
+  /// ARMRegisterInfo.td file.
+  enum SubregIndex {
+    SSUBREG_0 = 1, SSUBREG_1 = 2, SSUBREG_2 = 3, SSUBREG_3 = 4,
+    DSUBREG_0 = 5, DSUBREG_1 = 6
   };
 }
 
-/// isARMLowRegister - Returns true if the register is low register r0-r7.
-///
-static inline bool isARMLowRegister(unsigned Reg) {
-  using namespace ARM;
-  switch (Reg) {
-  case R0:  case R1:  case R2:  case R3:
-  case R4:  case R5:  case R6:  case R7:
-    return true;
-  default:
-    return false;
-  }
-}
-
-struct ARMBaseRegisterInfo : public ARMGenRegisterInfo {
-protected:
-  const TargetInstrInfo &TII;
-  const ARMSubtarget &STI;
-
-  /// FramePtr - ARM physical register used as frame ptr.
-  unsigned FramePtr;
-public:
-  ARMBaseRegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI);
-
-  /// getRegisterNumbering - Given the enum value for some register, e.g.
-  /// ARM::LR, return the number that it corresponds to (e.g. 14).
-  static unsigned getRegisterNumbering(unsigned RegEnum);
-
-  /// Same as previous getRegisterNumbering except it returns true in isSPVFP
-  /// if the register is a single precision VFP register.
-  static unsigned getRegisterNumbering(unsigned RegEnum, bool &isSPVFP);
-
-  /// Code Generation virtual methods...
-  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
-
-  const TargetRegisterClass* const*
-  getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
-
-  BitVector getReservedRegs(const MachineFunction &MF) const;
-
-  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
-
-  const TargetRegisterClass *getPointerRegClass() const;
-
-  std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
-  getAllocationOrder(const TargetRegisterClass *RC,
-                     unsigned HintType, unsigned HintReg,
-                     const MachineFunction &MF) const;
-
-  unsigned ResolveRegAllocHint(unsigned Type, unsigned Reg,
-                               const MachineFunction &MF) const;
-
-  void UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
-                          MachineFunction &MF) const;
-
-  bool hasFP(const MachineFunction &MF) const;
-
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS = NULL) const;
-
-  // Debug information queries.
-  unsigned getRARegister() const;
-  unsigned getFrameRegister(MachineFunction &MF) const;
-
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
-
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-
-  bool isLowRegister(unsigned Reg) const;
-
-private:
-  unsigned getRegisterPairEven(unsigned Reg, const MachineFunction &MF) const;
-
-  unsigned getRegisterPairOdd(unsigned Reg, const MachineFunction &MF) const;
-};
-
 struct ARMRegisterInfo : public ARMBaseRegisterInfo {
 public:
-  ARMRegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI);
-
-  /// emitLoadConstPool - Emits a load from constpool to materialize the
-  /// specified immediate.
-  void emitLoadConstPool(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator &MBBI,
-                         const TargetInstrInfo *TII, DebugLoc dl,
-                         unsigned DestReg, int Val,
-                         ARMCC::CondCodes Pred = ARMCC::AL,
-                         unsigned PredReg = 0) const;
-
-  /// Code Generation virtual methods...
-  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
-
-  bool requiresRegisterScavenging(const MachineFunction &MF) const;
-
-  bool hasReservedCallFrame(MachineFunction &MF) const;
-
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
-  void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
-
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  ARMRegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index a057e5c..20a7355 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -18,8 +18,8 @@ class ARMReg<bits<4> num, string n, list<Register> subregs = []> : Register<n> {
   let SubRegs = subregs;
 }
 
-class ARMFReg<bits<5> num, string n> : Register<n> {
-  field bits<5> Num;
+class ARMFReg<bits<6> num, string n> : Register<n> {
+  field bits<6> Num;
   let Namespace = "ARM";
 }
 
@@ -58,10 +58,11 @@ def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">;
 def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">;
 def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">;
 def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">;
+def SDummy : ARMFReg<63, "sINVALID">;
 
 // Aliases of the F* registers used to hold 64-bit fp values (doubles)
 def D0  : ARMReg< 0,  "d0", [S0,   S1]>;
-def D1  : ARMReg< 1,  "d1", [S2,   S3]>; 
+def D1  : ARMReg< 1,  "d1", [S2,   S3]>;
 def D2  : ARMReg< 2,  "d2", [S4,   S5]>;
 def D3  : ARMReg< 3,  "d3", [S6,   S7]>;
 def D4  : ARMReg< 4,  "d4", [S8,   S9]>;
@@ -78,18 +79,18 @@ def D14 : ARMReg<14, "d14", [S28, S29]>;
 def D15 : ARMReg<15, "d15", [S30, S31]>;
 
 // VFP3 defines 16 additional double registers
-def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d16">;
-def D18 : ARMFReg<18, "d16">; def D19 : ARMFReg<19, "d16">;
-def D20 : ARMFReg<20, "d16">; def D21 : ARMFReg<21, "d16">;
-def D22 : ARMFReg<22, "d16">; def D23 : ARMFReg<23, "d16">;
-def D24 : ARMFReg<24, "d16">; def D25 : ARMFReg<25, "d16">;
-def D26 : ARMFReg<26, "d16">; def D27 : ARMFReg<27, "d16">;
-def D28 : ARMFReg<28, "d16">; def D29 : ARMFReg<29, "d16">;
-def D30 : ARMFReg<30, "d16">; def D31 : ARMFReg<31, "d16">;
+def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d17">;
+def D18 : ARMFReg<18, "d18">; def D19 : ARMFReg<19, "d19">;
+def D20 : ARMFReg<20, "d20">; def D21 : ARMFReg<21, "d21">;
+def D22 : ARMFReg<22, "d22">; def D23 : ARMFReg<23, "d23">;
+def D24 : ARMFReg<24, "d24">; def D25 : ARMFReg<25, "d25">;
+def D26 : ARMFReg<26, "d26">; def D27 : ARMFReg<27, "d27">;
+def D28 : ARMFReg<28, "d28">; def D29 : ARMFReg<29, "d29">;
+def D30 : ARMFReg<30, "d30">; def D31 : ARMFReg<31, "d31">;
 
 // Advanced SIMD (NEON) defines 16 quad-word aliases
 def Q0  : ARMReg< 0,  "q0", [D0,   D1]>;
-def Q1  : ARMReg< 1,  "q1", [D2,   D3]>; 
+def Q1  : ARMReg< 1,  "q1", [D2,   D3]>;
 def Q2  : ARMReg< 2,  "q2", [D4,   D5]>;
 def Q3  : ARMReg< 3,  "q3", [D6,   D7]>;
 def Q4  : ARMReg< 4,  "q4", [D8,   D9]>;
@@ -106,7 +107,9 @@ def Q14 : ARMReg<14, "q14", [D28, D29]>;
 def Q15 : ARMReg<15, "q15", [D30, D31]>;
 
 // Current Program Status Register.
-def CPSR : ARMReg<0, "cpsr">;
+def CPSR  : ARMReg<0, "cpsr">;
+
+def FPSCR : ARMReg<1, "fpscr">;
 
 // Register classes.
 //
@@ -158,6 +161,13 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
       ARM::R4, ARM::R5, ARM::R6,
       ARM::R8, ARM::R10,ARM::R11,
       ARM::R7 };
+    // FP is R7, R9 is available as callee-saved register.
+    // This is used by non-Darwin platform in Thumb mode.
+    static const unsigned ARM_GPR_AO_5[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6,
+      ARM::R8, ARM::R9, ARM::R10,ARM::R11,ARM::R7 };
 
     GPRClass::iterator
     GPRClass::allocation_order_begin(const MachineFunction &MF) const {
@@ -171,6 +181,8 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
       } else {
         if (Subtarget.isR9Reserved())
           return ARM_GPR_AO_2;
+        else if (Subtarget.isThumb())
+          return ARM_GPR_AO_5;
         else
           return ARM_GPR_AO_1;
       }
@@ -191,6 +203,8 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
       } else {
         if (Subtarget.isR9Reserved())
           I = ARM_GPR_AO_2 + (sizeof(ARM_GPR_AO_2)/sizeof(unsigned));
+        else if (Subtarget.isThumb())
+          I = ARM_GPR_AO_5 + (sizeof(ARM_GPR_AO_5)/sizeof(unsigned));
         else
           I = ARM_GPR_AO_1 + (sizeof(ARM_GPR_AO_1)/sizeof(unsigned));
       }
@@ -240,32 +254,45 @@ def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
   S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22,
   S23, S24, S25, S26, S27, S28, S29, S30, S31]>;
 
+// Subset of SPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def SPR_8 : RegisterClass<"ARM", [f32], 32,
+                          [S0, S1,  S2,  S3,  S4,  S5,  S6,  S7,
+                           S8, S9, S10, S11, S12, S13, S14, S15]>;
+
+// Dummy f32 regclass to represent impossible subreg indices.
+def SPR_INVALID : RegisterClass<"ARM", [f32], 32, [SDummy]> {
+  let CopyCost = -1;
+}
+
 // Scalar double precision floating point / generic 64-bit vector register
 // class.
 // ARM requires only word alignment for double. It's more performant if it
 // is double-word alignment though.
 def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
                         [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
-                         D8,  D9,  D10, D11, D12, D13, D14, D15]> {
-  let SubRegClassList = [SPR, SPR];
+                         D8,  D9,  D10, D11, D12, D13, D14, D15,
+                         D16, D17, D18, D19, D20, D21, D22, D23,
+                         D24, D25, D26, D27, D28, D29, D30, D31]> {
+  let SubRegClassList = [SPR_INVALID, SPR_INVALID];
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
   let MethodBodies = [{
     // VFP2
-    static const unsigned ARM_DPR_VFP2[] = { 
-      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3, 
-      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7, 
-      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11, 
+    static const unsigned ARM_DPR_VFP2[] = {
+      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
+      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
+      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11,
       ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
     // VFP3
     static const unsigned ARM_DPR_VFP3[] = {
-      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3, 
-      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7, 
-      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11, 
+      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
+      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
+      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11,
       ARM::D12, ARM::D13, ARM::D14, ARM::D15,
-      ARM::D16, ARM::D17, ARM::D18, ARM::D15,
+      ARM::D16, ARM::D17, ARM::D18, ARM::D19,
       ARM::D20, ARM::D21, ARM::D22, ARM::D23,
       ARM::D24, ARM::D25, ARM::D26, ARM::D27,
       ARM::D28, ARM::D29, ARM::D30, ARM::D31 };
@@ -290,11 +317,34 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
   }];
 }
 
+// Subset of DPR that are accessible with VFP2 (and so that also have
+// 32-bit SPR subregs).
+def DPR_VFP2 : RegisterClass<"ARM", [f64, v2i32, v2f32], 64,
+                             [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                              D8,  D9,  D10, D11, D12, D13, D14, D15]> {
+  let SubRegClassList = [SPR, SPR];
+}
+
+// Subset of DPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def DPR_8 : RegisterClass<"ARM", [f64, v4i16, v2f32], 64,
+                          [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7]> {
+  let SubRegClassList = [SPR_8, SPR_8];
+}
+
 // Generic 128-bit vector register class.
 def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
                         [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
                          Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15]> {
-  let SubRegClassList = [SPR, SPR, SPR, SPR, DPR, DPR];
+  let SubRegClassList = [SPR_INVALID, SPR_INVALID, SPR_INVALID, SPR_INVALID,
+                         DPR, DPR];
+}
+
+// Subset of QPR that have 32-bit SPR subregs.
+def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                             128,
+                             [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7]> {
+  let SubRegClassList = [SPR, SPR, SPR, SPR, DPR_VFP2, DPR_VFP2];
 }
 
 // Condition code registers.
@@ -341,4 +391,3 @@ def : SubRegSet<6, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
                     Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15],
                    [D1,  D3,  D5,  D7,  D9,  D11, D13, D15,
                     D17, D19, D21, D23, D25, D27, D29, D31]>;
-
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 75fa707..fc4c5f5 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -10,26 +10,151 @@
 //===----------------------------------------------------------------------===//
 // Functional units across ARM processors
 //
-def FU_iALU   : FuncUnit; // Integer alu unit
-def FU_iLdSt  : FuncUnit; // Integer load / store unit
-def FU_FpALU  : FuncUnit; // FP alu unit
-def FU_FpLdSt : FuncUnit; // FP load / store unit
-def FU_Br     : FuncUnit; // Branch unit
+def FU_Issue   : FuncUnit; // issue
+def FU_Pipe0   : FuncUnit; // pipeline 0
+def FU_Pipe1   : FuncUnit; // pipeline 1
+def FU_LdSt0   : FuncUnit; // pipeline 0 load/store
+def FU_LdSt1   : FuncUnit; // pipeline 1 load/store
+def FU_NPipe   : FuncUnit; // NEON ALU/MUL pipe
+def FU_NLSPipe : FuncUnit; // NEON LS pipe
 
 //===----------------------------------------------------------------------===//
 // Instruction Itinerary classes used for ARM
 //
-def IIC_iALU    : InstrItinClass;
-def IIC_iLoad   : InstrItinClass;
-def IIC_iStore  : InstrItinClass;
-def IIC_fpALU   : InstrItinClass;
-def IIC_fpLoad  : InstrItinClass;
-def IIC_fpStore : InstrItinClass;
-def IIC_Br      : InstrItinClass;
+def IIC_iALUx      : InstrItinClass;
+def IIC_iALUi      : InstrItinClass;
+def IIC_iALUr      : InstrItinClass;
+def IIC_iALUsi     : InstrItinClass;
+def IIC_iALUsr     : InstrItinClass;
+def IIC_iUNAr      : InstrItinClass;
+def IIC_iUNAsi     : InstrItinClass;
+def IIC_iUNAsr     : InstrItinClass;
+def IIC_iCMPi      : InstrItinClass;
+def IIC_iCMPr      : InstrItinClass;
+def IIC_iCMPsi     : InstrItinClass;
+def IIC_iCMPsr     : InstrItinClass;
+def IIC_iMOVi      : InstrItinClass;
+def IIC_iMOVr      : InstrItinClass;
+def IIC_iMOVsi     : InstrItinClass;
+def IIC_iMOVsr     : InstrItinClass;
+def IIC_iCMOVi     : InstrItinClass;
+def IIC_iCMOVr     : InstrItinClass;
+def IIC_iCMOVsi    : InstrItinClass;
+def IIC_iCMOVsr    : InstrItinClass;
+def IIC_iMUL16     : InstrItinClass;
+def IIC_iMAC16     : InstrItinClass;
+def IIC_iMUL32     : InstrItinClass;
+def IIC_iMAC32     : InstrItinClass;
+def IIC_iMUL64     : InstrItinClass;
+def IIC_iMAC64     : InstrItinClass;
+def IIC_iLoadi     : InstrItinClass;
+def IIC_iLoadr     : InstrItinClass;
+def IIC_iLoadsi    : InstrItinClass;
+def IIC_iLoadiu    : InstrItinClass;
+def IIC_iLoadru    : InstrItinClass;
+def IIC_iLoadsiu   : InstrItinClass;
+def IIC_iLoadm     : InstrItinClass;
+def IIC_iStorei    : InstrItinClass;
+def IIC_iStorer    : InstrItinClass;
+def IIC_iStoresi   : InstrItinClass;
+def IIC_iStoreiu   : InstrItinClass;
+def IIC_iStoreru   : InstrItinClass;
+def IIC_iStoresiu  : InstrItinClass;
+def IIC_iStorem    : InstrItinClass;
+def IIC_Br         : InstrItinClass;
+def IIC_fpSTAT     : InstrItinClass;
+def IIC_fpUNA32    : InstrItinClass;
+def IIC_fpUNA64    : InstrItinClass;
+def IIC_fpCMP32    : InstrItinClass;
+def IIC_fpCMP64    : InstrItinClass;
+def IIC_fpCVTSD    : InstrItinClass;
+def IIC_fpCVTDS    : InstrItinClass;
+def IIC_fpCVTIS    : InstrItinClass;
+def IIC_fpCVTID    : InstrItinClass;
+def IIC_fpCVTSI    : InstrItinClass;
+def IIC_fpCVTDI    : InstrItinClass;
+def IIC_fpALU32    : InstrItinClass;
+def IIC_fpALU64    : InstrItinClass;
+def IIC_fpMUL32    : InstrItinClass;
+def IIC_fpMUL64    : InstrItinClass;
+def IIC_fpMAC32    : InstrItinClass;
+def IIC_fpMAC64    : InstrItinClass;
+def IIC_fpDIV32    : InstrItinClass;
+def IIC_fpDIV64    : InstrItinClass;
+def IIC_fpSQRT32   : InstrItinClass;
+def IIC_fpSQRT64   : InstrItinClass;
+def IIC_fpLoad32   : InstrItinClass;
+def IIC_fpLoad64   : InstrItinClass;
+def IIC_fpLoadm    : InstrItinClass;
+def IIC_fpStore32  : InstrItinClass;
+def IIC_fpStore64  : InstrItinClass;
+def IIC_fpStorem   : InstrItinClass;
+def IIC_VLD1       : InstrItinClass;
+def IIC_VLD2       : InstrItinClass;
+def IIC_VLD3       : InstrItinClass;
+def IIC_VLD4       : InstrItinClass;
+def IIC_VST        : InstrItinClass;
+def IIC_VUNAD      : InstrItinClass;
+def IIC_VUNAQ      : InstrItinClass;
+def IIC_VBIND      : InstrItinClass;
+def IIC_VBINQ      : InstrItinClass;
+def IIC_VMOVImm    : InstrItinClass;
+def IIC_VMOVD      : InstrItinClass;
+def IIC_VMOVQ      : InstrItinClass;
+def IIC_VMOVIS     : InstrItinClass;
+def IIC_VMOVID     : InstrItinClass;
+def IIC_VMOVISL    : InstrItinClass;
+def IIC_VMOVSI     : InstrItinClass;
+def IIC_VMOVDI     : InstrItinClass;
+def IIC_VPERMD     : InstrItinClass;
+def IIC_VPERMQ     : InstrItinClass;
+def IIC_VPERMQ3    : InstrItinClass;
+def IIC_VMACD      : InstrItinClass;
+def IIC_VMACQ      : InstrItinClass;
+def IIC_VRECSD     : InstrItinClass;
+def IIC_VRECSQ     : InstrItinClass;
+def IIC_VCNTiD     : InstrItinClass;
+def IIC_VCNTiQ     : InstrItinClass;
+def IIC_VUNAiD     : InstrItinClass;
+def IIC_VUNAiQ     : InstrItinClass;
+def IIC_VQUNAiD    : InstrItinClass;
+def IIC_VQUNAiQ    : InstrItinClass;
+def IIC_VBINiD     : InstrItinClass;
+def IIC_VBINiQ     : InstrItinClass;
+def IIC_VSUBiD     : InstrItinClass;
+def IIC_VSUBiQ     : InstrItinClass;
+def IIC_VBINi4D    : InstrItinClass;
+def IIC_VBINi4Q    : InstrItinClass;
+def IIC_VSHLiD     : InstrItinClass;
+def IIC_VSHLiQ     : InstrItinClass;
+def IIC_VSHLi4D    : InstrItinClass;
+def IIC_VSHLi4Q    : InstrItinClass;
+def IIC_VPALiD     : InstrItinClass;
+def IIC_VPALiQ     : InstrItinClass;
+def IIC_VMULi16D   : InstrItinClass;
+def IIC_VMULi32D   : InstrItinClass;
+def IIC_VMULi16Q   : InstrItinClass;
+def IIC_VMULi32Q   : InstrItinClass;
+def IIC_VMACi16D   : InstrItinClass;
+def IIC_VMACi32D   : InstrItinClass;
+def IIC_VMACi16Q   : InstrItinClass;
+def IIC_VMACi32Q   : InstrItinClass;
+def IIC_VEXTD      : InstrItinClass;
+def IIC_VEXTQ      : InstrItinClass;
+def IIC_VTB1       : InstrItinClass;
+def IIC_VTB2       : InstrItinClass;
+def IIC_VTB3       : InstrItinClass;
+def IIC_VTB4       : InstrItinClass;
+def IIC_VTBX1      : InstrItinClass;
+def IIC_VTBX2      : InstrItinClass;
+def IIC_VTBX3      : InstrItinClass;
+def IIC_VTBX4      : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
 
 def GenericItineraries : ProcessorItineraries<[]>;
 
+
 include "ARMScheduleV6.td"
+include "ARMScheduleV7.td"
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td
index 596a57f..1ace718 100644
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -1,4 +1,4 @@
-//===- ARMSchedule.td - ARM v6 Scheduling Definitions ------*- tablegen -*-===//
+//===- ARMScheduleV6.td - ARM v6 Scheduling Definitions ----*- tablegen -*-===//
 // 
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,12 +11,4 @@
 //
 //===----------------------------------------------------------------------===//
 
-def V6Itineraries : ProcessorItineraries<[
-  InstrItinData<IIC_iALU    , [InstrStage<1, [FU_iALU]>]>,
-  InstrItinData<IIC_iLoad   , [InstrStage<2, [FU_iLdSt]>]>,
-  InstrItinData<IIC_iStore  , [InstrStage<1, [FU_iLdSt]>]>,
-  InstrItinData<IIC_fpALU   , [InstrStage<6, [FU_FpALU]>]>,
-  InstrItinData<IIC_fpLoad  , [InstrStage<2, [FU_FpLdSt]>]>,
-  InstrItinData<IIC_fpStore , [InstrStage<1, [FU_FpLdSt]>]>,
-  InstrItinData<IIC_Br      , [InstrStage<3, [FU_Br]>]>
-]>;
+// TODO: Add model for an ARM11
diff --git a/lib/Target/ARM/ARMScheduleV7.td b/lib/Target/ARM/ARMScheduleV7.td
new file mode 100644
index 0000000..e565813
--- /dev/null
+++ b/lib/Target/ARM/ARMScheduleV7.td
@@ -0,0 +1,587 @@
+//===- ARMScheduleV7.td - ARM v7 Scheduling Definitions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM v7 processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from "Cortex-A8 Technical Reference Manual".
+//
+// Dual issue pipeline represented by FU_Pipe0 | FU_Pipe1
+//
+def CortexA8Itineraries : ProcessorItineraries<[
+
+  // Two fully-pipelined integer ALU pipelines
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iALUr    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2, 2]>,
+  InstrItinData<IIC_iALUsi   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsr   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2, 1, 1]>,
+  //
+  // Unary Instructions that produce a result
+  InstrItinData<IIC_iUNAr    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iUNAsi   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iUNAsr   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMPr    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iCMPsi   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMPsr   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1, 1]>,
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1]>,
+  InstrItinData<IIC_iMOVr    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsi   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsr   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1, 1, 1]>,
+  //
+  // Move instructions, conditional
+  InstrItinData<IIC_iCMOVi   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMOVr   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsi  , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsr  , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1, 1]>,
+
+  // Integer multiply pipeline
+  // Result written in E5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  //
+  InstrItinData<IIC_iMUL16   , [InstrStage<1, [FU_Pipe0]>], [5, 1, 1]>,
+  InstrItinData<IIC_iMAC16   , [InstrStage<1, [FU_Pipe1], 0>, 
+                                InstrStage<2, [FU_Pipe0]>], [6, 1, 1, 4]>,
+  InstrItinData<IIC_iMUL32   , [InstrStage<1, [FU_Pipe1], 0>, 
+                                InstrStage<2, [FU_Pipe0]>], [6, 1, 1]>,
+  InstrItinData<IIC_iMAC32   , [InstrStage<1, [FU_Pipe1], 0>, 
+                                InstrStage<2, [FU_Pipe0]>], [6, 1, 1, 4]>,
+  InstrItinData<IIC_iMUL64   , [InstrStage<2, [FU_Pipe1], 0>, 
+                                InstrStage<3, [FU_Pipe0]>], [6, 6, 1, 1]>,
+  InstrItinData<IIC_iMAC64   , [InstrStage<2, [FU_Pipe1], 0>, 
+                                InstrStage<3, [FU_Pipe0]>], [6, 6, 1, 1]>,
+  
+  // Integer load pipeline
+  //
+  // loads have an extra cycle of latency, but are fully pipelined
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoadi   , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoadr   , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iLoadsi  , [InstrStage<2, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0], 0>,
+                                InstrStage<1, [FU_Pipe1]>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [4, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoadiu  , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoadru  , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 2, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iLoadsiu , [InstrStage<2, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0], 0>,
+                                InstrStage<1, [FU_Pipe1]>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [4, 3, 1, 1]>,
+  //
+  // Load multiple
+  InstrItinData<IIC_iLoadm   , [InstrStage<2, [FU_Issue], 0>,
+                                InstrStage<2, [FU_Pipe0], 0>,
+                                InstrStage<2, [FU_Pipe1]>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>]>,
+
+  // Integer store pipeline
+  //
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  //
+  // Immediate offset
+  InstrItinData<IIC_iStorei  , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStorer  , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iStoresi , [InstrStage<2, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0], 0>,
+                                InstrStage<1, [FU_Pipe1]>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStoreiu , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [2, 3, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStoreru  , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [2, 3, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iStoresiu, [InstrStage<2, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0], 0>,
+                                InstrStage<1, [FU_Pipe1]>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 3, 1, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStorem  , [InstrStage<2, [FU_Issue], 0>,
+                                InstrStage<2, [FU_Pipe0], 0>,
+                                InstrStage<2, [FU_Pipe1]>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>]>,
+  
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br      , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
+
+  // VFP
+  // Issue through integer pipeline, and execute in NEON unit. We assume
+  // RunFast mode so that NFP pipeline is used for single-precision when
+  // possible.
+  //
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                              InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // Single-precision FP Unary
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [7, 1]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<4, [FU_NPipe], 0>,
+                               InstrStage<4, [FU_NLSPipe]>]>,
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [7, 1]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<4, [FU_NPipe], 0>,
+                               InstrStage<4, [FU_NLSPipe]>]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<7, [FU_NPipe], 0>,
+                               InstrStage<7, [FU_NLSPipe]>]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<5, [FU_NPipe], 0>,
+                               InstrStage<5, [FU_NLSPipe]>]>,
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [7, 1]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<8, [FU_NPipe], 0>,
+                               InstrStage<8, [FU_NLSPipe]>]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [7, 1]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<8, [FU_NPipe], 0>,
+                               InstrStage<8, [FU_NLSPipe]>]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [7, 1]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<9, [FU_NPipe], 0>,
+                               InstrStage<9, [FU_NLSPipe]>]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [7, 1]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<11, [FU_NPipe], 0>,
+                               InstrStage<11, [FU_NLSPipe]>]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [7, 1]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<19, [FU_NPipe], 0>,
+                               InstrStage<19, [FU_NLSPipe]>]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<20, [FU_NPipe], 0>,
+                               InstrStage<20, [FU_NLSPipe]>]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<29, [FU_NPipe], 0>,
+                               InstrStage<29, [FU_NLSPipe]>]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<19, [FU_NPipe], 0>,
+                               InstrStage<19, [FU_NLSPipe]>]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<29, [FU_NPipe], 0>,
+                               InstrStage<29, [FU_NLSPipe]>]>,
+  //
+  // Single-precision FP Load
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // Double-precision FP Load
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpLoad64, [InstrStage<2, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0], 0>,
+                               InstrStage<1, [FU_Pipe1]>,
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // FP Load Multiple
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpLoadm,  [InstrStage<3, [FU_Issue], 0>, 
+                               InstrStage<2, [FU_Pipe0], 0>,
+                               InstrStage<2, [FU_Pipe1]>,
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // Single-precision FP Store
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // Double-precision FP Store
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpStore64,[InstrStage<2, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0], 0>,
+                               InstrStage<1, [FU_Pipe1]>,
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // FP Store Multiple
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpStorem, [InstrStage<3, [FU_Issue], 0>, 
+                               InstrStage<2, [FU_Pipe0], 0>,
+                               InstrStage<2, [FU_Pipe1]>,
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+
+  // NEON
+  // Issue through integer pipeline, and execute in NEON unit.
+  //
+  // VLD1
+  InstrItinData<IIC_VLD1,     [InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // VLD2
+  InstrItinData<IIC_VLD2,     [InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 2, 1]>,
+  //
+  // VLD3
+  InstrItinData<IIC_VLD3,     [InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 2, 2, 1]>,
+  //
+  // VLD4
+  InstrItinData<IIC_VLD4,     [InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 2, 2, 2, 1]>,
+  //
+  // VST
+  InstrItinData<IIC_VST,      [InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // Double-register FP Unary
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [5, 2]>,
+  //
+  // Quad-register FP Unary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [6, 2]>,
+  //
+  // Double-register FP Binary
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [5, 2, 2]>,
+  //
+  // Quad-register FP Binary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [6, 2, 2]>,
+  //
+  // Move Immediate
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3]>,
+  //
+  // Double-register Permute Move
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 1]>,
+  //
+  // Quad-register Permute Move
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 3 for those cases
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 1]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [20, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [20, 20, 1]>,
+  //
+  // Integer to Lane Move
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 1, 1]>,
+  //
+  // Double-register Permute
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 2, 1, 1]>,
+  //
+  // Quad-register Permute
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 3 for those cases
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 3, 1, 1]>,
+  //
+  // Quad-register Permute (3 cycle issue)
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>,
+                               InstrStage<1, [FU_NPipe], 0>,
+                               InstrStage<2, [FU_NLSPipe]>], [4, 4, 1, 1]>,
+  //
+  // Double-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [9, 2, 2, 3]>,
+  //
+  // Quad-register FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [10, 2, 2, 3]>,
+  //
+  // Double-register Reciprical Step
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [9, 2, 2]>,
+  //
+  // Quad-register Reciprical Step
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [10, 2, 2]>,
+  //
+  // Double-register Integer Count
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3, 2, 2]>,
+  //
+  // Quad-register Integer Count
+  // Result written in N3, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [4, 2, 2]>,
+  //
+  // Double-register Integer Unary
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 2]>,
+  //
+  // Quad-register Integer Unary
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 2]>,
+  //
+  // Double-register Integer Q-Unary
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 1]>,
+  //
+  // Quad-register Integer CountQ-Unary
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 1]>,
+  //
+  // Double-register Integer Binary
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3, 2, 2]>,
+  //
+  // Quad-register Integer Binary
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3, 2, 2]>,
+  //
+  // Double-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 2, 1]>,
+  //
+  // Quad-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 2, 1]>,
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3, 2, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3, 2, 1]>,
+  //
+  // Double-register Integer Shift
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3, 1, 1]>,
+  //
+  // Quad-register Integer Shift
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [4, 1, 1]>,
+  //
+  // Double-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4D,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 1, 1]>,
+  //
+  // Quad-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4Q,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [5, 1, 1]>,
+  //
+  // Double-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [6, 3, 2, 1]>,
+  //
+  // Quad-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [7, 3, 2, 1]>,
+  //
+  // Double-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [6, 2, 2]>,
+  //
+  // Double-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [7, 2, 1]>,
+  //
+  // Quad-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [7, 2, 2]>,
+  //
+  // Quad-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>,
+                               InstrStage<2, [FU_NLSPipe], 0>,
+                               InstrStage<3, [FU_NPipe]>], [9, 2, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [6, 2, 2, 3]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [7, 2, 1, 3]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [7, 2, 2, 3]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>,
+                               InstrStage<2, [FU_NLSPipe], 0>,
+                               InstrStage<3, [FU_NPipe]>], [9, 2, 1, 3]>,
+  //
+  // Double-register VEXT
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 1, 1]>,
+  //
+  // Quad-register VEXT
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 1, 1]>,
+  //
+  // VTB
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 2, 2, 1]>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>,
+                               InstrStage<1, [FU_NPipe], 0>,
+                               InstrStage<2, [FU_NLSPipe]>], [4, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>,
+                               InstrStage<1, [FU_NPipe], 0>,
+                               InstrStage<2, [FU_NLSPipe]>], [4, 2, 2, 3, 3, 1]>,
+  //
+  // VTBX
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 1, 2, 1]>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 1, 2, 2, 1]>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>,
+                               InstrStage<1, [FU_NPipe], 0>,
+                               InstrStage<2, [FU_NLSPipe]>], [4, 1, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>,
+                               InstrStage<1, [FU_NPipe], 0>,
+                               InstrStage<2, [FU_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
+]>;
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index e611088..cf1ee3f 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -13,8 +13,7 @@
 
 #include "ARMSubtarget.h"
 #include "ARMGenSubtarget.inc"
-#include "llvm/Module.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/GlobalValue.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
@@ -22,13 +21,19 @@ using namespace llvm;
 static cl::opt<bool>
 ReserveR9("arm-reserve-r9", cl::Hidden,
           cl::desc("Reserve R9, making it unavailable as GPR"));
+static cl::opt<bool>
+UseNEONFP("arm-use-neon-fp",
+          cl::desc("Use NEON for single-precision FP"),
+          cl::init(false), cl::Hidden);
 
-ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
+ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
                            bool isThumb)
   : ARMArchVersion(V4T)
   , ARMFPUType(None)
+  , UseNEONForSinglePrecisionFP(UseNEONFP)
   , IsThumb(isThumb)
   , ThumbMode(Thumb1)
+  , PostRAScheduler(false)
   , IsR9Reserved(ReserveR9)
   , stackAlignment(4)
   , CPUString("generic")
@@ -45,7 +50,6 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
 
   // Set the boolean corresponding to the current target triple, or the default
   // if one cannot be determined, to true.
-  const std::string& TT = M.getTargetTriple();
   unsigned Len = TT.length();
   unsigned Idx = 0;
 
@@ -75,14 +79,14 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
     }
   }
 
+  // Thumb2 implies at least V6T2.
+  if (ARMArchVersion < V6T2 && ThumbMode >= Thumb2)
+    ARMArchVersion = V6T2;
+
   if (Len >= 10) {
     if (TT.find("-darwin") != std::string::npos)
       // arm-darwin
       TargetType = isDarwin;
-  } else if (TT.empty()) {
-#if defined(__APPLE__)
-    TargetType = isDarwin;
-#endif
   }
 
   if (TT.find("eabi") != std::string::npos)
@@ -93,4 +97,61 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
 
   if (isTargetDarwin())
     IsR9Reserved = ReserveR9 | (ARMArchVersion < V6);
+
+  // Set CPU specific features.
+  if (CPUString == "cortex-a8") {
+    PostRAScheduler = true;
+    if (UseNEONFP.getPosition() == 0)
+      UseNEONForSinglePrecisionFP = true;
+  }
+}
+
+/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
+bool
+ARMSubtarget::GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) const {
+  if (RelocM == Reloc::Static)
+    return false;
+
+  // GV with ghost linkage (in JIT lazy compilation mode) do not require an
+  // extra load from stub.
+  bool isDecl = GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode();
+
+  if (!isTargetDarwin()) {
+    // Extra load is needed for all externally visible.
+    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+      return false;
+    return true;
+  } else {
+    if (RelocM == Reloc::PIC_) {
+      // If this is a strong reference to a definition, it is definitely not
+      // through a stub.
+      if (!isDecl && !GV->isWeakForLinker())
+        return false;
+
+      // Unless we have a symbol with hidden visibility, we have to go through a
+      // normal $non_lazy_ptr stub because this symbol might be resolved late.
+      if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+        return true;
+
+      // If symbol visibility is hidden, we have a stub for common symbol
+      // references and external declarations.
+      if (isDecl || GV->hasCommonLinkage())
+        // Hidden $non_lazy_ptr reference.
+        return true;
+
+      return false;
+    } else {
+      // If this is a strong reference to a definition, it is definitely not
+      // through a stub.
+      if (!isDecl && !GV->isWeakForLinker())
+        return false;
+    
+      // Unless we have a symbol with hidden visibility, we have to go through a
+      // normal $non_lazy_ptr stub because this symbol might be resolved late.
+      if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+        return true;
+    }
+  }
+
+  return false;
 }
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 5110b31..7098fd4 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -15,11 +15,12 @@
 #define ARMSUBTARGET_H
 
 #include "llvm/Target/TargetInstrItineraries.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtarget.h"
 #include <string>
 
 namespace llvm {
-class Module;
+class GlobalValue;
 
 class ARMSubtarget : public TargetSubtarget {
 protected:
@@ -43,12 +44,20 @@ protected:
   /// ARMFPUType - Floating Point Unit type.
   ARMFPEnum ARMFPUType;
 
+  /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
+  /// specified. Use the method useNEONForSinglePrecisionFP() to
+  /// determine if NEON should actually be used.
+  bool UseNEONForSinglePrecisionFP;
+
   /// IsThumb - True if we are in thumb mode, false if in ARM mode.
   bool IsThumb;
 
   /// ThumbMode - Indicates supported Thumb version.
   ThumbTypeEnum ThumbMode;
 
+  /// PostRAScheduler - True if using post-register-allocation scheduler.
+  bool PostRAScheduler;
+
   /// IsR9Reserved - True if R9 is a not available as general purpose register.
   bool IsR9Reserved;
 
@@ -61,7 +70,7 @@ protected:
 
   /// Selected instruction itineraries (one entry per itinerary class.)
   InstrItineraryData InstrItins;
-  
+
  public:
   enum {
     isELF, isDarwin
@@ -73,9 +82,9 @@ protected:
   } TargetABI;
 
   /// This constructor initializes the data members to match that
-  /// of the specified module.
+  /// of the specified triple.
   ///
-  ARMSubtarget(const Module &M, const std::string &FS, bool isThumb);
+  ARMSubtarget(const std::string &TT, const std::string &FS, bool isThumb);
 
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
@@ -99,6 +108,8 @@ protected:
   bool hasVFP2() const { return ARMFPUType >= VFPv2; }
   bool hasVFP3() const { return ARMFPUType >= VFPv3; }
   bool hasNEON() const { return ARMFPUType >= NEON;  }
+  bool useNEONForSinglePrecisionFP() const {
+    return hasNEON() && UseNEONForSinglePrecisionFP; }
 
   bool isTargetDarwin() const { return TargetType == isDarwin; }
   bool isTargetELF() const { return TargetType == isELF; }
@@ -108,14 +119,18 @@ protected:
 
   bool isThumb() const { return IsThumb; }
   bool isThumb1Only() const { return IsThumb && (ThumbMode == Thumb1); }
-  bool isThumb2() const { return IsThumb && (ThumbMode >= Thumb2); }
+  bool isThumb2() const { return IsThumb && (ThumbMode == Thumb2); }
   bool hasThumb2() const { return ThumbMode >= Thumb2; }
 
   bool isR9Reserved() const { return IsR9Reserved; }
 
   const std::string & getCPUString() const { return CPUString; }
+  
+  /// enablePostRAScheduler - From TargetSubtarget, return true to
+  /// enable post-RA scheduler.
+  bool enablePostRAScheduler() const { return PostRAScheduler; }
 
-  /// getInstrItins - Return the instruction itineraies based on subtarget 
+  /// getInstrItins - Return the instruction itineraies based on subtarget
   /// selection.
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
 
@@ -123,6 +138,10 @@ protected:
   /// stack frame on entry to the function and which must be maintained by every
   /// function for this subtarget.
   unsigned getStackAlignment() const { return stackAlignment; }
+
+  /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect
+  /// symbol.
+  bool GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) const;
 };
 } // End llvm namespace
 
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 2344733..32ddc20 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -11,188 +11,122 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMTargetMachine.h"
-#include "ARMTargetAsmInfo.h"
+#include "ARMMCAsmInfo.h"
 #include "ARMFrameInfo.h"
 #include "ARM.h"
-#include "llvm/Module.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
-static cl::opt<bool> DisableLdStOpti("disable-arm-loadstore-opti", cl::Hidden,
-                              cl::desc("Disable load store optimization pass"));
-static cl::opt<bool> DisableIfConversion("disable-arm-if-conversion",cl::Hidden,
-                              cl::desc("Disable if-conversion pass"));
-
-/// ARMTargetMachineModule - Note that this is used on hosts that cannot link
-/// in a library unless there are references into the library.  In particular,
-/// it seems that it is not possible to get things to work on Win32 without
-/// this.  Though it is unused, do not remove it.
-extern "C" int ARMTargetMachineModule;
-int ARMTargetMachineModule = 0;
-
-// Register the target.
-static RegisterTarget<ARMTargetMachine>   X("arm",   "ARM");
-static RegisterTarget<ThumbTargetMachine> Y("thumb", "Thumb");
-
-// Force static initialization.
-extern "C" void LLVMInitializeARMTarget() { }
-
-// No assembler printer by default
-ARMBaseTargetMachine::AsmPrinterCtorFn ARMBaseTargetMachine::AsmPrinterCtor = 0;
-
-/// ThumbTargetMachine - Create an Thumb architecture model.
-///
-unsigned ThumbTargetMachine::getJITMatchQuality() {
-#if defined(__thumb__)
-  return 10;
-#endif
-  return 0;
+static const MCAsmInfo *createMCAsmInfo(const Target &T,
+                                        const StringRef &TT) {
+  Triple TheTriple(TT);
+  switch (TheTriple.getOS()) {
+  case Triple::Darwin:
+    return new ARMMCAsmInfoDarwin();
+  default:
+    return new ARMELFMCAsmInfo();
+  }
 }
 
-unsigned ThumbTargetMachine::getModuleMatchQuality(const Module &M) {
-  std::string TT = M.getTargetTriple();
-  // Match thumb-foo-bar, as well as things like thumbv5blah-*
-  if (TT.size() >= 6 &&
-      (TT.substr(0, 6) == "thumb-" || TT.substr(0, 6) == "thumbv"))
-    return 20;
 
-  // If the target triple is something non-thumb, we don't match.
-  if (!TT.empty()) return 0;
+extern "C" void LLVMInitializeARMTarget() {
+  // Register the target.
+  RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget);
+  RegisterTargetMachine<ThumbTargetMachine> Y(TheThumbTarget);
 
-  if (M.getEndianness()  == Module::LittleEndian &&
-      M.getPointerSize() == Module::Pointer32)
-    return 10;                                   // Weak match
-  else if (M.getEndianness() != Module::AnyEndianness ||
-           M.getPointerSize() != Module::AnyPointerSize)
-    return 0;                                    // Match for some other target
-
-  return getJITMatchQuality()/2;
+  // Register the target asm info.
+  RegisterAsmInfoFn A(TheARMTarget, createMCAsmInfo);
+  RegisterAsmInfoFn B(TheThumbTarget, createMCAsmInfo);
 }
 
 /// TargetMachine ctor - Create an ARM architecture model.
 ///
-ARMBaseTargetMachine::ARMBaseTargetMachine(const Module &M,
+ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T,
+                                           const std::string &TT,
                                            const std::string &FS,
                                            bool isThumb)
-  : Subtarget(M, FS, isThumb),
+  : LLVMTargetMachine(T, TT),
+    Subtarget(TT, FS, isThumb),
     FrameInfo(Subtarget),
     JITInfo(),
     InstrItins(Subtarget.getInstrItineraryData()) {
   DefRelocModel = getRelocationModel();
 }
 
-ARMTargetMachine::ARMTargetMachine(const Module &M, const std::string &FS)
-  : ARMBaseTargetMachine(M, FS, false), InstrInfo(Subtarget),
+ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT,
+                                   const std::string &FS)
+  : ARMBaseTargetMachine(T, TT, FS, false), InstrInfo(Subtarget),
     DataLayout(Subtarget.isAPCS_ABI() ?
                std::string("e-p:32:32-f64:32:32-i64:32:32") :
                std::string("e-p:32:32-f64:64:64-i64:64:64")),
     TLInfo(*this) {
 }
 
-ThumbTargetMachine::ThumbTargetMachine(const Module &M, const std::string &FS)
-  : ARMBaseTargetMachine(M, FS, true),
+ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT,
+                                       const std::string &FS)
+  : ARMBaseTargetMachine(T, TT, FS, true),
+    InstrInfo(Subtarget.hasThumb2()
+              ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
+              : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
     DataLayout(Subtarget.isAPCS_ABI() ?
                std::string("e-p:32:32-f64:32:32-i64:32:32-"
                            "i16:16:32-i8:8:32-i1:8:32-a:0:32") :
                std::string("e-p:32:32-f64:64:64-i64:64:64-"
                            "i16:16:32-i8:8:32-i1:8:32-a:0:32")),
     TLInfo(*this) {
-  // Create the approriate type of Thumb InstrInfo
-  if (Subtarget.hasThumb2())
-    InstrInfo = new Thumb2InstrInfo(Subtarget);
-  else
-    InstrInfo = new Thumb1InstrInfo(Subtarget);
-}
-
-unsigned ARMTargetMachine::getJITMatchQuality() {
-#if defined(__arm__)
-  return 10;
-#endif
-  return 0;
-}
-
-unsigned ARMTargetMachine::getModuleMatchQuality(const Module &M) {
-  std::string TT = M.getTargetTriple();
-  // Match arm-foo-bar, as well as things like armv5blah-*
-  if (TT.size() >= 4 &&
-      (TT.substr(0, 4) == "arm-" || TT.substr(0, 4) == "armv"))
-    return 20;
-  // If the target triple is something non-arm, we don't match.
-  if (!TT.empty()) return 0;
-
-  if (M.getEndianness()  == Module::LittleEndian &&
-      M.getPointerSize() == Module::Pointer32)
-    return 10;                                   // Weak match
-  else if (M.getEndianness() != Module::AnyEndianness ||
-           M.getPointerSize() != Module::AnyPointerSize)
-    return 0;                                    // Match for some other target
-
-  return getJITMatchQuality()/2;
 }
 
 
-const TargetAsmInfo *ARMBaseTargetMachine::createTargetAsmInfo() const {
-  switch (Subtarget.TargetType) {
-   case ARMSubtarget::isDarwin:
-    return new ARMDarwinTargetAsmInfo(*this);
-   case ARMSubtarget::isELF:
-    return new ARMELFTargetAsmInfo(*this);
-   default:
-    return new ARMGenericTargetAsmInfo(*this);
-  }
-}
-
 
 // Pass Pipeline Configuration
 bool ARMBaseTargetMachine::addInstSelector(PassManagerBase &PM,
                                            CodeGenOpt::Level OptLevel) {
-  PM.add(createARMISelDag(*this));
+  PM.add(createARMISelDag(*this, OptLevel));
   return false;
 }
 
 bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
                                           CodeGenOpt::Level OptLevel) {
-  // FIXME: temporarily disabling load / store optimization pass for Thumb mode.
-  if (OptLevel != CodeGenOpt::None && !DisableLdStOpti && !Subtarget.isThumb())
+  if (Subtarget.hasNEON())
+    PM.add(createNEONPreAllocPass());
+
+  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
+  if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
     PM.add(createARMLoadStoreOptimizationPass(true));
   return true;
 }
 
-bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
-                                          CodeGenOpt::Level OptLevel) {
-  // FIXME: temporarily disabling load / store optimization pass for Thumb mode.
-  if (OptLevel != CodeGenOpt::None && !DisableLdStOpti && !Subtarget.isThumb())
+bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM,
+                                        CodeGenOpt::Level OptLevel) {
+  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
+  if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
     PM.add(createARMLoadStoreOptimizationPass());
 
-  if (OptLevel != CodeGenOpt::None &&
-      !DisableIfConversion && !Subtarget.isThumb())
-    PM.add(createIfConverterPass());
-
-  PM.add(createARMConstantIslandPass());
   return true;
 }
 
-bool ARMBaseTargetMachine::addAssemblyEmitter(PassManagerBase &PM,
-                                              CodeGenOpt::Level OptLevel,
-                                              bool Verbose,
-                                              raw_ostream &Out) {
-  // Output assembly language.
-  assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-  if (AsmPrinterCtor)
-    PM.add(AsmPrinterCtor(Out, *this, Verbose));
+bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel) {
+  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
+  if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
+    PM.add(createIfConverterPass());
+
+  if (Subtarget.isThumb2()) {
+    PM.add(createThumb2ITBlockPass());
+    PM.add(createThumb2SizeReductionPass());
+  }
 
-  return false;
+  PM.add(createARMConstantIslandPass());
+  return true;
 }
 
-
 bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
                                           CodeGenOpt::Level OptLevel,
-                                          bool DumpAsm,
                                           MachineCodeEmitter &MCE) {
   // FIXME: Move this to TargetJITInfo!
   if (DefRelocModel == Reloc::Default)
@@ -200,18 +134,11 @@ bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
 
   // Machine code emitter pass for ARM.
   PM.add(createARMCodeEmitterPass(*this, MCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
-
   return false;
 }
 
 bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
                                           CodeGenOpt::Level OptLevel,
-                                          bool DumpAsm,
                                           JITCodeEmitter &JCE) {
   // FIXME: Move this to TargetJITInfo!
   if (DefRelocModel == Reloc::Default)
@@ -219,43 +146,42 @@ bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
 
   // Machine code emitter pass for ARM.
   PM.add(createARMJITCodeEmitterPass(*this, JCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
+  return false;
+}
+
+bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel,
+                                          ObjectCodeEmitter &OCE) {
+  // FIXME: Move this to TargetJITInfo!
+  if (DefRelocModel == Reloc::Default)
+    setRelocationModel(Reloc::Static);
 
+  // Machine code emitter pass for ARM.
+  PM.add(createARMObjectCodeEmitterPass(*this, OCE));
   return false;
 }
 
 bool ARMBaseTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
                                                 CodeGenOpt::Level OptLevel,
-                                                bool DumpAsm,
                                                 MachineCodeEmitter &MCE) {
   // Machine code emitter pass for ARM.
   PM.add(createARMCodeEmitterPass(*this, MCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
-
   return false;
 }
 
 bool ARMBaseTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
                                                 CodeGenOpt::Level OptLevel,
-                                                bool DumpAsm,
                                                 JITCodeEmitter &JCE) {
   // Machine code emitter pass for ARM.
   PM.add(createARMJITCodeEmitterPass(*this, JCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
-
   return false;
 }
 
+bool ARMBaseTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel,
+                                            ObjectCodeEmitter &OCE) {
+  // Machine code emitter pass for ARM.
+  PM.add(createARMObjectCodeEmitterPass(*this, OCE));
+  return false;
+}
 
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index a0df54d..71a5348 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -16,7 +16,6 @@
 
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetFrameInfo.h"
 #include "ARMInstrInfo.h"
 #include "ARMFrameInfo.h"
 #include "ARMJITInfo.h"
@@ -27,8 +26,6 @@
 
 namespace llvm {
 
-class Module;
-
 class ARMBaseTargetMachine : public LLVMTargetMachine {
 protected:
   ARMSubtarget        Subtarget;
@@ -39,16 +36,9 @@ private:
   InstrItineraryData  InstrItins;
   Reloc::Model        DefRelocModel;    // Reloc model before it's overridden.
 
-protected:
-  // To avoid having target depend on the asmprinter stuff libraries, asmprinter
-  // set this functions to ctor pointer at startup time if they are linked in.
-  typedef FunctionPass *(*AsmPrinterCtorFn)(raw_ostream &o,
-                                            ARMBaseTargetMachine &tm,
-                                            bool verbose);
-  static AsmPrinterCtorFn AsmPrinterCtor;
-
 public:
-  ARMBaseTargetMachine(const Module &M, const std::string &FS, bool isThumb);
+  ARMBaseTargetMachine(const Target &T, const std::string &TT,
+                       const std::string &FS, bool isThumb);
 
   virtual const ARMFrameInfo     *getFrameInfo() const { return &FrameInfo; }
   virtual       ARMJITInfo       *getJITInfo()         { return &JITInfo; }
@@ -57,34 +47,26 @@ public:
     return InstrItins;
   }
 
-  static void registerAsmPrinter(AsmPrinterCtorFn F) {
-    AsmPrinterCtor = F;
-  }
-
-  static unsigned getModuleMatchQuality(const Module &M);
-  static unsigned getJITMatchQuality();
-
-  virtual const TargetAsmInfo *createTargetAsmInfo() const;
-
   // Pass Pipeline Configuration
   virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreSched2(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addAssemblyEmitter(PassManagerBase &PM,
-                                  CodeGenOpt::Level OptLevel,
-                                  bool Verbose, raw_ostream &Out);
   virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
-                              bool DumpAsm, MachineCodeEmitter &MCE);
+                              MachineCodeEmitter &MCE);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              JITCodeEmitter &MCE);
   virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
-                              bool DumpAsm, JITCodeEmitter &MCE);
+                              ObjectCodeEmitter &OCE);
   virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
                                     CodeGenOpt::Level OptLevel,
-                                    bool DumpAsm,
                                     MachineCodeEmitter &MCE);
   virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
                                     CodeGenOpt::Level OptLevel,
-                                    bool DumpAsm,
                                     JITCodeEmitter &MCE);
+  virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+                                    CodeGenOpt::Level OptLevel,
+                                    ObjectCodeEmitter &OCE);
 };
 
 /// ARMTargetMachine - ARM target machine.
@@ -94,7 +76,8 @@ class ARMTargetMachine : public ARMBaseTargetMachine {
   const TargetData    DataLayout;       // Calculates type size & alignment
   ARMTargetLowering   TLInfo;
 public:
-  ARMTargetMachine(const Module &M, const std::string &FS);
+  ARMTargetMachine(const Target &T, const std::string &TT,
+                   const std::string &FS);
 
   virtual const ARMRegisterInfo  *getRegisterInfo() const {
     return &InstrInfo.getRegisterInfo();
@@ -106,9 +89,6 @@ public:
 
   virtual const ARMInstrInfo     *getInstrInfo() const { return &InstrInfo; }
   virtual const TargetData       *getTargetData() const { return &DataLayout; }
-
-  static unsigned getJITMatchQuality();
-  static unsigned getModuleMatchQuality(const Module &M);
 };
 
 /// ThumbTargetMachine - Thumb target machine.
@@ -120,7 +100,8 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
   const TargetData    DataLayout;   // Calculates type size & alignment
   ARMTargetLowering   TLInfo;
 public:
-  ThumbTargetMachine(const Module &M, const std::string &FS);
+  ThumbTargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS);
 
   /// returns either Thumb1RegisterInfo of Thumb2RegisterInfo
   virtual const ARMBaseRegisterInfo *getRegisterInfo() const {
@@ -134,9 +115,6 @@ public:
   /// returns either Thumb1InstrInfo or Thumb2InstrInfo
   virtual const ARMBaseInstrInfo *getInstrInfo() const { return InstrInfo; }
   virtual const TargetData       *getTargetData() const { return &DataLayout; }
-
-  static unsigned getJITMatchQuality();
-  static unsigned getModuleMatchQuality(const Module &M);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
new file mode 100644
index 0000000..9703403
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -0,0 +1,39 @@
+//===-- llvm/Target/ARMTargetObjectFile.h - ARM Object Info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_TARGETOBJECTFILE_H
+#define LLVM_TARGET_ARM_TARGETOBJECTFILE_H
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/MC/MCSectionELF.h"
+
+namespace llvm {
+
+  class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
+  public:
+    ARMElfTargetObjectFile() : TargetLoweringObjectFileELF() {}
+
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) {
+      TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+      if (TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI()) {
+        StaticCtorSection =
+          getELFSection(".init_array", MCSectionELF::SHT_INIT_ARRAY, 
+                        MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                        SectionKind::getDataRel());
+        StaticDtorSection =
+          getELFSection(".fini_array", MCSectionELF::SHT_FINI_ARRAY,
+                        MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                        SectionKind::getDataRel());
+      }
+    }
+  };
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
new file mode 100644
index 0000000..7438ea9
--- /dev/null
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -0,0 +1,618 @@
+//===-- ARMAsmParser.cpp - Parse ARM assembly to MCInst instructions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLexer.h"
+#include "llvm/MC/MCAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Target/TargetAsmParser.h"
+using namespace llvm;
+
+namespace {
+struct ARMOperand;
+
+// The shift types for register controlled shifts in arm memory addressing
+enum ShiftType {
+  Lsl,
+  Lsr,
+  Asr,
+  Ror,
+  Rrx
+};
+
+class ARMAsmParser : public TargetAsmParser {
+  MCAsmParser &Parser;
+
+private:
+  MCAsmParser &getParser() const { return Parser; }
+
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+
+  bool ParseRegister(ARMOperand &Op);
+
+  bool ParseRegisterList(ARMOperand &Op);
+
+  bool ParseMemory(ARMOperand &Op);
+
+  bool ParseShift(enum ShiftType *St, const MCExpr *&ShiftAmount);
+
+  bool ParseOperand(ARMOperand &Op);
+
+  bool ParseDirectiveWord(unsigned Size, SMLoc L);
+
+  // TODO - For now hacked versions of the next two are in here in this file to
+  // allow some parser testing until the table gen versions are implemented.
+
+  /// @name Auto-generated Match Functions
+  /// {
+  bool MatchInstruction(SmallVectorImpl<ARMOperand> &Operands,
+                        MCInst &Inst);
+
+  /// MatchRegisterName - Match the given string to a register name and return
+  /// its register number, or -1 if there is no match.  To allow return values
+  /// to be used directly in register lists, arm registers have values between
+  /// 0 and 15.
+  int MatchRegisterName(const StringRef &Name);
+
+  /// }
+
+
+public:
+  ARMAsmParser(const Target &T, MCAsmParser &_Parser)
+    : TargetAsmParser(T), Parser(_Parser) {}
+
+  virtual bool ParseInstruction(const StringRef &Name, MCInst &Inst);
+
+  virtual bool ParseDirective(AsmToken DirectiveID);
+};
+  
+} // end anonymous namespace
+
+namespace {
+
+/// ARMOperand - Instances of this class represent a parsed ARM machine
+/// instruction.
+struct ARMOperand {
+  enum {
+    Token,
+    Register,
+    Immediate,
+    Memory
+  } Kind;
+
+
+  union {
+    struct {
+      const char *Data;
+      unsigned Length;
+    } Tok;
+
+    struct {
+      unsigned RegNum;
+      bool Writeback;
+    } Reg;
+
+    struct {
+      const MCExpr *Val;
+    } Imm;
+
+    // This is for all forms of ARM address expressions
+    struct {
+      unsigned BaseRegNum;
+      bool OffsetIsReg;
+      const MCExpr *Offset; // used when OffsetIsReg is false
+      unsigned OffsetRegNum; // used when OffsetIsReg is true
+      bool OffsetRegShifted; // only used when OffsetIsReg is true
+      enum ShiftType ShiftType;  // used when OffsetRegShifted is true
+      const MCExpr *ShiftAmount; // used when OffsetRegShifted is true
+      bool Preindexed;
+      bool Postindexed;
+      bool Negative; // only used when OffsetIsReg is true
+      bool Writeback;
+    } Mem;
+
+  };
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const {
+    assert(Kind == Register && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  bool isToken() const {return Kind == Token; }
+
+  bool isReg() const { return Kind == Register; }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  static ARMOperand CreateToken(StringRef Str) {
+    ARMOperand Res;
+    Res.Kind = Token;
+    Res.Tok.Data = Str.data();
+    Res.Tok.Length = Str.size();
+    return Res;
+  }
+
+  static ARMOperand CreateReg(unsigned RegNum, bool Writeback) {
+    ARMOperand Res;
+    Res.Kind = Register;
+    Res.Reg.RegNum = RegNum;
+    Res.Reg.Writeback = Writeback;
+    return Res;
+  }
+
+  static ARMOperand CreateImm(const MCExpr *Val) {
+    ARMOperand Res;
+    Res.Kind = Immediate;
+    Res.Imm.Val = Val;
+    return Res;
+  }
+
+  static ARMOperand CreateMem(unsigned BaseRegNum, bool OffsetIsReg,
+                              const MCExpr *Offset, unsigned OffsetRegNum,
+                              bool OffsetRegShifted, enum ShiftType ShiftType,
+                              const MCExpr *ShiftAmount, bool Preindexed,
+                              bool Postindexed, bool Negative, bool Writeback) {
+    ARMOperand Res;
+    Res.Kind = Memory;
+    Res.Mem.BaseRegNum = BaseRegNum;
+    Res.Mem.OffsetIsReg = OffsetIsReg;
+    Res.Mem.Offset = Offset;
+    Res.Mem.OffsetRegNum = OffsetRegNum;
+    Res.Mem.OffsetRegShifted = OffsetRegShifted;
+    Res.Mem.ShiftType = ShiftType;
+    Res.Mem.ShiftAmount = ShiftAmount;
+    Res.Mem.Preindexed = Preindexed;
+    Res.Mem.Postindexed = Postindexed;
+    Res.Mem.Negative = Negative;
+    Res.Mem.Writeback = Writeback;
+    return Res;
+  }
+};
+
+} // end anonymous namespace.
+
+// Try to parse a register name.  The token must be an Identifier when called,
+// and if it is a register name a Reg operand is created, the token is eaten
+// and false is returned.  Else true is returned and no token is eaten.
+// TODO this is likely to change to allow different register types and or to
+// parse for a specific register type.
+bool ARMAsmParser::ParseRegister(ARMOperand &Op) {
+  const AsmToken &Tok = getLexer().getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  // FIXME: Validate register for the current architecture; we have to do
+  // validation later, so maybe there is no need for this here.
+  int RegNum;
+
+  RegNum = MatchRegisterName(Tok.getString());
+  if (RegNum == -1)
+    return true;
+  getLexer().Lex(); // Eat identifier token.
+
+  bool Writeback = false;
+  const AsmToken &ExclaimTok = getLexer().getTok();
+  if (ExclaimTok.is(AsmToken::Exclaim)) {
+    Writeback = true;
+    getLexer().Lex(); // Eat exclaim token
+  }
+
+  Op = ARMOperand::CreateReg(RegNum, Writeback);
+
+  return false;
+}
+
+// Try to parse a register list.  The first token must be a '{' when called
+// for now.
+bool ARMAsmParser::ParseRegisterList(ARMOperand &Op) {
+  assert(getLexer().getTok().is(AsmToken::LCurly) &&
+         "Token is not an Left Curly Brace");
+  getLexer().Lex(); // Eat left curly brace token.
+
+  const AsmToken &RegTok = getLexer().getTok();
+  SMLoc RegLoc = RegTok.getLoc();
+  if (RegTok.isNot(AsmToken::Identifier))
+    return Error(RegLoc, "register expected");
+  int RegNum = MatchRegisterName(RegTok.getString());
+  if (RegNum == -1)
+    return Error(RegLoc, "register expected");
+  getLexer().Lex(); // Eat identifier token.
+  unsigned RegList = 1 << RegNum;
+
+  int HighRegNum = RegNum;
+  // TODO ranges like "{Rn-Rm}"
+  while (getLexer().getTok().is(AsmToken::Comma)) {
+    getLexer().Lex(); // Eat comma token.
+
+    const AsmToken &RegTok = getLexer().getTok();
+    SMLoc RegLoc = RegTok.getLoc();
+    if (RegTok.isNot(AsmToken::Identifier))
+      return Error(RegLoc, "register expected");
+    int RegNum = MatchRegisterName(RegTok.getString());
+    if (RegNum == -1)
+      return Error(RegLoc, "register expected");
+
+    if (RegList & (1 << RegNum))
+      Warning(RegLoc, "register duplicated in register list");
+    else if (RegNum <= HighRegNum)
+      Warning(RegLoc, "register not in ascending order in register list");
+    RegList |= 1 << RegNum;
+    HighRegNum = RegNum;
+
+    getLexer().Lex(); // Eat identifier token.
+  }
+  const AsmToken &RCurlyTok = getLexer().getTok();
+  if (RCurlyTok.isNot(AsmToken::RCurly))
+    return Error(RCurlyTok.getLoc(), "'}' expected");
+  getLexer().Lex(); // Eat left curly brace token.
+
+  return false;
+}
+
+// Try to parse an arm memory expression.  It must start with a '[' token.
+// TODO Only preindexing and postindexing addressing are started, unindexed
+// with option, etc are still to do.
+bool ARMAsmParser::ParseMemory(ARMOperand &Op) {
+  assert(getLexer().getTok().is(AsmToken::LBrac) &&
+         "Token is not an Left Bracket");
+  getLexer().Lex(); // Eat left bracket token.
+
+  const AsmToken &BaseRegTok = getLexer().getTok();
+  if (BaseRegTok.isNot(AsmToken::Identifier))
+    return Error(BaseRegTok.getLoc(), "register expected");
+  int BaseRegNum = MatchRegisterName(BaseRegTok.getString());
+  if (BaseRegNum == -1)
+    return Error(BaseRegTok.getLoc(), "register expected");
+  getLexer().Lex(); // Eat identifier token.
+
+  bool Preindexed = false;
+  bool Postindexed = false;
+  bool OffsetIsReg = false;
+  bool Negative = false;
+  bool Writeback = false;
+
+  // First look for preindexed address forms:
+  //  [Rn, +/-Rm]
+  //  [Rn, #offset]
+  //  [Rn, +/-Rm, shift]
+  // that is after the "[Rn" we now have see if the next token is a comma.
+  const AsmToken &Tok = getLexer().getTok();
+  if (Tok.is(AsmToken::Comma)) {
+    Preindexed = true;
+    getLexer().Lex(); // Eat comma token.
+
+    const AsmToken &NextTok = getLexer().getTok();
+    if (NextTok.is(AsmToken::Plus))
+      getLexer().Lex(); // Eat plus token.
+    else if (NextTok.is(AsmToken::Minus)) {
+      Negative = true;
+      getLexer().Lex(); // Eat minus token
+    }
+
+    // See if there is a register following the "[Rn," we have so far.
+    const AsmToken &OffsetRegTok = getLexer().getTok();
+    int OffsetRegNum = MatchRegisterName(OffsetRegTok.getString());
+    bool OffsetRegShifted = false;
+    enum ShiftType ShiftType;
+    const MCExpr *ShiftAmount;
+    const MCExpr *Offset;
+    if (OffsetRegNum != -1) {
+      OffsetIsReg = true;
+      getLexer().Lex(); // Eat identifier token for the offset register.
+      // Look for a comma then a shift
+      const AsmToken &Tok = getLexer().getTok();
+      if (Tok.is(AsmToken::Comma)) {
+        getLexer().Lex(); // Eat comma token.
+
+        const AsmToken &Tok = getLexer().getTok();
+        if (ParseShift(&ShiftType, ShiftAmount))
+          return Error(Tok.getLoc(), "shift expected");
+        OffsetRegShifted = true;
+      }
+    }
+    else { // "[Rn," we have so far was not followed by "Rm"
+      // Look for #offset following the "[Rn,"
+      const AsmToken &HashTok = getLexer().getTok();
+      if (HashTok.isNot(AsmToken::Hash))
+        return Error(HashTok.getLoc(), "'#' expected");
+      getLexer().Lex(); // Eat hash token.
+
+      if (getParser().ParseExpression(Offset))
+       return true;
+    }
+    const AsmToken &RBracTok = getLexer().getTok();
+    if (RBracTok.isNot(AsmToken::RBrac))
+      return Error(RBracTok.getLoc(), "']' expected");
+    getLexer().Lex(); // Eat right bracket token.
+
+    const AsmToken &ExclaimTok = getLexer().getTok();
+    if (ExclaimTok.is(AsmToken::Exclaim)) {
+      Writeback = true;
+      getLexer().Lex(); // Eat exclaim token
+    }
+    Op = ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset, OffsetRegNum,
+                               OffsetRegShifted, ShiftType, ShiftAmount,
+                               Preindexed, Postindexed, Negative, Writeback);
+    return false;
+  }
+  // The "[Rn" we have so far was not followed by a comma.
+  else if (Tok.is(AsmToken::RBrac)) {
+    // This is a post indexing addressing forms:
+    //  [Rn], #offset
+    //  [Rn], +/-Rm
+    //  [Rn], +/-Rm, shift
+    // that is a ']' follows after the "[Rn".
+    Postindexed = true;
+    Writeback = true;
+    getLexer().Lex(); // Eat right bracket token.
+
+    const AsmToken &CommaTok = getLexer().getTok();
+    if (CommaTok.isNot(AsmToken::Comma))
+      return Error(CommaTok.getLoc(), "',' expected");
+    getLexer().Lex(); // Eat comma token.
+
+    const AsmToken &NextTok = getLexer().getTok();
+    if (NextTok.is(AsmToken::Plus))
+      getLexer().Lex(); // Eat plus token.
+    else if (NextTok.is(AsmToken::Minus)) {
+      Negative = true;
+      getLexer().Lex(); // Eat minus token
+    }
+
+    // See if there is a register following the "[Rn]," we have so far.
+    const AsmToken &OffsetRegTok = getLexer().getTok();
+    int OffsetRegNum = MatchRegisterName(OffsetRegTok.getString());
+    bool OffsetRegShifted = false;
+    enum ShiftType ShiftType;
+    const MCExpr *ShiftAmount;
+    const MCExpr *Offset;
+    if (OffsetRegNum != -1) {
+      OffsetIsReg = true;
+      getLexer().Lex(); // Eat identifier token for the offset register.
+      // Look for a comma then a shift
+      const AsmToken &Tok = getLexer().getTok();
+      if (Tok.is(AsmToken::Comma)) {
+        getLexer().Lex(); // Eat comma token.
+
+        const AsmToken &Tok = getLexer().getTok();
+        if (ParseShift(&ShiftType, ShiftAmount))
+          return Error(Tok.getLoc(), "shift expected");
+        OffsetRegShifted = true;
+      }
+    }
+    else { // "[Rn]," we have so far was not followed by "Rm"
+      // Look for #offset following the "[Rn],"
+      const AsmToken &HashTok = getLexer().getTok();
+      if (HashTok.isNot(AsmToken::Hash))
+        return Error(HashTok.getLoc(), "'#' expected");
+      getLexer().Lex(); // Eat hash token.
+
+      if (getParser().ParseExpression(Offset))
+       return true;
+    }
+    Op = ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset, OffsetRegNum,
+                               OffsetRegShifted, ShiftType, ShiftAmount,
+                               Preindexed, Postindexed, Negative, Writeback);
+    return false;
+  }
+
+  return true;
+}
+
+/// ParseShift as one of these two:
+///   ( lsl | lsr | asr | ror ) , # shift_amount
+///   rrx
+/// and returns true if it parses a shift otherwise it returns false.
+bool ARMAsmParser::ParseShift(ShiftType *St, const MCExpr *&ShiftAmount) {
+  const AsmToken &Tok = getLexer().getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return true;
+  const StringRef &ShiftName = Tok.getString();
+  if (ShiftName == "lsl" || ShiftName == "LSL")
+    *St = Lsl;
+  else if (ShiftName == "lsr" || ShiftName == "LSR")
+    *St = Lsr;
+  else if (ShiftName == "asr" || ShiftName == "ASR")
+    *St = Asr;
+  else if (ShiftName == "ror" || ShiftName == "ROR")
+    *St = Ror;
+  else if (ShiftName == "rrx" || ShiftName == "RRX")
+    *St = Rrx;
+  else
+    return true;
+  getLexer().Lex(); // Eat shift type token.
+
+  // For all but a Rotate right there must be a '#' and a shift amount
+  if (*St != Rrx) {
+    // Look for # following the shift type
+    const AsmToken &HashTok = getLexer().getTok();
+    if (HashTok.isNot(AsmToken::Hash))
+      return Error(HashTok.getLoc(), "'#' expected");
+    getLexer().Lex(); // Eat hash token.
+
+    if (getParser().ParseExpression(ShiftAmount))
+      return true;
+  }
+
+  return false;
+}
+
+// A hack to allow some testing
+int ARMAsmParser::MatchRegisterName(const StringRef &Name) {
+  if (Name == "r0" || Name == "R0")
+    return 0;
+  else if (Name == "r1" || Name == "R1")
+    return 1;
+  else if (Name == "r2" || Name == "R2")
+    return 2;
+  else if (Name == "r3" || Name == "R3")
+    return 3;
+  else if (Name == "r3" || Name == "R3")
+    return 3;
+  else if (Name == "r4" || Name == "R4")
+    return 4;
+  else if (Name == "r5" || Name == "R5")
+    return 5;
+  else if (Name == "r6" || Name == "R6")
+    return 6;
+  else if (Name == "r7" || Name == "R7")
+    return 7;
+  else if (Name == "r8" || Name == "R8")
+    return 8;
+  else if (Name == "r9" || Name == "R9")
+    return 9;
+  else if (Name == "r10" || Name == "R10")
+    return 10;
+  else if (Name == "r11" || Name == "R11" || Name == "fp")
+    return 11;
+  else if (Name == "r12" || Name == "R12" || Name == "ip")
+    return 12;
+  else if (Name == "r13" || Name == "R13" || Name == "sp")
+    return 13;
+  else if (Name == "r14" || Name == "R14" || Name == "lr")
+      return 14;
+  else if (Name == "r15" || Name == "R15" || Name == "pc")
+    return 15;
+  return -1;
+}
+
+// A hack to allow some testing
+bool ARMAsmParser::MatchInstruction(SmallVectorImpl<ARMOperand> &Operands,
+                                    MCInst &Inst) {
+  struct ARMOperand Op0 = Operands[0];
+  assert(Op0.Kind == ARMOperand::Token && "First operand not a Token");
+  const StringRef &Mnemonic = Op0.getToken();
+  if (Mnemonic == "add" ||
+      Mnemonic == "stmfd" ||
+      Mnemonic == "str" ||
+      Mnemonic == "ldmfd" ||
+      Mnemonic == "ldr" ||
+      Mnemonic == "mov" ||
+      Mnemonic == "sub")
+    return false;
+
+  return true;
+}
+
+// TODO - this is a work in progress
+bool ARMAsmParser::ParseOperand(ARMOperand &Op) {
+  switch (getLexer().getKind()) {
+  case AsmToken::Identifier:
+    if (!ParseRegister(Op))
+      return false;
+    // TODO parse other operands that start with an identifier like labels
+    return Error(getLexer().getTok().getLoc(), "labels not yet supported");
+  case AsmToken::LBrac:
+    if (!ParseMemory(Op))
+      return false;
+  case AsmToken::LCurly:
+    if (!ParseRegisterList(Op))
+      return false;
+  case AsmToken::Hash:
+    // #42 -> immediate.
+    // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate
+    getLexer().Lex();
+    const MCExpr *Val;
+    if (getParser().ParseExpression(Val))
+      return true;
+    Op = ARMOperand::CreateImm(Val);
+    return false;
+  default:
+    return Error(getLexer().getTok().getLoc(), "unexpected token in operand");
+  }
+}
+
+bool ARMAsmParser::ParseInstruction(const StringRef &Name, MCInst &Inst) {
+  SmallVector<ARMOperand, 7> Operands;
+
+  Operands.push_back(ARMOperand::CreateToken(Name));
+
+  SMLoc Loc = getLexer().getTok().getLoc();
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+
+    // Read the first operand.
+    Operands.push_back(ARMOperand());
+    if (ParseOperand(Operands.back()))
+      return true;
+
+    while (getLexer().is(AsmToken::Comma)) {
+      getLexer().Lex();  // Eat the comma.
+
+      // Parse and remember the operand.
+      Operands.push_back(ARMOperand());
+      if (ParseOperand(Operands.back()))
+        return true;
+    }
+  }
+  if (!MatchInstruction(Operands, Inst))
+    return false;
+
+  Error(Loc, "ARMAsmParser::ParseInstruction only partly implemented");
+  return true;
+}
+
+bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".word")
+    return ParseDirectiveWord(4, DirectiveID.getLoc());
+  return true;
+}
+
+/// ParseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool ARMAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().ParseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+      
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      getLexer().Lex();
+    }
+  }
+
+  getLexer().Lex();
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARMAsmParser() {
+  RegisterAsmParser<ARMAsmParser> X(TheARMTarget);
+  RegisterAsmParser<ARMAsmParser> Y(TheThumbTarget);
+}
diff --git a/lib/Target/ARM/AsmParser/CMakeLists.txt b/lib/Target/ARM/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..308c6cf
--- /dev/null
+++ b/lib/Target/ARM/AsmParser/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARMAsmParser
+  ARMAsmParser.cpp
+  )
+
diff --git a/lib/Target/ARM/AsmParser/Makefile b/lib/Target/ARM/AsmParser/Makefile
new file mode 100644
index 0000000..97e5612
--- /dev/null
+++ b/lib/Target/ARM/AsmParser/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARMAsmParser
+
+# Hack: we need to include 'main' ARM target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
index 434a19a..546731b 100644
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@@ -1,5 +1,3 @@
-//===-- ARMAsmPrinter.cpp - ARM LLVM assembly writer ----------------------===//
-//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
@@ -21,23 +19,30 @@
 #include "ARMMachineFunctionInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/Module.h"
-#include "llvm/MDNode.h"
+#include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DwarfWriter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Mangler.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/FormattedStream.h"
 #include <cctype>
 using namespace llvm;
 
@@ -45,7 +50,6 @@ STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
 namespace {
   class VISIBILITY_HIDDEN ARMAsmPrinter : public AsmPrinter {
-    DwarfWriter *DW;
 
     /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
     /// make the right decision when printing asm code for different targets.
@@ -68,22 +72,18 @@ namespace {
 
     /// GVNonLazyPtrs - Keeps the set of GlobalValues that require
     /// non-lazy-pointers for indirect access.
-    StringSet<> GVNonLazyPtrs;
+    StringMap<std::string> GVNonLazyPtrs;
 
     /// HiddenGVNonLazyPtrs - Keeps the set of GlobalValues with hidden
     /// visibility that require non-lazy-pointers for indirect access.
-    StringSet<> HiddenGVNonLazyPtrs;
-
-    /// FnStubs - Keeps the set of external function GlobalAddresses that the
-    /// asm printer should generate stubs for.
-    StringSet<> FnStubs;
+    StringMap<std::string> HiddenGVNonLazyPtrs;
 
     /// True if asm printer is printing a series of CONSTPOOL_ENTRY.
     bool InCPMode;
   public:
-    explicit ARMAsmPrinter(raw_ostream &O, TargetMachine &TM,
-                           const TargetAsmInfo *T, bool V)
-      : AsmPrinter(O, TM, T, V), DW(0), AFI(NULL), MCP(NULL),
+    explicit ARMAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                           const MCAsmInfo *T, bool V)
+      : AsmPrinter(O, TM, T, V), AFI(NULL), MCP(NULL),
         InCPMode(false) {
       Subtarget = &TM.getSubtarget<ARMSubtarget>();
     }
@@ -110,6 +110,7 @@ namespace {
                                 const char *Modifier = 0);
     void printBitfieldInvMaskImmOperand (const MachineInstr *MI, int OpNum);
 
+    void printThumbITMask(const MachineInstr *MI, int OpNum);
     void printThumbAddrModeRROperand(const MachineInstr *MI, int OpNum);
     void printThumbAddrModeRI5Operand(const MachineInstr *MI, int OpNum,
                                       unsigned Scale);
@@ -118,10 +119,10 @@ namespace {
     void printThumbAddrModeS4Operand(const MachineInstr *MI, int OpNum);
     void printThumbAddrModeSPOperand(const MachineInstr *MI, int OpNum);
 
-    void printT2SOImmOperand(const MachineInstr *MI, int OpNum);
     void printT2SOOperand(const MachineInstr *MI, int OpNum);
     void printT2AddrModeImm12Operand(const MachineInstr *MI, int OpNum);
     void printT2AddrModeImm8Operand(const MachineInstr *MI, int OpNum);
+    void printT2AddrModeImm8s4Operand(const MachineInstr *MI, int OpNum);
     void printT2AddrModeImm8OffsetOperand(const MachineInstr *MI, int OpNum);
     void printT2AddrModeSoRegOperand(const MachineInstr *MI, int OpNum);
 
@@ -132,6 +133,9 @@ namespace {
     void printCPInstOperand(const MachineInstr *MI, int OpNum,
                             const char *Modifier);
     void printJTBlockOperand(const MachineInstr *MI, int OpNum);
+    void printJT2BlockOperand(const MachineInstr *MI, int OpNum);
+    void printTBAddrMode(const MachineInstr *MI, int OpNum);
+    void printNoHashImmediate(const MachineInstr *MI, int OpNum);
 
     virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                  unsigned AsmVariant, const char *ExtraCode);
@@ -139,12 +143,14 @@ namespace {
                                        unsigned AsmVariant,
                                        const char *ExtraCode);
 
-    void printModuleLevelGV(const GlobalVariable* GVar);
-    bool printInstruction(const MachineInstr *MI);  // autogenerated.
+    void PrintGlobalVariable(const GlobalVariable* GVar);
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
     void printMachineInstruction(const MachineInstr *MI);
     bool runOnMachineFunction(MachineFunction &F);
-    bool doInitialization(Module &M);
     bool doFinalization(Module &M);
+    void EmitStartOfAsmFile(Module &M);
 
     /// EmitMachineConstantPoolValue - Print a machine constantpool value to
     /// the .s file.
@@ -153,24 +159,35 @@ namespace {
 
       ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
       GlobalValue *GV = ACPV->getGV();
-      std::string Name = GV ? Mang->getValueName(GV) : TAI->getGlobalPrefix();
-      if (!GV)
-        Name += ACPV->getSymbol();
-      if (ACPV->isNonLazyPointer()) {
-        if (GV->hasHiddenVisibility())
-          HiddenGVNonLazyPtrs.insert(Name);
-        else
-          GVNonLazyPtrs.insert(Name);
-        printSuffixedName(Name, "$non_lazy_ptr");
-      } else if (ACPV->isStub()) {
-        FnStubs.insert(Name);
-        printSuffixedName(Name, "$stub");
+      std::string Name;
+
+      if (ACPV->isLSDA()) {
+        SmallString<16> LSDAName;
+        raw_svector_ostream(LSDAName) << MAI->getPrivateGlobalPrefix() <<
+          "_LSDA_" << getFunctionNumber();
+        Name = LSDAName.str();
+      } else if (GV) {
+        bool isIndirect = Subtarget->isTargetDarwin() &&
+          Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
+        if (!isIndirect)
+          Name = Mang->getMangledName(GV);
+        else {
+          // FIXME: Remove this when Darwin transition to @GOT like syntax.
+          std::string SymName = Mang->getMangledName(GV);
+          Name = Mang->getMangledName(GV, "$non_lazy_ptr", true);
+          if (GV->hasHiddenVisibility())
+            HiddenGVNonLazyPtrs[SymName] = Name;
+          else
+            GVNonLazyPtrs[SymName] = Name;
+        }
       } else
-        O << Name;
+        Name = Mang->makeNameProper(ACPV->getSymbol());
+      O << Name;
+
       if (ACPV->hasModifier()) O << "(" << ACPV->getModifier() << ")";
       if (ACPV->getPCAdjustment() != 0) {
-        O << "-(" << TAI->getPrivateGlobalPrefix() << "PC"
-          << utostr(ACPV->getLabelId())
+        O << "-(" << MAI->getPrivateGlobalPrefix() << "PC"
+          << ACPV->getLabelId()
           << "+" << (unsigned)ACPV->getPCAdjustment();
          if (ACPV->mustAddCurrentAddress())
            O << "-.";
@@ -178,7 +195,7 @@ namespace {
       }
       O << "\n";
     }
-    
+
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AsmPrinter::getAnalysisUsage(AU);
       AU.setPreservesAll();
@@ -205,38 +222,39 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // NOTE: we don't print out constant pools here, they are handled as
   // instructions.
 
-  O << "\n";
+  O << '\n';
+
   // Print out labels for the function.
   const Function *F = MF.getFunction();
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
+
   switch (F->getLinkage()) {
-  default: assert(0 && "Unknown linkage type!");
+  default: llvm_unreachable("Unknown linkage type!");
   case Function::PrivateLinkage:
   case Function::InternalLinkage:
-    SwitchToTextSection("\t.text", F);
     break;
   case Function::ExternalLinkage:
-    SwitchToTextSection("\t.text", F);
     O << "\t.globl\t" << CurrentFnName << "\n";
     break;
+  case Function::LinkerPrivateLinkage:
   case Function::WeakAnyLinkage:
   case Function::WeakODRLinkage:
   case Function::LinkOnceAnyLinkage:
   case Function::LinkOnceODRLinkage:
     if (Subtarget->isTargetDarwin()) {
-      SwitchToTextSection(
-                ".section __TEXT,__textcoal_nt,coalesced,pure_instructions", F);
       O << "\t.globl\t" << CurrentFnName << "\n";
       O << "\t.weak_definition\t" << CurrentFnName << "\n";
     } else {
-      O << TAI->getWeakRefDirective() << CurrentFnName << "\n";
+      O << MAI->getWeakRefDirective() << CurrentFnName << "\n";
     }
     break;
   }
 
   printVisibility(CurrentFnName, F->getVisibility());
 
+  unsigned FnAlign = 1 << MF.getAlignment();  // MF alignment is log2.
   if (AFI->isThumbFunction()) {
-    EmitAlignment(MF.getAlignment(), F, AFI->getAlign());
+    EmitAlignment(FnAlign, F, AFI->getAlign());
     O << "\t.code\t16\n";
     O << "\t.thumb_func";
     if (Subtarget->isTargetDarwin())
@@ -244,7 +262,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     O << "\n";
     InCPMode = false;
   } else {
-    EmitAlignment(MF.getAlignment(), F);
+    EmitAlignment(FnAlign, F);
   }
 
   O << CurrentFnName << ":\n";
@@ -266,8 +284,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
        I != E; ++I) {
     // Print a label for the basic block.
     if (I != MF.begin()) {
-      printBasicBlockLabel(I, true, true, VerboseAsm);
-      O << '\n';
+      EmitBasicBlockStart(I);
     }
     for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
          II != E; ++II) {
@@ -276,14 +293,12 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  if (TAI->hasDotTypeDotSizeDirective())
+  if (MAI->hasDotTypeDotSizeDirective())
     O << "\t.size " << CurrentFnName << ", .-" << CurrentFnName << "\n";
 
   // Emit post-function debug information.
   DW->EndFunction(&MF);
 
-  O.flush();
-
   return false;
 }
 
@@ -298,37 +313,39 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
         unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
         unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
         O << '{'
-          << TRI->getAsmName(DRegLo) << "-" << TRI->getAsmName(DRegHi)
+          << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi)
           << '}';
+      } else if (Modifier && strcmp(Modifier, "lane") == 0) {
+        unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
+        unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1,
+                                                 &ARM::DPR_VFP2RegClass);
+        O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']';
       } else {
-        O << TRI->getAsmName(Reg);
+        O << getRegisterName(Reg);
       }
     } else
-      assert(0 && "not implemented");
+      llvm_unreachable("not implemented");
     break;
   }
   case MachineOperand::MO_Immediate: {
-    if (!Modifier || strcmp(Modifier, "no_hash") != 0)
-      O << "#";
-
-    O << MO.getImm();
+    int64_t Imm = MO.getImm();
+    O << '#';
+    if (Modifier) {
+      if (strcmp(Modifier, "lo16") == 0)
+        O << ":lower16:";
+      else if (strcmp(Modifier, "hi16") == 0)
+        O << ":upper16:";
+    }
+    O << Imm;
     break;
   }
   case MachineOperand::MO_MachineBasicBlock:
-    printBasicBlockLabel(MO.getMBB());
+    GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
     return;
   case MachineOperand::MO_GlobalAddress: {
     bool isCallOp = Modifier && !strcmp(Modifier, "call");
     GlobalValue *GV = MO.getGlobal();
-    std::string Name = Mang->getValueName(GV);
-    bool isExt = (GV->isDeclaration() || GV->hasWeakLinkage() ||
-                  GV->hasLinkOnceLinkage());
-    if (isExt && isCallOp && Subtarget->isTargetDarwin() &&
-        TM.getRelocationModel() != Reloc::Static) {
-      printSuffixedName(Name, "$stub");
-      FnStubs.insert(Name);
-    } else
-      O << Name;
+    O << Mang->getMangledName(GV);
 
     printOffset(MO.getOffset());
 
@@ -339,25 +356,20 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
   }
   case MachineOperand::MO_ExternalSymbol: {
     bool isCallOp = Modifier && !strcmp(Modifier, "call");
-    std::string Name(TAI->getGlobalPrefix());
-    Name += MO.getSymbolName();
-    if (isCallOp && Subtarget->isTargetDarwin() &&
-        TM.getRelocationModel() != Reloc::Static) {
-      printSuffixedName(Name, "$stub");
-      FnStubs.insert(Name);
-    } else
-      O << Name;
+    std::string Name = Mang->makeNameProper(MO.getSymbolName());
+
+    O << Name;
     if (isCallOp && Subtarget->isTargetELF() &&
         TM.getRelocationModel() == Reloc::PIC_)
       O << "(PLT)";
     break;
   }
   case MachineOperand::MO_ConstantPoolIndex:
-    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
       << '_' << MO.getIndex();
     break;
   case MachineOperand::MO_JumpTableIndex:
-    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
       << '_' << MO.getIndex();
     break;
   default:
@@ -365,9 +377,12 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
   }
 }
 
-static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm,
-                       const TargetAsmInfo *TAI) {
-  assert(V < (1 << 12) && "Not a valid so_imm value!");
+static void printSOImm(formatted_raw_ostream &O, int64_t V, bool VerboseAsm,
+                       const MCAsmInfo *MAI) {
+  // Break it up into two parts that make up a shifter immediate.
+  V = ARM_AM::getSOImmVal(V);
+  assert(V != -1 && "Not a valid so_imm value!");
+
   unsigned Imm = ARM_AM::getSOImmValImm(V);
   unsigned Rot = ARM_AM::getSOImmValRot(V);
 
@@ -377,7 +392,7 @@ static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm,
     O << "#" << Imm << ", " << Rot;
     // Pretty printed version.
     if (VerboseAsm)
-      O << ' ' << TAI->getCommentString()
+      O << ' ' << MAI->getCommentString()
         << ' ' << (int)ARM_AM::rotr32(Imm, Rot);
   } else {
     O << "#" << Imm;
@@ -389,7 +404,7 @@ static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm,
 void ARMAsmPrinter::printSOImmOperand(const MachineInstr *MI, int OpNum) {
   const MachineOperand &MO = MI->getOperand(OpNum);
   assert(MO.isImm() && "Not a valid so_imm value!");
-  printSOImm(O, MO.getImm(), VerboseAsm, TAI);
+  printSOImm(O, MO.getImm(), VerboseAsm, MAI);
 }
 
 /// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov'
@@ -399,15 +414,15 @@ void ARMAsmPrinter::printSOImm2PartOperand(const MachineInstr *MI, int OpNum) {
   assert(MO.isImm() && "Not a valid so_imm value!");
   unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO.getImm());
   unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO.getImm());
-  printSOImm(O, ARM_AM::getSOImmVal(V1), VerboseAsm, TAI);
+  printSOImm(O, V1, VerboseAsm, MAI);
   O << "\n\torr";
   printPredicateOperand(MI, 2);
   O << " ";
-  printOperand(MI, 0); 
+  printOperand(MI, 0);
   O << ", ";
-  printOperand(MI, 0); 
+  printOperand(MI, 0);
   O << ", ";
-  printSOImm(O, ARM_AM::getSOImmVal(V2), VerboseAsm, TAI);
+  printSOImm(O, V2, VerboseAsm, MAI);
 }
 
 // so_reg is a 4-operand unit corresponding to register forms of the A5.1
@@ -420,8 +435,7 @@ void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op) {
   const MachineOperand &MO2 = MI->getOperand(Op+1);
   const MachineOperand &MO3 = MI->getOperand(Op+2);
 
-  assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
-  O << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+  O << getRegisterName(MO1.getReg());
 
   // Print the shift opc.
   O << ", "
@@ -429,8 +443,7 @@ void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op) {
     << " ";
 
   if (MO2.getReg()) {
-    assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg()));
-    O << TM.getRegisterInfo()->get(MO2.getReg()).AsmName;
+    O << getRegisterName(MO2.getReg());
     assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
   } else {
     O << "#" << ARM_AM::getSORegOffset(MO3.getImm());
@@ -447,7 +460,7 @@ void ARMAsmPrinter::printAddrMode2Operand(const MachineInstr *MI, int Op) {
     return;
   }
 
-  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+  O << "[" << getRegisterName(MO1.getReg());
 
   if (!MO2.getReg()) {
     if (ARM_AM::getAM2Offset(MO3.getImm()))  // Don't print +0.
@@ -460,8 +473,8 @@ void ARMAsmPrinter::printAddrMode2Operand(const MachineInstr *MI, int Op) {
 
   O << ", "
     << (char)ARM_AM::getAM2Op(MO3.getImm())
-    << TM.getRegisterInfo()->get(MO2.getReg()).AsmName;
-  
+    << getRegisterName(MO2.getReg());
+
   if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
     O << ", "
       << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
@@ -483,8 +496,8 @@ void ARMAsmPrinter::printAddrMode2OffsetOperand(const MachineInstr *MI, int Op){
   }
 
   O << (char)ARM_AM::getAM2Op(MO2.getImm())
-    << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
-  
+    << getRegisterName(MO1.getReg());
+
   if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()))
     O << ", "
       << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm()))
@@ -495,18 +508,18 @@ void ARMAsmPrinter::printAddrMode3Operand(const MachineInstr *MI, int Op) {
   const MachineOperand &MO1 = MI->getOperand(Op);
   const MachineOperand &MO2 = MI->getOperand(Op+1);
   const MachineOperand &MO3 = MI->getOperand(Op+2);
-  
+
   assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
-  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+  O << "[" << getRegisterName(MO1.getReg());
 
   if (MO2.getReg()) {
     O << ", "
       << (char)ARM_AM::getAM3Op(MO3.getImm())
-      << TM.getRegisterInfo()->get(MO2.getReg()).AsmName
+      << getRegisterName(MO2.getReg())
       << "]";
     return;
   }
-  
+
   if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()))
     O << ", #"
       << (char)ARM_AM::getAM3Op(MO3.getImm())
@@ -520,7 +533,7 @@ void ARMAsmPrinter::printAddrMode3OffsetOperand(const MachineInstr *MI, int Op){
 
   if (MO1.getReg()) {
     O << (char)ARM_AM::getAM3Op(MO2.getImm())
-      << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+      << getRegisterName(MO1.getReg());
     return;
   }
 
@@ -530,7 +543,7 @@ void ARMAsmPrinter::printAddrMode3OffsetOperand(const MachineInstr *MI, int Op){
     << (char)ARM_AM::getAM3Op(MO2.getImm())
     << ImmOffs;
 }
-  
+
 void ARMAsmPrinter::printAddrMode4Operand(const MachineInstr *MI, int Op,
                                           const char *Modifier) {
   const MachineOperand &MO1 = MI->getOperand(Op);
@@ -538,11 +551,18 @@ void ARMAsmPrinter::printAddrMode4Operand(const MachineInstr *MI, int Op,
   ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
   if (Modifier && strcmp(Modifier, "submode") == 0) {
     if (MO1.getReg() == ARM::SP) {
+      // FIXME
       bool isLDM = (MI->getOpcode() == ARM::LDM ||
-                    MI->getOpcode() == ARM::LDM_RET);
+                    MI->getOpcode() == ARM::LDM_RET ||
+                    MI->getOpcode() == ARM::t2LDM ||
+                    MI->getOpcode() == ARM::t2LDM_RET);
       O << ARM_AM::getAMSubModeAltStr(Mode, isLDM);
     } else
       O << ARM_AM::getAMSubModeStr(Mode);
+  } else if (Modifier && strcmp(Modifier, "wide") == 0) {
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
+    if (Mode == ARM_AM::ia)
+      O << ".w";
   } else {
     printOperand(MI, Op);
     if (ARM_AM::getAM4WBFlag(MO2.getImm()))
@@ -559,7 +579,7 @@ void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op,
     printOperand(MI, Op);
     return;
   }
-  
+
   assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
 
   if (Modifier && strcmp(Modifier, "submode") == 0) {
@@ -573,14 +593,14 @@ void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op,
     return;
   } else if (Modifier && strcmp(Modifier, "base") == 0) {
     // Used for FSTM{D|S} and LSTM{D|S} operations.
-    O << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+    O << getRegisterName(MO1.getReg());
     if (ARM_AM::getAM5WBFlag(MO2.getImm()))
       O << "!";
     return;
   }
-  
-  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
-  
+
+  O << "[" << getRegisterName(MO1.getReg());
+
   if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
     O << ", #"
       << (char)ARM_AM::getAM5Op(MO2.getImm())
@@ -595,13 +615,13 @@ void ARMAsmPrinter::printAddrMode6Operand(const MachineInstr *MI, int Op) {
   const MachineOperand &MO3 = MI->getOperand(Op+2);
 
   // FIXME: No support yet for specifying alignment.
-  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName << "]";
+  O << "[" << getRegisterName(MO1.getReg()) << "]";
 
   if (ARM_AM::getAM6WBFlag(MO3.getImm())) {
     if (MO2.getReg() == 0)
       O << "!";
     else
-      O << ", " << TM.getRegisterInfo()->get(MO2.getReg()).AsmName;
+      O << ", " << getRegisterName(MO2.getReg());
   }
 }
 
@@ -614,7 +634,7 @@ void ARMAsmPrinter::printAddrModePCOperand(const MachineInstr *MI, int Op,
 
   const MachineOperand &MO1 = MI->getOperand(Op);
   assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
-  O << "[pc, +" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName << "]";
+  O << "[pc, +" << getRegisterName(MO1.getReg()) << "]";
 }
 
 void
@@ -630,11 +650,26 @@ ARMAsmPrinter::printBitfieldInvMaskImmOperand(const MachineInstr *MI, int Op) {
 //===--------------------------------------------------------------------===//
 
 void
+ARMAsmPrinter::printThumbITMask(const MachineInstr *MI, int Op) {
+  // (3 - the number of trailing zeros) is the number of then / else.
+  unsigned Mask = MI->getOperand(Op).getImm();
+  unsigned NumTZ = CountTrailingZeros_32(Mask);
+  assert(NumTZ <= 3 && "Invalid IT mask!");
+  for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
+    bool T = (Mask & (1 << Pos)) == 0;
+    if (T)
+      O << 't';
+    else
+      O << 'e';
+  }
+}
+
+void
 ARMAsmPrinter::printThumbAddrModeRROperand(const MachineInstr *MI, int Op) {
   const MachineOperand &MO1 = MI->getOperand(Op);
   const MachineOperand &MO2 = MI->getOperand(Op+1);
-  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
-  O << ", " << TM.getRegisterInfo()->get(MO2.getReg()).AsmName << "]";
+  O << "[" << getRegisterName(MO1.getReg());
+  O << ", " << getRegisterName(MO2.getReg()) << "]";
 }
 
 void
@@ -649,9 +684,9 @@ ARMAsmPrinter::printThumbAddrModeRI5Operand(const MachineInstr *MI, int Op,
     return;
   }
 
-  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+  O << "[" << getRegisterName(MO1.getReg());
   if (MO3.getReg())
-    O << ", " << TM.getRegisterInfo()->get(MO3.getReg()).AsmName;
+    O << ", " << getRegisterName(MO3.getReg());
   else if (unsigned ImmOffs = MO2.getImm()) {
     O << ", #" << ImmOffs;
     if (Scale > 1)
@@ -676,7 +711,7 @@ ARMAsmPrinter::printThumbAddrModeS4Operand(const MachineInstr *MI, int Op) {
 void ARMAsmPrinter::printThumbAddrModeSPOperand(const MachineInstr *MI,int Op) {
   const MachineOperand &MO1 = MI->getOperand(Op);
   const MachineOperand &MO2 = MI->getOperand(Op+1);
-  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+  O << "[" << getRegisterName(MO1.getReg());
   if (unsigned ImmOffs = MO2.getImm())
     O << ", #" << ImmOffs << " * 4";
   O << "]";
@@ -684,20 +719,6 @@ void ARMAsmPrinter::printThumbAddrModeSPOperand(const MachineInstr *MI,int Op) {
 
 //===--------------------------------------------------------------------===//
 
-/// printT2SOImmOperand - T2SOImm is:
-///  1. a 4-bit splat control value and 8 bit immediate value
-///  2. a 5-bit rotate amount and a non-zero 8-bit immediate value
-///     represented by a normalizedin 7-bit value (msb is always 1)
-void ARMAsmPrinter::printT2SOImmOperand(const MachineInstr *MI, int OpNum) {
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  assert(MO.isImm() && "Not a valid so_imm value!");
-
-  unsigned Imm = ARM_AM::getT2SOImmValDecode(MO.getImm());  
-  // Always print the immediate directly, as the "rotate" form
-  // is deprecated in some contexts.
-  O << "#" << Imm;
-}
-
 // Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2
 // register with shift forms.
 // REG 0   0           - e.g. R5
@@ -708,7 +729,7 @@ void ARMAsmPrinter::printT2SOOperand(const MachineInstr *MI, int OpNum) {
 
   unsigned Reg = MO1.getReg();
   assert(TargetRegisterInfo::isPhysicalRegister(Reg));
-  O << TM.getRegisterInfo()->getAsmName(Reg);
+  O << getRegisterName(Reg);
 
   // Print the shift opc.
   O << ", "
@@ -724,7 +745,7 @@ void ARMAsmPrinter::printT2AddrModeImm12Operand(const MachineInstr *MI,
   const MachineOperand &MO1 = MI->getOperand(OpNum);
   const MachineOperand &MO2 = MI->getOperand(OpNum+1);
 
-  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+  O << "[" << getRegisterName(MO1.getReg());
 
   unsigned OffImm = MO2.getImm();
   if (OffImm)  // Don't print +0.
@@ -737,7 +758,7 @@ void ARMAsmPrinter::printT2AddrModeImm8Operand(const MachineInstr *MI,
   const MachineOperand &MO1 = MI->getOperand(OpNum);
   const MachineOperand &MO2 = MI->getOperand(OpNum+1);
 
-  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+  O << "[" << getRegisterName(MO1.getReg());
 
   int32_t OffImm = (int32_t)MO2.getImm();
   // Don't print +0.
@@ -748,6 +769,22 @@ void ARMAsmPrinter::printT2AddrModeImm8Operand(const MachineInstr *MI,
   O << "]";
 }
 
+void ARMAsmPrinter::printT2AddrModeImm8s4Operand(const MachineInstr *MI,
+                                                 int OpNum) {
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << "[" << getRegisterName(MO1.getReg());
+
+  int32_t OffImm = (int32_t)MO2.getImm() / 4;
+  // Don't print +0.
+  if (OffImm < 0)
+    O << ", #-" << -OffImm << " * 4";
+  else if (OffImm > 0)
+    O << ", #+" << OffImm << " * 4";
+  O << "]";
+}
+
 void ARMAsmPrinter::printT2AddrModeImm8OffsetOperand(const MachineInstr *MI,
                                                      int OpNum) {
   const MachineOperand &MO1 = MI->getOperand(OpNum);
@@ -765,17 +802,15 @@ void ARMAsmPrinter::printT2AddrModeSoRegOperand(const MachineInstr *MI,
   const MachineOperand &MO2 = MI->getOperand(OpNum+1);
   const MachineOperand &MO3 = MI->getOperand(OpNum+2);
 
-  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).AsmName;
+  O << "[" << getRegisterName(MO1.getReg());
 
-  if (MO2.getReg()) {
-    O << ", +"
-      << TM.getRegisterInfo()->get(MO2.getReg()).AsmName;
+  assert(MO2.getReg() && "Invalid so_reg load / store address!");
+  O << ", " << getRegisterName(MO2.getReg());
 
-    unsigned ShAmt = MO3.getImm();
-    if (ShAmt) {
-      assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!");
-      O << ", lsl #" << ShAmt;
-    }
+  unsigned ShAmt = MO3.getImm();
+  if (ShAmt) {
+    assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!");
+    O << ", lsl #" << ShAmt;
   }
   O << "]";
 }
@@ -799,14 +834,17 @@ void ARMAsmPrinter::printSBitModifierOperand(const MachineInstr *MI, int OpNum){
 
 void ARMAsmPrinter::printPCLabel(const MachineInstr *MI, int OpNum) {
   int Id = (int)MI->getOperand(OpNum).getImm();
-  O << TAI->getPrivateGlobalPrefix() << "PC" << Id;
+  O << MAI->getPrivateGlobalPrefix() << "PC" << Id;
 }
 
 void ARMAsmPrinter::printRegisterList(const MachineInstr *MI, int OpNum) {
   O << "{";
-  for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) {
+  // Always skip the first operand, it's the optional (and implicit writeback).
+  for (unsigned i = OpNum+1, e = MI->getNumOperands(); i != e; ++i) {
+    if (MI->getOperand(i).isImplicit())
+      continue;
+    if ((int)i != OpNum+1) O << ", ";
     printOperand(MI, i);
-    if (i != e-1) O << ", ";
   }
   O << "}";
 }
@@ -818,14 +856,14 @@ void ARMAsmPrinter::printCPInstOperand(const MachineInstr *MI, int OpNum,
   // data itself.
   if (!strcmp(Modifier, "label")) {
     unsigned ID = MI->getOperand(OpNum).getImm();
-    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
       << '_' << ID << ":\n";
   } else {
     assert(!strcmp(Modifier, "cpentry") && "Unknown modifier for CPE");
     unsigned CPI = MI->getOperand(OpNum).getIndex();
 
     const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI];
-    
+
     if (MCPE.isMachineConstantPoolEntry()) {
       EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
     } else {
@@ -835,57 +873,119 @@ void ARMAsmPrinter::printCPInstOperand(const MachineInstr *MI, int OpNum,
 }
 
 void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNum) {
+  assert(!Subtarget->isThumb2() && "Thumb2 should use double-jump jumptables!");
+
   const MachineOperand &MO1 = MI->getOperand(OpNum);
   const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
   unsigned JTI = MO1.getIndex();
-  O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+  O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
     << '_' << JTI << '_' << MO2.getImm() << ":\n";
 
-  const char *JTEntryDirective = TAI->getJumpTableDirective();
-  if (!JTEntryDirective)
-    JTEntryDirective = TAI->getData32bitsDirective();
+  const char *JTEntryDirective = MAI->getData32bitsDirective();
 
   const MachineFunction *MF = MI->getParent()->getParent();
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
-  bool UseSet= TAI->getSetDirective() && TM.getRelocationModel() == Reloc::PIC_;
-  std::set<MachineBasicBlock*> JTSets;
+  bool UseSet= MAI->getSetDirective() && TM.getRelocationModel() == Reloc::PIC_;
+  SmallPtrSet<MachineBasicBlock*, 8> JTSets;
   for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
     MachineBasicBlock *MBB = JTBBs[i];
-    if (UseSet && JTSets.insert(MBB).second)
+    bool isNew = JTSets.insert(MBB);
+
+    if (UseSet && isNew)
       printPICJumpTableSetLabel(JTI, MO2.getImm(), MBB);
 
     O << JTEntryDirective << ' ';
     if (UseSet)
-      O << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+      O << MAI->getPrivateGlobalPrefix() << getFunctionNumber()
         << '_' << JTI << '_' << MO2.getImm()
         << "_set_" << MBB->getNumber();
     else if (TM.getRelocationModel() == Reloc::PIC_) {
-      printBasicBlockLabel(MBB, false, false, false);
-      // If the arch uses custom Jump Table directives, don't calc relative to JT
-      if (!TAI->getJumpTableDirective()) 
-        O << '-' << TAI->getPrivateGlobalPrefix() << "JTI"
-          << getFunctionNumber() << '_' << JTI << '_' << MO2.getImm();
-    } else
-      printBasicBlockLabel(MBB, false, false, false);
+      GetMBBSymbol(MBB->getNumber())->print(O, MAI);
+      O << '-' << MAI->getPrivateGlobalPrefix() << "JTI"
+        << getFunctionNumber() << '_' << JTI << '_' << MO2.getImm();
+    } else {
+      GetMBBSymbol(MBB->getNumber())->print(O, MAI);
+    }
+    if (i != e-1)
+      O << '\n';
+  }
+}
+
+void ARMAsmPrinter::printJT2BlockOperand(const MachineInstr *MI, int OpNum) {
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
+  unsigned JTI = MO1.getIndex();
+  O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+    << '_' << JTI << '_' << MO2.getImm() << ":\n";
+
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+  bool ByteOffset = false, HalfWordOffset = false;
+  if (MI->getOpcode() == ARM::t2TBB)
+    ByteOffset = true;
+  else if (MI->getOpcode() == ARM::t2TBH)
+    HalfWordOffset = true;
+
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    if (ByteOffset)
+      O << MAI->getData8bitsDirective();
+    else if (HalfWordOffset)
+      O << MAI->getData16bitsDirective();
+    if (ByteOffset || HalfWordOffset) {
+      O << '(';
+      GetMBBSymbol(MBB->getNumber())->print(O, MAI);
+      O << "-" << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+        << '_' << JTI << '_' << MO2.getImm() << ")/2";
+    } else {
+      O << "\tb.w ";
+      GetMBBSymbol(MBB->getNumber())->print(O, MAI);
+    }
     if (i != e-1)
       O << '\n';
   }
+
+  // Make sure the instruction that follows TBB is 2-byte aligned.
+  // FIXME: Constant island pass should insert an "ALIGN" instruction instead.
+  if (ByteOffset && (JTBBs.size() & 1)) {
+    O << '\n';
+    EmitAlignment(1);
+  }
+}
+
+void ARMAsmPrinter::printTBAddrMode(const MachineInstr *MI, int OpNum) {
+  O << "[pc, " << getRegisterName(MI->getOperand(OpNum).getReg());
+  if (MI->getOpcode() == ARM::t2TBH)
+    O << ", lsl #1";
+  O << ']';
 }
 
+void ARMAsmPrinter::printNoHashImmediate(const MachineInstr *MI, int OpNum) {
+  O << MI->getOperand(OpNum).getImm();
+}
 
 bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                     unsigned AsmVariant, const char *ExtraCode){
   // Does this asm operand have a single letter operand modifier?
   if (ExtraCode && ExtraCode[0]) {
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
-    
+
     switch (ExtraCode[0]) {
     default: return true;  // Unknown modifier.
-    case 'a': // Don't print "#" before a global var name or constant.
-    case 'c': // Don't print "$" before a global var name or constant.
-      printOperand(MI, OpNum, "no_hash");
+    case 'a': // Print as a memory address.
+      if (MI->getOperand(OpNum).isReg()) {
+        O << "[" << getRegisterName(MI->getOperand(OpNum).getReg()) << "]";
+        return false;
+      }
+      // Fallthrough
+    case 'c': // Don't print "#" before an immediate operand.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      printNoHashImmediate(MI, OpNum);
       return false;
     case 'P': // Print a VFP double precision register.
       printOperand(MI, OpNum);
@@ -898,7 +998,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       if (TM.getTargetData()->isBigEndian())
         break;
       // Fallthrough
-    case 'H': // Write second word of DI / DF reference.  
+    case 'H': // Write second word of DI / DF reference.
       // Verify that this operand has two consecutive registers.
       if (!MI->getOperand(OpNum).isReg() ||
           OpNum+1 == MI->getNumOperands() ||
@@ -907,7 +1007,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       ++OpNum;   // Return the high-part.
     }
   }
-  
+
   printOperand(MI, OpNum);
   return false;
 }
@@ -917,7 +1017,10 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                           const char *ExtraCode) {
   if (ExtraCode && ExtraCode[0])
     return true; // Unknown modifier.
-  printAddrMode2Operand(MI, OpNum);
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "[" << getRegisterName(MO.getReg()) << "]";
   return false;
 }
 
@@ -938,16 +1041,47 @@ void ARMAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
   }}
 
   // Call the autogenerated instruction printer routines.
+  processDebugLoc(MI, true);
   printInstruction(MI);
+  if (VerboseAsm && !MI->getDebugLoc().isUnknown())
+    EmitComments(*MI);
+  O << '\n';
+  processDebugLoc(MI, false);
 }
 
-bool ARMAsmPrinter::doInitialization(Module &M) {
-
-  bool Result = AsmPrinter::doInitialization(M);
-  DW = getAnalysisIfAvailable<DwarfWriter>();
+void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (Subtarget->isTargetDarwin()) {
+    Reloc::Model RelocM = TM.getRelocationModel();
+    if (RelocM == Reloc::PIC_ || RelocM == Reloc::DynamicNoPIC) {
+      // Declare all the text sections up front (before the DWARF sections
+      // emitted by AsmPrinter::doInitialization) so the assembler will keep
+      // them together at the beginning of the object file.  This helps
+      // avoid out-of-range branches that are due a fundamental limitation of
+      // the way symbol offsets are encoded with the current Darwin ARM
+      // relocations.
+      TargetLoweringObjectFileMachO &TLOFMacho = 
+        static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
+      OutStreamer.SwitchSection(TLOFMacho.getTextSection());
+      OutStreamer.SwitchSection(TLOFMacho.getTextCoalSection());
+      OutStreamer.SwitchSection(TLOFMacho.getConstTextCoalSection());
+      if (RelocM == Reloc::DynamicNoPIC) {
+        const MCSection *sect =
+          TLOFMacho.getMachOSection("__TEXT", "__symbol_stub4",
+                                    MCSectionMachO::S_SYMBOL_STUBS,
+                                    12, SectionKind::getText());
+        OutStreamer.SwitchSection(sect);
+      } else {
+        const MCSection *sect =
+          TLOFMacho.getMachOSection("__TEXT", "__picsymbolstub4",
+                                    MCSectionMachO::S_SYMBOL_STUBS,
+                                    16, SectionKind::getText());
+        OutStreamer.SwitchSection(sect);
+      }
+    }
+  }
 
-  // Thumb-2 instructions are supported only in unified assembler syntax mode.
-  if (Subtarget->hasThumb2())
+  // Use unified assembler syntax mode for Thumb.
+  if (Subtarget->isThumb())
     O << "\t.syntax unified\n";
 
   // Emit ARM Build Attributes
@@ -975,22 +1109,16 @@ bool ARMAsmPrinter::doInitialization(Module &M) {
     O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_align8_needed << ", 1\n"
       << "\t.eabi_attribute " << ARMBuildAttrs::ABI_align8_preserved << ", 1\n";
 
+    // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
+    if (Subtarget->isAAPCS_ABI() && FloatABIType == FloatABI::Hard)
+      O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_HardFP_use << ", 3\n"
+        << "\t.eabi_attribute " << ARMBuildAttrs::ABI_VFP_args << ", 1\n";
+
     // FIXME: Should we signal R9 usage?
   }
-
-  return Result;
-}
-
-/// PrintUnmangledNameSafely - Print out the printable characters in the name.
-/// Don't print things like \\n or \\0.
-static void PrintUnmangledNameSafely(const Value *V, raw_ostream &OS) {
-  for (const char *Name = V->getNameStart(), *E = Name+V->getNameLen();
-       Name != E; ++Name)
-    if (isprint(*Name))
-      OS << *Name;
 }
 
-void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+void ARMAsmPrinter::PrintGlobalVariable(const GlobalVariable* GVar) {
   const TargetData *TD = TM.getTargetData();
 
   if (!GVar->hasInitializer())   // External global require no code
@@ -1009,10 +1137,8 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
     return;
   }
 
-  std::string name = Mang->getValueName(GVar);
+  std::string name = Mang->getMangledName(GVar);
   Constant *C = GVar->getInitializer();
-  if (isa<MDNode>(C) || isa<MDString>(C))
-    return;
   const Type *Type = C->getType();
   unsigned Size = TD->getTypeAllocSize(Type);
   unsigned Align = TD->getPreferredAlignmentLog(GVar);
@@ -1023,14 +1149,16 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
   if (Subtarget->isTargetELF())
     O << "\t.type " << name << ",%object\n";
 
-  if (C->isNullValue() && !GVar->hasSection() && !GVar->isThreadLocal() &&
-      !(isDarwin &&
-        TAI->SectionKindForGlobal(GVar) == SectionKind::RODataMergeStr)) {
-    // FIXME: This seems to be pretty darwin-specific
+  const MCSection *TheSection =
+    getObjFileLowering().SectionForGlobal(GVar, Mang, TM);
+  OutStreamer.SwitchSection(TheSection);
 
+  // FIXME: get this stuff from section kind flags.
+  if (C->isNullValue() && !GVar->hasSection() && !GVar->isThreadLocal() &&
+      // Don't put things that should go in the cstring section into "comm".
+      !TheSection->getKind().isMergeableCString()) {
     if (GVar->hasExternalLinkage()) {
-      SwitchToSection(TAI->SectionForGlobal(GVar));
-      if (const char *Directive = TAI->getZeroFillDirective()) {
+      if (const char *Directive = MAI->getZeroFillDirective()) {
         O << "\t.globl\t" << name << "\n";
         O << Directive << "__DATA, __common, " << name << ", "
           << Size << ", " << Align << "\n";
@@ -1043,57 +1171,56 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
 
       if (isDarwin) {
         if (GVar->hasLocalLinkage()) {
-          O << TAI->getLCOMMDirective()  << name << "," << Size
+          O << MAI->getLCOMMDirective()  << name << "," << Size
             << ',' << Align;
         } else if (GVar->hasCommonLinkage()) {
-          O << TAI->getCOMMDirective()  << name << "," << Size
+          O << MAI->getCOMMDirective()  << name << "," << Size
             << ',' << Align;
         } else {
-          SwitchToSection(TAI->SectionForGlobal(GVar));
+          OutStreamer.SwitchSection(TheSection);
           O << "\t.globl " << name << '\n'
-            << TAI->getWeakDefDirective() << name << '\n';
+            << MAI->getWeakDefDirective() << name << '\n';
           EmitAlignment(Align, GVar);
           O << name << ":";
           if (VerboseAsm) {
-            O << "\t\t\t\t" << TAI->getCommentString() << ' ';
-            PrintUnmangledNameSafely(GVar, O);
+            O << "\t\t\t\t" << MAI->getCommentString() << ' ';
+            WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
           }
           O << '\n';
           EmitGlobalConstant(C);
           return;
         }
-      } else if (TAI->getLCOMMDirective() != NULL) {
+      } else if (MAI->getLCOMMDirective() != NULL) {
         if (GVar->hasLocalLinkage()) {
-          O << TAI->getLCOMMDirective() << name << "," << Size;
+          O << MAI->getLCOMMDirective() << name << "," << Size;
         } else {
-          O << TAI->getCOMMDirective()  << name << "," << Size;
-          if (TAI->getCOMMDirectiveTakesAlignment())
-            O << ',' << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+          O << MAI->getCOMMDirective()  << name << "," << Size;
+          if (MAI->getCOMMDirectiveTakesAlignment())
+            O << ',' << (MAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
         }
       } else {
-        SwitchToSection(TAI->SectionForGlobal(GVar));
         if (GVar->hasLocalLinkage())
           O << "\t.local\t" << name << "\n";
-        O << TAI->getCOMMDirective()  << name << "," << Size;
-        if (TAI->getCOMMDirectiveTakesAlignment())
-          O << "," << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+        O << MAI->getCOMMDirective()  << name << "," << Size;
+        if (MAI->getCOMMDirectiveTakesAlignment())
+          O << "," << (MAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
       }
       if (VerboseAsm) {
-        O << "\t\t" << TAI->getCommentString() << " ";
-        PrintUnmangledNameSafely(GVar, O);
+        O << "\t\t" << MAI->getCommentString() << " ";
+        WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
       }
       O << "\n";
       return;
     }
   }
 
-  SwitchToSection(TAI->SectionForGlobal(GVar));
   switch (GVar->getLinkage()) {
-   case GlobalValue::CommonLinkage:
-   case GlobalValue::LinkOnceAnyLinkage:
-   case GlobalValue::LinkOnceODRLinkage:
-   case GlobalValue::WeakAnyLinkage:
-   case GlobalValue::WeakODRLinkage:
+  case GlobalValue::CommonLinkage:
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+  case GlobalValue::LinkerPrivateLinkage:
     if (isDarwin) {
       O << "\t.globl " << name << "\n"
         << "\t.weak_definition " << name << "\n";
@@ -1101,28 +1228,27 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
       O << "\t.weak " << name << "\n";
     }
     break;
-   case GlobalValue::AppendingLinkage:
-    // FIXME: appending linkage variables should go into a section of
-    // their name or something.  For now, just emit them as external.
-   case GlobalValue::ExternalLinkage:
+  case GlobalValue::AppendingLinkage:
+  // FIXME: appending linkage variables should go into a section of
+  // their name or something.  For now, just emit them as external.
+  case GlobalValue::ExternalLinkage:
     O << "\t.globl " << name << "\n";
-    // FALL THROUGH
-   case GlobalValue::PrivateLinkage:
-   case GlobalValue::InternalLinkage:
     break;
-   default:
-    assert(0 && "Unknown linkage type!");
+  case GlobalValue::PrivateLinkage:
+  case GlobalValue::InternalLinkage:
     break;
+  default:
+    llvm_unreachable("Unknown linkage type!");
   }
 
   EmitAlignment(Align, GVar);
   O << name << ":";
   if (VerboseAsm) {
-    O << "\t\t\t\t" << TAI->getCommentString() << " ";
-    PrintUnmangledNameSafely(GVar, O);
+    O << "\t\t\t\t" << MAI->getCommentString() << " ";
+    WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
   }
   O << "\n";
-  if (TAI->hasDotTypeDotSizeDirective())
+  if (MAI->hasDotTypeDotSizeDirective())
     O << "\t.size " << name << ", " << Size << "\n";
 
   EmitGlobalConstant(C);
@@ -1131,83 +1257,36 @@ void ARMAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
 
 
 bool ARMAsmPrinter::doFinalization(Module &M) {
-  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    printModuleLevelGV(I);
-
   if (Subtarget->isTargetDarwin()) {
-    SwitchToDataSection("");
-
-    // Output stubs for dynamically-linked functions
-    for (StringSet<>::iterator i = FnStubs.begin(), e = FnStubs.end();
-         i != e; ++i) {
-      if (TM.getRelocationModel() == Reloc::PIC_)
-        SwitchToTextSection(".section __TEXT,__picsymbolstub4,symbol_stubs,"
-                            "none,16", 0);
-      else
-        SwitchToTextSection(".section __TEXT,__symbol_stub4,symbol_stubs,"
-                            "none,12", 0);
+    // All darwin targets use mach-o.
+    TargetLoweringObjectFileMachO &TLOFMacho =
+      static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
 
-      EmitAlignment(2);
-      O << "\t.code\t32\n";
-
-      const char *p = i->getKeyData();
-      printSuffixedName(p, "$stub");
-      O << ":\n";
-      O << "\t.indirect_symbol " << p << "\n";
-      O << "\tldr ip, ";
-      printSuffixedName(p, "$slp");
-      O << "\n";
-      if (TM.getRelocationModel() == Reloc::PIC_) {
-        printSuffixedName(p, "$scv");
-        O << ":\n";
-        O << "\tadd ip, pc, ip\n";
-      }
-      O << "\tldr pc, [ip, #0]\n";
-      printSuffixedName(p, "$slp");
-      O << ":\n";
-      O << "\t.long\t";
-      printSuffixedName(p, "$lazy_ptr");
-      if (TM.getRelocationModel() == Reloc::PIC_) {
-        O << "-(";
-        printSuffixedName(p, "$scv");
-        O << "+8)\n";
-      } else
-        O << "\n";
-      SwitchToDataSection(".lazy_symbol_pointer", 0);
-      printSuffixedName(p, "$lazy_ptr");
-      O << ":\n";
-      O << "\t.indirect_symbol " << p << "\n";
-      O << "\t.long\tdyld_stub_binding_helper\n";
-    }
-    O << "\n";
+    O << '\n';
 
     // Output non-lazy-pointers for external and common global variables.
     if (!GVNonLazyPtrs.empty()) {
-      SwitchToDataSection("\t.non_lazy_symbol_pointer", 0);
-      for (StringSet<>::iterator i =  GVNonLazyPtrs.begin(),
-             e = GVNonLazyPtrs.end(); i != e; ++i) {
-        const char *p = i->getKeyData();
-        printSuffixedName(p, "$non_lazy_ptr");
-        O << ":\n";
-        O << "\t.indirect_symbol " << p << "\n";
+      // Switch with ".non_lazy_symbol_pointer" directive.
+      OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+      EmitAlignment(2);
+      for (StringMap<std::string>::iterator I = GVNonLazyPtrs.begin(),
+           E = GVNonLazyPtrs.end(); I != E; ++I) {
+        O << I->second << ":\n";
+        O << "\t.indirect_symbol " << I->getKeyData() << "\n";
         O << "\t.long\t0\n";
       }
     }
 
     if (!HiddenGVNonLazyPtrs.empty()) {
-      SwitchToSection(TAI->getDataSection());
-      for (StringSet<>::iterator i = HiddenGVNonLazyPtrs.begin(),
-             e = HiddenGVNonLazyPtrs.end(); i != e; ++i) {
-        const char *p = i->getKeyData();
-        EmitAlignment(2);
-        printSuffixedName(p, "$non_lazy_ptr");
-        O << ":\n";
-        O << "\t.long " << p << "\n";
+      OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+      EmitAlignment(2);
+      for (StringMap<std::string>::iterator I = HiddenGVNonLazyPtrs.begin(),
+             E = HiddenGVNonLazyPtrs.end(); I != E; ++I) {
+        O << I->second << ":\n";
+        O << "\t.long " << I->getKeyData() << "\n";
       }
     }
 
-
     // Funny Darwin hack: This flag tells the linker that no global symbols
     // contain code that falls through to other global symbols (e.g. the obvious
     // implementation of multiple entry points).  If this doesn't occur, the
@@ -1219,24 +1298,8 @@ bool ARMAsmPrinter::doFinalization(Module &M) {
   return AsmPrinter::doFinalization(M);
 }
 
-/// createARMCodePrinterPass - Returns a pass that prints the ARM
-/// assembly code for a MachineFunction to the given output stream,
-/// using the given target machine description.  This should work
-/// regardless of whether the function is in SSA form.
-///
-FunctionPass *llvm::createARMCodePrinterPass(raw_ostream &o,
-                                             ARMBaseTargetMachine &tm,
-                                             bool verbose) {
-  return new ARMAsmPrinter(o, tm, tm.getTargetAsmInfo(), verbose);
-}
-
-namespace {
-  static struct Register {
-    Register() {
-      ARMBaseTargetMachine::registerAsmPrinter(createARMCodePrinterPass);
-    }
-  } Registrator;
-}
-
 // Force static initialization.
-extern "C" void LLVMInitializeARMAsmPrinter() { }
+extern "C" void LLVMInitializeARMAsmPrinter() {
+  RegisterAsmPrinter<ARMAsmPrinter> X(TheARMTarget);
+  RegisterAsmPrinter<ARMAsmPrinter> Y(TheThumbTarget);
+}
diff --git a/lib/Target/ARM/AsmPrinter/Makefile b/lib/Target/ARM/AsmPrinter/Makefile
index ce36cec..208becc 100644
--- a/lib/Target/ARM/AsmPrinter/Makefile
+++ b/lib/Target/ARM/AsmPrinter/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/ARM/Makefile -----------------------------*- Makefile -*-===##
+##===- lib/Target/ARM/AsmPrinter/Makefile ------------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 9c46fe0..6e09eb2 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -12,6 +12,8 @@ tablegen(ARMGenCallingConv.inc -gen-callingconv)
 tablegen(ARMGenSubtarget.inc -gen-subtarget)
 
 add_llvm_target(ARMCodeGen
+  ARMBaseInstrInfo.cpp
+  ARMBaseRegisterInfo.cpp
   ARMCodeEmitter.cpp
   ARMConstantIslandPass.cpp
   ARMConstantPoolValue.cpp
@@ -20,14 +22,17 @@ add_llvm_target(ARMCodeGen
   ARMISelLowering.cpp
   ARMJITInfo.cpp
   ARMLoadStoreOptimizer.cpp
+  ARMMCAsmInfo.cpp
   ARMRegisterInfo.cpp
   ARMSubtarget.cpp
-  ARMTargetAsmInfo.cpp
   ARMTargetMachine.cpp
+  NEONPreAllocPass.cpp
   Thumb1InstrInfo.cpp
   Thumb1RegisterInfo.cpp
+  Thumb2ITBlockPass.cpp
   Thumb2InstrInfo.cpp
   Thumb2RegisterInfo.cpp
+  Thumb2SizeReduction.cpp
   )
 
 target_link_libraries (LLVMARMCodeGen LLVMSelectionDAG)
diff --git a/lib/Target/ARM/Makefile b/lib/Target/ARM/Makefile
index 9a3b9be..a8dd38c 100644
--- a/lib/Target/ARM/Makefile
+++ b/lib/Target/ARM/Makefile
@@ -18,6 +18,6 @@ BUILT_SOURCES = ARMGenRegisterInfo.h.inc ARMGenRegisterNames.inc \
                 ARMGenDAGISel.inc ARMGenSubtarget.inc \
                 ARMGenCodeEmitter.inc ARMGenCallingConv.inc
 
-DIRS = AsmPrinter
+DIRS = AsmPrinter AsmParser TargetInfo
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp
new file mode 100644
index 0000000..821b872
--- /dev/null
+++ b/lib/Target/ARM/NEONPreAllocPass.cpp
@@ -0,0 +1,394 @@
+//===-- NEONPreAllocPass.cpp - Allocate adjacent NEON registers--*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "neon-prealloc"
+#include "ARM.h"
+#include "ARMInstrInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+namespace {
+  class VISIBILITY_HIDDEN NEONPreAllocPass : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+
+  public:
+    static char ID;
+    NEONPreAllocPass() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "NEON register pre-allocation pass";
+    }
+
+  private:
+    bool PreAllocNEONRegisters(MachineBasicBlock &MBB);
+  };
+
+  char NEONPreAllocPass::ID = 0;
+}
+
+static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
+                             unsigned &Offset, unsigned &Stride) {
+  // Default to unit stride with no offset.
+  Stride = 1;
+  Offset = 0;
+
+  switch (Opcode) {
+  default:
+    break;
+
+  case ARM::VLD2d8:
+  case ARM::VLD2d16:
+  case ARM::VLD2d32:
+  case ARM::VLD2d64:
+  case ARM::VLD2LNd8:
+  case ARM::VLD2LNd16:
+  case ARM::VLD2LNd32:
+    FirstOpnd = 0;
+    NumRegs = 2;
+    return true;
+
+  case ARM::VLD2q8:
+  case ARM::VLD2q16:
+  case ARM::VLD2q32:
+    FirstOpnd = 0;
+    NumRegs = 4;
+    return true;
+
+  case ARM::VLD2LNq16a:
+  case ARM::VLD2LNq32a:
+    FirstOpnd = 0;
+    NumRegs = 2;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD2LNq16b:
+  case ARM::VLD2LNq32b:
+    FirstOpnd = 0;
+    NumRegs = 2;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD3d8:
+  case ARM::VLD3d16:
+  case ARM::VLD3d32:
+  case ARM::VLD3d64:
+  case ARM::VLD3LNd8:
+  case ARM::VLD3LNd16:
+  case ARM::VLD3LNd32:
+    FirstOpnd = 0;
+    NumRegs = 3;
+    return true;
+
+  case ARM::VLD3q8a:
+  case ARM::VLD3q16a:
+  case ARM::VLD3q32a:
+    FirstOpnd = 0;
+    NumRegs = 3;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD3q8b:
+  case ARM::VLD3q16b:
+  case ARM::VLD3q32b:
+    FirstOpnd = 0;
+    NumRegs = 3;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD3LNq16a:
+  case ARM::VLD3LNq32a:
+    FirstOpnd = 0;
+    NumRegs = 3;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD3LNq16b:
+  case ARM::VLD3LNq32b:
+    FirstOpnd = 0;
+    NumRegs = 3;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD4d8:
+  case ARM::VLD4d16:
+  case ARM::VLD4d32:
+  case ARM::VLD4d64:
+  case ARM::VLD4LNd8:
+  case ARM::VLD4LNd16:
+  case ARM::VLD4LNd32:
+    FirstOpnd = 0;
+    NumRegs = 4;
+    return true;
+
+  case ARM::VLD4q8a:
+  case ARM::VLD4q16a:
+  case ARM::VLD4q32a:
+    FirstOpnd = 0;
+    NumRegs = 4;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD4q8b:
+  case ARM::VLD4q16b:
+  case ARM::VLD4q32b:
+    FirstOpnd = 0;
+    NumRegs = 4;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD4LNq16a:
+  case ARM::VLD4LNq32a:
+    FirstOpnd = 0;
+    NumRegs = 4;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD4LNq16b:
+  case ARM::VLD4LNq32b:
+    FirstOpnd = 0;
+    NumRegs = 4;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VST2d8:
+  case ARM::VST2d16:
+  case ARM::VST2d32:
+  case ARM::VST2d64:
+  case ARM::VST2LNd8:
+  case ARM::VST2LNd16:
+  case ARM::VST2LNd32:
+    FirstOpnd = 3;
+    NumRegs = 2;
+    return true;
+
+  case ARM::VST2q8:
+  case ARM::VST2q16:
+  case ARM::VST2q32:
+    FirstOpnd = 3;
+    NumRegs = 4;
+    return true;
+
+  case ARM::VST2LNq16a:
+  case ARM::VST2LNq32a:
+    FirstOpnd = 3;
+    NumRegs = 2;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VST2LNq16b:
+  case ARM::VST2LNq32b:
+    FirstOpnd = 3;
+    NumRegs = 2;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VST3d8:
+  case ARM::VST3d16:
+  case ARM::VST3d32:
+  case ARM::VST3d64:
+  case ARM::VST3LNd8:
+  case ARM::VST3LNd16:
+  case ARM::VST3LNd32:
+    FirstOpnd = 3;
+    NumRegs = 3;
+    return true;
+
+  case ARM::VST3q8a:
+  case ARM::VST3q16a:
+  case ARM::VST3q32a:
+    FirstOpnd = 4;
+    NumRegs = 3;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VST3q8b:
+  case ARM::VST3q16b:
+  case ARM::VST3q32b:
+    FirstOpnd = 4;
+    NumRegs = 3;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VST3LNq16a:
+  case ARM::VST3LNq32a:
+    FirstOpnd = 3;
+    NumRegs = 3;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VST3LNq16b:
+  case ARM::VST3LNq32b:
+    FirstOpnd = 3;
+    NumRegs = 3;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VST4d8:
+  case ARM::VST4d16:
+  case ARM::VST4d32:
+  case ARM::VST4d64:
+  case ARM::VST4LNd8:
+  case ARM::VST4LNd16:
+  case ARM::VST4LNd32:
+    FirstOpnd = 3;
+    NumRegs = 4;
+    return true;
+
+  case ARM::VST4q8a:
+  case ARM::VST4q16a:
+  case ARM::VST4q32a:
+    FirstOpnd = 4;
+    NumRegs = 4;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VST4q8b:
+  case ARM::VST4q16b:
+  case ARM::VST4q32b:
+    FirstOpnd = 4;
+    NumRegs = 4;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VST4LNq16a:
+  case ARM::VST4LNq32a:
+    FirstOpnd = 3;
+    NumRegs = 4;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VST4LNq16b:
+  case ARM::VST4LNq32b:
+    FirstOpnd = 3;
+    NumRegs = 4;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VTBL2:
+    FirstOpnd = 1;
+    NumRegs = 2;
+    return true;
+
+  case ARM::VTBL3:
+    FirstOpnd = 1;
+    NumRegs = 3;
+    return true;
+
+  case ARM::VTBL4:
+    FirstOpnd = 1;
+    NumRegs = 4;
+    return true;
+
+  case ARM::VTBX2:
+    FirstOpnd = 2;
+    NumRegs = 2;
+    return true;
+
+  case ARM::VTBX3:
+    FirstOpnd = 2;
+    NumRegs = 3;
+    return true;
+
+  case ARM::VTBX4:
+    FirstOpnd = 2;
+    NumRegs = 4;
+    return true;
+  }
+
+  return false;
+}
+
+bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  for (; MBBI != E; ++MBBI) {
+    MachineInstr *MI = &*MBBI;
+    unsigned FirstOpnd, NumRegs, Offset, Stride;
+    if (!isNEONMultiRegOp(MI->getOpcode(), FirstOpnd, NumRegs, Offset, Stride))
+      continue;
+
+    MachineBasicBlock::iterator NextI = next(MBBI);
+    for (unsigned R = 0; R < NumRegs; ++R) {
+      MachineOperand &MO = MI->getOperand(FirstOpnd + R);
+      assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand");
+      unsigned VirtReg = MO.getReg();
+      assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+             "expected a virtual register");
+
+      // For now, just assign a fixed set of adjacent registers.
+      // This leaves plenty of room for future improvements.
+      static const unsigned NEONDRegs[] = {
+        ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+        ARM::D4, ARM::D5, ARM::D6, ARM::D7
+      };
+      MO.setReg(NEONDRegs[Offset + R * Stride]);
+
+      if (MO.isUse()) {
+        // Insert a copy from VirtReg.
+        TII->copyRegToReg(MBB, MBBI, MO.getReg(), VirtReg,
+                          ARM::DPRRegisterClass, ARM::DPRRegisterClass);
+        if (MO.isKill()) {
+          MachineInstr *CopyMI = prior(MBBI);
+          CopyMI->findRegisterUseOperand(VirtReg)->setIsKill();
+        }
+        MO.setIsKill();
+      } else if (MO.isDef() && !MO.isDead()) {
+        // Add a copy to VirtReg.
+        TII->copyRegToReg(MBB, NextI, VirtReg, MO.getReg(),
+                          ARM::DPRRegisterClass, ARM::DPRRegisterClass);
+      }
+    }
+  }
+
+  return Modified;
+}
+
+bool NEONPreAllocPass::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= PreAllocNEONRegisters(MBB);
+  }
+
+  return Modified;
+}
+
+/// createNEONPreAllocPass - returns an instance of the NEON register
+/// pre-allocation pass.
+FunctionPass *llvm::createNEONPreAllocPass() {
+  return new NEONPreAllocPass();
+}
diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt
index 4d3200b..a961a57 100644
--- a/lib/Target/ARM/README-Thumb.txt
+++ b/lib/Target/ARM/README-Thumb.txt
@@ -226,3 +226,31 @@ etc. Almost all Thumb instructions clobber condition code.
 //===---------------------------------------------------------------------===//
 
 Add ldmia, stmia support.
+
+//===---------------------------------------------------------------------===//
+
+Thumb load / store address mode offsets are scaled. The values kept in the
+instruction operands are pre-scale values. This probably ought to be changed
+to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions.
+
+//===---------------------------------------------------------------------===//
+
+We need to make (some of the) Thumb1 instructions predicable. That will allow
+shrinking of predicated Thumb2 instructions. To allow this, we need to be able
+to toggle the 's' bit since they do not set CPSR when they are inside IT blocks.
+
+//===---------------------------------------------------------------------===//
+
+Make use of hi register variants of cmp: tCMPhir / tCMPZhir.
+
+//===---------------------------------------------------------------------===//
+
+Thumb1 immediate field sometimes keep pre-scaled values. See
+Thumb1RegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and
+Thumb2.
+
+//===---------------------------------------------------------------------===//
+
+Rather than having tBR_JTr print a ".align 2" and constant island pass pad it,
+add a target specific ALIGN instruction instead. That way, GetInstSizeInBytes
+won't have to over-estimate. It can also be used for loop alignment pass.
diff --git a/lib/Target/ARM/README-Thumb2.txt b/lib/Target/ARM/README-Thumb2.txt
new file mode 100644
index 0000000..e7c2552
--- /dev/null
+++ b/lib/Target/ARM/README-Thumb2.txt
@@ -0,0 +1,6 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the ARM backend (Thumb2 specific).
+//===---------------------------------------------------------------------===//
+
+Make sure jumptable destinations are below the jumptable in order to make use
+of tbb / tbh.
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index f3377f9..8fb1da3 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -537,3 +537,66 @@ Split out LDR (literal) from normal ARM LDR instruction. Also consider spliting
 LDR into imm12 and so_reg forms. This allows us to clean up some code. e.g.
 ARMLoadStoreOptimizer does not need to look at LDR (literal) and LDR (so_reg)
 while ARMConstantIslandPass only need to worry about LDR (literal).
+
+//===---------------------------------------------------------------------===//
+
+We need to fix constant isel for ARMv6t2 to use MOVT.
+
+//===---------------------------------------------------------------------===//
+
+Constant island pass should make use of full range SoImm values for LEApcrel.
+Be careful though as the last attempt caused infinite looping on lencod.
+
+//===---------------------------------------------------------------------===//
+
+Predication issue. This function:   
+
+extern unsigned array[ 128 ];
+int     foo( int x ) {
+  int     y;
+  y = array[ x & 127 ];
+  if ( x & 128 )
+     y = 123456789 & ( y >> 2 );
+  else
+     y = 123456789 & y;
+  return y;
+}
+
+compiles to:
+
+_foo:
+	and r1, r0, #127
+	ldr r2, LCPI1_0
+	ldr r2, [r2]
+	ldr r1, [r2, +r1, lsl #2]
+	mov r2, r1, lsr #2
+	tst r0, #128
+	moveq r2, r1
+	ldr r0, LCPI1_1
+	and r0, r2, r0
+	bx lr
+
+It would be better to do something like this, to fold the shift into the
+conditional move:
+
+	and r1, r0, #127
+	ldr r2, LCPI1_0
+	ldr r2, [r2]
+	ldr r1, [r2, +r1, lsl #2]
+	tst r0, #128
+	movne r1, r1, lsr #2
+	ldr r0, LCPI1_1
+	and r0, r1, r0
+	bx lr
+
+it saves an instruction and a register.
+
+//===---------------------------------------------------------------------===//
+
+add/sub/and/or + i32 imm can be simplified by folding part of the immediate
+into the operation.
+
+//===---------------------------------------------------------------------===//
+
+It might be profitable to cse MOVi16 if there are lots of 32-bit immediates
+with the same bottom half.
diff --git a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
new file mode 100644
index 0000000..163a0a9
--- /dev/null
+++ b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- ARMTargetInfo.cpp - ARM Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheARMTarget, llvm::TheThumbTarget;
+
+extern "C" void LLVMInitializeARMTargetInfo() { 
+  RegisterTarget<Triple::arm, /*HasJIT=*/true>
+    X(TheARMTarget, "arm", "ARM");
+
+  RegisterTarget<Triple::thumb, /*HasJIT=*/true>
+    Y(TheThumbTarget, "thumb", "Thumb");
+}
diff --git a/lib/Target/ARM/TargetInfo/CMakeLists.txt b/lib/Target/ARM/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..3910bb0
--- /dev/null
+++ b/lib/Target/ARM/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARMInfo
+  ARMTargetInfo.cpp
+  )
+
+add_dependencies(LLVMARMInfo ARMCodeGenTable_gen)
diff --git a/lib/Target/ARM/TargetInfo/Makefile b/lib/Target/ARM/TargetInfo/Makefile
new file mode 100644
index 0000000..6292ab1
--- /dev/null
+++ b/lib/Target/ARM/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/ARM/TargetInfo/Makefile ------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARMInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index e13a811..7eed30e 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -22,63 +22,29 @@
 
 using namespace llvm;
 
-Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
-  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) : RI(*this, STI) {
 }
 
-bool Thumb1InstrInfo::isMoveInstr(const MachineInstr &MI,
-                                  unsigned &SrcReg, unsigned &DstReg,
-                                  unsigned& SrcSubIdx, unsigned& DstSubIdx) const {
-  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
-
-  unsigned oc = MI.getOpcode();
-  switch (oc) {
-  default:
-    return false;
-  case ARM::tMOVr:
-  case ARM::tMOVhir2lor:
-  case ARM::tMOVlor2hir:
-  case ARM::tMOVhir2hir:
-    assert(MI.getDesc().getNumOperands() >= 2 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           "Invalid Thumb MOV instruction");
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    return true;
-  }
-}
-
-unsigned Thumb1InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                              int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default: break;
-  case ARM::tRestore:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
+unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
   return 0;
 }
 
-unsigned Thumb1InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
-                                             int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default: break;
-  case ARM::tSpill:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
+bool
+Thumb1InstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
+  if (MBB.empty()) return false;
+
+  switch (MBB.back().getOpcode()) {
+  case ARM::tBX_RET:
+  case ARM::tBX_RET_vararg:
+  case ARM::tPOP_RET:
+  case ARM::tB:
+  case ARM::tBR_JTr:
+    return true;
+  default:
     break;
   }
-  return 0;
+
+  return false;
 }
 
 bool Thumb1InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
@@ -91,15 +57,15 @@ bool Thumb1InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
 
   if (DestRC == ARM::GPRRegisterClass) {
     if (SrcRC == ARM::GPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVhir2hir), DestReg).addReg(SrcReg);
+      BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg);
       return true;
     } else if (SrcRC == ARM::tGPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVlor2hir), DestReg).addReg(SrcReg);
+      BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg);
       return true;
     }
   } else if (DestRC == ARM::tGPRRegisterClass) {
     if (SrcRC == ARM::GPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVhir2lor), DestReg).addReg(SrcReg);
+      BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg);
       return true;
     } else if (SrcRC == ARM::tGPRRegisterClass) {
       BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg);
@@ -120,17 +86,19 @@ canFoldMemoryOperand(const MachineInstr *MI,
   switch (Opc) {
   default: break;
   case ARM::tMOVr:
-  case ARM::tMOVlor2hir:
-  case ARM::tMOVhir2lor:
-  case ARM::tMOVhir2hir: {
+  case ARM::tMOVtgpr2gpr:
+  case ARM::tMOVgpr2tgpr:
+  case ARM::tMOVgpr2gpr: {
     if (OpNum == 0) { // move -> store
       unsigned SrcReg = MI->getOperand(1).getReg();
-      if (RI.isPhysicalRegister(SrcReg) && !isARMLowRegister(SrcReg))
+      if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+          !isARMLowRegister(SrcReg))
         // tSpill cannot take a high register operand.
         return false;
     } else {          // move -> load
       unsigned DstReg = MI->getOperand(0).getReg();
-      if (RI.isPhysicalRegister(DstReg) && !isARMLowRegister(DstReg))
+      if (TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+          !isARMLowRegister(DstReg))
         // tRestore cannot target a high register operand.
         return false;
     }
@@ -148,36 +116,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL = DebugLoc::getUnknownLoc();
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  assert(RC == ARM::tGPRRegisterClass && "Unknown regclass!");
+  assert((RC == ARM::tGPRRegisterClass ||
+          (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+           isARMLowRegister(SrcReg))) && "Unknown regclass!");
 
   if (RC == ARM::tGPRRegisterClass) {
-    BuildMI(MBB, I, DL, get(ARM::tSpill))
-      .addReg(SrcReg, getKillRegState(isKill))
-      .addFrameIndex(FI).addImm(0);
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSpill))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0));
   }
 }
 
-void Thumb1InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
-                                     bool isKill,
-                                     SmallVectorImpl<MachineOperand> &Addr,
-                                     const TargetRegisterClass *RC,
-                                     SmallVectorImpl<MachineInstr*> &NewMIs) const{
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  unsigned Opc = 0;
-
-  assert(RC == ARM::GPRRegisterClass && "Unknown regclass!");
-  if (RC == ARM::GPRRegisterClass) {
-    Opc = Addr[0].isFI() ? ARM::tSpill : ARM::tSTR;
-  }
-
-  MachineInstrBuilder MIB =
-    BuildMI(MF, DL,  get(Opc)).addReg(SrcReg, getKillRegState(isKill));
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  NewMIs.push_back(MIB);
-  return;
-}
-
 void Thumb1InstrInfo::
 loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
@@ -185,33 +134,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL = DebugLoc::getUnknownLoc();
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  assert(RC == ARM::tGPRRegisterClass && "Unknown regclass!");
+  assert((RC == ARM::tGPRRegisterClass ||
+          (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
+           isARMLowRegister(DestReg))) && "Unknown regclass!");
 
   if (RC == ARM::tGPRRegisterClass) {
-    BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg)
-      .addFrameIndex(FI).addImm(0);
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg)
+                   .addFrameIndex(FI).addImm(0));
   }
 }
 
-void Thumb1InstrInfo::
-loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                SmallVectorImpl<MachineOperand> &Addr,
-                const TargetRegisterClass *RC,
-                SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  unsigned Opc = 0;
-
-  if (RC == ARM::GPRRegisterClass) {
-    Opc = Addr[0].isFI() ? ARM::tRestore : ARM::tLDR;
-  }
-
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  NewMIs.push_back(MIB);
-  return;
-}
-
 bool Thumb1InstrInfo::
 spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MI,
@@ -223,6 +155,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   if (MI != MBB.end()) DL = MI->getDebugLoc();
 
   MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, get(ARM::tPUSH));
+  AddDefaultPred(MIB);
+  MIB.addReg(0); // No write back.
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i-1].getReg();
     // Add the callee-saved register as live-in. It's killed at the spill.
@@ -242,7 +176,12 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
     return false;
 
   bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
-  MachineInstr *PopMI = MF.CreateMachineInstr(get(ARM::tPOP),MI->getDebugLoc());
+  DebugLoc DL = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM::tPOP));
+  AddDefaultPred(MIB);
+  MIB.addReg(0); // No write back.
+
+  bool NumRegs = 0;
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i-1].getReg();
     if (Reg == ARM::LR) {
@@ -250,15 +189,16 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
       if (isVarArg)
         continue;
       Reg = ARM::PC;
-      PopMI->setDesc(get(ARM::tPOP_RET));
+      (*MIB).setDesc(get(ARM::tPOP_RET));
       MI = MBB.erase(MI);
     }
-    PopMI->addOperand(MachineOperand::CreateReg(Reg, true));
+    MIB.addReg(Reg, getDefRegState(true));
+    ++NumRegs;
   }
 
   // It's illegal to emit pop instruction without operands.
-  if (PopMI->getNumOperands() > 0)
-    MBB.insert(MI, PopMI);
+  if (NumRegs)
+    MBB.insert(MI, &*MIB);
 
   return true;
 }
@@ -274,27 +214,30 @@ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
   switch (Opc) {
   default: break;
   case ARM::tMOVr:
-  case ARM::tMOVlor2hir:
-  case ARM::tMOVhir2lor:
-  case ARM::tMOVhir2hir: {
+  case ARM::tMOVtgpr2gpr:
+  case ARM::tMOVgpr2tgpr:
+  case ARM::tMOVgpr2gpr: {
     if (OpNum == 0) { // move -> store
       unsigned SrcReg = MI->getOperand(1).getReg();
       bool isKill = MI->getOperand(1).isKill();
-      if (RI.isPhysicalRegister(SrcReg) && !isARMLowRegister(SrcReg))
+      if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+          !isARMLowRegister(SrcReg))
         // tSpill cannot take a high register operand.
         break;
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill))
-        .addReg(SrcReg, getKillRegState(isKill))
-        .addFrameIndex(FI).addImm(0);
+      NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill))
+                             .addReg(SrcReg, getKillRegState(isKill))
+                             .addFrameIndex(FI).addImm(0));
     } else {          // move -> load
       unsigned DstReg = MI->getOperand(0).getReg();
-      if (RI.isPhysicalRegister(DstReg) && !isARMLowRegister(DstReg))
+      if (TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+          !isARMLowRegister(DstReg))
         // tRestore cannot target a high register operand.
         break;
       bool isDead = MI->getOperand(0).isDead();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore))
-        .addReg(DstReg, RegState::Define | getDeadRegState(isDead))
-        .addFrameIndex(FI).addImm(0);
+      NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore))
+                             .addReg(DstReg,
+                                     RegState::Define | getDeadRegState(isDead))
+                             .addFrameIndex(FI).addImm(0));
     }
     break;
   }
diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index 1bfa1d0..13cc578 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h
@@ -27,6 +27,13 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo {
 public:
   explicit Thumb1InstrInfo(const ARMSubtarget &STI);
 
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const;
+
+  // Return true if the block does not fall through.
+  bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
@@ -40,14 +47,6 @@ public:
                                    MachineBasicBlock::iterator MI,
                                    const std::vector<CalleeSavedInfo> &CSI) const;
 
-  bool isMoveInstr(const MachineInstr &MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
-
   bool copyRegToReg(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator I,
                             unsigned DestReg, unsigned SrcReg,
@@ -58,21 +57,11 @@ public:
                                    unsigned SrcReg, bool isKill, int FrameIndex,
                                    const TargetRegisterClass *RC) const;
 
-  void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                              SmallVectorImpl<MachineOperand> &Addr,
-                              const TargetRegisterClass *RC,
-                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MBBI,
                                     unsigned DestReg, int FrameIndex,
                                     const TargetRegisterClass *RC) const;
 
-  void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                               SmallVectorImpl<MachineOperand> &Addr,
-                               const TargetRegisterClass *RC,
-                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
   bool canFoldMemoryOperand(const MachineInstr *MI,
                                     const SmallVectorImpl<unsigned> &Ops) const;
 
@@ -80,7 +69,7 @@ public:
                                       MachineInstr* MI,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       int FrameIndex) const;
- 
+
   MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr* MI,
                                       const SmallVectorImpl<unsigned> &Ops,
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 92f01d1..3c896da 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -13,12 +13,15 @@
 
 #include "ARM.h"
 #include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "Thumb1InstrInfo.h"
 #include "Thumb1RegisterInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -30,14 +33,11 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-static cl::opt<bool>
-ThumbRegScavenging("enable-thumb-reg-scavenging",
-                   cl::Hidden,
-                   cl::desc("Enable register scavenging on Thumb"));
-
-Thumb1RegisterInfo::Thumb1RegisterInfo(const TargetInstrInfo &tii,
+Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii,
                                        const ARMSubtarget &sti)
   : ARMBaseRegisterInfo(tii, sti) {
 }
@@ -46,20 +46,24 @@ Thumb1RegisterInfo::Thumb1RegisterInfo(const TargetInstrInfo &tii,
 /// specified immediate.
 void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator &MBBI,
-                                           unsigned DestReg, int Val,
-                                           const TargetInstrInfo *TII,
-                                           DebugLoc dl) const {
+                                           DebugLoc dl,
+                                           unsigned DestReg, unsigned SubIdx,
+                                           int Val,
+                                           ARMCC::CondCodes Pred,
+                                           unsigned PredReg) const {
   MachineFunction &MF = *MBB.getParent();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
-  Constant *C = ConstantInt::get(Type::Int32Ty, Val);
+  Constant *C = ConstantInt::get(
+          Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
   unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
 
-  BuildMI(MBB, MBBI, dl, TII->get(ARM::tLDRcp), DestReg)
-    .addConstantPoolIndex(Idx);
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRcp))
+          .addReg(DestReg, getDefRegState(true), SubIdx)
+          .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg);
 }
 
 const TargetRegisterClass*
-Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, MVT VT) const {
+Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, EVT VT) const {
   if (isARMLowRegister(Reg))
     return ARM::tGPRRegisterClass;
   switch (Reg) {
@@ -75,9 +79,16 @@ Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, MVT VT) const {
 
 bool
 Thumb1RegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
-  return ThumbRegScavenging;
+  return true;
+}
+
+bool
+Thumb1RegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF)
+  const {
+  return true;
 }
 
+
 bool Thumb1RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
   const MachineFrameInfo *FFI = MF.getFrameInfo();
   unsigned CFSize = FFI->getMaxCallFrameSize();
@@ -91,6 +102,7 @@ bool Thumb1RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
   return !MF.getFrameInfo()->hasVarSizedObjects();
 }
 
+
 /// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize
 /// a destreg = basereg + immediate in Thumb code. Materialize the immediate
 /// in a register using mov / mvn sequences or load the immediate from a
@@ -103,6 +115,7 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
                               const TargetInstrInfo &TII,
                               const Thumb1RegisterInfo& MRI,
                               DebugLoc dl) {
+    MachineFunction &MF = *MBB.getParent();
     bool isHigh = !isARMLowRegister(DestReg) ||
                   (BaseReg != 0 && !isARMLowRegister(BaseReg));
     bool isSub = false;
@@ -117,31 +130,31 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
     unsigned LdReg = DestReg;
     if (DestReg == ARM::SP) {
       assert(BaseReg == ARM::SP && "Unexpected!");
-      LdReg = ARM::R3;
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
-        .addReg(ARM::R3, RegState::Kill);
+      LdReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass);
     }
 
     if (NumBytes <= 255 && NumBytes >= 0)
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes);
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes);
     else if (NumBytes < 0 && NumBytes >= -255) {
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes);
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), LdReg)
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes);
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
         .addReg(LdReg, RegState::Kill);
     } else
-      MRI.emitLoadConstPool(MBB, MBBI, LdReg, NumBytes, &TII, dl);
+      MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes);
 
     // Emit add / sub.
     int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr);
-    const MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl,
-                                            TII.get(Opc), DestReg);
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+    if (Opc != ARM::tADDhirr)
+      MIB = AddDefaultT1CC(MIB);
     if (DestReg == ARM::SP || isSub)
       MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
     else
       MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
-    if (DestReg == ARM::SP)
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVhir2lor), ARM::R3)
-        .addReg(ARM::R12, RegState::Kill);
+    AddDefaultPred(MIB);
 }
 
 /// calcNumMI - Returns the number of instructions required to materialize
@@ -187,6 +200,8 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
   unsigned Scale = 1;
   int Opc = 0;
   int ExtraOpc = 0;
+  bool NeedCC = false;
+  bool NeedPred = false;
 
   if (DestReg == BaseReg && BaseReg == ARM::SP) {
     assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
@@ -213,7 +228,16 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
     if (DestReg != BaseReg)
       DstNotEqBase = true;
     NumBits = 8;
-    Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+    if (DestReg == ARM::SP) {
+      Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+      assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
+      NumBits = 7;
+      Scale = 4;
+    } else {
+      Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+      NumBits = 8;
+      NeedPred = NeedCC = true;
+    }
     isTwoAddr = true;
   }
 
@@ -233,8 +257,10 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
       unsigned Chunk = (1 << 3) - 1;
       unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
       Bytes -= ThisVal;
-      BuildMI(MBB, MBBI, dl,TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3), DestReg)
-        .addReg(BaseReg, RegState::Kill).addImm(ThisVal);
+      const TargetInstrDesc &TID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3);
+      const MachineInstrBuilder MIB =
+        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg));
+      AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal));
     } else {
       BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
         .addReg(BaseReg, RegState::Kill);
@@ -248,13 +274,22 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
     Bytes -= ThisVal;
     ThisVal /= Scale;
     // Build the new tADD / tSUB.
-    if (isTwoAddr)
-      BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-        .addReg(DestReg).addImm(ThisVal);
+    if (isTwoAddr) {
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+      if (NeedCC)
+        MIB = AddDefaultT1CC(MIB);
+      MIB .addReg(DestReg).addImm(ThisVal);
+      if (NeedPred)
+        MIB = AddDefaultPred(MIB);
+    }
     else {
       bool isKill = BaseReg != ARM::SP;
-      BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-        .addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+      if (NeedCC)
+        MIB = AddDefaultT1CC(MIB);
+      MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
+      if (NeedPred)
+        MIB = AddDefaultPred(MIB);
       BaseReg = DestReg;
 
       if (Opc == ARM::tADDrSPi) {
@@ -265,15 +300,17 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
         Scale = 1;
         Chunk = ((1 << NumBits) - 1) * Scale;
         Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
-        isTwoAddr = true;
+        NeedPred = NeedCC = isTwoAddr = true;
       }
     }
   }
 
-  if (ExtraOpc)
-    BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg)
-      .addReg(DestReg, RegState::Kill)
-      .addImm(((unsigned)NumBytes) & 3);
+  if (ExtraOpc) {
+    const TargetInstrDesc &TID = TII.get(ExtraOpc);
+    AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg))
+                   .addReg(DestReg, RegState::Kill)
+                   .addImm(((unsigned)NumBytes) & 3));
+  }
 }
 
 static void emitSPUpdate(MachineBasicBlock &MBB,
@@ -329,16 +366,64 @@ static void emitThumbConstant(MachineBasicBlock &MBB,
   int Chunk = (1 << 8) - 1;
   int ThisVal = (Imm > Chunk) ? Chunk : Imm;
   Imm -= ThisVal;
-  BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), DestReg).addImm(ThisVal);
+  AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8),
+                                        DestReg))
+                 .addImm(ThisVal));
   if (Imm > 0)
     emitThumbRegPlusImmediate(MBB, MBBI, DestReg, DestReg, Imm, TII, MRI, dl);
-  if (isSub)
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), DestReg)
-      .addReg(DestReg, RegState::Kill);
+  if (isSub) {
+    const TargetInstrDesc &TID = TII.get(ARM::tRSB);
+    AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg))
+                   .addReg(DestReg, RegState::Kill));
+  }
 }
 
-void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                             int SPAdj, RegScavenger *RS) const{
+static void removeOperands(MachineInstr &MI, unsigned i) {
+  unsigned Op = i;
+  for (unsigned e = MI.getNumOperands(); i != e; ++i)
+    MI.RemoveOperand(Op);
+}
+
+int Thumb1RegisterInfo::
+rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                  unsigned FrameReg, int Offset,
+                  unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const
+{
+  // if/when eliminateFrameIndex() conforms with ARMBaseRegisterInfo
+  // version then can pull out Thumb1 specific parts here
+  return 0;
+}
+
+/// saveScavengerRegister - Save the register so it can be used by the
+/// register scavenger. Return true.
+bool Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator I,
+                                               const TargetRegisterClass *RC,
+                                               unsigned Reg) const {
+  // Thumb1 can't use the emergency spill slot on the stack because
+  // ldr/str immediate offsets must be positive, and if we're referencing
+  // off the frame pointer (if, for example, there are alloca() calls in
+  // the function, the offset will be negative. Use R12 instead since that's
+  // a call clobbered register that we know won't be used in Thumb1 mode.
+
+  TII.copyRegToReg(MBB, I, ARM::R12, Reg, ARM::GPRRegisterClass, RC);
+  return true;
+}
+
+/// restoreScavengerRegister - restore a registers saved by
+// saveScavengerRegister().
+void Thumb1RegisterInfo::restoreScavengerRegister(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator I,
+                                               const TargetRegisterClass *RC,
+                                               unsigned Reg) const {
+  TII.copyRegToReg(MBB, I, Reg, ARM::R12, RC, ARM::GPRRegisterClass);
+}
+
+unsigned
+Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                        int SPAdj, int *Value,
+                                        RegScavenger *RS) const{
+  unsigned VReg = 0;
   unsigned i = 0;
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
@@ -380,7 +465,7 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     unsigned Scale = 1;
     if (FrameReg != ARM::SP) {
       Opcode = ARM::tADDi3;
-      MI.setDesc(TII.get(ARM::tADDi3));
+      MI.setDesc(TII.get(Opcode));
       NumBits = 3;
     } else {
       NumBits = 8;
@@ -391,19 +476,26 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
     if (Offset == 0) {
       // Turn it into a move.
-      MI.setDesc(TII.get(ARM::tMOVhir2lor));
+      MI.setDesc(TII.get(ARM::tMOVgpr2tgpr));
       MI.getOperand(i).ChangeToRegister(FrameReg, false);
       MI.RemoveOperand(i+1);
-      return;
+      return 0;
     }
 
     // Common case: small offset, fits into instruction.
     unsigned Mask = (1 << NumBits) - 1;
     if (((Offset / Scale) & ~Mask) == 0) {
       // Replace the FrameIndex with sp / fp
-      MI.getOperand(i).ChangeToRegister(FrameReg, false);
-      MI.getOperand(i+1).ChangeToImmediate(Offset / Scale);
-      return;
+      if (Opcode == ARM::tADDi3) {
+        removeOperands(MI, i);
+        MachineInstrBuilder MIB(&MI);
+        AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg)
+                       .addImm(Offset / Scale));
+      } else {
+        MI.getOperand(i).ChangeToRegister(FrameReg, false);
+        MI.getOperand(i+1).ChangeToImmediate(Offset / Scale);
+      }
+      return 0;
     }
 
     unsigned DestReg = MI.getOperand(0).getReg();
@@ -415,15 +507,21 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII,
                                 *this, dl);
       MBB.erase(II);
-      return;
+      return 0;
     }
 
     if (Offset > 0) {
       // Translate r0 = add sp, imm to
       // r0 = add sp, 255*4
       // r0 = add r0, (imm - 255*4)
-      MI.getOperand(i).ChangeToRegister(FrameReg, false);
-      MI.getOperand(i+1).ChangeToImmediate(Mask);
+      if (Opcode == ARM::tADDi3) {
+        removeOperands(MI, i);
+        MachineInstrBuilder MIB(&MI);
+        AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg).addImm(Mask));
+      } else {
+        MI.getOperand(i).ChangeToRegister(FrameReg, false);
+        MI.getOperand(i+1).ChangeToImmediate(Mask);
+      }
       Offset = (Offset - Mask * Scale);
       MachineBasicBlock::iterator NII = next(II);
       emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII,
@@ -433,11 +531,16 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       // r0 = -imm (this is then translated into a series of instructons)
       // r0 = add r0, sp
       emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl);
+
       MI.setDesc(TII.get(ARM::tADDhirr));
       MI.getOperand(i).ChangeToRegister(DestReg, false, false, true);
       MI.getOperand(i+1).ChangeToRegister(FrameReg, false);
+      if (Opcode == ARM::tADDi3) {
+        MachineInstrBuilder MIB(&MI);
+        AddDefaultPred(MIB);
+      }
     }
-    return;
+    return 0;
   } else {
     unsigned ImmIdx = 0;
     int InstrOffs = 0;
@@ -452,8 +555,7 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       break;
     }
     default:
-      assert(0 && "Unsupported addressing mode!");
-      abort();
+      llvm_unreachable("Unsupported addressing mode!");
       break;
     }
 
@@ -468,7 +570,7 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       // Replace the FrameIndex with sp
       MI.getOperand(i).ChangeToRegister(FrameReg, false);
       ImmOp.ChangeToImmediate(ImmedOffset);
-      return;
+      return 0;
     }
 
     bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill;
@@ -495,6 +597,11 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // SP+LargeImm.
   assert(Offset && "This code isn't needed if offset already handled!");
 
+  // Remove predicate first.
+  int PIdx = MI.findFirstPredOperandIdx();
+  if (PIdx != -1)
+    removeOperands(MI, PIdx);
+
   if (Desc.mayLoad()) {
     // Use the destination register to materialize sp + offset.
     unsigned TmpReg = MI.getOperand(0).getReg();
@@ -504,12 +611,14 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
         emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,
                                  Offset, false, TII, *this, dl);
       else {
-        emitLoadConstPool(MBB, II, TmpReg, Offset, &TII, dl);
+        emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset);
         UseRR = true;
       }
-    } else
+    } else {
       emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII,
                                 *this, dl);
+    }
+
     MI.setDesc(TII.get(ARM::tLDR));
     MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
     if (UseRR)
@@ -518,52 +627,37 @@ void Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     else  // tLDR has an extra register operand.
       MI.addOperand(MachineOperand::CreateReg(0, false));
   } else if (Desc.mayStore()) {
-    // FIXME! This is horrific!!! We need register scavenging.
-    // Our temporary workaround has marked r3 unavailable. Of course, r3 is
-    // also a ABI register so it's possible that is is the register that is
-    // being storing here. If that's the case, we do the following:
-    // r12 = r2
-    // Use r2 to materialize sp + offset
-    // str r3, r2
-    // r2 = r12
-    unsigned ValReg = MI.getOperand(0).getReg();
-    unsigned TmpReg = ARM::R3;
-    bool UseRR = false;
-    if (ValReg == ARM::R3) {
-      BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
-        .addReg(ARM::R2, RegState::Kill);
-      TmpReg = ARM::R2;
-    }
-    if (TmpReg == ARM::R3 && AFI->isR3LiveIn())
-      BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
-        .addReg(ARM::R3, RegState::Kill);
-    if (Opcode == ARM::tSpill) {
-      if (FrameReg == ARM::SP)
-        emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,
-                                 Offset, false, TII, *this, dl);
-      else {
-        emitLoadConstPool(MBB, II, TmpReg, Offset, &TII, dl);
-        UseRR = true;
-      }
-    } else
-      emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII,
-                                *this, dl);
-    MI.setDesc(TII.get(ARM::tSTR));
-    MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
-    if (UseRR)  // Use [reg, reg] addrmode.
-      MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
-    else // tSTR has an extra register operand.
-      MI.addOperand(MachineOperand::CreateReg(0, false));
-
-    MachineBasicBlock::iterator NII = next(II);
-    if (ValReg == ARM::R3)
-      BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R2)
-        .addReg(ARM::R12, RegState::Kill);
-    if (TmpReg == ARM::R3 && AFI->isR3LiveIn())
-      BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R3)
-        .addReg(ARM::R12, RegState::Kill);
+      VReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass);
+      assert (Value && "Frame index virtual allocated, but Value arg is NULL!");
+      *Value = Offset;
+      bool UseRR = false;
+
+      if (Opcode == ARM::tSpill) {
+        if (FrameReg == ARM::SP)
+          emitThumbRegPlusImmInReg(MBB, II, VReg, FrameReg,
+                                   Offset, false, TII, *this, dl);
+        else {
+          emitLoadConstPool(MBB, II, dl, VReg, 0, Offset);
+          UseRR = true;
+        }
+      } else
+        emitThumbRegPlusImmediate(MBB, II, VReg, FrameReg, Offset, TII,
+                                  *this, dl);
+      MI.setDesc(TII.get(ARM::tSTR));
+      MI.getOperand(i).ChangeToRegister(VReg, false, false, true);
+      if (UseRR)  // Use [reg, reg] addrmode.
+        MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
+      else // tSTR has an extra register operand.
+        MI.addOperand(MachineOperand::CreateReg(0, false));
   } else
     assert(false && "Unexpected opcode!");
+
+  // Add predicate back if it's needed.
+  if (MI.getDesc().isPredicable()) {
+    MachineInstrBuilder MIB(&MI);
+    AddDefaultPred(MIB);
+  }
+  return VReg;
 }
 
 void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const {
@@ -577,15 +671,6 @@ void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const {
   DebugLoc dl = (MBBI != MBB.end() ?
                  MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
 
-  // Check if R3 is live in. It might have to be used as a scratch register.
-  for (MachineRegisterInfo::livein_iterator I =MF.getRegInfo().livein_begin(),
-         E = MF.getRegInfo().livein_end(); I != E; ++I) {
-    if (I->first == ARM::R3) {
-      AFI->setR3IsLiveIn(true);
-      break;
-    }
-  }
-
   // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
   NumBytes = (NumBytes + 3) & ~3;
   MFI->setStackSize(NumBytes);
@@ -647,8 +732,7 @@ void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const {
   // Darwin ABI requires FP to point to the stack slot that contains the
   // previous FP.
   if (STI.isTargetDarwin() || hasFP(MF)) {
-    MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
       .addFrameIndex(FramePtrSpillFI).addImm(0);
   }
 
@@ -729,7 +813,7 @@ void Thumb1RegisterInfo::emitEpilogue(MachineFunction &MF,
         emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, FramePtr, -NumBytes,
                                   TII, *this, dl);
       else
-        BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::SP)
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP)
           .addReg(FramePtr);
     } else {
       if (MBBI->getOpcode() == ARM::tBX_RET &&
@@ -745,11 +829,14 @@ void Thumb1RegisterInfo::emitEpilogue(MachineFunction &MF,
   if (VARegSaveSize) {
     // Epilogue for vararg functions: pop LR to R3 and branch off it.
     // FIXME: Verify this is still ok when R3 is no longer being reserved.
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)).addReg(ARM::R3);
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+      .addReg(0) // No write back.
+      .addReg(ARM::R3, RegState::Define);
 
     emitSPUpdate(MBB, MBBI, TII, dl, *this, VARegSaveSize);
 
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)).addReg(ARM::R3);
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg))
+      .addReg(ARM::R3, RegState::Kill);
     MBB.erase(MBBI);
   }
 }
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index 6d4f1f0..bb7a619 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -20,28 +20,28 @@
 
 namespace llvm {
   class ARMSubtarget;
-  class TargetInstrInfo;
+  class ARMBaseInstrInfo;
   class Type;
 
 struct Thumb1RegisterInfo : public ARMBaseRegisterInfo {
 public:
-  Thumb1RegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI);
+  Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
-  void emitLoadConstPool(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator &MBBI,
-                         unsigned DestReg, int Val,
-                         const TargetInstrInfo *TII,
-                         DebugLoc dl) const;
+ void emitLoadConstPool(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator &MBBI,
+                        DebugLoc dl,
+                        unsigned DestReg, unsigned SubIdx, int Val,
+                        ARMCC::CondCodes Pred = ARMCC::AL,
+                        unsigned PredReg = 0) const;
 
   /// Code Generation virtual methods...
   const TargetRegisterClass *
-    getPhysicalRegisterRegClass(unsigned Reg, MVT VT = MVT::Other) const;
-
-  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+    getPhysicalRegisterRegClass(unsigned Reg, EVT VT = MVT::Other) const;
 
   bool requiresRegisterScavenging(const MachineFunction &MF) const;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
 
   bool hasReservedCallFrame(MachineFunction &MF) const;
 
@@ -49,8 +49,23 @@ public:
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-  void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+  // rewrite MI to access 'Offset' bytes from the FP. Return the offset that
+  // could not be handled directly in MI.
+  int rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                        unsigned FrameReg, int Offset,
+                        unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const;
+
+  bool saveScavengerRegister(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I,
+                             const TargetRegisterClass *RC,
+                             unsigned Reg) const;
+  void restoreScavengerRegister(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I,
+                                const TargetRegisterClass *RC,
+                                unsigned Reg) const;
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
 
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
new file mode 100644
index 0000000..98b5cbd
--- /dev/null
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -0,0 +1,158 @@
+//===-- Thumb2ITBlockPass.cpp - Insert Thumb IT blocks -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "thumb2-it"
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumITs,     "Number of IT blocks inserted");
+
+namespace {
+  struct VISIBILITY_HIDDEN Thumb2ITBlockPass : public MachineFunctionPass {
+    static char ID;
+    Thumb2ITBlockPass() : MachineFunctionPass(&ID) {}
+
+    const Thumb2InstrInfo *TII;
+    ARMFunctionInfo *AFI;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "Thumb IT blocks insertion pass";
+    }
+
+  private:
+    MachineBasicBlock::iterator
+      SplitT2MOV32imm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                      MachineInstr *MI, DebugLoc dl,
+                      unsigned PredReg, ARMCC::CondCodes CC);
+    bool InsertITBlocks(MachineBasicBlock &MBB);
+  };
+  char Thumb2ITBlockPass::ID = 0;
+}
+
+static ARMCC::CondCodes getPredicate(const MachineInstr *MI, unsigned &PredReg){
+  unsigned Opc = MI->getOpcode();
+  if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
+    return ARMCC::AL;
+  return llvm::getInstrPredicate(MI, PredReg);
+}
+
+MachineBasicBlock::iterator
+Thumb2ITBlockPass::SplitT2MOV32imm(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineInstr *MI,
+                                   DebugLoc dl, unsigned PredReg,
+                                   ARMCC::CondCodes CC) {
+  // Splitting t2MOVi32imm into a pair of t2MOVi16 + t2MOVTi16 here.
+  // The only reason it was a single instruction was so it could be
+  // re-materialized. We want to split it before this and the thumb2
+  // size reduction pass to make sure the IT mask is correct and expose
+  // width reduction opportunities. It doesn't make sense to do this in a 
+  // separate pass so here it is.
+  unsigned DstReg = MI->getOperand(0).getReg();
+  bool DstDead = MI->getOperand(0).isDead(); // Is this possible?
+  unsigned Imm = MI->getOperand(1).getImm();
+  unsigned Lo16 = Imm & 0xffff;
+  unsigned Hi16 = (Imm >> 16) & 0xffff;
+  BuildMI(MBB, MBBI, dl, TII->get(ARM::t2MOVi16), DstReg)
+    .addImm(Lo16).addImm(CC).addReg(PredReg);
+  BuildMI(MBB, MBBI, dl, TII->get(ARM::t2MOVTi16))
+    .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead))
+    .addReg(DstReg).addImm(Hi16).addImm(CC).addReg(PredReg);
+  --MBBI;
+  --MBBI;
+  MI->eraseFromParent();
+  return MBBI;
+}
+
+bool Thumb2ITBlockPass::InsertITBlocks(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineInstr *MI = &*MBBI;
+    DebugLoc dl = MI->getDebugLoc();
+    unsigned PredReg = 0;
+    ARMCC::CondCodes CC = getPredicate(MI, PredReg);
+
+    if (MI->getOpcode() == ARM::t2MOVi32imm) {
+      MBBI = SplitT2MOV32imm(MBB, MBBI, MI, dl, PredReg, CC);
+      continue;
+    }
+
+    if (CC == ARMCC::AL) {
+      ++MBBI;
+      continue;
+    }
+
+    // Insert an IT instruction.
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT))
+      .addImm(CC);
+    ++MBBI;
+
+    // Finalize IT mask.
+    ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
+    unsigned Mask = 0, Pos = 3;
+    while (MBBI != E && Pos) {
+      MachineInstr *NMI = &*MBBI;
+      DebugLoc ndl = NMI->getDebugLoc();
+      unsigned NPredReg = 0;
+      ARMCC::CondCodes NCC = getPredicate(NMI, NPredReg);
+      if (NMI->getOpcode() == ARM::t2MOVi32imm) {
+        MBBI = SplitT2MOV32imm(MBB, MBBI, NMI, ndl, NPredReg, NCC);
+        continue;
+      }
+
+      if (NCC == OCC) {
+        Mask |= (1 << Pos);
+      } else if (NCC != CC)
+        break;
+      --Pos;
+      ++MBBI;
+    }
+    Mask |= (1 << Pos);
+    MIB.addImm(Mask);
+    Modified = true;
+    ++NumITs;
+  }
+
+  return Modified;
+}
+
+bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetMachine &TM = Fn.getTarget();
+  AFI = Fn.getInfo<ARMFunctionInfo>();
+  TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+
+  if (!AFI->isThumbFunction())
+    return false;
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= InsertITBlocks(MBB);
+  }
+
+  return Modified;
+}
+
+/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks
+/// insertion pass.
+FunctionPass *llvm::createThumb2ITBlockPass() {
+  return new Thumb2ITBlockPass();
+}
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 35d09fd..264601b 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "ARMInstrInfo.h"
 #include "ARM.h"
+#include "ARMAddressingModes.h"
 #include "ARMGenInstrInfo.inc"
 #include "ARMMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -22,127 +23,62 @@
 
 using namespace llvm;
 
-Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
-  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) : RI(*this, STI) {
 }
 
-bool Thumb2InstrInfo::isMoveInstr(const MachineInstr &MI,
-                                  unsigned &SrcReg, unsigned &DstReg,
-                                  unsigned& SrcSubIdx, unsigned& DstSubIdx) const {
-  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
-
-  unsigned oc = MI.getOpcode();
-  switch (oc) {
-  default:
-    return false;
-  // FIXME: Thumb2
-  case ARM::tMOVr:
-  case ARM::tMOVhir2lor:
-  case ARM::tMOVlor2hir:
-  case ARM::tMOVhir2hir:
-    assert(MI.getDesc().getNumOperands() >= 2 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           "Invalid Thumb MOV instruction");
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    return true;
-  }
-}
-
-unsigned Thumb2InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                              int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default: break;
-  // FIXME: Thumb2
-  case ARM::tRestore:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
+unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const {
+  // FIXME
   return 0;
 }
 
-unsigned Thumb2InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
-                                             int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default: break;
-  // FIXME: Thumb2
-  case ARM::tSpill:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
+bool
+Thumb2InstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
+  if (MBB.empty()) return false;
+
+  switch (MBB.back().getOpcode()) {
+  case ARM::t2LDM_RET:
+  case ARM::t2B:        // Uncond branch.
+  case ARM::t2BR_JT:    // Jumptable branch.
+  case ARM::t2TBB:      // Table branch byte.
+  case ARM::t2TBH:      // Table branch halfword.
+  case ARM::tBR_JTr:    // Jumptable branch (16-bit version).
+  case ARM::tBX_RET:
+  case ARM::tBX_RET_vararg:
+  case ARM::tPOP_RET:
+  case ARM::tB:
+    return true;
+  default:
     break;
   }
-  return 0;
+
+  return false;
 }
 
-bool Thumb2InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   const TargetRegisterClass *DestRC,
-                                   const TargetRegisterClass *SrcRC) const {
+bool
+Thumb2InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DestReg, unsigned SrcReg,
+                              const TargetRegisterClass *DestRC,
+                              const TargetRegisterClass *SrcRC) const {
   DebugLoc DL = DebugLoc::getUnknownLoc();
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  // FIXME: Thumb2
-  if (DestRC == ARM::GPRRegisterClass) {
-    if (SrcRC == ARM::GPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVhir2hir), DestReg).addReg(SrcReg);
-      return true;
-    } else if (SrcRC == ARM::tGPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVlor2hir), DestReg).addReg(SrcReg);
-      return true;
-    }
-  } else if (DestRC == ARM::tGPRRegisterClass) {
-    if (SrcRC == ARM::GPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVhir2lor), DestReg).addReg(SrcReg);
-      return true;
-    } else if (SrcRC == ARM::tGPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg);
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool Thumb2InstrInfo::
-canFoldMemoryOperand(const MachineInstr *MI,
-                     const SmallVectorImpl<unsigned> &Ops) const {
-  if (Ops.size() != 1) return false;
-
-  unsigned OpNum = Ops[0];
-  unsigned Opc = MI->getOpcode();
-  switch (Opc) {
-  default: break;
-  case ARM::tMOVr:
-  case ARM::tMOVlor2hir:
-  case ARM::tMOVhir2lor:
-  case ARM::tMOVhir2hir: {
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      if (RI.isPhysicalRegister(SrcReg) && !isARMLowRegister(SrcReg))
-        // tSpill cannot take a high register operand.
-        return false;
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      if (RI.isPhysicalRegister(DstReg) && !isARMLowRegister(DstReg))
-        // tRestore cannot target a high register operand.
-        return false;
-    }
+  if (DestRC == ARM::GPRRegisterClass &&
+      SrcRC == ARM::GPRRegisterClass) {
+    BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg);
+    return true;
+  } else if (DestRC == ARM::GPRRegisterClass &&
+             SrcRC == ARM::tGPRRegisterClass) {
+    BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg);
+    return true;
+  } else if (DestRC == ARM::tGPRRegisterClass &&
+             SrcRC == ARM::GPRRegisterClass) {
+    BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg);
     return true;
-  }
   }
 
-  return false;
+  // Handle SPR, DPR, and QPR copies.
+  return ARMBaseInstrInfo::copyRegToReg(MBB, I, DestReg, SrcReg, DestRC, SrcRC);
 }
 
 void Thumb2InstrInfo::
@@ -152,36 +88,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL = DebugLoc::getUnknownLoc();
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  assert(RC == ARM::tGPRRegisterClass && "Unknown regclass!");
-
-  // FIXME: Thumb2
-  if (RC == ARM::tGPRRegisterClass) {
-    BuildMI(MBB, I, DL, get(ARM::tSpill))
-      .addReg(SrcReg, getKillRegState(isKill))
-      .addFrameIndex(FI).addImm(0);
-  }
-}
-
-void Thumb2InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
-                                     bool isKill,
-                                     SmallVectorImpl<MachineOperand> &Addr,
-                                     const TargetRegisterClass *RC,
-                                     SmallVectorImpl<MachineInstr*> &NewMIs) const{
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  unsigned Opc = 0;
-
-  // FIXME: Thumb2. Is GPRRegClass here correct?
-  assert(RC == ARM::GPRRegisterClass && "Unknown regclass!");
   if (RC == ARM::GPRRegisterClass) {
-    Opc = Addr[0].isFI() ? ARM::tSpill : ARM::tSTR;
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0));
+    return;
   }
 
-  MachineInstrBuilder MIB =
-    BuildMI(MF, DL,  get(Opc)).addReg(SrcReg, getKillRegState(isKill));
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  NewMIs.push_back(MIB);
-  return;
+  ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC);
 }
 
 void Thumb2InstrInfo::
@@ -191,122 +105,381 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL = DebugLoc::getUnknownLoc();
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  // FIXME: Thumb2
-  assert(RC == ARM::tGPRRegisterClass && "Unknown regclass!");
-
-  if (RC == ARM::tGPRRegisterClass) {
-    BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg)
-      .addFrameIndex(FI).addImm(0);
+  if (RC == ARM::GPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
+                   .addFrameIndex(FI).addImm(0));
+    return;
   }
+
+  ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC);
 }
 
-void Thumb2InstrInfo::
-loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                SmallVectorImpl<MachineOperand> &Addr,
-                const TargetRegisterClass *RC,
-                SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  unsigned Opc = 0;
 
-  // FIXME: Thumb2. Is GPRRegClass ok here?
-  if (RC == ARM::GPRRegisterClass) {
-    Opc = Addr[0].isFI() ? ARM::tRestore : ARM::tLDR;
+void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                               unsigned DestReg, unsigned BaseReg, int NumBytes,
+                               ARMCC::CondCodes Pred, unsigned PredReg,
+                               const ARMBaseInstrInfo &TII) {
+  bool isSub = NumBytes < 0;
+  if (isSub) NumBytes = -NumBytes;
+
+  // If profitable, use a movw or movt to materialize the offset.
+  // FIXME: Use the scavenger to grab a scratch register.
+  if (DestReg != ARM::SP && DestReg != BaseReg &&
+      NumBytes >= 4096 &&
+      ARM_AM::getT2SOImmVal(NumBytes) == -1) {
+    bool Fits = false;
+    if (NumBytes < 65536) {
+      // Use a movw to materialize the 16-bit constant.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg)
+        .addImm(NumBytes)
+        .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+      Fits = true;
+    } else if ((NumBytes & 0xffff) == 0) {
+      // Use a movt to materialize the 32-bit constant.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg)
+        .addReg(DestReg)
+        .addImm(NumBytes >> 16)
+        .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+      Fits = true;
+    }
+
+    if (Fits) {
+      if (isSub) {
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg)
+          .addReg(BaseReg, RegState::Kill)
+          .addReg(DestReg, RegState::Kill)
+          .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+      } else {
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg)
+          .addReg(DestReg, RegState::Kill)
+          .addReg(BaseReg, RegState::Kill)
+        .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+      }
+      return;
+    }
   }
 
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  NewMIs.push_back(MIB);
-  return;
-}
+  while (NumBytes) {
+    unsigned ThisVal = NumBytes;
+    unsigned Opc = 0;
+    if (DestReg == ARM::SP && BaseReg != ARM::SP) {
+      // mov sp, rn. Note t2MOVr cannot be used.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr),DestReg).addReg(BaseReg);
+      BaseReg = ARM::SP;
+      continue;
+    }
 
-bool Thumb2InstrInfo::
-spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI,
-                          const std::vector<CalleeSavedInfo> &CSI) const {
-  if (CSI.empty())
-    return false;
+    if (BaseReg == ARM::SP) {
+      // sub sp, sp, #imm7
+      if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) {
+        assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
+        Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+        // FIXME: Fix Thumb1 immediate encoding.
+        BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+          .addReg(BaseReg).addImm(ThisVal/4);
+        NumBytes = 0;
+        continue;
+      }
+
+      // sub rd, sp, so_imm
+      Opc = isSub ? ARM::t2SUBrSPi : ARM::t2ADDrSPi;
+      if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+        NumBytes = 0;
+      } else {
+        // FIXME: Move this to ARMAddressingModes.h?
+        unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+        ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+        NumBytes &= ~ThisVal;
+        assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+               "Bit extraction didn't work?");
+      }
+    } else {
+      assert(DestReg != ARM::SP && BaseReg != ARM::SP);
+      Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
+      if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+        NumBytes = 0;
+      } else if (ThisVal < 4096) {
+        Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
+        NumBytes = 0;
+      } else {
+        // FIXME: Move this to ARMAddressingModes.h?
+        unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+        ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+        NumBytes &= ~ThisVal;
+        assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+               "Bit extraction didn't work?");
+      }
+    }
 
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  if (MI != MBB.end()) DL = MI->getDebugLoc();
-
-  MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, get(ARM::tPUSH));
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    unsigned Reg = CSI[i-1].getReg();
-    // Add the callee-saved register as live-in. It's killed at the spill.
-    MBB.addLiveIn(Reg);
-    MIB.addReg(Reg, RegState::Kill);
+    // Build the new ADD / SUB.
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+                                .addReg(BaseReg, RegState::Kill)
+                                .addImm(ThisVal)));
+
+    BaseReg = DestReg;
   }
-  return true;
 }
 
-bool Thumb2InstrInfo::
-restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            const std::vector<CalleeSavedInfo> &CSI) const {
-  MachineFunction &MF = *MBB.getParent();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  if (CSI.empty())
-    return false;
-
-  bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
-  MachineInstr *PopMI = MF.CreateMachineInstr(get(ARM::tPOP),MI->getDebugLoc());
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    unsigned Reg = CSI[i-1].getReg();
-    if (Reg == ARM::LR) {
-      // Special epilogue for vararg functions. See emitEpilogue
-      if (isVarArg)
-        continue;
-      Reg = ARM::PC;
-      PopMI->setDesc(get(ARM::tPOP_RET));
-      MI = MBB.erase(MI);
-    }
-    PopMI->addOperand(MachineOperand::CreateReg(Reg, true));
+static unsigned
+negativeOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRi12:   return ARM::t2LDRi8;
+  case ARM::t2LDRHi12:  return ARM::t2LDRHi8;
+  case ARM::t2LDRBi12:  return ARM::t2LDRBi8;
+  case ARM::t2LDRSHi12: return ARM::t2LDRSHi8;
+  case ARM::t2LDRSBi12: return ARM::t2LDRSBi8;
+  case ARM::t2STRi12:   return ARM::t2STRi8;
+  case ARM::t2STRBi12:  return ARM::t2STRBi8;
+  case ARM::t2STRHi12:  return ARM::t2STRHi8;
+
+  case ARM::t2LDRi8:
+  case ARM::t2LDRHi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSBi8:
+  case ARM::t2STRi8:
+  case ARM::t2STRBi8:
+  case ARM::t2STRHi8:
+    return opcode;
+
+  default:
+    break;
   }
 
-  // It's illegal to emit pop instruction without operands.
-  if (PopMI->getNumOperands() > 0)
-    MBB.insert(MI, PopMI);
+  return 0;
+}
+
+static unsigned
+positiveOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRi8:   return ARM::t2LDRi12;
+  case ARM::t2LDRHi8:  return ARM::t2LDRHi12;
+  case ARM::t2LDRBi8:  return ARM::t2LDRBi12;
+  case ARM::t2LDRSHi8: return ARM::t2LDRSHi12;
+  case ARM::t2LDRSBi8: return ARM::t2LDRSBi12;
+  case ARM::t2STRi8:   return ARM::t2STRi12;
+  case ARM::t2STRBi8:  return ARM::t2STRBi12;
+  case ARM::t2STRHi8:  return ARM::t2STRHi12;
+
+  case ARM::t2LDRi12:
+  case ARM::t2LDRHi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSBi12:
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+    return opcode;
 
-  return true;
+  default:
+    break;
+  }
+
+  return 0;
 }
 
-MachineInstr *Thumb2InstrInfo::
-foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                      const SmallVectorImpl<unsigned> &Ops, int FI) const {
-  if (Ops.size() != 1) return NULL;
-
-  unsigned OpNum = Ops[0];
-  unsigned Opc = MI->getOpcode();
-  MachineInstr *NewMI = NULL;
-  switch (Opc) {
-  default: break;
-  case ARM::tMOVr:
-  case ARM::tMOVlor2hir:
-  case ARM::tMOVhir2lor:
-  case ARM::tMOVhir2hir: {
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      bool isKill = MI->getOperand(1).isKill();
-      if (RI.isPhysicalRegister(SrcReg) && !isARMLowRegister(SrcReg))
-        // tSpill cannot take a high register operand.
-        break;
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill))
-        .addReg(SrcReg, getKillRegState(isKill))
-        .addFrameIndex(FI).addImm(0);
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      if (RI.isPhysicalRegister(DstReg) && !isARMLowRegister(DstReg))
-        // tRestore cannot target a high register operand.
-        break;
-      bool isDead = MI->getOperand(0).isDead();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore))
-        .addReg(DstReg, RegState::Define | getDeadRegState(isDead))
-        .addFrameIndex(FI).addImm(0);
-    }
+static unsigned
+immediateOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRs:   return ARM::t2LDRi12;
+  case ARM::t2LDRHs:  return ARM::t2LDRHi12;
+  case ARM::t2LDRBs:  return ARM::t2LDRBi12;
+  case ARM::t2LDRSHs: return ARM::t2LDRSHi12;
+  case ARM::t2LDRSBs: return ARM::t2LDRSBi12;
+  case ARM::t2STRs:   return ARM::t2STRi12;
+  case ARM::t2STRBs:  return ARM::t2STRBi12;
+  case ARM::t2STRHs:  return ARM::t2STRHi12;
+
+  case ARM::t2LDRi12:
+  case ARM::t2LDRHi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSBi12:
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRHi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSBi8:
+  case ARM::t2STRi8:
+  case ARM::t2STRBi8:
+  case ARM::t2STRHi8:
+    return opcode;
+
+  default:
     break;
   }
+
+  return 0;
+}
+
+bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                               unsigned FrameReg, int &Offset,
+                               const ARMBaseInstrInfo &TII) {
+  unsigned Opcode = MI.getOpcode();
+  const TargetInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  bool isSub = false;
+
+  // Memory operands in inline assembly always use AddrModeT2_i12.
+  if (Opcode == ARM::INLINEASM)
+    AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2?
+
+  if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) {
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
+
+    bool isSP = FrameReg == ARM::SP;
+    if (Offset == 0) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::tMOVgpr2gpr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.RemoveOperand(FrameRegIdx+1);
+      Offset = 0;
+      return true;
+    }
+
+    if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+      MI.setDesc(TII.get(isSP ? ARM::t2SUBrSPi : ARM::t2SUBri));
+    } else {
+      MI.setDesc(TII.get(isSP ? ARM::t2ADDrSPi : ARM::t2ADDri));
+    }
+
+    // Common case: small offset, fits into instruction.
+    if (ARM_AM::getT2SOImmVal(Offset) != -1) {
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      Offset = 0;
+      return true;
+    }
+    // Another common case: imm12.
+    if (Offset < 4096) {
+      unsigned NewOpc = isSP
+        ? (isSub ? ARM::t2SUBrSPi12 : ARM::t2ADDrSPi12)
+        : (isSub ? ARM::t2SUBri12   : ARM::t2ADDri12);
+      MI.setDesc(TII.get(NewOpc));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, extract 8 adjacent bits from the immediate into this
+    // t2ADDri/t2SUBri.
+    unsigned RotAmt = CountLeadingZeros_32(Offset);
+    unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xff000000U, RotAmt);
+
+    // We will handle these bits from offset, clear them.
+    Offset &= ~ThisImmVal;
+
+    assert(ARM_AM::getT2SOImmVal(ThisImmVal) != -1 &&
+           "Bit extraction didn't work?");
+    MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+  } else {
+
+    // AddrMode4 cannot handle any offset.
+    if (AddrMode == ARMII::AddrMode4)
+      return false;
+
+    // AddrModeT2_so cannot handle any offset. If there is no offset
+    // register then we change to an immediate version.
+    unsigned NewOpc = Opcode;
+    if (AddrMode == ARMII::AddrModeT2_so) {
+      unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg();
+      if (OffsetReg != 0) {
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        return Offset == 0;
+      }
+
+      MI.RemoveOperand(FrameRegIdx+1);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(0);
+      NewOpc = immediateOffsetOpcode(Opcode);
+      AddrMode = ARMII::AddrModeT2_i12;
+    }
+
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    if (AddrMode == ARMII::AddrModeT2_i8 || AddrMode == ARMII::AddrModeT2_i12) {
+      // i8 supports only negative, and i12 supports only positive, so
+      // based on Offset sign convert Opcode to the appropriate
+      // instruction
+      Offset += MI.getOperand(FrameRegIdx+1).getImm();
+      if (Offset < 0) {
+        NewOpc = negativeOffsetOpcode(Opcode);
+        NumBits = 8;
+        isSub = true;
+        Offset = -Offset;
+      } else {
+        NewOpc = positiveOffsetOpcode(Opcode);
+        NumBits = 12;
+      }
+    } else {
+      // VFP and NEON address modes.
+      int InstrOffs = 0;
+      if (AddrMode == ARMII::AddrMode5) {
+        const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1);
+        InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm());
+        if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub)
+          InstrOffs *= -1;
+      }
+      NumBits = 8;
+      Scale = 4;
+      Offset += InstrOffs * 4;
+      assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+      if (Offset < 0) {
+        Offset = -Offset;
+        isSub = true;
+      }
+    }
+
+    if (NewOpc != Opcode)
+      MI.setDesc(TII.get(NewOpc));
+
+    MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1);
+
+    // Attempt to fold address computation
+    // Common case: small offset, fits into instruction.
+    int ImmedOffset = Offset / Scale;
+    unsigned Mask = (1 << NumBits) - 1;
+    if ((unsigned)Offset <= Mask * Scale) {
+      // Replace the FrameIndex with fp/sp
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      if (isSub) {
+        if (AddrMode == ARMII::AddrMode5)
+          // FIXME: Not consistent.
+          ImmedOffset |= 1 << NumBits;
+        else
+          ImmedOffset = -ImmedOffset;
+      }
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, offset doesn't fit. Pull in what we can to simplify
+    ImmedOffset = ImmedOffset & Mask;
+    if (isSub) {
+      if (AddrMode == ARMII::AddrMode5)
+        // FIXME: Not consistent.
+        ImmedOffset |= 1 << NumBits;
+      else {
+        ImmedOffset = -ImmedOffset;
+        if (ImmedOffset == 0)
+          // Change the opcode back if the encoded offset is zero.
+          MI.setDesc(TII.get(positiveOffsetOpcode(NewOpc)));
+      }
+    }
+    ImmOp.ChangeToImmediate(ImmedOffset);
+    Offset &= ~(Mask*Scale);
   }
 
-  return NewMI;
+  Offset = (isSub) ? -Offset : Offset;
+  return Offset == 0;
 }
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index 84dcb49..f3688c0 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -27,66 +27,34 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo {
 public:
   explicit Thumb2InstrInfo(const ARMSubtarget &STI);
 
-  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
-  /// such, whenever a client has an instance of instruction info, it should
-  /// always be able to get register info as well (through this method).
-  ///
-  const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
-
-  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI) const;
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI) const;
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const;
 
-  bool isMoveInstr(const MachineInstr &MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
+  // Return true if the block does not fall through.
+  bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
 
   bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator I,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC) const;
-  void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC) const;
+                    MachineBasicBlock::iterator I,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *DestRC,
+                    const TargetRegisterClass *SrcRC) const;
 
-  void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                              SmallVectorImpl<MachineOperand> &Addr,
-                              const TargetRegisterClass *RC,
-                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC) const;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC) const;
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC) const;
 
-  void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                               SmallVectorImpl<MachineOperand> &Addr,
-                               const TargetRegisterClass *RC,
-                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
-  bool canFoldMemoryOperand(const MachineInstr *MI,
-                                    const SmallVectorImpl<unsigned> &Ops) const;
-
-  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                      MachineInstr* MI,
-                                      const SmallVectorImpl<unsigned> &Ops,
-                                      int FrameIndex) const;
-
-  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                      MachineInstr* MI,
-                                      const SmallVectorImpl<unsigned> &Ops,
-                                      MachineInstr* LoadMI) const {
-    return 0;
-  }
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
 };
 }
 
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp
index 0f0c0e4..6c4c15d 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp
@@ -13,12 +13,15 @@
 
 #include "ARM.h"
 #include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "Thumb2InstrInfo.h"
 #include "Thumb2RegisterInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -30,14 +33,10 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
-static cl::opt<bool>
-Thumb2RegScavenging("enable-thumb2-reg-scavenging",
-                    cl::Hidden,
-                    cl::desc("Enable register scavenging on Thumb-2"));
-
-Thumb2RegisterInfo::Thumb2RegisterInfo(const TargetInstrInfo &tii,
+Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii,
                                        const ARMSubtarget &sti)
   : ARMBaseRegisterInfo(tii, sti) {
 }
@@ -46,710 +45,23 @@ Thumb2RegisterInfo::Thumb2RegisterInfo(const TargetInstrInfo &tii,
 /// specified immediate.
 void Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator &MBBI,
-                                           unsigned DestReg, int Val,
-                                           const TargetInstrInfo *TII,
-                                           DebugLoc dl) const {
+                                           DebugLoc dl,
+                                           unsigned DestReg, unsigned SubIdx,
+                                           int Val,
+                                           ARMCC::CondCodes Pred,
+                                           unsigned PredReg) const {
   MachineFunction &MF = *MBB.getParent();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
-  Constant *C = ConstantInt::get(Type::Int32Ty, Val);
+  Constant *C = ConstantInt::get(
+           Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
   unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
 
-  BuildMI(MBB, MBBI, dl, TII->get(ARM::tLDRcp), DestReg)
-    .addConstantPoolIndex(Idx);
-}
-
-const TargetRegisterClass*
-Thumb2RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, MVT VT) const {
-  if (isARMLowRegister(Reg))
-    return ARM::tGPRRegisterClass;
-  switch (Reg) {
-   default:
-    break;
-   case ARM::R8:  case ARM::R9:  case ARM::R10:  case ARM::R11:
-   case ARM::R12: case ARM::SP:  case ARM::LR:   case ARM::PC:
-    return ARM::GPRRegisterClass;
-  }
-
-  return TargetRegisterInfo::getPhysicalRegisterRegClass(Reg, VT);
-}
-
-bool
-Thumb2RegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
-  return Thumb2RegScavenging;
-}
-
-bool Thumb2RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
-  const MachineFrameInfo *FFI = MF.getFrameInfo();
-  unsigned CFSize = FFI->getMaxCallFrameSize();
-  // It's not always a good idea to include the call frame as part of the
-  // stack frame. ARM (especially Thumb) has small immediate offset to
-  // address the stack frame. So a large call frame can cause poor codegen
-  // and may even makes it impossible to scavenge a register.
-  if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4
-    return false;
-
-  return !MF.getFrameInfo()->hasVarSizedObjects();
-}
-
-/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize
-/// a destreg = basereg + immediate in Thumb code. Materialize the immediate
-/// in a register using mov / mvn sequences or load the immediate from a
-/// constpool entry.
-static
-void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator &MBBI,
-                              unsigned DestReg, unsigned BaseReg,
-                              int NumBytes, bool CanChangeCC,
-                              const TargetInstrInfo &TII,
-                              const Thumb2RegisterInfo& MRI,
-                              DebugLoc dl) {
-    bool isHigh = !isARMLowRegister(DestReg) ||
-                  (BaseReg != 0 && !isARMLowRegister(BaseReg));
-    bool isSub = false;
-    // Subtract doesn't have high register version. Load the negative value
-    // if either base or dest register is a high register. Also, if do not
-    // issue sub as part of the sequence if condition register is to be
-    // preserved.
-    if (NumBytes < 0 && !isHigh && CanChangeCC) {
-      isSub = true;
-      NumBytes = -NumBytes;
-    }
-    unsigned LdReg = DestReg;
-    if (DestReg == ARM::SP) {
-      assert(BaseReg == ARM::SP && "Unexpected!");
-      LdReg = ARM::R3;
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
-        .addReg(ARM::R3, RegState::Kill);
-    }
-
-    if (NumBytes <= 255 && NumBytes >= 0)
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes);
-    else if (NumBytes < 0 && NumBytes >= -255) {
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes);
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), LdReg)
-        .addReg(LdReg, RegState::Kill);
-    } else
-      MRI.emitLoadConstPool(MBB, MBBI, LdReg, NumBytes, &TII, dl);
-
-    // Emit add / sub.
-    int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr);
-    const MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl,
-                                            TII.get(Opc), DestReg);
-    if (DestReg == ARM::SP || isSub)
-      MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
-    else
-      MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
-    if (DestReg == ARM::SP)
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVhir2lor), ARM::R3)
-        .addReg(ARM::R12, RegState::Kill);
-}
-
-/// calcNumMI - Returns the number of instructions required to materialize
-/// the specific add / sub r, c instruction.
-static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes,
-                          unsigned NumBits, unsigned Scale) {
-  unsigned NumMIs = 0;
-  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
-
-  if (Opc == ARM::tADDrSPi) {
-    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
-    Bytes -= ThisVal;
-    NumMIs++;
-    NumBits = 8;
-    Scale = 1;  // Followed by a number of tADDi8.
-    Chunk = ((1 << NumBits) - 1) * Scale;
-  }
-
-  NumMIs += Bytes / Chunk;
-  if ((Bytes % Chunk) != 0)
-    NumMIs++;
-  if (ExtraOpc)
-    NumMIs++;
-  return NumMIs;
-}
-
-/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
-/// a destreg = basereg + immediate in Thumb code.
-static
-void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator &MBBI,
-                               unsigned DestReg, unsigned BaseReg,
-                               int NumBytes, const TargetInstrInfo &TII,
-                               const Thumb2RegisterInfo& MRI,
-                               DebugLoc dl) {
-  bool isSub = NumBytes < 0;
-  unsigned Bytes = (unsigned)NumBytes;
-  if (isSub) Bytes = -NumBytes;
-  bool isMul4 = (Bytes & 3) == 0;
-  bool isTwoAddr = false;
-  bool DstNotEqBase = false;
-  unsigned NumBits = 1;
-  unsigned Scale = 1;
-  int Opc = 0;
-  int ExtraOpc = 0;
-
-  if (DestReg == BaseReg && BaseReg == ARM::SP) {
-    assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
-    NumBits = 7;
-    Scale = 4;
-    Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
-    isTwoAddr = true;
-  } else if (!isSub && BaseReg == ARM::SP) {
-    // r1 = add sp, 403
-    // =>
-    // r1 = add sp, 100 * 4
-    // r1 = add r1, 3
-    if (!isMul4) {
-      Bytes &= ~3;
-      ExtraOpc = ARM::tADDi3;
-    }
-    NumBits = 8;
-    Scale = 4;
-    Opc = ARM::tADDrSPi;
-  } else {
-    // sp = sub sp, c
-    // r1 = sub sp, c
-    // r8 = sub sp, c
-    if (DestReg != BaseReg)
-      DstNotEqBase = true;
-    NumBits = 8;
-    Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
-    isTwoAddr = true;
-  }
-
-  unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale);
-  unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2;
-  if (NumMIs > Threshold) {
-    // This will expand into too many instructions. Load the immediate from a
-    // constpool entry.
-    emitThumbRegPlusImmInReg(MBB, MBBI, DestReg, BaseReg, NumBytes, true, TII,
-                             MRI, dl);
-    return;
-  }
-
-  if (DstNotEqBase) {
-    if (isARMLowRegister(DestReg) && isARMLowRegister(BaseReg)) {
-      // If both are low registers, emit DestReg = add BaseReg, max(Imm, 7)
-      unsigned Chunk = (1 << 3) - 1;
-      unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
-      Bytes -= ThisVal;
-      BuildMI(MBB, MBBI, dl,TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3), DestReg)
-        .addReg(BaseReg, RegState::Kill).addImm(ThisVal);
-    } else {
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
-        .addReg(BaseReg, RegState::Kill);
-    }
-    BaseReg = DestReg;
-  }
-
-  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
-  while (Bytes) {
-    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
-    Bytes -= ThisVal;
-    ThisVal /= Scale;
-    // Build the new tADD / tSUB.
-    if (isTwoAddr)
-      BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-        .addReg(DestReg).addImm(ThisVal);
-    else {
-      bool isKill = BaseReg != ARM::SP;
-      BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-        .addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
-      BaseReg = DestReg;
-
-      if (Opc == ARM::tADDrSPi) {
-        // r4 = add sp, imm
-        // r4 = add r4, imm
-        // ...
-        NumBits = 8;
-        Scale = 1;
-        Chunk = ((1 << NumBits) - 1) * Scale;
-        Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
-        isTwoAddr = true;
-      }
-    }
-  }
-
-  if (ExtraOpc)
-    BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg)
-      .addReg(DestReg, RegState::Kill)
-      .addImm(((unsigned)NumBytes) & 3);
-}
-
-static void emitSPUpdate(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator &MBBI,
-                         const TargetInstrInfo &TII, DebugLoc dl,
-                         const Thumb2RegisterInfo &MRI,
-                         int NumBytes) {
-  emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII,
-                            MRI, dl);
-}
-
-void Thumb2RegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  if (!hasReservedCallFrame(MF)) {
-    // If we have alloca, convert as follows:
-    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
-    // ADJCALLSTACKUP   -> add, sp, sp, amount
-    MachineInstr *Old = I;
-    DebugLoc dl = Old->getDebugLoc();
-    unsigned Amount = Old->getOperand(0).getImm();
-    if (Amount != 0) {
-      // We need to keep the stack aligned properly.  To do this, we round the
-      // amount of space needed for the outgoing arguments up to the next
-      // alignment boundary.
-      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
-      Amount = (Amount+Align-1)/Align*Align;
-
-      // Replace the pseudo instruction with a new instruction...
-      unsigned Opc = Old->getOpcode();
-      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
-        emitSPUpdate(MBB, I, TII, dl, *this, -Amount);
-      } else {
-        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
-        emitSPUpdate(MBB, I, TII, dl, *this, Amount);
-      }
-    }
-  }
-  MBB.erase(I);
-}
-
-/// emitThumbConstant - Emit a series of instructions to materialize a
-/// constant.
-static void emitThumbConstant(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator &MBBI,
-                              unsigned DestReg, int Imm,
-                              const TargetInstrInfo &TII,
-                              const Thumb2RegisterInfo& MRI,
-                              DebugLoc dl) {
-  bool isSub = Imm < 0;
-  if (isSub) Imm = -Imm;
-
-  int Chunk = (1 << 8) - 1;
-  int ThisVal = (Imm > Chunk) ? Chunk : Imm;
-  Imm -= ThisVal;
-  BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), DestReg).addImm(ThisVal);
-  if (Imm > 0)
-    emitThumbRegPlusImmediate(MBB, MBBI, DestReg, DestReg, Imm, TII, MRI, dl);
-  if (isSub)
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tNEG), DestReg)
-      .addReg(DestReg, RegState::Kill);
-}
-
-void Thumb2RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                             int SPAdj, RegScavenger *RS) const{
-  unsigned i = 0;
-  MachineInstr &MI = *II;
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  DebugLoc dl = MI.getDebugLoc();
-
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-
-  unsigned FrameReg = ARM::SP;
-  int FrameIndex = MI.getOperand(i).getIndex();
-  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
-               MF.getFrameInfo()->getStackSize() + SPAdj;
-
-  if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
-    Offset -= AFI->getGPRCalleeSavedArea1Offset();
-  else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
-    Offset -= AFI->getGPRCalleeSavedArea2Offset();
-  else if (hasFP(MF)) {
-    assert(SPAdj == 0 && "Unexpected");
-    // There is alloca()'s in this function, must reference off the frame
-    // pointer instead.
-    FrameReg = getFrameRegister(MF);
-    Offset -= AFI->getFramePtrSpillOffset();
-  }
-
-  unsigned Opcode = MI.getOpcode();
-  const TargetInstrDesc &Desc = MI.getDesc();
-  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
-
-  if (Opcode == ARM::tADDrSPi) {
-    Offset += MI.getOperand(i+1).getImm();
-
-    // Can't use tADDrSPi if it's based off the frame pointer.
-    unsigned NumBits = 0;
-    unsigned Scale = 1;
-    if (FrameReg != ARM::SP) {
-      Opcode = ARM::tADDi3;
-      MI.setDesc(TII.get(ARM::tADDi3));
-      NumBits = 3;
-    } else {
-      NumBits = 8;
-      Scale = 4;
-      assert((Offset & 3) == 0 &&
-             "Thumb add/sub sp, #imm immediate must be multiple of 4!");
-    }
-
-    if (Offset == 0) {
-      // Turn it into a move.
-      MI.setDesc(TII.get(ARM::tMOVhir2lor));
-      MI.getOperand(i).ChangeToRegister(FrameReg, false);
-      MI.RemoveOperand(i+1);
-      return;
-    }
-
-    // Common case: small offset, fits into instruction.
-    unsigned Mask = (1 << NumBits) - 1;
-    if (((Offset / Scale) & ~Mask) == 0) {
-      // Replace the FrameIndex with sp / fp
-      MI.getOperand(i).ChangeToRegister(FrameReg, false);
-      MI.getOperand(i+1).ChangeToImmediate(Offset / Scale);
-      return;
-    }
-
-    unsigned DestReg = MI.getOperand(0).getReg();
-    unsigned Bytes = (Offset > 0) ? Offset : -Offset;
-    unsigned NumMIs = calcNumMI(Opcode, 0, Bytes, NumBits, Scale);
-    // MI would expand into a large number of instructions. Don't try to
-    // simplify the immediate.
-    if (NumMIs > 2) {
-      emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII,
-                                *this, dl);
-      MBB.erase(II);
-      return;
-    }
-
-    if (Offset > 0) {
-      // Translate r0 = add sp, imm to
-      // r0 = add sp, 255*4
-      // r0 = add r0, (imm - 255*4)
-      MI.getOperand(i).ChangeToRegister(FrameReg, false);
-      MI.getOperand(i+1).ChangeToImmediate(Mask);
-      Offset = (Offset - Mask * Scale);
-      MachineBasicBlock::iterator NII = next(II);
-      emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII,
-                                *this, dl);
-    } else {
-      // Translate r0 = add sp, -imm to
-      // r0 = -imm (this is then translated into a series of instructons)
-      // r0 = add r0, sp
-      emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl);
-      MI.setDesc(TII.get(ARM::tADDhirr));
-      MI.getOperand(i).ChangeToRegister(DestReg, false, false, true);
-      MI.getOperand(i+1).ChangeToRegister(FrameReg, false);
-    }
-    return;
-  } else {
-    unsigned ImmIdx = 0;
-    int InstrOffs = 0;
-    unsigned NumBits = 0;
-    unsigned Scale = 1;
-    switch (AddrMode) {
-    case ARMII::AddrModeT1_s: {
-      ImmIdx = i+1;
-      InstrOffs = MI.getOperand(ImmIdx).getImm();
-      NumBits = (FrameReg == ARM::SP) ? 8 : 5;
-      Scale = 4;
-      break;
-    }
-    default:
-      assert(0 && "Unsupported addressing mode!");
-      abort();
-      break;
-    }
-
-    Offset += InstrOffs * Scale;
-    assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
-
-    // Common case: small offset, fits into instruction.
-    MachineOperand &ImmOp = MI.getOperand(ImmIdx);
-    int ImmedOffset = Offset / Scale;
-    unsigned Mask = (1 << NumBits) - 1;
-    if ((unsigned)Offset <= Mask * Scale) {
-      // Replace the FrameIndex with sp
-      MI.getOperand(i).ChangeToRegister(FrameReg, false);
-      ImmOp.ChangeToImmediate(ImmedOffset);
-      return;
-    }
-
-    bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill;
-    if (AddrMode == ARMII::AddrModeT1_s) {
-      // Thumb tLDRspi, tSTRspi. These will change to instructions that use
-      // a different base register.
-      NumBits = 5;
-      Mask = (1 << NumBits) - 1;
-    }
-    // If this is a thumb spill / restore, we will be using a constpool load to
-    // materialize the offset.
-    if (AddrMode == ARMII::AddrModeT1_s && isThumSpillRestore)
-      ImmOp.ChangeToImmediate(0);
-    else {
-      // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
-      ImmedOffset = ImmedOffset & Mask;
-      ImmOp.ChangeToImmediate(ImmedOffset);
-      Offset &= ~(Mask*Scale);
-    }
-  }
-
-  // If we get here, the immediate doesn't fit into the instruction.  We folded
-  // as much as possible above, handle the rest, providing a register that is
-  // SP+LargeImm.
-  assert(Offset && "This code isn't needed if offset already handled!");
-
-  if (Desc.mayLoad()) {
-    // Use the destination register to materialize sp + offset.
-    unsigned TmpReg = MI.getOperand(0).getReg();
-    bool UseRR = false;
-    if (Opcode == ARM::tRestore) {
-      if (FrameReg == ARM::SP)
-        emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,
-                                 Offset, false, TII, *this, dl);
-      else {
-        emitLoadConstPool(MBB, II, TmpReg, Offset, &TII, dl);
-        UseRR = true;
-      }
-    } else
-      emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII,
-                                *this, dl);
-    MI.setDesc(TII.get(ARM::tLDR));
-    MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
-    if (UseRR)
-      // Use [reg, reg] addrmode.
-      MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
-    else  // tLDR has an extra register operand.
-      MI.addOperand(MachineOperand::CreateReg(0, false));
-  } else if (Desc.mayStore()) {
-    // FIXME! This is horrific!!! We need register scavenging.
-    // Our temporary workaround has marked r3 unavailable. Of course, r3 is
-    // also a ABI register so it's possible that is is the register that is
-    // being storing here. If that's the case, we do the following:
-    // r12 = r2
-    // Use r2 to materialize sp + offset
-    // str r3, r2
-    // r2 = r12
-    unsigned ValReg = MI.getOperand(0).getReg();
-    unsigned TmpReg = ARM::R3;
-    bool UseRR = false;
-    if (ValReg == ARM::R3) {
-      BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
-        .addReg(ARM::R2, RegState::Kill);
-      TmpReg = ARM::R2;
-    }
-    if (TmpReg == ARM::R3 && AFI->isR3LiveIn())
-      BuildMI(MBB, II, dl, TII.get(ARM::tMOVlor2hir), ARM::R12)
-        .addReg(ARM::R3, RegState::Kill);
-    if (Opcode == ARM::tSpill) {
-      if (FrameReg == ARM::SP)
-        emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,
-                                 Offset, false, TII, *this, dl);
-      else {
-        emitLoadConstPool(MBB, II, TmpReg, Offset, &TII, dl);
-        UseRR = true;
-      }
-    } else
-      emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII,
-                                *this, dl);
-    MI.setDesc(TII.get(ARM::tSTR));
-    MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
-    if (UseRR)  // Use [reg, reg] addrmode.
-      MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
-    else // tSTR has an extra register operand.
-      MI.addOperand(MachineOperand::CreateReg(0, false));
-
-    MachineBasicBlock::iterator NII = next(II);
-    if (ValReg == ARM::R3)
-      BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R2)
-        .addReg(ARM::R12, RegState::Kill);
-    if (TmpReg == ARM::R3 && AFI->isR3LiveIn())
-      BuildMI(MBB, NII, dl, TII.get(ARM::tMOVhir2lor), ARM::R3)
-        .addReg(ARM::R12, RegState::Kill);
-  } else
-    assert(false && "Unexpected opcode!");
-}
-
-void Thumb2RegisterInfo::emitPrologue(MachineFunction &MF) const {
-  MachineBasicBlock &MBB = MF.front();
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  MachineFrameInfo  *MFI = MF.getFrameInfo();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
-  unsigned NumBytes = MFI->getStackSize();
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  DebugLoc dl = (MBBI != MBB.end() ?
-                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
-
-  // Check if R3 is live in. It might have to be used as a scratch register.
-  for (MachineRegisterInfo::livein_iterator I =MF.getRegInfo().livein_begin(),
-         E = MF.getRegInfo().livein_end(); I != E; ++I) {
-    if (I->first == ARM::R3) {
-      AFI->setR3IsLiveIn(true);
-      break;
-    }
-  }
-
-  // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
-  NumBytes = (NumBytes + 3) & ~3;
-  MFI->setStackSize(NumBytes);
-
-  // Determine the sizes of each callee-save spill areas and record which frame
-  // belongs to which callee-save spill areas.
-  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
-  int FramePtrSpillFI = 0;
-
-  if (VARegSaveSize)
-    emitSPUpdate(MBB, MBBI, TII, dl, *this, -VARegSaveSize);
-
-  if (!AFI->hasStackFrame()) {
-    if (NumBytes != 0)
-      emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes);
-    return;
-  }
-
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
-    int FI = CSI[i].getFrameIdx();
-    switch (Reg) {
-    case ARM::R4:
-    case ARM::R5:
-    case ARM::R6:
-    case ARM::R7:
-    case ARM::LR:
-      if (Reg == FramePtr)
-        FramePtrSpillFI = FI;
-      AFI->addGPRCalleeSavedArea1Frame(FI);
-      GPRCS1Size += 4;
-      break;
-    case ARM::R8:
-    case ARM::R9:
-    case ARM::R10:
-    case ARM::R11:
-      if (Reg == FramePtr)
-        FramePtrSpillFI = FI;
-      if (STI.isTargetDarwin()) {
-        AFI->addGPRCalleeSavedArea2Frame(FI);
-        GPRCS2Size += 4;
-      } else {
-        AFI->addGPRCalleeSavedArea1Frame(FI);
-        GPRCS1Size += 4;
-      }
-      break;
-    default:
-      AFI->addDPRCalleeSavedAreaFrame(FI);
-      DPRCSSize += 8;
-    }
-  }
-
-  if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
-    ++MBBI;
-    if (MBBI != MBB.end())
-      dl = MBBI->getDebugLoc();
-  }
-
-  // Darwin ABI requires FP to point to the stack slot that contains the
-  // previous FP.
-  if (STI.isTargetDarwin() || hasFP(MF)) {
-    MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
-      .addFrameIndex(FramePtrSpillFI).addImm(0);
-  }
-
-  // Determine starting offsets of spill areas.
-  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
-  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
-  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
-  AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes);
-  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
-  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
-  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
-
-  NumBytes = DPRCSOffset;
-  if (NumBytes) {
-    // Insert it after all the callee-save spills.
-    emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes);
-  }
-
-  if (STI.isTargetELF() && hasFP(MF)) {
-    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
-                             AFI->getFramePtrSpillOffset());
-  }
-
-  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
-  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
-  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci))
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0);
 }
 
-static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
-static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) {
-  return (MI->getOpcode() == ARM::tRestore &&
-          MI->getOperand(1).isFI() &&
-          isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs));
-}
-
-void Thumb2RegisterInfo::emitEpilogue(MachineFunction &MF,
-                                      MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = prior(MBB.end());
-  assert((MBBI->getOpcode() == ARM::tBX_RET ||
-          MBBI->getOpcode() == ARM::tPOP_RET) &&
-         "Can only insert epilog into returning blocks");
-  DebugLoc dl = MBBI->getDebugLoc();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
-  int NumBytes = (int)MFI->getStackSize();
-
-  if (!AFI->hasStackFrame()) {
-    if (NumBytes != 0)
-      emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes);
-  } else {
-    // Unwind MBBI to point to first LDR / FLDD.
-    const unsigned *CSRegs = getCalleeSavedRegs();
-    if (MBBI != MBB.begin()) {
-      do
-        --MBBI;
-      while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs));
-      if (!isCSRestore(MBBI, CSRegs))
-        ++MBBI;
-    }
-
-    // Move SP to start of FP callee save spill area.
-    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
-                 AFI->getGPRCalleeSavedArea2Size() +
-                 AFI->getDPRCalleeSavedAreaSize());
-
-    if (hasFP(MF)) {
-      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
-      // Reset SP based on frame pointer only if the stack frame extends beyond
-      // frame pointer stack slot or target is ELF and the function has FP.
-      if (NumBytes)
-        emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, FramePtr, -NumBytes,
-                                  TII, *this, dl);
-      else
-        BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVlor2hir), ARM::SP)
-          .addReg(FramePtr);
-    } else {
-      if (MBBI->getOpcode() == ARM::tBX_RET &&
-          &MBB.front() != MBBI &&
-          prior(MBBI)->getOpcode() == ARM::tPOP) {
-        MachineBasicBlock::iterator PMBBI = prior(MBBI);
-        emitSPUpdate(MBB, PMBBI, TII, dl, *this, NumBytes);
-      } else
-        emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes);
-    }
-  }
-
-  if (VARegSaveSize) {
-    // Epilogue for vararg functions: pop LR to R3 and branch off it.
-    // FIXME: Verify this is still ok when R3 is no longer being reserved.
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)).addReg(ARM::R3);
-
-    emitSPUpdate(MBB, MBBI, TII, dl, *this, VARegSaveSize);
-
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)).addReg(ARM::R3);
-    MBB.erase(MBBI);
-  }
+bool Thumb2RegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
 }
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.h b/lib/Target/ARM/Thumb2RegisterInfo.h
index d379c31..a63c60b 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.h
+++ b/lib/Target/ARM/Thumb2RegisterInfo.h
@@ -20,40 +20,23 @@
 
 namespace llvm {
   class ARMSubtarget;
-  class TargetInstrInfo;
+  class ARMBaseInstrInfo;
   class Type;
 
 struct Thumb2RegisterInfo : public ARMBaseRegisterInfo {
 public:
-  Thumb2RegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI);
+  Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
   void emitLoadConstPool(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator &MBBI,
-                         unsigned DestReg, int Val,
-                         const TargetInstrInfo *TII,
-                         DebugLoc dl) const;
-
-  /// Code Generation virtual methods...
-  const TargetRegisterClass *
-    getPhysicalRegisterRegClass(unsigned Reg, MVT VT = MVT::Other) const;
-
-  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+                         DebugLoc dl,
+                         unsigned DestReg, unsigned SubIdx, int Val,
+                         ARMCC::CondCodes Pred = ARMCC::AL,
+                         unsigned PredReg = 0) const;
 
   bool requiresRegisterScavenging(const MachineFunction &MF) const;
-
-  bool hasReservedCallFrame(MachineFunction &MF) const;
-
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
-  void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
-
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 };
 }
 
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
new file mode 100644
index 0000000..b8879d2
--- /dev/null
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -0,0 +1,685 @@
+//===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "t2-reduce-size"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMBaseInstrInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumNarrows,  "Number of 32-bit instrs reduced to 16-bit ones");
+STATISTIC(Num2Addrs,   "Number of 32-bit instrs reduced to 2addr 16-bit ones");
+STATISTIC(NumLdSts,    "Number of 32-bit load / store reduced to 16-bit ones");
+
+static cl::opt<int> ReduceLimit("t2-reduce-limit",
+                                cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimit2Addr("t2-reduce-limit2",
+                                     cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3",
+                                     cl::init(-1), cl::Hidden);
+
+namespace {
+  /// ReduceTable - A static table with information on mapping from wide
+  /// opcodes to narrow
+  struct ReduceEntry {
+    unsigned WideOpc;      // Wide opcode
+    unsigned NarrowOpc1;   // Narrow opcode to transform to
+    unsigned NarrowOpc2;   // Narrow opcode when it's two-address
+    uint8_t  Imm1Limit;    // Limit of immediate field (bits)
+    uint8_t  Imm2Limit;    // Limit of immediate field when it's two-address
+    unsigned LowRegs1 : 1; // Only possible if low-registers are used
+    unsigned LowRegs2 : 1; // Only possible if low-registers are used (2addr)
+    unsigned PredCC1  : 2; // 0 - If predicated, cc is on and vice versa.
+                           // 1 - No cc field.
+                           // 2 - Always set CPSR.
+    unsigned PredCC2  : 2;
+    unsigned Special  : 1; // Needs to be dealt with specially
+  };
+
+  static const ReduceEntry ReduceTable[] = {
+    // Wide,        Narrow1,      Narrow2,     imm1,imm2,  lo1, lo2, P/C, S
+    { ARM::t2ADCrr, 0,            ARM::tADC,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2ADDri, ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  0,0, 0 },
+    { ARM::t2ADDrr, ARM::tADDrr,  ARM::tADDhirr, 0,   0,    1,   0,  0,1, 0 },
+    // Note: immediate scale is 4.
+    { ARM::t2ADDrSPi,ARM::tADDrSPi,0,            8,   0,    1,   0,  1,0, 0 },
+    { ARM::t2ADDSri,ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  2,2, 1 },
+    { ARM::t2ADDSrr,ARM::tADDrr,  0,             0,   0,    1,   0,  2,0, 1 },
+    { ARM::t2ANDrr, 0,            ARM::tAND,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2ASRri, ARM::tASRri,  0,             5,   0,    1,   0,  0,0, 0 },
+    { ARM::t2ASRrr, 0,            ARM::tASRrr,   0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2BICrr, 0,            ARM::tBIC,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2CMNrr, ARM::tCMN,    0,             0,   0,    1,   0,  2,0, 0 },
+    { ARM::t2CMPri, ARM::tCMPi8,  0,             8,   0,    1,   0,  2,0, 0 },
+    { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,    0,   0,  2,0, 0 },
+    { ARM::t2CMPzri,ARM::tCMPzi8, 0,             8,   0,    1,   0,  2,0, 0 },
+    { ARM::t2CMPzrr,ARM::tCMPzhir,0,             0,   0,    0,   0,  2,0, 0 },
+    { ARM::t2EORrr, 0,            ARM::tEOR,     0,   0,    0,   1,  0,0, 0 },
+    // FIXME: adr.n immediate offset must be multiple of 4.
+    //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0,     0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2LSLri, ARM::tLSLri,  0,             5,   0,    1,   0,  0,0, 0 },
+    { ARM::t2LSLrr, 0,            ARM::tLSLrr,   0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2LSRri, ARM::tLSRri,  0,             5,   0,    1,   0,  0,0, 0 },
+    { ARM::t2LSRrr, 0,            ARM::tLSRrr,   0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2MOVi,  ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 0 },
+    { ARM::t2MOVi16,ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 0 },
+    // FIXME: Do we need the 16-bit 'S' variant?
+    { ARM::t2MOVr,ARM::tMOVgpr2gpr,0,            0,   0,    0,   0,  1,0, 0 },
+    { ARM::t2MOVCCr,0,            ARM::tMOVCCr,  0,   0,    0,   0,  0,1, 0 },
+    { ARM::t2MOVCCi,0,            ARM::tMOVCCi,  0,   8,    0,   0,  0,1, 0 },
+    { ARM::t2MUL,   0,            ARM::tMUL,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2MVNr,  ARM::tMVN,    0,             0,   0,    1,   0,  0,0, 0 },
+    { ARM::t2ORRrr, 0,            ARM::tORR,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2REV,   ARM::tREV,    0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2REV16, ARM::tREV16,  0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2REVSH, ARM::tREVSH,  0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2RORrr, 0,            ARM::tROR,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2RSBri, ARM::tRSB,    0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2RSBSri,ARM::tRSB,    0,             0,   0,    1,   0,  2,0, 1 },
+    { ARM::t2SBCrr, 0,            ARM::tSBC,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2SUBri, ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  0,0, 0 },
+    { ARM::t2SUBrr, ARM::tSUBrr,  0,             0,   0,    1,   0,  0,0, 0 },
+    { ARM::t2SUBSri,ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  2,2, 0 },
+    { ARM::t2SUBSrr,ARM::tSUBrr,  0,             0,   0,    1,   0,  2,0, 0 },
+    { ARM::t2SXTBr, ARM::tSXTB,   0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2SXTHr, ARM::tSXTH,   0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2TSTrr, ARM::tTST,    0,             0,   0,    1,   0,  2,0, 0 },
+    { ARM::t2UXTBr, ARM::tUXTB,   0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2UXTHr, ARM::tUXTH,   0,             0,   0,    1,   0,  1,0, 0 },
+
+    // FIXME: Clean this up after splitting each Thumb load / store opcode
+    // into multiple ones.
+    { ARM::t2LDRi12,ARM::tLDR,    0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRs,  ARM::tLDR,    0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRBi12,ARM::tLDRB,  0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRBs, ARM::tLDRB,   0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRHi12,ARM::tLDRH,  0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRHs, ARM::tLDRH,   0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRSBs,ARM::tLDRSB,  0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRSHs,ARM::tLDRSH,  0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRi12,ARM::tSTR,    0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRs,  ARM::tSTR,    0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRBi12,ARM::tSTRB,  0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRBs, ARM::tSTRB,   0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRHi12,ARM::tSTRH,  0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRHs, ARM::tSTRH,   0,             0,   0,    1,   0,  0,0, 1 },
+
+    { ARM::t2LDM_RET,0,           ARM::tPOP_RET, 0,   0,    1,   1,  1,1, 1 },
+    { ARM::t2LDM,   ARM::tLDM,    ARM::tPOP,     0,   0,    1,   1,  1,1, 1 },
+    { ARM::t2STM,   ARM::tSTM,    ARM::tPUSH,    0,   0,    1,   1,  1,1, 1 },
+  };
+
+  class VISIBILITY_HIDDEN Thumb2SizeReduce : public MachineFunctionPass {
+  public:
+    static char ID;
+    Thumb2SizeReduce();
+
+    const Thumb2InstrInfo *TII;
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "Thumb2 instruction size reduction pass";
+    }
+
+  private:
+    /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable.
+    DenseMap<unsigned, unsigned> ReduceOpcodeMap;
+
+    bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+                         bool is2Addr, ARMCC::CondCodes Pred,
+                         bool LiveCPSR, bool &HasCC, bool &CCDead);
+
+    bool ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+                         const ReduceEntry &Entry);
+
+    bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+                       const ReduceEntry &Entry, bool LiveCPSR);
+
+    /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address
+    /// instruction.
+    bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+                       const ReduceEntry &Entry,
+                       bool LiveCPSR);
+
+    /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit
+    /// non-two-address instruction.
+    bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+                        const ReduceEntry &Entry,
+                        bool LiveCPSR);
+
+    /// ReduceMBB - Reduce width of instructions in the specified basic block.
+    bool ReduceMBB(MachineBasicBlock &MBB);
+  };
+  char Thumb2SizeReduce::ID = 0;
+}
+
+Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(&ID) {
+  for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) {
+    unsigned FromOpc = ReduceTable[i].WideOpc;
+    if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second)
+      assert(false && "Duplicated entries?");
+  }
+}
+
+static bool HasImplicitCPSRDef(const TargetInstrDesc &TID) {
+  for (const unsigned *Regs = TID.ImplicitDefs; *Regs; ++Regs)
+    if (*Regs == ARM::CPSR)
+      return true;
+  return false;
+}
+
+bool
+Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+                                  bool is2Addr, ARMCC::CondCodes Pred,
+                                  bool LiveCPSR, bool &HasCC, bool &CCDead) {
+  if ((is2Addr  && Entry.PredCC2 == 0) ||
+      (!is2Addr && Entry.PredCC1 == 0)) {
+    if (Pred == ARMCC::AL) {
+      // Not predicated, must set CPSR.
+      if (!HasCC) {
+        // Original instruction was not setting CPSR, but CPSR is not
+        // currently live anyway. It's ok to set it. The CPSR def is
+        // dead though.
+        if (!LiveCPSR) {
+          HasCC = true;
+          CCDead = true;
+          return true;
+        }
+        return false;
+      }
+    } else {
+      // Predicated, must not set CPSR.
+      if (HasCC)
+        return false;
+    }
+  } else if ((is2Addr  && Entry.PredCC2 == 2) ||
+             (!is2Addr && Entry.PredCC1 == 2)) {
+    /// Old opcode has an optional def of CPSR.
+    if (HasCC)
+      return true;
+    // If both old opcode does not implicit CPSR def, then it's not ok since
+    // these new opcodes CPSR def is not meant to be thrown away. e.g. CMP.
+    if (!HasImplicitCPSRDef(MI->getDesc()))
+      return false;
+    HasCC = true;
+  } else {
+    // 16-bit instruction does not set CPSR.
+    if (HasCC)
+      return false;
+  }
+
+  return true;
+}
+
+static bool VerifyLowRegs(MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  bool isPCOk = (Opc == ARM::t2LDM_RET) || (Opc == ARM::t2LDM);
+  bool isLROk = (Opc == ARM::t2STM);
+  bool isSPOk = isPCOk || isLROk || (Opc == ARM::t2ADDrSPi);
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || Reg == ARM::CPSR)
+      continue;
+    if (isPCOk && Reg == ARM::PC)
+      continue;
+    if (isLROk && Reg == ARM::LR)
+      continue;
+    if (isSPOk && Reg == ARM::SP)
+      continue;
+    if (!isARMLowRegister(Reg))
+      return false;
+  }
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+                                  const ReduceEntry &Entry) {
+  if (ReduceLimitLdSt != -1 && ((int)NumLdSts >= ReduceLimitLdSt))
+    return false;
+
+  unsigned Scale = 1;
+  bool HasImmOffset = false;
+  bool HasShift = false;
+  bool isLdStMul = false;
+  unsigned Opc = Entry.NarrowOpc1;
+  unsigned OpNum = 3; // First 'rest' of operands.
+  switch (Entry.WideOpc) {
+  default:
+    llvm_unreachable("Unexpected Thumb2 load / store opcode!");
+  case ARM::t2LDRi12:
+  case ARM::t2STRi12:
+    Scale = 4;
+    HasImmOffset = true;
+    break;
+  case ARM::t2LDRBi12:
+  case ARM::t2STRBi12:
+    HasImmOffset = true;
+    break;
+  case ARM::t2LDRHi12:
+  case ARM::t2STRHi12:
+    Scale = 2;
+    HasImmOffset = true;
+    break;
+  case ARM::t2LDRs:
+  case ARM::t2LDRBs:
+  case ARM::t2LDRHs:
+  case ARM::t2LDRSBs:
+  case ARM::t2LDRSHs:
+  case ARM::t2STRs:
+  case ARM::t2STRBs:
+  case ARM::t2STRHs:
+    HasShift = true;
+    OpNum = 4;
+    break;
+  case ARM::t2LDM_RET:
+  case ARM::t2LDM:
+  case ARM::t2STM: {
+    OpNum = 0;
+    unsigned BaseReg = MI->getOperand(0).getReg();
+    unsigned Mode = MI->getOperand(1).getImm();
+    if (BaseReg == ARM::SP && ARM_AM::getAM4WBFlag(Mode)) {
+      Opc = Entry.NarrowOpc2;
+      OpNum = 2;
+    } else if (Entry.WideOpc == ARM::t2LDM_RET ||
+               !isARMLowRegister(BaseReg) ||
+               !ARM_AM::getAM4WBFlag(Mode) ||
+               ARM_AM::getAM4SubMode(Mode) != ARM_AM::ia) {
+      return false;
+    }
+    isLdStMul = true;
+    break;
+  }
+  }
+
+  unsigned OffsetReg = 0;
+  bool OffsetKill = false;
+  if (HasShift) {
+    OffsetReg  = MI->getOperand(2).getReg();
+    OffsetKill = MI->getOperand(2).isKill();
+    if (MI->getOperand(3).getImm())
+      // Thumb1 addressing mode doesn't support shift.
+      return false;
+  }
+
+  unsigned OffsetImm = 0;
+  if (HasImmOffset) {
+    OffsetImm = MI->getOperand(2).getImm();
+    unsigned MaxOffset = ((1 << Entry.Imm1Limit) - 1) * Scale;
+    if ((OffsetImm & (Scale-1)) || OffsetImm > MaxOffset)
+      // Make sure the immediate field fits.
+      return false;
+  }
+
+  // Add the 16-bit load / store instruction.
+  // FIXME: Thumb1 addressing mode encode both immediate and register offset.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, TII->get(Opc));
+  if (!isLdStMul) {
+    MIB.addOperand(MI->getOperand(0)).addOperand(MI->getOperand(1));
+    if (Entry.NarrowOpc1 != ARM::tLDRSB && Entry.NarrowOpc1 != ARM::tLDRSH) {
+      // tLDRSB and tLDRSH do not have an immediate offset field. On the other
+      // hand, it must have an offset register.
+      // FIXME: Remove this special case.
+      MIB.addImm(OffsetImm/Scale);
+    }
+    assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!");
+
+    MIB.addReg(OffsetReg, getKillRegState(OffsetKill));
+  }
+
+  // Transfer the rest of operands.
+  for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum)
+    MIB.addOperand(MI->getOperand(OpNum));
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase(MI);
+  ++NumLdSts;
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+                                const ReduceEntry &Entry,
+                                bool LiveCPSR) {
+  if (Entry.LowRegs1 && !VerifyLowRegs(MI))
+    return false;
+
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (TID.mayLoad() || TID.mayStore())
+    return ReduceLoadStore(MBB, MI, Entry);
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default: break;
+  case ARM::t2ADDSri: 
+  case ARM::t2ADDSrr: {
+    unsigned PredReg = 0;
+    if (getInstrPredicate(MI, PredReg) == ARMCC::AL) {
+      switch (Opc) {
+      default: break;
+      case ARM::t2ADDSri: {
+        if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR))
+          return true;
+        // fallthrough
+      }
+      case ARM::t2ADDSrr:
+        return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+      }
+    }
+    break;
+  }
+  case ARM::t2RSBri:
+  case ARM::t2RSBSri:
+    if (MI->getOperand(2).getImm() == 0)
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+    break;
+  }
+  return false;
+}
+
+bool
+Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+                                const ReduceEntry &Entry,
+                                bool LiveCPSR) {
+
+  if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
+    return false;
+
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned Reg0 = MI->getOperand(0).getReg();
+  unsigned Reg1 = MI->getOperand(1).getReg();
+  if (Reg0 != Reg1)
+    return false;
+  if (Entry.LowRegs2 && !isARMLowRegister(Reg0))
+    return false;
+  if (Entry.Imm2Limit) {
+    unsigned Imm = MI->getOperand(2).getImm();
+    unsigned Limit = (1 << Entry.Imm2Limit) - 1;
+    if (Imm > Limit)
+      return false;
+  } else {
+    unsigned Reg2 = MI->getOperand(2).getReg();
+    if (Entry.LowRegs2 && !isARMLowRegister(Reg2))
+      return false;
+  }
+
+  // Check if it's possible / necessary to transfer the predicate.
+  const TargetInstrDesc &NewTID = TII->get(Entry.NarrowOpc2);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  bool SkipPred = false;
+  if (Pred != ARMCC::AL) {
+    if (!NewTID.isPredicable())
+      // Can't transfer predicate, fail.
+      return false;
+  } else {
+    SkipPred = !NewTID.isPredicable();
+  }
+
+  bool HasCC = false;
+  bool CCDead = false;
+  if (TID.hasOptionalDef()) {
+    unsigned NumOps = TID.getNumOperands();
+    HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+    if (HasCC && MI->getOperand(NumOps-1).isDead())
+      CCDead = true;
+  }
+  if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead))
+    return false;
+
+  // Add the 16-bit instruction.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID);
+  MIB.addOperand(MI->getOperand(0));
+  if (NewTID.hasOptionalDef()) {
+    if (HasCC)
+      AddDefaultT1CC(MIB, CCDead);
+    else
+      AddNoT1CC(MIB);
+  }
+
+  // Transfer the rest of operands.
+  unsigned NumOps = TID.getNumOperands();
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    if (i < NumOps && TID.OpInfo[i].isOptionalDef())
+      continue;
+    if (SkipPred && TID.OpInfo[i].isPredicate())
+      continue;
+    MIB.addOperand(MI->getOperand(i));
+  }
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase(MI);
+  ++Num2Addrs;
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+                                 const ReduceEntry &Entry,
+                                 bool LiveCPSR) {
+  if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
+    return false;
+
+  unsigned Limit = ~0U;
+  unsigned Scale = (Entry.WideOpc == ARM::t2ADDrSPi) ? 4 : 1;
+  if (Entry.Imm1Limit)
+    Limit = ((1 << Entry.Imm1Limit) - 1) * Scale;
+
+  const TargetInstrDesc &TID = MI->getDesc();
+  for (unsigned i = 0, e = TID.getNumOperands(); i != e; ++i) {
+    if (TID.OpInfo[i].isPredicate())
+      continue;
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg()) {
+      unsigned Reg = MO.getReg();
+      if (!Reg || Reg == ARM::CPSR)
+        continue;
+      if (Entry.WideOpc == ARM::t2ADDrSPi && Reg == ARM::SP)
+        continue;
+      if (Entry.LowRegs1 && !isARMLowRegister(Reg))
+        return false;
+    } else if (MO.isImm() &&
+               !TID.OpInfo[i].isPredicate()) {
+      if (((unsigned)MO.getImm()) > Limit || (MO.getImm() & (Scale-1)) != 0)
+        return false;
+    }
+  }
+
+  // Check if it's possible / necessary to transfer the predicate.
+  const TargetInstrDesc &NewTID = TII->get(Entry.NarrowOpc1);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  bool SkipPred = false;
+  if (Pred != ARMCC::AL) {
+    if (!NewTID.isPredicable())
+      // Can't transfer predicate, fail.
+      return false;
+  } else {
+    SkipPred = !NewTID.isPredicable();
+  }
+
+  bool HasCC = false;
+  bool CCDead = false;
+  if (TID.hasOptionalDef()) {
+    unsigned NumOps = TID.getNumOperands();
+    HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+    if (HasCC && MI->getOperand(NumOps-1).isDead())
+      CCDead = true;
+  }
+  if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead))
+    return false;
+
+  // Add the 16-bit instruction.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID);
+  MIB.addOperand(MI->getOperand(0));
+  if (NewTID.hasOptionalDef()) {
+    if (HasCC)
+      AddDefaultT1CC(MIB, CCDead);
+    else
+      AddNoT1CC(MIB);
+  }
+
+  // Transfer the rest of operands.
+  unsigned NumOps = TID.getNumOperands();
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    if (i < NumOps && TID.OpInfo[i].isOptionalDef())
+      continue;
+    if ((TID.getOpcode() == ARM::t2RSBSri ||
+         TID.getOpcode() == ARM::t2RSBri) && i == 2)
+      // Skip the zero immediate operand, it's now implicit.
+      continue;
+    bool isPred = (i < NumOps && TID.OpInfo[i].isPredicate());
+    if (SkipPred && isPred)
+        continue;
+    const MachineOperand &MO = MI->getOperand(i);
+    if (Scale > 1 && !isPred && MO.isImm())
+      MIB.addImm(MO.getImm() / Scale);
+    else {
+      if (MO.isReg() && MO.isImplicit() && MO.getReg() == ARM::CPSR)
+        // Skip implicit def of CPSR. Either it's modeled as an optional
+        // def now or it's already an implicit def on the new instruction.
+        continue;
+      MIB.addOperand(MO);
+    }
+  }
+  if (!TID.isPredicable() && NewTID.isPredicable())
+    AddDefaultPred(MIB);
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase(MI);
+  ++NumNarrows;
+  return true;
+}
+
+static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) {
+  bool HasDef = false;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isUse())
+      continue;
+    if (MO.getReg() != ARM::CPSR)
+      continue;
+    if (!MO.isDead())
+      HasDef = true;
+  }
+
+  return HasDef || LiveCPSR;
+}
+
+static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isDef())
+      continue;
+    if (MO.getReg() != ARM::CPSR)
+      continue;
+    assert(LiveCPSR && "CPSR liveness tracking is wrong!");
+    if (MO.isKill()) {
+      LiveCPSR = false;
+      break;
+    }
+  }
+
+  return LiveCPSR;
+}
+
+bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  bool LiveCPSR = false;
+  // Yes, CPSR could be livein.
+  for (MachineBasicBlock::const_livein_iterator I = MBB.livein_begin(),
+         E = MBB.livein_end(); I != E; ++I) {
+    if (*I == ARM::CPSR) {
+      LiveCPSR = true;
+      break;
+    }
+  }
+
+  MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
+  MachineBasicBlock::iterator NextMII;
+  for (; MII != E; MII = NextMII) {
+    NextMII = next(MII);
+
+    MachineInstr *MI = &*MII;
+    LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
+
+    unsigned Opcode = MI->getOpcode();
+    DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode);
+    if (OPI != ReduceOpcodeMap.end()) {
+      const ReduceEntry &Entry = ReduceTable[OPI->second];
+      // Ignore "special" cases for now.
+      if (Entry.Special) {
+        if (ReduceSpecial(MBB, MI, Entry, LiveCPSR)) {
+          Modified = true;
+          MachineBasicBlock::iterator I = prior(NextMII);
+          MI = &*I;
+        }
+        goto ProcessNext;
+      }
+
+      // Try to transform to a 16-bit two-address instruction.
+      if (Entry.NarrowOpc2 && ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) {
+        Modified = true;
+        MachineBasicBlock::iterator I = prior(NextMII);
+        MI = &*I;
+        goto ProcessNext;
+      }
+
+      // Try to transform ro a 16-bit non-two-address instruction.
+      if (Entry.NarrowOpc1 && ReduceToNarrow(MBB, MI, Entry, LiveCPSR)) {
+        Modified = true;
+        MachineBasicBlock::iterator I = prior(NextMII);
+        MI = &*I;
+      }
+    }
+
+  ProcessNext:
+    LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR);
+  }
+
+  return Modified;
+}
+
+bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
+  TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+
+  bool Modified = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+    Modified |= ReduceMBB(*I);
+  return Modified;
+}
+
+/// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size
+/// reduction pass.
+FunctionPass *llvm::createThumb2SizeReductionPass() {
+  return new Thumb2SizeReduce();
+}
diff --git a/lib/Target/Alpha/Alpha.h b/lib/Target/Alpha/Alpha.h
index 0818e25..b8a0645 100644
--- a/lib/Target/Alpha/Alpha.h
+++ b/lib/Target/Alpha/Alpha.h
@@ -22,20 +22,22 @@ namespace llvm {
   class AlphaTargetMachine;
   class FunctionPass;
   class MachineCodeEmitter;
-  class raw_ostream;
+  class ObjectCodeEmitter;
+  class formatted_raw_ostream;
 
   FunctionPass *createAlphaISelDag(AlphaTargetMachine &TM);
-  FunctionPass *createAlphaCodePrinterPass(raw_ostream &OS,
-                                           TargetMachine &TM,
-                                           bool Verbose);
   FunctionPass *createAlphaPatternInstructionSelector(TargetMachine &TM);
   FunctionPass *createAlphaCodeEmitterPass(AlphaTargetMachine &TM,
                                            MachineCodeEmitter &MCE);
   FunctionPass *createAlphaJITCodeEmitterPass(AlphaTargetMachine &TM,
-                                           JITCodeEmitter &JCE);
+                                              JITCodeEmitter &JCE);
+  FunctionPass *createAlphaObjectCodeEmitterPass(AlphaTargetMachine &TM,
+                                                 ObjectCodeEmitter &OCE);
   FunctionPass *createAlphaLLRPPass(AlphaTargetMachine &tm);
   FunctionPass *createAlphaBranchSelectionPass();
 
+  extern Target TheAlphaTarget;
+
 } // end namespace llvm;
 
 // Defines symbolic names for Alpha registers.  This defines a mapping from
diff --git a/lib/Target/Alpha/Alpha.td b/lib/Target/Alpha/Alpha.td
index e3748c6..6efdf55 100644
--- a/lib/Target/Alpha/Alpha.td
+++ b/lib/Target/Alpha/Alpha.td
@@ -30,6 +30,12 @@ def FeatureCIX : SubtargetFeature<"cix", "HasCT", "true",
 include "AlphaRegisterInfo.td"
 
 //===----------------------------------------------------------------------===//
+// Calling Convention Description
+//===----------------------------------------------------------------------===//
+
+include "AlphaCallingConv.td"
+
+//===----------------------------------------------------------------------===//
 // Schedule Description
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/Alpha/AlphaBranchSelector.cpp b/lib/Target/Alpha/AlphaBranchSelector.cpp
index aca8ca7..719ffae 100644
--- a/lib/Target/Alpha/AlphaBranchSelector.cpp
+++ b/lib/Target/Alpha/AlphaBranchSelector.cpp
@@ -17,7 +17,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
 using namespace llvm;
 
 namespace {
diff --git a/lib/Target/Alpha/AlphaCallingConv.td b/lib/Target/Alpha/AlphaCallingConv.td
new file mode 100644
index 0000000..38ada69
--- /dev/null
+++ b/lib/Target/Alpha/AlphaCallingConv.td
@@ -0,0 +1,37 @@
+//===- AlphaCallingConv.td - Calling Conventions for Alpha -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for Alpha architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Alpha Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_Alpha : CallingConv<[
+  // i64 is returned in register R0
+  CCIfType<[i64], CCAssignToReg<[R0]>>,
+
+  // f32 / f64 are returned in F0/F1
+  CCIfType<[f32, f64], CCAssignToReg<[F0, F1]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Alpha Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_Alpha : CallingConv<[
+  // The first 6 arguments are passed in registers, whether integer or
+  // floating-point
+  CCIfType<[i64], CCAssignToRegWithShadow<[R16, R17, R18, R19, R20, R21],
+                                          [F16, F17, F18, F19, F20, F21]>>,
+
+  CCIfType<[f32, f64], CCAssignToRegWithShadow<[F16, F17, F18, F19, F20, F21],
+                                               [R16, R17, R18, R19, R20, R21]>>,
+
+  // Stack slots are 8 bytes in size and 8-byte aligned.
+  CCIfType<[i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
diff --git a/lib/Target/Alpha/AlphaCodeEmitter.cpp b/lib/Target/Alpha/AlphaCodeEmitter.cpp
index f50f007..8023add 100644
--- a/lib/Target/Alpha/AlphaCodeEmitter.cpp
+++ b/lib/Target/Alpha/AlphaCodeEmitter.cpp
@@ -19,16 +19,19 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/MachineCodeEmitter.h"
 #include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/ObjectCodeEmitter.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Function.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 namespace {
-  	
+
   class AlphaCodeEmitter {
     MachineCodeEmitter &MCE;
   public:
@@ -57,7 +60,7 @@ namespace {
   public:
     static char ID;
     explicit Emitter(TargetMachine &tm, CodeEmitter &mce)
-      : MachineFunctionPass(&ID), AlphaCodeEmitter(mce), 
+      : MachineFunctionPass(&ID), AlphaCodeEmitter(mce),
         II(0), TM(tm), MCE(mce) {}
     Emitter(TargetMachine &tm, CodeEmitter &mce, const AlphaInstrInfo& ii)
       : MachineFunctionPass(&ID), AlphaCodeEmitter(mce),
@@ -69,8 +72,6 @@ namespace {
       return "Alpha Machine Code Emitter";
     }
 
-    void emitInstruction(const MachineInstr &MI);
-
   private:
     void emitBasicBlock(MachineBasicBlock &MBB);
   };
@@ -91,6 +92,10 @@ FunctionPass *llvm::createAlphaJITCodeEmitterPass(AlphaTargetMachine &TM,
                                                   JITCodeEmitter &JCE) {
   return new Emitter<JITCodeEmitter>(TM, JCE);
 }
+FunctionPass *llvm::createAlphaObjectCodeEmitterPass(AlphaTargetMachine &TM,
+                                                     ObjectCodeEmitter &OCE) {
+  return new Emitter<ObjectCodeEmitter>(TM, OCE);
+}
 
 template <class CodeEmitter>
 bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
@@ -111,6 +116,7 @@ void Emitter<CodeEmitter>::emitBasicBlock(MachineBasicBlock &MBB) {
   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
        I != E; ++I) {
     const MachineInstr &MI = *I;
+    MCE.processDebugLoc(MI.getDebugLoc(), true);
     switch(MI.getOpcode()) {
     default:
       MCE.emitWordLE(getBinaryCodeForInstr(*I));
@@ -119,8 +125,10 @@ void Emitter<CodeEmitter>::emitBasicBlock(MachineBasicBlock &MBB) {
     case Alpha::PCLABEL:
     case Alpha::MEMLABEL:
     case TargetInstrInfo::IMPLICIT_DEF:
+    case TargetInstrInfo::KILL:
       break; //skip these
     }
+    MCE.processDebugLoc(MI.getDebugLoc(), false);
   }
 }
 
@@ -159,13 +167,12 @@ static unsigned getAlphaRegNumber(unsigned Reg) {
   case Alpha::R30 : case Alpha::F30 : return 30;
   case Alpha::R31 : case Alpha::F31 : return 31;
   default:
-    assert(0 && "Unhandled reg");
-    abort();
+    llvm_unreachable("Unhandled reg");
   }
 }
 
 unsigned AlphaCodeEmitter::getMachineOpValue(const MachineInstr &MI,
-                           		             const MachineOperand &MO) {
+                                             const MachineOperand &MO) {
 
   unsigned rv = 0; // Return value; defaults to 0 for unhandled cases
                    // or things that get fixed up later by the JIT.
@@ -175,7 +182,7 @@ unsigned AlphaCodeEmitter::getMachineOpValue(const MachineInstr &MI,
   } else if (MO.isImm()) {
     rv = MO.getImm();
   } else if (MO.isGlobal() || MO.isSymbol() || MO.isCPI()) {
-    DOUT << MO << " is a relocated op for " << MI << "\n";
+    DEBUG(errs() << MO << " is a relocated op for " << MI << "\n");
     unsigned Reloc = 0;
     int Offset = 0;
     bool useGOT = false;
@@ -211,8 +218,7 @@ unsigned AlphaCodeEmitter::getMachineOpValue(const MachineInstr &MI,
       Offset = MI.getOperand(3).getImm();
       break;
     default:
-      assert(0 && "unknown relocatable instruction");
-      abort();
+      llvm_unreachable("unknown relocatable instruction");
     }
     if (MO.isGlobal())
       MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(),
@@ -229,14 +235,14 @@ unsigned AlphaCodeEmitter::getMachineOpValue(const MachineInstr &MI,
   } else if (MO.isMBB()) {
     MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
                                                Alpha::reloc_bsr, MO.getMBB()));
-  }else {
-    cerr << "ERROR: Unknown type of MachineOperand: " << MO << "\n";
-    abort();
+  } else {
+#ifndef NDEBUG
+    errs() << "ERROR: Unknown type of MachineOperand: " << MO << "\n";
+#endif
+    llvm_unreachable(0);
   }
 
   return rv;
 }
 
 #include "AlphaGenCodeEmitter.inc"
-
-
diff --git a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
index e3f631a..e3587fb 100644
--- a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
+++ b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
@@ -26,9 +26,12 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -114,7 +117,7 @@ namespace {
       uint64_t complow = 1 << (63 - at);
       uint64_t comphigh = 1 << (64 - at);
       //cerr << x << ":" << complow << ":" << comphigh << "\n";
-      if (abs(complow - x) <= abs(comphigh - x))
+      if (abs64(complow - x) <= abs64(comphigh - x))
         return complow;
       else
         return comphigh;
@@ -208,7 +211,6 @@ private:
 /// GOT address into a register.
 ///
 SDNode *AlphaDAGToDAGISel::getGlobalBaseReg() {
-  MachineFunction *MF = BB->getParent();
   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
   return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
 }
@@ -216,7 +218,6 @@ SDNode *AlphaDAGToDAGISel::getGlobalBaseReg() {
 /// getGlobalRetAddr - Grab the return address.
 ///
 SDNode *AlphaDAGToDAGISel::getGlobalRetAddr() {
-  MachineFunction *MF = BB->getParent();
   unsigned GlobalRetAddr = getInstrInfo()->getGlobalRetAddr(MF);
   return CurDAG->getRegister(GlobalRetAddr, TLI.getPointerTy()).getNode();
 }
@@ -269,8 +270,8 @@ SDNode *AlphaDAGToDAGISel::Select(SDValue Op) {
     Chain = CurDAG->getCopyToReg(Chain, dl, Alpha::R27, N0, 
                                  Chain.getValue(1));
     SDNode *CNode =
-      CurDAG->getTargetNode(Alpha::JSRs, dl, MVT::Other, MVT::Flag, 
-                            Chain, Chain.getValue(1));
+      CurDAG->getMachineNode(Alpha::JSRs, dl, MVT::Other, MVT::Flag, 
+                             Chain, Chain.getValue(1));
     Chain = CurDAG->getCopyFromReg(Chain, dl, Alpha::R27, MVT::i64, 
                                    SDValue(CNode, 1));
     return CurDAG->SelectNodeTo(N, Alpha::BISr, MVT::i64, Chain, Chain);
@@ -278,8 +279,8 @@ SDNode *AlphaDAGToDAGISel::Select(SDValue Op) {
 
   case ISD::READCYCLECOUNTER: {
     SDValue Chain = N->getOperand(0);
-    return CurDAG->getTargetNode(Alpha::RPCC, dl, MVT::i64, MVT::Other,
-                                 Chain);
+    return CurDAG->getMachineNode(Alpha::RPCC, dl, MVT::i64, MVT::Other,
+                                  Chain);
   }
 
   case ISD::Constant: {
@@ -302,10 +303,11 @@ SDNode *AlphaDAGToDAGISel::Select(SDValue Op) {
       // val32 >= IMM_LOW  + IMM_LOW  * IMM_MULT) //always true
       break; //(zext (LDAH (LDA)))
     //Else use the constant pool
-    ConstantInt *C = ConstantInt::get(Type::Int64Ty, uval);
+    ConstantInt *C = ConstantInt::get(
+                                Type::getInt64Ty(*CurDAG->getContext()), uval);
     SDValue CPI = CurDAG->getTargetConstantPool(C, MVT::i64);
-    SDNode *Tmp = CurDAG->getTargetNode(Alpha::LDAHr, dl, MVT::i64, CPI,
-                                        SDValue(getGlobalBaseReg(), 0));
+    SDNode *Tmp = CurDAG->getMachineNode(Alpha::LDAHr, dl, MVT::i64, CPI,
+                                         SDValue(getGlobalBaseReg(), 0));
     return CurDAG->SelectNodeTo(N, Alpha::LDQr, MVT::i64, MVT::Other, 
                                 CPI, SDValue(Tmp, 0), CurDAG->getEntryNode());
   }
@@ -313,7 +315,7 @@ SDNode *AlphaDAGToDAGISel::Select(SDValue Op) {
   case ISD::ConstantFP: {
     ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
     bool isDouble = N->getValueType(0) == MVT::f64;
-    MVT T = isDouble ? MVT::f64 : MVT::f32;
+    EVT T = isDouble ? MVT::f64 : MVT::f32;
     if (CN->getValueAPF().isPosZero()) {
       return CurDAG->SelectNodeTo(N, isDouble ? Alpha::CPYST : Alpha::CPYSS,
                                   T, CurDAG->getRegister(Alpha::F31, T),
@@ -323,7 +325,7 @@ SDNode *AlphaDAGToDAGISel::Select(SDValue Op) {
                                   T, CurDAG->getRegister(Alpha::F31, T),
                                   CurDAG->getRegister(Alpha::F31, T));
     } else {
-      abort();
+      llvm_report_error("Unhandled FP constant type");
     }
     break;
   }
@@ -336,7 +338,7 @@ SDNode *AlphaDAGToDAGISel::Select(SDValue Op) {
       bool rev = false;
       bool inv = false;
       switch(CC) {
-      default: DEBUG(N->dump(CurDAG)); assert(0 && "Unknown FP comparison!");
+      default: DEBUG(N->dump(CurDAG)); llvm_unreachable("Unknown FP comparison!");
       case ISD::SETEQ: case ISD::SETOEQ: case ISD::SETUEQ:
         Opc = Alpha::CMPTEQ; break;
       case ISD::SETLT: case ISD::SETOLT: case ISD::SETULT: 
@@ -356,48 +358,29 @@ SDNode *AlphaDAGToDAGISel::Select(SDValue Op) {
       };
       SDValue tmp1 = N->getOperand(rev?1:0);
       SDValue tmp2 = N->getOperand(rev?0:1);
-      SDNode *cmp = CurDAG->getTargetNode(Opc, dl, MVT::f64, tmp1, tmp2);
+      SDNode *cmp = CurDAG->getMachineNode(Opc, dl, MVT::f64, tmp1, tmp2);
       if (inv) 
-        cmp = CurDAG->getTargetNode(Alpha::CMPTEQ, dl, 
-                                    MVT::f64, SDValue(cmp, 0), 
-                                    CurDAG->getRegister(Alpha::F31, MVT::f64));
+        cmp = CurDAG->getMachineNode(Alpha::CMPTEQ, dl, 
+                                     MVT::f64, SDValue(cmp, 0), 
+                                     CurDAG->getRegister(Alpha::F31, MVT::f64));
       switch(CC) {
       case ISD::SETUEQ: case ISD::SETULT: case ISD::SETULE:
       case ISD::SETUNE: case ISD::SETUGT: case ISD::SETUGE:
        {
-         SDNode* cmp2 = CurDAG->getTargetNode(Alpha::CMPTUN, dl, MVT::f64,
-                                              tmp1, tmp2);
-         cmp = CurDAG->getTargetNode(Alpha::ADDT, dl, MVT::f64, 
-                                     SDValue(cmp2, 0), SDValue(cmp, 0));
+         SDNode* cmp2 = CurDAG->getMachineNode(Alpha::CMPTUN, dl, MVT::f64,
+                                               tmp1, tmp2);
+         cmp = CurDAG->getMachineNode(Alpha::ADDT, dl, MVT::f64, 
+                                      SDValue(cmp2, 0), SDValue(cmp, 0));
          break;
        }
       default: break;
       }
 
-      SDNode* LD = CurDAG->getTargetNode(Alpha::FTOIT, dl,
-                                         MVT::i64, SDValue(cmp, 0));
-      return CurDAG->getTargetNode(Alpha::CMPULT, dl, MVT::i64, 
-                                   CurDAG->getRegister(Alpha::R31, MVT::i64),
-                                   SDValue(LD,0));
-    }
-    break;
-
-  case ISD::SELECT:
-    if (N->getValueType(0).isFloatingPoint() &&
-        (N->getOperand(0).getOpcode() != ISD::SETCC ||
-         !N->getOperand(0).getOperand(1).getValueType().isFloatingPoint())) {
-      //This should be the condition not covered by the Patterns
-      //FIXME: Don't have SelectCode die, but rather return something testable
-      // so that things like this can be caught in fall though code
-      //move int to fp
-      bool isDouble = N->getValueType(0) == MVT::f64;
-      SDValue cond = N->getOperand(0);
-      SDValue TV = N->getOperand(1);
-      SDValue FV = N->getOperand(2);
-      
-      SDNode* LD = CurDAG->getTargetNode(Alpha::ITOFT, dl, MVT::f64, cond);
-      return CurDAG->getTargetNode(isDouble?Alpha::FCMOVNET:Alpha::FCMOVNES,
-                                   dl, MVT::f64, FV, TV, SDValue(LD,0));
+      SDNode* LD = CurDAG->getMachineNode(Alpha::FTOIT, dl,
+                                          MVT::i64, SDValue(cmp, 0));
+      return CurDAG->getMachineNode(Alpha::CMPULT, dl, MVT::i64, 
+                                    CurDAG->getRegister(Alpha::R31, MVT::i64),
+                                    SDValue(LD,0));
     }
     break;
 
@@ -422,11 +405,11 @@ SDNode *AlphaDAGToDAGISel::Select(SDValue Op) {
       
       if (get_zapImm(mask)) {
         SDValue Z = 
-          SDValue(CurDAG->getTargetNode(Alpha::ZAPNOTi, dl, MVT::i64,
-                                          N->getOperand(0).getOperand(0),
-                                          getI64Imm(get_zapImm(mask))), 0);
-        return CurDAG->getTargetNode(Alpha::SRLr, dl, MVT::i64, Z, 
-                                     getI64Imm(sval));
+          SDValue(CurDAG->getMachineNode(Alpha::ZAPNOTi, dl, MVT::i64,
+                                         N->getOperand(0).getOperand(0),
+                                         getI64Imm(get_zapImm(mask))), 0);
+        return CurDAG->getMachineNode(Alpha::SRLr, dl, MVT::i64, Z, 
+                                      getI64Imm(sval));
       }
     }
     break;
@@ -443,95 +426,26 @@ void AlphaDAGToDAGISel::SelectCALL(SDValue Op) {
   SDNode *N = Op.getNode();
   SDValue Chain = N->getOperand(0);
   SDValue Addr = N->getOperand(1);
-  SDValue InFlag(0,0);  // Null incoming flag value.
+  SDValue InFlag = N->getOperand(N->getNumOperands() - 1);
   DebugLoc dl = N->getDebugLoc();
 
-   std::vector<SDValue> CallOperands;
-   std::vector<MVT> TypeOperands;
-  
-   //grab the arguments
-   for(int i = 2, e = N->getNumOperands(); i < e; ++i) {
-     TypeOperands.push_back(N->getOperand(i).getValueType());
-     CallOperands.push_back(N->getOperand(i));
-   }
-   int count = N->getNumOperands() - 2;
-
-   static const unsigned args_int[] = {Alpha::R16, Alpha::R17, Alpha::R18,
-                                       Alpha::R19, Alpha::R20, Alpha::R21};
-   static const unsigned args_float[] = {Alpha::F16, Alpha::F17, Alpha::F18,
-                                         Alpha::F19, Alpha::F20, Alpha::F21};
-   
-   for (int i = 6; i < count; ++i) {
-     unsigned Opc = Alpha::WTF;
-     if (TypeOperands[i].isInteger()) {
-       Opc = Alpha::STQ;
-     } else if (TypeOperands[i] == MVT::f32) {
-       Opc = Alpha::STS;
-     } else if (TypeOperands[i] == MVT::f64) {
-       Opc = Alpha::STT;
-     } else
-       assert(0 && "Unknown operand"); 
-
-     SDValue Ops[] = { CallOperands[i],  getI64Imm((i - 6) * 8), 
-                       CurDAG->getCopyFromReg(Chain, dl, Alpha::R30, MVT::i64),
-                       Chain };
-     Chain = SDValue(CurDAG->getTargetNode(Opc, dl, MVT::Other, Ops, 4), 0);
-   }
-   for (int i = 0; i < std::min(6, count); ++i) {
-     if (TypeOperands[i].isInteger()) {
-       Chain = CurDAG->getCopyToReg(Chain, dl, args_int[i], 
-                                    CallOperands[i], InFlag);
-       InFlag = Chain.getValue(1);
-     } else if (TypeOperands[i] == MVT::f32 || TypeOperands[i] == MVT::f64) {
-       Chain = CurDAG->getCopyToReg(Chain, dl, args_float[i], 
-                                    CallOperands[i], InFlag);
-       InFlag = Chain.getValue(1);
-     } else
-       assert(0 && "Unknown operand"); 
-   }
-
-   // Finally, once everything is in registers to pass to the call, emit the
-   // call itself.
    if (Addr.getOpcode() == AlphaISD::GPRelLo) {
      SDValue GOT = SDValue(getGlobalBaseReg(), 0);
      Chain = CurDAG->getCopyToReg(Chain, dl, Alpha::R29, GOT, InFlag);
      InFlag = Chain.getValue(1);
-     Chain = SDValue(CurDAG->getTargetNode(Alpha::BSR, dl, MVT::Other, 
-                                           MVT::Flag, Addr.getOperand(0), 
-                                           Chain, InFlag), 0);
+     Chain = SDValue(CurDAG->getMachineNode(Alpha::BSR, dl, MVT::Other, 
+                                            MVT::Flag, Addr.getOperand(0),
+                                            Chain, InFlag), 0);
    } else {
      Chain = CurDAG->getCopyToReg(Chain, dl, Alpha::R27, Addr, InFlag);
      InFlag = Chain.getValue(1);
-     Chain = SDValue(CurDAG->getTargetNode(Alpha::JSR, dl, MVT::Other,
-                                             MVT::Flag, Chain, InFlag), 0);
+     Chain = SDValue(CurDAG->getMachineNode(Alpha::JSR, dl, MVT::Other,
+                                            MVT::Flag, Chain, InFlag), 0);
    }
    InFlag = Chain.getValue(1);
 
-   std::vector<SDValue> CallResults;
-  
-   switch (N->getValueType(0).getSimpleVT()) {
-   default: assert(0 && "Unexpected ret value!");
-     case MVT::Other: break;
-   case MVT::i64:
-     Chain = CurDAG->getCopyFromReg(Chain, dl, 
-                                    Alpha::R0, MVT::i64, InFlag).getValue(1);
-     CallResults.push_back(Chain.getValue(0));
-     break;
-   case MVT::f32:
-     Chain = CurDAG->getCopyFromReg(Chain, dl, 
-                                    Alpha::F0, MVT::f32, InFlag).getValue(1);
-     CallResults.push_back(Chain.getValue(0));
-     break;
-   case MVT::f64:
-     Chain = CurDAG->getCopyFromReg(Chain, dl,
-                                    Alpha::F0, MVT::f64, InFlag).getValue(1);
-     CallResults.push_back(Chain.getValue(0));
-     break;
-   }
-
-   CallResults.push_back(Chain);
-   for (unsigned i = 0, e = CallResults.size(); i != e; ++i)
-     ReplaceUses(Op.getValue(i), CallResults[i]);
+  ReplaceUses(Op.getValue(0), Chain);
+  ReplaceUses(Op.getValue(1), InFlag);
 }
 
 
diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp
index fa0b656..b3f865c 100644
--- a/lib/Target/Alpha/AlphaISelLowering.cpp
+++ b/lib/Target/Alpha/AlphaISelLowering.cpp
@@ -13,17 +13,22 @@
 
 #include "AlphaISelLowering.h"
 #include "AlphaTargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
 #include "llvm/Module.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 /// AddLiveIn - This helper function adds the specified physical register to the
@@ -37,14 +42,15 @@ static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
   return VReg;
 }
 
-AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM) : TargetLowering(TM) {
+AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM)
+  : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
   // Set up the TargetLowering object.
   //I am having problems with shr n i8 1
   setShiftAmountType(MVT::i64);
   setBooleanContents(ZeroOrOneBooleanContent);
-  
+
   setUsesGlobalOffsetTable(true);
-  
+
   addRegisterClass(MVT::i64, Alpha::GPRCRegisterClass);
   addRegisterClass(MVT::f64, Alpha::F8RCRegisterClass);
   addRegisterClass(MVT::f32, Alpha::F4RCRegisterClass);
@@ -54,24 +60,26 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM) : TargetLowering(TM)
 
   setLoadExtAction(ISD::EXTLOAD, MVT::i1,  Promote);
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  
+
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
-  
+
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i8,  Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
 
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
   //  setOperationAction(ISD::BRIND,        MVT::Other,   Expand);
   setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
   setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
-  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);  
+  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
   setOperationAction(ISD::FREM, MVT::f32, Expand);
   setOperationAction(ISD::FREM, MVT::f64, Expand);
-  
+
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
@@ -85,7 +93,7 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM) : TargetLowering(TM)
   setOperationAction(ISD::BSWAP    , MVT::i64, Expand);
   setOperationAction(ISD::ROTL     , MVT::i64, Expand);
   setOperationAction(ISD::ROTR     , MVT::i64, Expand);
-  
+
   setOperationAction(ISD::SREM     , MVT::i64, Custom);
   setOperationAction(ISD::UREM     , MVT::i64, Custom);
   setOperationAction(ISD::SDIV     , MVT::i64, Custom);
@@ -99,6 +107,9 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM) : TargetLowering(TM)
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
 
   // We don't support sin/cos/sqrt/pow
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
@@ -123,7 +134,7 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM) : TargetLowering(TM)
   setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 
   // Not implemented yet.
-  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
 
@@ -141,8 +152,6 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM) : TargetLowering(TM)
   setOperationAction(ISD::VAARG,   MVT::Other, Custom);
   setOperationAction(ISD::VAARG,   MVT::i32,   Custom);
 
-  setOperationAction(ISD::RET,     MVT::Other, Custom);
-
   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
   setOperationAction(ISD::JumpTable, MVT::i32, Custom);
 
@@ -159,7 +168,7 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM) : TargetLowering(TM)
   computeRegisterProperties();
 }
 
-MVT AlphaTargetLowering::getSetCCResultType(MVT VT) const {
+MVT::SimpleValueType AlphaTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i64;
 }
 
@@ -187,13 +196,13 @@ unsigned AlphaTargetLowering::getFunctionAlignment(const Function *F) const {
 }
 
 static SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
-  MVT PtrVT = Op.getValueType();
+  EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
   SDValue Zero = DAG.getConstant(0, PtrVT);
   // FIXME there isn't really any debug info here
   DebugLoc dl = Op.getDebugLoc();
-  
+
   SDValue Hi = DAG.getNode(AlphaISD::GPRelHi,  dl, MVT::i64, JTI,
                              DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
   SDValue Lo = DAG.getNode(AlphaISD::GPRelLo, dl, MVT::i64, JTI, Hi);
@@ -219,43 +228,205 @@ static SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
 // //#define GP    $29
 // //#define SP    $30
 
-static SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG,
-                                       int &VarArgsBase,
-                                       int &VarArgsOffset) {
+#include "AlphaGenCallingConv.inc"
+
+SDValue
+AlphaTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               bool isTailCall,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) {
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_Alpha);
+
+    // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes,
+                                                      getPointerTy(), true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    SDValue Arg = Outs[i].Val;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: assert(0 && "Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      if (StackPtr.getNode() == 0)
+        StackPtr = DAG.getCopyFromReg(Chain, dl, Alpha::R30, MVT::i64);
+
+      SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   StackPtr,
+                                   DAG.getIntPtrConstant(VA.getLocMemOffset()));
+
+      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                         PseudoSourceValue::getStack(), 0));
+    }
+  }
+
+  // Transform all store nodes into one single node because all store nodes are
+  // independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emited instructions must be stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(AlphaISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, getPointerTy(), true),
+                             DAG.getConstant(0, getPointerTy(), true),
+                             InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+AlphaTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                     CallingConv::ID CallConv, bool isVarArg,
+                                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                                     DebugLoc dl, SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &InVals) {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
+                 *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Alpha);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+
+    Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                               VA.getLocVT(), InFlag).getValue(1);
+    SDValue RetValue = Chain.getValue(0);
+    InFlag = Chain.getValue(2);
+
+    // If this is an 8/16/32-bit value, it is really passed promoted to 64
+    // bits. Insert an assert[sz]ext to capture this, then truncate to the
+    // right size.
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      RetValue = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), RetValue,
+                             DAG.getValueType(VA.getValVT()));
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      RetValue = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), RetValue,
+                             DAG.getValueType(VA.getValVT()));
+
+    if (VA.getLocInfo() != CCValAssign::Full)
+      RetValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), RetValue);
+
+    InVals.push_back(RetValue);
+  }
+
+  return Chain;
+}
+
+SDValue
+AlphaTargetLowering::LowerFormalArguments(SDValue Chain,
+                                          CallingConv::ID CallConv, bool isVarArg,
+                                          const SmallVectorImpl<ISD::InputArg>
+                                            &Ins,
+                                          DebugLoc dl, SelectionDAG &DAG,
+                                          SmallVectorImpl<SDValue> &InVals) {
+
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  std::vector<SDValue> ArgValues;
-  SDValue Root = Op.getOperand(0);
-  DebugLoc dl = Op.getDebugLoc();
 
   unsigned args_int[] = {
     Alpha::R16, Alpha::R17, Alpha::R18, Alpha::R19, Alpha::R20, Alpha::R21};
   unsigned args_float[] = {
     Alpha::F16, Alpha::F17, Alpha::F18, Alpha::F19, Alpha::F20, Alpha::F21};
-  
-  for (unsigned ArgNo = 0, e = Op.getNode()->getNumValues()-1; ArgNo != e; ++ArgNo) {
+
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
     SDValue argt;
-    MVT ObjectVT = Op.getValue(ArgNo).getValueType();
+    EVT ObjectVT = Ins[ArgNo].VT;
     SDValue ArgVal;
 
     if (ArgNo  < 6) {
-      switch (ObjectVT.getSimpleVT()) {
+      switch (ObjectVT.getSimpleVT().SimpleTy) {
       default:
         assert(false && "Invalid value type!");
       case MVT::f64:
-        args_float[ArgNo] = AddLiveIn(MF, args_float[ArgNo], 
+        args_float[ArgNo] = AddLiveIn(MF, args_float[ArgNo],
                                       &Alpha::F8RCRegClass);
-        ArgVal = DAG.getCopyFromReg(Root, dl, args_float[ArgNo], ObjectVT);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, args_float[ArgNo], ObjectVT);
         break;
       case MVT::f32:
-        args_float[ArgNo] = AddLiveIn(MF, args_float[ArgNo], 
+        args_float[ArgNo] = AddLiveIn(MF, args_float[ArgNo],
                                       &Alpha::F4RCRegClass);
-        ArgVal = DAG.getCopyFromReg(Root, dl, args_float[ArgNo], ObjectVT);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, args_float[ArgNo], ObjectVT);
         break;
       case MVT::i64:
-        args_int[ArgNo] = AddLiveIn(MF, args_int[ArgNo], 
+        args_int[ArgNo] = AddLiveIn(MF, args_int[ArgNo],
                                     &Alpha::GPRCRegClass);
-        ArgVal = DAG.getCopyFromReg(Root, dl, args_int[ArgNo], MVT::i64);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, args_int[ArgNo], MVT::i64);
         break;
       }
     } else { //more args
@@ -265,60 +436,58 @@ static SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG,
       // Create the SelectionDAG nodes corresponding to a load
       //from this parameter
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i64);
-      ArgVal = DAG.getLoad(ObjectVT, dl, Root, FIN, NULL, 0);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, NULL, 0);
     }
-    ArgValues.push_back(ArgVal);
+    InVals.push_back(ArgVal);
   }
 
   // If the functions takes variable number of arguments, copy all regs to stack
-  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
   if (isVarArg) {
-    VarArgsOffset = (Op.getNode()->getNumValues()-1) * 8;
+    VarArgsOffset = Ins.size() * 8;
     std::vector<SDValue> LS;
     for (int i = 0; i < 6; ++i) {
       if (TargetRegisterInfo::isPhysicalRegister(args_int[i]))
         args_int[i] = AddLiveIn(MF, args_int[i], &Alpha::GPRCRegClass);
-      SDValue argt = DAG.getCopyFromReg(Root, dl, args_int[i], MVT::i64);
+      SDValue argt = DAG.getCopyFromReg(Chain, dl, args_int[i], MVT::i64);
       int FI = MFI->CreateFixedObject(8, -8 * (6 - i));
       if (i == 0) VarArgsBase = FI;
       SDValue SDFI = DAG.getFrameIndex(FI, MVT::i64);
-      LS.push_back(DAG.getStore(Root, dl, argt, SDFI, NULL, 0));
+      LS.push_back(DAG.getStore(Chain, dl, argt, SDFI, NULL, 0));
 
       if (TargetRegisterInfo::isPhysicalRegister(args_float[i]))
         args_float[i] = AddLiveIn(MF, args_float[i], &Alpha::F8RCRegClass);
-      argt = DAG.getCopyFromReg(Root, dl, args_float[i], MVT::f64);
+      argt = DAG.getCopyFromReg(Chain, dl, args_float[i], MVT::f64);
       FI = MFI->CreateFixedObject(8, - 8 * (12 - i));
       SDFI = DAG.getFrameIndex(FI, MVT::i64);
-      LS.push_back(DAG.getStore(Root, dl, argt, SDFI, NULL, 0));
+      LS.push_back(DAG.getStore(Chain, dl, argt, SDFI, NULL, 0));
     }
 
     //Set up a token factor with all the stack traffic
-    Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &LS[0], LS.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &LS[0], LS.size());
   }
 
-  ArgValues.push_back(Root);
-
-  // Return the new list of results.
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
-                     &ArgValues[0], ArgValues.size());
+  return Chain;
 }
 
-static SDValue LowerRET(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
-  SDValue Copy = DAG.getCopyToReg(Op.getOperand(0), dl, Alpha::R26, 
-                                    DAG.getNode(AlphaISD::GlobalRetAddr, 
-                                                DebugLoc::getUnknownLoc(),
-                                                MVT::i64),
-                                    SDValue());
-  switch (Op.getNumOperands()) {
+SDValue
+AlphaTargetLowering::LowerReturn(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 DebugLoc dl, SelectionDAG &DAG) {
+
+  SDValue Copy = DAG.getCopyToReg(Chain, dl, Alpha::R26,
+                                  DAG.getNode(AlphaISD::GlobalRetAddr,
+                                              DebugLoc::getUnknownLoc(),
+                                              MVT::i64),
+                                  SDValue());
+  switch (Outs.size()) {
   default:
-    assert(0 && "Do not know how to return this many arguments!");
-    abort();
-  case 1: 
+    llvm_unreachable("Do not know how to return this many arguments!");
+  case 0:
     break;
     //return SDValue(); // ret void is legal
-  case 3: {
-    MVT ArgVT = Op.getOperand(1).getValueType();
+  case 1: {
+    EVT ArgVT = Outs[0].Val.getValueType();
     unsigned ArgReg;
     if (ArgVT.isInteger())
       ArgReg = Alpha::R0;
@@ -326,14 +495,14 @@ static SDValue LowerRET(SDValue Op, SelectionDAG &DAG) {
       assert(ArgVT.isFloatingPoint());
       ArgReg = Alpha::F0;
     }
-    Copy = DAG.getCopyToReg(Copy, dl, ArgReg, 
-                            Op.getOperand(1), Copy.getValue(1));
+    Copy = DAG.getCopyToReg(Copy, dl, ArgReg,
+                            Outs[0].Val, Copy.getValue(1));
     if (DAG.getMachineFunction().getRegInfo().liveout_empty())
       DAG.getMachineFunction().getRegInfo().addLiveOut(ArgReg);
     break;
   }
-  case 5: {
-    MVT ArgVT = Op.getOperand(1).getValueType();
+  case 2: {
+    EVT ArgVT = Outs[0].Val.getValueType();
     unsigned ArgReg1, ArgReg2;
     if (ArgVT.isInteger()) {
       ArgReg1 = Alpha::R0;
@@ -343,104 +512,25 @@ static SDValue LowerRET(SDValue Op, SelectionDAG &DAG) {
       ArgReg1 = Alpha::F0;
       ArgReg2 = Alpha::F1;
     }
-    Copy = DAG.getCopyToReg(Copy, dl, ArgReg1, 
-                            Op.getOperand(1), Copy.getValue(1));
-    if (std::find(DAG.getMachineFunction().getRegInfo().liveout_begin(), 
+    Copy = DAG.getCopyToReg(Copy, dl, ArgReg1,
+                            Outs[0].Val, Copy.getValue(1));
+    if (std::find(DAG.getMachineFunction().getRegInfo().liveout_begin(),
                   DAG.getMachineFunction().getRegInfo().liveout_end(), ArgReg1)
         == DAG.getMachineFunction().getRegInfo().liveout_end())
       DAG.getMachineFunction().getRegInfo().addLiveOut(ArgReg1);
-    Copy = DAG.getCopyToReg(Copy, dl, ArgReg2, 
-                            Op.getOperand(3), Copy.getValue(1));
-    if (std::find(DAG.getMachineFunction().getRegInfo().liveout_begin(), 
+    Copy = DAG.getCopyToReg(Copy, dl, ArgReg2,
+                            Outs[1].Val, Copy.getValue(1));
+    if (std::find(DAG.getMachineFunction().getRegInfo().liveout_begin(),
                    DAG.getMachineFunction().getRegInfo().liveout_end(), ArgReg2)
         == DAG.getMachineFunction().getRegInfo().liveout_end())
       DAG.getMachineFunction().getRegInfo().addLiveOut(ArgReg2);
     break;
   }
   }
-  return DAG.getNode(AlphaISD::RET_FLAG, dl, 
+  return DAG.getNode(AlphaISD::RET_FLAG, dl,
                      MVT::Other, Copy, Copy.getValue(1));
 }
 
-std::pair<SDValue, SDValue>
-AlphaTargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy, 
-                                 bool RetSExt, bool RetZExt, bool isVarArg,
-                                 bool isInreg, unsigned NumFixedArgs,
-                                 unsigned CallingConv, 
-                                 bool isTailCall, SDValue Callee, 
-                                 ArgListTy &Args, SelectionDAG &DAG,
-                                 DebugLoc dl) {
-  int NumBytes = 0;
-  if (Args.size() > 6)
-    NumBytes = (Args.size() - 6) * 8;
-
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
-  std::vector<SDValue> args_to_use;
-  for (unsigned i = 0, e = Args.size(); i != e; ++i)
-  {
-    switch (getValueType(Args[i].Ty).getSimpleVT()) {
-    default: assert(0 && "Unexpected ValueType for argument!");
-    case MVT::i1:
-    case MVT::i8:
-    case MVT::i16:
-    case MVT::i32:
-      // Promote the integer to 64 bits.  If the input type is signed use a
-      // sign extend, otherwise use a zero extend.
-      if (Args[i].isSExt)
-        Args[i].Node = DAG.getNode(ISD::SIGN_EXTEND, dl, 
-                                   MVT::i64, Args[i].Node);
-      else if (Args[i].isZExt)
-        Args[i].Node = DAG.getNode(ISD::ZERO_EXTEND, dl,
-                                   MVT::i64, Args[i].Node);
-      else
-        Args[i].Node = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, Args[i].Node);
-      break;
-    case MVT::i64:
-    case MVT::f64:
-    case MVT::f32:
-      break;
-    }
-    args_to_use.push_back(Args[i].Node);
-  }
-
-  std::vector<MVT> RetVals;
-  MVT RetTyVT = getValueType(RetTy);
-  MVT ActualRetTyVT = RetTyVT;
-  if (RetTyVT.getSimpleVT() >= MVT::i1 && RetTyVT.getSimpleVT() <= MVT::i32)
-    ActualRetTyVT = MVT::i64;
-
-  if (RetTyVT != MVT::isVoid)
-    RetVals.push_back(ActualRetTyVT);
-  RetVals.push_back(MVT::Other);
-
-  std::vector<SDValue> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Callee);
-  Ops.insert(Ops.end(), args_to_use.begin(), args_to_use.end());
-  SDValue TheCall = DAG.getNode(AlphaISD::CALL, dl, 
-                                RetVals, &Ops[0], Ops.size());
-  Chain = TheCall.getValue(RetTyVT != MVT::isVoid);
-  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), SDValue());
-  SDValue RetVal = TheCall;
-
-  if (RetTyVT != ActualRetTyVT) {
-    ISD::NodeType AssertKind = ISD::DELETED_NODE;
-    if (RetSExt)
-      AssertKind = ISD::AssertSext;
-    else if (RetZExt)
-      AssertKind = ISD::AssertZext;
-
-    if (AssertKind != ISD::DELETED_NODE)
-      RetVal = DAG.getNode(AssertKind, dl, MVT::i64, RetVal,
-                           DAG.getValueType(RetTyVT));
-
-    RetVal = DAG.getNode(ISD::TRUNCATE, dl, RetTyVT, RetVal);
-  }
-
-  return std::make_pair(RetVal, Chain);
-}
-
 void AlphaTargetLowering::LowerVAARG(SDNode *N, SDValue &Chain,
                                      SDValue &DataPtr, SelectionDAG &DAG) {
   Chain = N->getOperand(0);
@@ -475,12 +565,7 @@ void AlphaTargetLowering::LowerVAARG(SDNode *N, SDValue &Chain,
 SDValue AlphaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   switch (Op.getOpcode()) {
-  default: assert(0 && "Wasn't expecting to be able to lower this!");
-  case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG, 
-                                                           VarArgsBase,
-                                                           VarArgsOffset);
-
-  case ISD::RET: return LowerRET(Op,DAG);
+  default: llvm_unreachable("Wasn't expecting to be able to lower this!");
   case ISD::JumpTable: return LowerJumpTable(Op, DAG);
 
   case ISD::INTRINSIC_WO_CHAIN: {
@@ -488,11 +573,40 @@ SDValue AlphaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
     switch (IntNo) {
     default: break;    // Don't custom lower most intrinsics.
     case Intrinsic::alpha_umulh:
-      return DAG.getNode(ISD::MULHU, dl, MVT::i64, 
+      return DAG.getNode(ISD::MULHU, dl, MVT::i64,
                          Op.getOperand(1), Op.getOperand(2));
     }
   }
 
+  case ISD::SRL_PARTS: {
+    SDValue ShOpLo = Op.getOperand(0);
+    SDValue ShOpHi = Op.getOperand(1);
+    SDValue ShAmt  = Op.getOperand(2);
+    SDValue bm = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                             DAG.getConstant(64, MVT::i64), ShAmt);
+    SDValue BMCC = DAG.getSetCC(dl, MVT::i64, bm,
+                                DAG.getConstant(0, MVT::i64), ISD::SETLE);
+    // if 64 - shAmt <= 0
+    SDValue Hi_Neg = DAG.getConstant(0, MVT::i64);
+    SDValue ShAmt_Neg = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                    DAG.getConstant(0, MVT::i64), bm);
+    SDValue Lo_Neg = DAG.getNode(ISD::SRL, dl, MVT::i64, ShOpHi, ShAmt_Neg);
+    // else
+    SDValue carries = DAG.getNode(ISD::SHL, dl, MVT::i64, ShOpHi, bm);
+    SDValue Hi_Pos =  DAG.getNode(ISD::SRL, dl, MVT::i64, ShOpHi, ShAmt);
+    SDValue Lo_Pos = DAG.getNode(ISD::SRL, dl, MVT::i64, ShOpLo, ShAmt);
+    Lo_Pos = DAG.getNode(ISD::OR, dl, MVT::i64, Lo_Pos, carries);
+    // Merge
+    SDValue Hi = DAG.getNode(ISD::SELECT, dl, MVT::i64, BMCC, Hi_Neg, Hi_Pos);
+    SDValue Lo = DAG.getNode(ISD::SELECT, dl, MVT::i64, BMCC, Lo_Neg, Lo_Pos);
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, 2, dl);
+  }
+    //  case ISD::SRA_PARTS:
+
+    //  case ISD::SHL_PARTS:
+
+
   case ISD::SINT_TO_FP: {
     assert(Op.getOperand(0).getValueType() == MVT::i64 &&
            "Unhandled SINT_TO_FP type in custom expander!");
@@ -509,7 +623,7 @@ SDValue AlphaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
 
     if (!isDouble) //Promote
       src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, src);
-    
+
     src = DAG.getNode(AlphaISD::CVTTQ_, dl, MVT::f64, src);
 
     return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, src);
@@ -519,14 +633,14 @@ SDValue AlphaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
     Constant *C = CP->getConstVal();
     SDValue CPI = DAG.getTargetConstantPool(C, MVT::i64, CP->getAlignment());
     // FIXME there isn't really any debug info here
-    
+
     SDValue Hi = DAG.getNode(AlphaISD::GPRelHi,  dl, MVT::i64, CPI,
                                DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
     SDValue Lo = DAG.getNode(AlphaISD::GPRelLo, dl, MVT::i64, CPI, Hi);
     return Lo;
   }
   case ISD::GlobalTLSAddress:
-    assert(0 && "TLS not implemented for Alpha.");
+    llvm_unreachable("TLS not implemented for Alpha.");
   case ISD::GlobalAddress: {
     GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
     GlobalValue *GV = GSDN->getGlobal();
@@ -540,11 +654,11 @@ SDValue AlphaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
       SDValue Lo = DAG.getNode(AlphaISD::GPRelLo, dl, MVT::i64, GA, Hi);
       return Lo;
     } else
-      return DAG.getNode(AlphaISD::RelLit, dl, MVT::i64, GA, 
+      return DAG.getNode(AlphaISD::RelLit, dl, MVT::i64, GA,
                          DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
   }
   case ISD::ExternalSymbol: {
-    return DAG.getNode(AlphaISD::RelLit, dl, MVT::i64, 
+    return DAG.getNode(AlphaISD::RelLit, dl, MVT::i64,
                        DAG.getTargetExternalSymbol(cast<ExternalSymbolSDNode>(Op)
                                                    ->getSymbol(), MVT::i64),
                        DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
@@ -554,7 +668,7 @@ SDValue AlphaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   case ISD::SREM:
     //Expand only on constant case
     if (Op.getOperand(1).getOpcode() == ISD::Constant) {
-      MVT VT = Op.getNode()->getValueType(0);
+      EVT VT = Op.getNode()->getValueType(0);
       SDValue Tmp1 = Op.getNode()->getOpcode() == ISD::UREM ?
         BuildUDIV(Op.getNode(), DAG, NULL) :
         BuildSDIV(Op.getNode(), DAG, NULL);
@@ -567,7 +681,7 @@ SDValue AlphaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   case ISD::UDIV:
     if (Op.getValueType().isInteger()) {
       if (Op.getOperand(1).getOpcode() == ISD::Constant)
-        return Op.getOpcode() == ISD::SDIV ? BuildSDIV(Op.getNode(), DAG, NULL) 
+        return Op.getOpcode() == ISD::SDIV ? BuildSDIV(Op.getNode(), DAG, NULL)
           : BuildUDIV(Op.getNode(), DAG, NULL);
       const char* opstr = 0;
       switch (Op.getOpcode()) {
@@ -601,12 +715,12 @@ SDValue AlphaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
     SDValue SrcP = Op.getOperand(2);
     const Value *DestS = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
     const Value *SrcS = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-    
+
     SDValue Val = DAG.getLoad(getPointerTy(), dl, Chain, SrcP, SrcS, 0);
     SDValue Result = DAG.getStore(Val.getValue(1), dl, Val, DestP, DestS, 0);
-    SDValue NP = DAG.getNode(ISD::ADD, dl, MVT::i64, SrcP, 
+    SDValue NP = DAG.getNode(ISD::ADD, dl, MVT::i64, SrcP,
                                DAG.getConstant(8, MVT::i64));
-    Val = DAG.getExtLoad(ISD::SEXTLOAD, dl, MVT::i64, Result, 
+    Val = DAG.getExtLoad(ISD::SEXTLOAD, dl, MVT::i64, Result,
                          NP, NULL,0, MVT::i32);
     SDValue NPD = DAG.getNode(ISD::ADD, dl, MVT::i64, DestP,
                                 DAG.getConstant(8, MVT::i64));
@@ -616,7 +730,7 @@ SDValue AlphaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
     SDValue Chain = Op.getOperand(0);
     SDValue VAListP = Op.getOperand(1);
     const Value *VAListS = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-    
+
     // vastart stores the address of the VarArgsBase and VarArgsOffset
     SDValue FR  = DAG.getFrameIndex(VarArgsBase, MVT::i64);
     SDValue S1  = DAG.getStore(Chain, dl, FR, VAListP, VAListS, 0);
@@ -625,13 +739,13 @@ SDValue AlphaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
     return DAG.getTruncStore(S1, dl, DAG.getConstant(VarArgsOffset, MVT::i64),
                              SA2, NULL, 0, MVT::i32);
   }
-  case ISD::RETURNADDR:        
+  case ISD::RETURNADDR:
     return DAG.getNode(AlphaISD::GlobalRetAddr, DebugLoc::getUnknownLoc(),
                        MVT::i64);
       //FIXME: implement
   case ISD::FRAMEADDR:          break;
   }
-  
+
   return SDValue();
 }
 
@@ -655,7 +769,7 @@ void AlphaTargetLowering::ReplaceNodeResults(SDNode *N,
 
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
-AlphaTargetLowering::ConstraintType 
+AlphaTargetLowering::ConstraintType
 AlphaTargetLowering::getConstraintType(const std::string &Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
@@ -670,37 +784,37 @@ AlphaTargetLowering::getConstraintType(const std::string &Constraint) const {
 
 std::vector<unsigned> AlphaTargetLowering::
 getRegClassForInlineAsmConstraint(const std::string &Constraint,
-                                  MVT VT) const {
+                                  EVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default: break;  // Unknown constriant letter
-    case 'f': 
+    case 'f':
       return make_vector<unsigned>(Alpha::F0 , Alpha::F1 , Alpha::F2 ,
                                    Alpha::F3 , Alpha::F4 , Alpha::F5 ,
-                                   Alpha::F6 , Alpha::F7 , Alpha::F8 , 
-                                   Alpha::F9 , Alpha::F10, Alpha::F11, 
-                                   Alpha::F12, Alpha::F13, Alpha::F14, 
-                                   Alpha::F15, Alpha::F16, Alpha::F17, 
-                                   Alpha::F18, Alpha::F19, Alpha::F20, 
-                                   Alpha::F21, Alpha::F22, Alpha::F23, 
-                                   Alpha::F24, Alpha::F25, Alpha::F26, 
-                                   Alpha::F27, Alpha::F28, Alpha::F29, 
+                                   Alpha::F6 , Alpha::F7 , Alpha::F8 ,
+                                   Alpha::F9 , Alpha::F10, Alpha::F11,
+                                   Alpha::F12, Alpha::F13, Alpha::F14,
+                                   Alpha::F15, Alpha::F16, Alpha::F17,
+                                   Alpha::F18, Alpha::F19, Alpha::F20,
+                                   Alpha::F21, Alpha::F22, Alpha::F23,
+                                   Alpha::F24, Alpha::F25, Alpha::F26,
+                                   Alpha::F27, Alpha::F28, Alpha::F29,
                                    Alpha::F30, Alpha::F31, 0);
-    case 'r': 
-      return make_vector<unsigned>(Alpha::R0 , Alpha::R1 , Alpha::R2 , 
-                                   Alpha::R3 , Alpha::R4 , Alpha::R5 , 
-                                   Alpha::R6 , Alpha::R7 , Alpha::R8 , 
-                                   Alpha::R9 , Alpha::R10, Alpha::R11, 
-                                   Alpha::R12, Alpha::R13, Alpha::R14, 
-                                   Alpha::R15, Alpha::R16, Alpha::R17, 
-                                   Alpha::R18, Alpha::R19, Alpha::R20, 
-                                   Alpha::R21, Alpha::R22, Alpha::R23, 
-                                   Alpha::R24, Alpha::R25, Alpha::R26, 
-                                   Alpha::R27, Alpha::R28, Alpha::R29, 
+    case 'r':
+      return make_vector<unsigned>(Alpha::R0 , Alpha::R1 , Alpha::R2 ,
+                                   Alpha::R3 , Alpha::R4 , Alpha::R5 ,
+                                   Alpha::R6 , Alpha::R7 , Alpha::R8 ,
+                                   Alpha::R9 , Alpha::R10, Alpha::R11,
+                                   Alpha::R12, Alpha::R13, Alpha::R14,
+                                   Alpha::R15, Alpha::R16, Alpha::R17,
+                                   Alpha::R18, Alpha::R19, Alpha::R20,
+                                   Alpha::R21, Alpha::R22, Alpha::R23,
+                                   Alpha::R24, Alpha::R25, Alpha::R26,
+                                   Alpha::R27, Alpha::R28, Alpha::R29,
                                    Alpha::R30, Alpha::R31, 0);
     }
   }
-  
+
   return std::vector<unsigned>();
 }
 //===----------------------------------------------------------------------===//
@@ -709,7 +823,8 @@ getRegClassForInlineAsmConstraint(const std::string &Constraint,
 
 MachineBasicBlock *
 AlphaTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *BB) const {
+                                                 MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   assert((MI->getOpcode() == Alpha::CAS32 ||
           MI->getOpcode() == Alpha::CAS64 ||
@@ -719,10 +834,10 @@ AlphaTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
           MI->getOpcode() == Alpha::SWAP64) &&
          "Unexpected instr type to insert");
 
-  bool is32 = MI->getOpcode() == Alpha::CAS32 || 
+  bool is32 = MI->getOpcode() == Alpha::CAS32 ||
     MI->getOpcode() == Alpha::LAS32 ||
     MI->getOpcode() == Alpha::SWAP32;
-  
+
   //Load locked store conditional for atomic ops take on the same form
   //start:
   //ll
@@ -734,30 +849,35 @@ AlphaTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   DebugLoc dl = MI->getDebugLoc();
   MachineFunction::iterator It = BB;
   ++It;
-  
+
   MachineBasicBlock *thisMBB = BB;
   MachineFunction *F = BB->getParent();
   MachineBasicBlock *llscMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
 
+  // Inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(),
+         E = BB->succ_end(); I != E; ++I)
+    EM->insert(std::make_pair(*I, sinkMBB));
+
   sinkMBB->transferSuccessors(thisMBB);
 
   F->insert(It, llscMBB);
   F->insert(It, sinkMBB);
 
   BuildMI(thisMBB, dl, TII->get(Alpha::BR)).addMBB(llscMBB);
-  
+
   unsigned reg_res = MI->getOperand(0).getReg(),
     reg_ptr = MI->getOperand(1).getReg(),
     reg_v2 = MI->getOperand(2).getReg(),
     reg_store = F->getRegInfo().createVirtualRegister(&Alpha::GPRCRegClass);
 
-  BuildMI(llscMBB, dl, TII->get(is32 ? Alpha::LDL_L : Alpha::LDQ_L), 
+  BuildMI(llscMBB, dl, TII->get(is32 ? Alpha::LDL_L : Alpha::LDQ_L),
           reg_res).addImm(0).addReg(reg_ptr);
   switch (MI->getOpcode()) {
   case Alpha::CAS32:
   case Alpha::CAS64: {
-    unsigned reg_cmp 
+    unsigned reg_cmp
       = F->getRegInfo().createVirtualRegister(&Alpha::GPRCRegClass);
     BuildMI(llscMBB, dl, TII->get(Alpha::CMPEQ), reg_cmp)
       .addReg(reg_v2).addReg(reg_res);
diff --git a/lib/Target/Alpha/AlphaISelLowering.h b/lib/Target/Alpha/AlphaISelLowering.h
index 4925367..b580c9d 100644
--- a/lib/Target/Alpha/AlphaISelLowering.h
+++ b/lib/Target/Alpha/AlphaISelLowering.h
@@ -62,12 +62,11 @@ namespace llvm {
   class AlphaTargetLowering : public TargetLowering {
     int VarArgsOffset;  // What is the offset to the first vaarg
     int VarArgsBase;    // What is the base FrameIndex
-    bool useITOF;
   public:
     explicit AlphaTargetLowering(TargetMachine &TM);
     
     /// getSetCCResultType - Get the SETCC result ValueType
-    virtual MVT getSetCCResultType(MVT VT) const;
+    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     ///
@@ -82,24 +81,21 @@ namespace llvm {
     // Friendly names for dumps
     const char *getTargetNodeName(unsigned Opcode) const;
 
-    /// LowerCallTo - This hook lowers an abstract call to a function into an
-    /// actual call.
-    virtual std::pair<SDValue, SDValue>
-    LowerCallTo(SDValue Chain, const Type *RetTy, bool RetSExt, bool RetZExt,
-                bool isVarArg, bool isInreg, unsigned NumFixedArgs, unsigned CC,
-                bool isTailCall, SDValue Callee, ArgListTy &Args,
-                SelectionDAG &DAG, DebugLoc dl);
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
 
     ConstraintType getConstraintType(const std::string &Constraint) const;
 
     std::vector<unsigned> 
       getRegClassForInlineAsmConstraint(const std::string &Constraint,
-                                        MVT VT) const;
-
-    bool hasITOF() { return useITOF; }
+                                        EVT VT) const;
 
     MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                   MachineBasicBlock *BB) const;
+                                                   MachineBasicBlock *BB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
@@ -111,6 +107,26 @@ namespace llvm {
     void LowerVAARG(SDNode *N, SDValue &Chain, SDValue &DataPtr,
                     SelectionDAG &DAG);
 
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
   };
 }
 
diff --git a/lib/Target/Alpha/AlphaInstrInfo.cpp b/lib/Target/Alpha/AlphaInstrInfo.cpp
index 76a594f..86173ff 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.cpp
+++ b/lib/Target/Alpha/AlphaInstrInfo.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
 AlphaInstrInfo::AlphaInstrInfo()
@@ -200,29 +201,7 @@ AlphaInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       .addReg(SrcReg, getKillRegState(isKill))
       .addFrameIndex(FrameIdx).addReg(Alpha::F31);
   else
-    abort();
-}
-
-void AlphaInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
-                                       bool isKill,
-                                       SmallVectorImpl<MachineOperand> &Addr,
-                                       const TargetRegisterClass *RC,
-                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Opc = 0;
-  if (RC == Alpha::F4RCRegisterClass)
-    Opc = Alpha::STS;
-  else if (RC == Alpha::F8RCRegisterClass)
-    Opc = Alpha::STT;
-  else if (RC == Alpha::GPRCRegisterClass)
-    Opc = Alpha::STQ;
-  else
-    abort();
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  MachineInstrBuilder MIB = 
-    BuildMI(MF, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill));
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  NewMIs.push_back(MIB);
+    llvm_unreachable("Unhandled register class");
 }
 
 void
@@ -245,28 +224,7 @@ AlphaInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     BuildMI(MBB, MI, DL, get(Alpha::LDQ), DestReg)
       .addFrameIndex(FrameIdx).addReg(Alpha::F31);
   else
-    abort();
-}
-
-void AlphaInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                                        SmallVectorImpl<MachineOperand> &Addr,
-                                        const TargetRegisterClass *RC,
-                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Opc = 0;
-  if (RC == Alpha::F4RCRegisterClass)
-    Opc = Alpha::LDS;
-  else if (RC == Alpha::F8RCRegisterClass)
-    Opc = Alpha::LDT;
-  else if (RC == Alpha::GPRCRegisterClass)
-    Opc = Alpha::LDQ;
-  else
-    abort();
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  MachineInstrBuilder MIB = 
-    BuildMI(MF, DL, get(Opc), DestReg);
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  NewMIs.push_back(MIB);
+    llvm_unreachable("Unhandled register class");
 }
 
 MachineInstr *AlphaInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
@@ -331,7 +289,7 @@ static unsigned AlphaRevCondCode(unsigned Opcode) {
   case Alpha::FBLE: return Alpha::FBGT;
   case Alpha::FBLT: return Alpha::FBGE;
   default:
-    assert(0 && "Unknown opcode");
+    llvm_unreachable("Unknown opcode");
   }
   return 0; // Not reached
 }
diff --git a/lib/Target/Alpha/AlphaInstrInfo.h b/lib/Target/Alpha/AlphaInstrInfo.h
index ea09885..274f452 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.h
+++ b/lib/Target/Alpha/AlphaInstrInfo.h
@@ -54,20 +54,10 @@ public:
                                    unsigned SrcReg, bool isKill, int FrameIndex,
                                    const TargetRegisterClass *RC) const;
 
-  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                              SmallVectorImpl<MachineOperand> &Addr,
-                              const TargetRegisterClass *RC,
-                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
   virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MBBI,
                                     unsigned DestReg, int FrameIndex,
                                     const TargetRegisterClass *RC) const;
-
-  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                               SmallVectorImpl<MachineOperand> &Addr,
-                               const TargetRegisterClass *RC,
-                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
   
   virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
                                               MachineInstr* MI,
diff --git a/lib/Target/Alpha/AlphaInstrInfo.td b/lib/Target/Alpha/AlphaInstrInfo.td
index e73bdf9..3b98206 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.td
+++ b/lib/Target/Alpha/AlphaInstrInfo.td
@@ -702,7 +702,7 @@ def FCMOVNET : FPForm<0x17, 0x02B, "fcmovne $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
 
 //misc FP selects
 //Select double
-     
+
 def : Pat<(select (seteq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
       (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
 def : Pat<(select (setoeq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
@@ -791,12 +791,14 @@ def : Pat<(select (setule F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
 
 
 let OutOperandList = (ops GPRC:$RC), InOperandList = (ops F4RC:$RA), Fb = 31 in 
-def FTOIS : FPForm<0x1C, 0x078, "ftois $RA,$RC",[], s_ftoi>; //Floating to integer move, S_floating
+def FTOIS : FPForm<0x1C, 0x078, "ftois $RA,$RC",
+        [(set GPRC:$RC, (bitconvert F4RC:$RA))], s_ftoi>; //Floating to integer move, S_floating
 let OutOperandList = (ops GPRC:$RC), InOperandList = (ops F8RC:$RA), Fb = 31 in 
 def FTOIT : FPForm<0x1C, 0x070, "ftoit $RA,$RC",
         [(set GPRC:$RC, (bitconvert F8RC:$RA))], s_ftoi>; //Floating to integer move
 let OutOperandList = (ops F4RC:$RC), InOperandList = (ops GPRC:$RA), Fb = 31 in 
-def ITOFS : FPForm<0x14, 0x004, "itofs $RA,$RC",[], s_itof>; //Integer to floating move, S_floating
+def ITOFS : FPForm<0x14, 0x004, "itofs $RA,$RC",
+    	[(set F4RC:$RC, (bitconvert GPRC:$RA))], s_itof>; //Integer to floating move, S_floating
 let OutOperandList = (ops F8RC:$RC), InOperandList = (ops GPRC:$RA), Fb = 31 in 
 def ITOFT : FPForm<0x14, 0x024, "itoft $RA,$RC",
         [(set F8RC:$RC, (bitconvert GPRC:$RA))], s_itof>; //Integer to floating move
@@ -818,6 +820,10 @@ let OutOperandList = (ops F4RC:$RC), InOperandList = (ops F8RC:$RB), Fa = 31 in
 def CVTTS : FPForm<0x16, 0x7AC, "cvtts/sui $RB,$RC",
                    [(set F4RC:$RC, (fround F8RC:$RB))], s_fadd>;
 
+def :  Pat<(select GPRC:$RC, F8RC:$st, F8RC:$sf),
+       (f64 (FCMOVEQT  F8RC:$st, F8RC:$sf, (ITOFT GPRC:$RC)))>; 
+def :  Pat<(select GPRC:$RC, F4RC:$st, F4RC:$sf),
+       (f32 (FCMOVEQS  F4RC:$st, F4RC:$sf, (ITOFT GPRC:$RC)))>; 
 
 /////////////////////////////////////////////////////////
 //Branching
diff --git a/lib/Target/Alpha/AlphaJITInfo.cpp b/lib/Target/Alpha/AlphaJITInfo.cpp
index ba7478e..d328135 100644
--- a/lib/Target/Alpha/AlphaJITInfo.cpp
+++ b/lib/Target/Alpha/AlphaJITInfo.cpp
@@ -16,8 +16,9 @@
 #include "AlphaRelocations.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/Config/alloca.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
 using namespace llvm;
 
@@ -57,12 +58,12 @@ static void EmitBranchToAt(void *At, void *To) {
 
   AtI[0] = BUILD_OR(0, 27, 27);
 
-  DOUT << "Stub targeting " << To << "\n";
+  DEBUG(errs() << "Stub targeting " << To << "\n");
 
   for (int x = 1; x <= 8; ++x) {
     AtI[2*x - 1] = BUILD_SLLi(27,27,8);
     unsigned d = (Fn >> (64 - 8 * x)) & 0x00FF;
-    //DOUT << "outputing " << hex << d << dec << "\n";
+    //DEBUG(errs() << "outputing " << hex << d << dec << "\n");
     AtI[2*x] = BUILD_ORi(27, 27, d);
   }
   AtI[17] = BUILD_JMP(31,27,0); //jump, preserving ra, and setting pv
@@ -71,7 +72,7 @@ static void EmitBranchToAt(void *At, void *To) {
 
 void AlphaJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
   //FIXME
-  assert(0);
+  llvm_unreachable(0);
 }
 
 static TargetJITInfo::JITCompilerFn JITCompilerFunction;
@@ -86,12 +87,12 @@ extern "C" {
 
     //rewrite the stub to an unconditional branch
     if (((unsigned*)CameFromStub)[18] == 0x00FFFFFF) {
-      DOUT << "Came from a stub, rewriting\n";
+      DEBUG(errs() << "Came from a stub, rewriting\n");
       EmitBranchToAt(CameFromStub, Target);
     } else {
-      DOUT << "confused, didn't come from stub at " << CameFromStub
-           << " old jump vector " << oldpv
-           << " new jump vector " << Target << "\n";
+      DEBUG(errs() << "confused, didn't come from stub at " << CameFromStub
+                   << " old jump vector " << oldpv
+                   << " new jump vector " << Target << "\n");
     }
 
     //Change pv to new Target
@@ -184,8 +185,7 @@ extern "C" {
       );
 #else
   void AlphaCompilationCallback() {
-    cerr << "Cannot call AlphaCompilationCallback() on a non-Alpha arch!\n";
-    abort();
+    llvm_unreachable("Cannot call AlphaCompilationCallback() on a non-Alpha arch!");
   }
 #endif
 }
@@ -199,7 +199,7 @@ void *AlphaJITInfo::emitFunctionStub(const Function* F, void *Fn,
   for (int x = 0; x < 19; ++ x)
     JCE.emitWordLE(0);
   EmitBranchToAt(Addr, Fn);
-  DOUT << "Emitting Stub to " << Fn << " at [" << Addr << "]\n";
+  DEBUG(errs() << "Emitting Stub to " << Fn << " at [" << Addr << "]\n");
   return JCE.finishGVStub(F);
 }
 
@@ -241,34 +241,34 @@ void AlphaJITInfo::relocate(void *Function, MachineRelocation *MR,
     long idx = 0;
     bool doCommon = true;
     switch ((Alpha::RelocationType)MR->getRelocationType()) {
-    default: assert(0 && "Unknown relocation type!");
+    default: llvm_unreachable("Unknown relocation type!");
     case Alpha::reloc_literal:
       //This is a LDQl
       idx = MR->getGOTIndex();
-      DOUT << "Literal relocation to slot " << idx;
+      DEBUG(errs() << "Literal relocation to slot " << idx);
       idx = (idx - GOToffset) * 8;
-      DOUT << " offset " << idx << "\n";
+      DEBUG(errs() << " offset " << idx << "\n");
       break;
     case Alpha::reloc_gprellow:
       idx = (unsigned char*)MR->getResultPointer() - &GOTBase[GOToffset * 8];
       idx = getLower16(idx);
-      DOUT << "gprellow relocation offset " << idx << "\n";
-      DOUT << " Pointer is " << (void*)MR->getResultPointer()
-           << " GOT is " << (void*)&GOTBase[GOToffset * 8] << "\n";
+      DEBUG(errs() << "gprellow relocation offset " << idx << "\n");
+      DEBUG(errs() << " Pointer is " << (void*)MR->getResultPointer()
+           << " GOT is " << (void*)&GOTBase[GOToffset * 8] << "\n");
       break;
     case Alpha::reloc_gprelhigh:
       idx = (unsigned char*)MR->getResultPointer() - &GOTBase[GOToffset * 8];
       idx = getUpper16(idx);
-      DOUT << "gprelhigh relocation offset " << idx << "\n";
-      DOUT << " Pointer is " << (void*)MR->getResultPointer()
-           << " GOT is " << (void*)&GOTBase[GOToffset * 8] << "\n";
+      DEBUG(errs() << "gprelhigh relocation offset " << idx << "\n");
+      DEBUG(errs() << " Pointer is " << (void*)MR->getResultPointer()
+            << " GOT is " << (void*)&GOTBase[GOToffset * 8] << "\n");
       break;
     case Alpha::reloc_gpdist:
       switch (*RelocPos >> 26) {
       case 0x09: //LDAH
         idx = &GOTBase[GOToffset * 8] - (unsigned char*)RelocPos;
         idx = getUpper16(idx);
-        DOUT << "LDAH: " << idx << "\n";
+        DEBUG(errs() << "LDAH: " << idx << "\n");
         //add the relocation to the map
         gpdistmap[std::make_pair(Function, MR->getConstantVal())] = RelocPos;
         break;
@@ -278,10 +278,10 @@ void AlphaJITInfo::relocate(void *Function, MachineRelocation *MR,
         idx = &GOTBase[GOToffset * 8] -
           (unsigned char*)gpdistmap[std::make_pair(Function, MR->getConstantVal())];
         idx = getLower16(idx);
-        DOUT << "LDA: " << idx << "\n";
+        DEBUG(errs() << "LDA: " << idx << "\n");
         break;
       default:
-        assert(0 && "Cannot handle gpdist yet");
+        llvm_unreachable("Cannot handle gpdist yet");
       }
       break;
     case Alpha::reloc_bsr: {
diff --git a/lib/Target/Alpha/AlphaMCAsmInfo.cpp b/lib/Target/Alpha/AlphaMCAsmInfo.cpp
new file mode 100644
index 0000000..b652a53
--- /dev/null
+++ b/lib/Target/Alpha/AlphaMCAsmInfo.cpp
@@ -0,0 +1,22 @@
+//===-- AlphaMCAsmInfo.cpp - Alpha asm properties ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the AlphaMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AlphaMCAsmInfo.h"
+using namespace llvm;
+
+AlphaMCAsmInfo::AlphaMCAsmInfo(const Target &T, const StringRef &TT) {
+  AlignmentIsInBytes = false;
+  PrivateGlobalPrefix = "$";
+  PICJumpTableDirective = ".gprel32";
+  WeakRefDirective = "\t.weak\t";
+}
diff --git a/lib/Target/Alpha/AlphaMCAsmInfo.h b/lib/Target/Alpha/AlphaMCAsmInfo.h
new file mode 100644
index 0000000..c27065d
--- /dev/null
+++ b/lib/Target/Alpha/AlphaMCAsmInfo.h
@@ -0,0 +1,29 @@
+//=====-- AlphaMCAsmInfo.h - Alpha asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AlphaMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHATARGETASMINFO_H
+#define ALPHATARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+
+  struct AlphaMCAsmInfo : public MCAsmInfo {
+    explicit AlphaMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.cpp b/lib/Target/Alpha/AlphaRegisterInfo.cpp
index 0ff53c7..98e9730 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.cpp
+++ b/lib/Target/Alpha/AlphaRegisterInfo.cpp
@@ -28,6 +28,8 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include <cstdlib>
@@ -149,8 +151,10 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 //variable locals
 //<- SP
 
-void AlphaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                            int SPAdj, RegScavenger *RS) const {
+unsigned
+AlphaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, int *Value,
+                                       RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
   unsigned i = 0;
@@ -172,16 +176,16 @@ void AlphaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // Now add the frame object offset to the offset from the virtual frame index.
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
 
-  DOUT << "FI: " << FrameIndex << " Offset: " << Offset << "\n";
+  DEBUG(errs() << "FI: " << FrameIndex << " Offset: " << Offset << "\n");
 
   Offset += MF.getFrameInfo()->getStackSize();
 
-  DOUT << "Corrected Offset " << Offset
-       << " for stack size: " << MF.getFrameInfo()->getStackSize() << "\n";
+  DEBUG(errs() << "Corrected Offset " << Offset
+       << " for stack size: " << MF.getFrameInfo()->getStackSize() << "\n");
 
   if (Offset > IMM_HIGH || Offset < IMM_LOW) {
-    DOUT << "Unconditionally using R28 for evil purposes Offset: "
-         << Offset << "\n";
+    DEBUG(errs() << "Unconditionally using R28 for evil purposes Offset: "
+          << Offset << "\n");
     //so in this case, we need to use a temporary register, and move the
     //original inst off the SP/FP
     //fix up the old:
@@ -195,6 +199,7 @@ void AlphaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   } else {
     MI.getOperand(i).ChangeToImmediate(Offset);
   }
+  return 0;
 }
 
 
@@ -244,8 +249,10 @@ void AlphaRegisterInfo::emitPrologue(MachineFunction &MF) const {
     BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDA), Alpha::R30)
       .addImm(getLower16(NumBytes)).addReg(Alpha::R30);
   } else {
-    cerr << "Too big a stack frame at " << NumBytes << "\n";
-    abort();
+    std::string msg;
+    raw_string_ostream Msg(msg); 
+    Msg << "Too big a stack frame at " + NumBytes;
+    llvm_report_error(Msg.str());
   }
 
   //now if we need to, save the old FP and set the new
@@ -294,14 +301,16 @@ void AlphaRegisterInfo::emitEpilogue(MachineFunction &MF,
       BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDA), Alpha::R30)
         .addImm(getLower16(NumBytes)).addReg(Alpha::R30);
     } else {
-      cerr << "Too big a stack frame at " << NumBytes << "\n";
-      abort();
+      std::string msg;
+      raw_string_ostream Msg(msg); 
+      Msg << "Too big a stack frame at " + NumBytes;
+      llvm_report_error(Msg.str());
     }
   }
 }
 
 unsigned AlphaRegisterInfo::getRARegister() const {
-  assert(0 && "What is the return address register");
+  llvm_unreachable("What is the return address register");
   return 0;
 }
 
@@ -310,17 +319,17 @@ unsigned AlphaRegisterInfo::getFrameRegister(MachineFunction &MF) const {
 }
 
 unsigned AlphaRegisterInfo::getEHExceptionRegister() const {
-  assert(0 && "What is the exception register");
+  llvm_unreachable("What is the exception register");
   return 0;
 }
 
 unsigned AlphaRegisterInfo::getEHHandlerRegister() const {
-  assert(0 && "What is the exception handler register");
+  llvm_unreachable("What is the exception handler register");
   return 0;
 }
 
 int AlphaRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  assert(0 && "What is the dwarf register number");
+  llvm_unreachable("What is the dwarf register number");
   return -1;
 }
 
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.h b/lib/Target/Alpha/AlphaRegisterInfo.h
index 5012fe8..66f0898 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.h
+++ b/lib/Target/Alpha/AlphaRegisterInfo.h
@@ -41,8 +41,9 @@ struct AlphaRegisterInfo : public AlphaGenRegisterInfo {
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-  void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
 
   //void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
diff --git a/lib/Target/Alpha/AlphaSubtarget.cpp b/lib/Target/Alpha/AlphaSubtarget.cpp
index d5a9365..bda7104 100644
--- a/lib/Target/Alpha/AlphaSubtarget.cpp
+++ b/lib/Target/Alpha/AlphaSubtarget.cpp
@@ -16,7 +16,7 @@
 #include "AlphaGenSubtarget.inc"
 using namespace llvm;
 
-AlphaSubtarget::AlphaSubtarget(const Module &M, const std::string &FS)
+AlphaSubtarget::AlphaSubtarget(const std::string &TT, const std::string &FS)
   : HasCT(false) {
   std::string CPU = "generic";
 
diff --git a/lib/Target/Alpha/AlphaSubtarget.h b/lib/Target/Alpha/AlphaSubtarget.h
index 0a944cb..f0eb93c 100644
--- a/lib/Target/Alpha/AlphaSubtarget.h
+++ b/lib/Target/Alpha/AlphaSubtarget.h
@@ -20,7 +20,6 @@
 #include <string>
 
 namespace llvm {
-class Module;
 
 class AlphaSubtarget : public TargetSubtarget {
 protected:
@@ -31,9 +30,9 @@ protected:
 
 public:
   /// This constructor initializes the data members to match that
-  /// of the specified module.
+  /// of the specified triple.
   ///
-  AlphaSubtarget(const Module &M, const std::string &FS);
+  AlphaSubtarget(const std::string &TT, const std::string &FS);
   
   /// ParseSubtargetFeatures - Parses features string setting specified 
   /// subtarget options.  Definition of function is auto generated by tblgen.
diff --git a/lib/Target/Alpha/AlphaTargetMachine.cpp b/lib/Target/Alpha/AlphaTargetMachine.cpp
index 060089c..b8bc13b 100644
--- a/lib/Target/Alpha/AlphaTargetMachine.cpp
+++ b/lib/Target/Alpha/AlphaTargetMachine.cpp
@@ -12,60 +12,26 @@
 
 #include "Alpha.h"
 #include "AlphaJITInfo.h"
-#include "AlphaTargetAsmInfo.h"
+#include "AlphaMCAsmInfo.h"
 #include "AlphaTargetMachine.h"
-#include "llvm/Module.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetMachineRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
-// Register the targets
-static RegisterTarget<AlphaTargetMachine> X("alpha", "Alpha [experimental]");
-
-// No assembler printer by default
-AlphaTargetMachine::AsmPrinterCtorFn AlphaTargetMachine::AsmPrinterCtor = 0;
-
-// Force static initialization.
-extern "C" void LLVMInitializeAlphaTarget() { }
-
-const TargetAsmInfo *AlphaTargetMachine::createTargetAsmInfo() const {
-  return new AlphaTargetAsmInfo(*this);
-}
-
-unsigned AlphaTargetMachine::getModuleMatchQuality(const Module &M) {
-  // We strongly match "alpha*".
-  std::string TT = M.getTargetTriple();
-  if (TT.size() >= 5 && TT[0] == 'a' && TT[1] == 'l' && TT[2] == 'p' &&
-      TT[3] == 'h' && TT[4] == 'a')
-    return 20;
-  // If the target triple is something non-alpha, we don't match.
-  if (!TT.empty()) return 0;
-
-  if (M.getEndianness()  == Module::LittleEndian &&
-      M.getPointerSize() == Module::Pointer64)
-    return 10;                                   // Weak match
-  else if (M.getEndianness() != Module::AnyEndianness ||
-           M.getPointerSize() != Module::AnyPointerSize)
-    return 0;                                    // Match for some other target
-
-  return getJITMatchQuality()/2;
-}
-
-unsigned AlphaTargetMachine::getJITMatchQuality() {
-#ifdef __alpha
-  return 10;
-#else
-  return 0;
-#endif
+extern "C" void LLVMInitializeAlphaTarget() { 
+  // Register the target.
+  RegisterTargetMachine<AlphaTargetMachine> X(TheAlphaTarget);
+  RegisterAsmInfo<AlphaMCAsmInfo> Y(TheAlphaTarget);
 }
 
-AlphaTargetMachine::AlphaTargetMachine(const Module &M, const std::string &FS)
-  : DataLayout("e-f128:128:128"),
+AlphaTargetMachine::AlphaTargetMachine(const Target &T, const std::string &TT,
+                                       const std::string &FS)
+  : LLVMTargetMachine(T, TT),
+    DataLayout("e-f128:128:128"),
     FrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0),
     JITInfo(*this),
-    Subtarget(M, FS),
+    Subtarget(TT, FS),
     TLInfo(*this) {
   setRelocationModel(Reloc::PIC_);
 }
@@ -84,51 +50,40 @@ bool AlphaTargetMachine::addPreEmitPass(PassManagerBase &PM,
                                         CodeGenOpt::Level OptLevel) {
   // Must run branch selection immediately preceding the asm printer
   PM.add(createAlphaBranchSelectionPass());
-  return false;
-}
-bool AlphaTargetMachine::addAssemblyEmitter(PassManagerBase &PM,
-                                            CodeGenOpt::Level OptLevel,
-                                            bool Verbose,
-                                            raw_ostream &Out) {
   PM.add(createAlphaLLRPPass(*this));
-  // Output assembly language.
-  assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-  if (AsmPrinterCtor)
-    PM.add(AsmPrinterCtor(Out, *this, Verbose));
   return false;
 }
 bool AlphaTargetMachine::addCodeEmitter(PassManagerBase &PM,
                                         CodeGenOpt::Level OptLevel,
-                                        bool DumpAsm, MachineCodeEmitter &MCE) {
+                                        MachineCodeEmitter &MCE) {
   PM.add(createAlphaCodeEmitterPass(*this, MCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
   return false;
 }
 bool AlphaTargetMachine::addCodeEmitter(PassManagerBase &PM,
                                         CodeGenOpt::Level OptLevel,
-                                        bool DumpAsm, JITCodeEmitter &JCE) {
+                                        JITCodeEmitter &JCE) {
   PM.add(createAlphaJITCodeEmitterPass(*this, JCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
+  return false;
+}
+bool AlphaTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                        CodeGenOpt::Level OptLevel,
+                                        ObjectCodeEmitter &OCE) {
+  PM.add(createAlphaObjectCodeEmitterPass(*this, OCE));
   return false;
 }
 bool AlphaTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
                                               CodeGenOpt::Level OptLevel,
-                                              bool DumpAsm,
                                               MachineCodeEmitter &MCE) {
-  return addCodeEmitter(PM, OptLevel, DumpAsm, MCE);
+  return addCodeEmitter(PM, OptLevel, MCE);
 }
 bool AlphaTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
                                               CodeGenOpt::Level OptLevel,
-                                              bool DumpAsm,
                                               JITCodeEmitter &JCE) {
-  return addCodeEmitter(PM, OptLevel, DumpAsm, JCE);
+  return addCodeEmitter(PM, OptLevel, JCE);
+}
+bool AlphaTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+                                              CodeGenOpt::Level OptLevel,
+                                              ObjectCodeEmitter &OCE) {
+  return addCodeEmitter(PM, OptLevel, OCE);
 }
 
diff --git a/lib/Target/Alpha/AlphaTargetMachine.h b/lib/Target/Alpha/AlphaTargetMachine.h
index 26684c7..f03e938 100644
--- a/lib/Target/Alpha/AlphaTargetMachine.h
+++ b/lib/Target/Alpha/AlphaTargetMachine.h
@@ -34,18 +34,9 @@ class AlphaTargetMachine : public LLVMTargetMachine {
   AlphaSubtarget Subtarget;
   AlphaTargetLowering TLInfo;
 
-protected:
-  virtual const TargetAsmInfo *createTargetAsmInfo() const;
-
-  // To avoid having target depend on the asmprinter stuff libraries, asmprinter
-  // set this functions to ctor pointer at startup time if they are linked in.
-  typedef FunctionPass *(*AsmPrinterCtorFn)(raw_ostream &o,
-                                            TargetMachine &tm,
-                                            bool verbose);
-  static AsmPrinterCtorFn AsmPrinterCtor;
-
 public:
-  AlphaTargetMachine(const Module &M, const std::string &FS);
+  AlphaTargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS);
 
   virtual const AlphaInstrInfo *getInstrInfo() const { return &InstrInfo; }
   virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
@@ -61,31 +52,24 @@ public:
     return &JITInfo;
   }
 
-  static unsigned getJITMatchQuality();
-  static unsigned getModuleMatchQuality(const Module &M);
-
   // Pass Pipeline Configuration
   virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addAssemblyEmitter(PassManagerBase &PM,
-                                  CodeGenOpt::Level OptLevel,
-                                  bool Verbose, raw_ostream &Out);
   virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
-                              bool DumpAsm, MachineCodeEmitter &MCE);
+                              MachineCodeEmitter &MCE);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              JITCodeEmitter &JCE);
   virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
-                              bool DumpAsm, JITCodeEmitter &JCE);
+                              ObjectCodeEmitter &JCE);
   virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
                                     CodeGenOpt::Level OptLevel,
-                                    bool DumpAsm,
                                     MachineCodeEmitter &MCE);
   virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
                                     CodeGenOpt::Level OptLevel,
-                                    bool DumpAsm,
                                     JITCodeEmitter &JCE);
-
-  static void registerAsmPrinter(AsmPrinterCtorFn F) {
-    AsmPrinterCtor = F;
-  }
+  virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+                                    CodeGenOpt::Level OptLevel,
+                                    ObjectCodeEmitter &OCE);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp b/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp
index 982ef5e..d8e8b79 100644
--- a/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp
+++ b/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp
@@ -17,16 +17,20 @@
 #include "AlphaInstrInfo.h"
 #include "AlphaTargetMachine.h"
 #include "llvm/Module.h"
-#include "llvm/MDNode.h"
 #include "llvm/Type.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DwarfWriter.h"
-#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Mangler.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
@@ -37,21 +41,22 @@ namespace {
     /// Unique incrementer for label values for referencing Global values.
     ///
 
-    explicit AlphaAsmPrinter(raw_ostream &o, TargetMachine &tm,
-                             const TargetAsmInfo *T, bool V)
+    explicit AlphaAsmPrinter(formatted_raw_ostream &o, TargetMachine &tm,
+                             const MCAsmInfo *T, bool V)
       : AsmPrinter(o, tm, T, V) {}
 
     virtual const char *getPassName() const {
       return "Alpha Assembly Printer";
     }
-    bool printInstruction(const MachineInstr *MI);
+    void printInstruction(const MachineInstr *MI);
+    static const char *getRegisterName(unsigned RegNo);
+
     void printOp(const MachineOperand &MO, bool IsCallOp = false);
     void printOperand(const MachineInstr *MI, int opNum);
-    void printBaseOffsetPair (const MachineInstr *MI, int i, bool brackets=true);
-    void printModuleLevelGV(const GlobalVariable* GVar);
+    void printBaseOffsetPair(const MachineInstr *MI, int i, bool brackets=true);
+    void PrintGlobalVariable(const GlobalVariable *GVar);
     bool runOnMachineFunction(MachineFunction &F);
-    bool doInitialization(Module &M);
-    bool doFinalization(Module &M);
+    void EmitStartOfAsmFile(Module &M);
 
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode);
@@ -62,17 +67,6 @@ namespace {
   };
 } // end of anonymous namespace
 
-/// createAlphaCodePrinterPass - Returns a pass that prints the Alpha
-/// assembly code for a MachineFunction to the given output stream,
-/// using the given target machine description.  This should work
-/// regardless of whether the function is in SSA form.
-///
-FunctionPass *llvm::createAlphaCodePrinterPass(raw_ostream &o,
-                                               TargetMachine &tm,
-                                               bool verbose) {
-  return new AlphaAsmPrinter(o, tm, tm.getTargetAsmInfo(), verbose);
-}
-
 #include "AlphaGenAsmWriter.inc"
 
 void AlphaAsmPrinter::printOperand(const MachineInstr *MI, int opNum)
@@ -81,7 +75,7 @@ void AlphaAsmPrinter::printOperand(const MachineInstr *MI, int opNum)
   if (MO.getType() == MachineOperand::MO_Register) {
     assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
            "Not physreg??");
-    O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
+    O << getRegisterName(MO.getReg());
   } else if (MO.isImm()) {
     O << MO.getImm();
     assert(MO.getImm() < (1 << 30));
@@ -92,24 +86,21 @@ void AlphaAsmPrinter::printOperand(const MachineInstr *MI, int opNum)
 
 
 void AlphaAsmPrinter::printOp(const MachineOperand &MO, bool IsCallOp) {
-  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
-
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
-    O << RI.get(MO.getReg()).AsmName;
+    O << getRegisterName(MO.getReg());
     return;
 
   case MachineOperand::MO_Immediate:
-    cerr << "printOp() does not handle immediate values\n";
-    abort();
+    llvm_unreachable("printOp() does not handle immediate values");
     return;
 
   case MachineOperand::MO_MachineBasicBlock:
-    printBasicBlockLabel(MO.getMBB());
+    GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
     return;
 
   case MachineOperand::MO_ConstantPoolIndex:
-    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
       << MO.getIndex();
     return;
 
@@ -117,14 +108,12 @@ void AlphaAsmPrinter::printOp(const MachineOperand &MO, bool IsCallOp) {
     O << MO.getSymbolName();
     return;
 
-  case MachineOperand::MO_GlobalAddress: {
-    GlobalValue *GV = MO.getGlobal();
-    O << Mang->getValueName(GV);
+  case MachineOperand::MO_GlobalAddress:
+    O << Mang->getMangledName(MO.getGlobal());
     return;
-  }
 
   case MachineOperand::MO_JumpTableIndex:
-    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
       << '_' << MO.getIndex();
     return;
 
@@ -151,13 +140,14 @@ bool AlphaAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   // Print out labels for the function.
   const Function *F = MF.getFunction();
-  SwitchToSection(TAI->SectionForGlobal(F));
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
 
   EmitAlignment(MF.getAlignment(), F);
   switch (F->getLinkage()) {
-  default: assert(0 && "Unknown linkage type!");
+  default: llvm_unreachable("Unknown linkage type!");
   case Function::InternalLinkage:  // Symbols default to internal.
   case Function::PrivateLinkage:
+  case Function::LinkerPrivateLinkage:
     break;
    case Function::ExternalLinkage:
      O << "\t.globl " << CurrentFnName << "\n";
@@ -166,7 +156,7 @@ bool AlphaAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   case Function::WeakODRLinkage:
   case Function::LinkOnceAnyLinkage:
   case Function::LinkOnceODRLinkage:
-    O << TAI->getWeakRefDirective() << CurrentFnName << "\n";
+    O << MAI->getWeakRefDirective() << CurrentFnName << "\n";
     break;
   }
 
@@ -180,17 +170,19 @@ bool AlphaAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
        I != E; ++I) {
     if (I != MF.begin()) {
-      printBasicBlockLabel(I, true, true);
-      O << '\n';
+      EmitBasicBlockStart(I);
     }
     for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
          II != E; ++II) {
       // Print the assembly for the instruction.
       ++EmittedInsts;
-      if (!printInstruction(II)) {
-        assert(0 && "Unhandled instruction in asm writer!");
-        abort();
-      }
+      processDebugLoc(II, true);
+      printInstruction(II);
+      
+      if (VerboseAsm && !II->getDebugLoc().isUnknown())
+        EmitComments(*II);
+      O << '\n';
+      processDebugLoc(II, false);
     }
   }
 
@@ -200,17 +192,15 @@ bool AlphaAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   return false;
 }
 
-bool AlphaAsmPrinter::doInitialization(Module &M)
-{
-  if(TM.getSubtarget<AlphaSubtarget>().hasCT())
+void AlphaAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (TM.getSubtarget<AlphaSubtarget>().hasCT())
     O << "\t.arch ev6\n"; //This might need to be ev67, so leave this test here
   else
     O << "\t.arch ev6\n";
   O << "\t.set noat\n";
-  return AsmPrinter::doInitialization(M);
 }
 
-void AlphaAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+void AlphaAsmPrinter::PrintGlobalVariable(const GlobalVariable *GVar) {
   const TargetData *TD = TM.getTargetData();
 
   if (!GVar->hasInitializer()) return;  // External global require no code
@@ -219,15 +209,14 @@ void AlphaAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
   if (EmitSpecialLLVMGlobal(GVar))
     return;
 
-  std::string name = Mang->getValueName(GVar);
+  std::string name = Mang->getMangledName(GVar);
   Constant *C = GVar->getInitializer();
-  if (isa<MDNode>(C) || isa<MDString>(C))
-    return;
   unsigned Size = TD->getTypeAllocSize(C->getType());
   unsigned Align = TD->getPreferredAlignmentLog(GVar);
 
   // 0: Switch to section
-  SwitchToSection(TAI->SectionForGlobal(GVar));
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GVar, Mang,
+                                                                  TM));
 
   // 1: Check visibility
   printVisibility(name, GVar->getVisibility());
@@ -239,23 +228,22 @@ void AlphaAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
    case GlobalValue::WeakAnyLinkage:
    case GlobalValue::WeakODRLinkage:
    case GlobalValue::CommonLinkage:
-    O << TAI->getWeakRefDirective() << name << '\n';
+    O << MAI->getWeakRefDirective() << name << '\n';
     break;
    case GlobalValue::AppendingLinkage:
    case GlobalValue::ExternalLinkage:
-      O << TAI->getGlobalDirective() << name << "\n";
+      O << MAI->getGlobalDirective() << name << "\n";
       break;
     case GlobalValue::InternalLinkage:
     case GlobalValue::PrivateLinkage:
+    case GlobalValue::LinkerPrivateLinkage:
       break;
     default:
-      assert(0 && "Unknown linkage type!");
-      cerr << "Unknown linkage type!\n";
-      abort();
+      llvm_unreachable("Unknown linkage type!");
     }
 
   // 3: Type, Size, Align
-  if (TAI->hasDotTypeDotSizeDirective()) {
+  if (MAI->hasDotTypeDotSizeDirective()) {
     O << "\t.type\t" << name << ", @object\n";
     O << "\t.size\t" << name << ", " << Size << "\n";
   }
@@ -268,14 +256,6 @@ void AlphaAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
   O << '\n';
 }
 
-bool AlphaAsmPrinter::doFinalization(Module &M) {
-  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    printModuleLevelGV(I);
-
-  return AsmPrinter::doFinalization(M);
-}
-
 /// PrintAsmOperand - Print out an operand for an inline asm expression.
 ///
 bool AlphaAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -298,12 +278,6 @@ bool AlphaAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 }
 
 // Force static initialization.
-extern "C" void LLVMInitializeAlphaAsmPrinter() { }
-
-namespace {
-  static struct Register {
-    Register() {
-      AlphaTargetMachine::registerAsmPrinter(createAlphaCodePrinterPass);
-    }
-  } Registrator;
+extern "C" void LLVMInitializeAlphaAsmPrinter() { 
+  RegisterAsmPrinter<AlphaAsmPrinter> X(TheAlphaTarget);
 }
diff --git a/lib/Target/Alpha/AsmPrinter/Makefile b/lib/Target/Alpha/AsmPrinter/Makefile
index c5b3e94..3c64a3c 100644
--- a/lib/Target/Alpha/AsmPrinter/Makefile
+++ b/lib/Target/Alpha/AsmPrinter/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/Alpha/Makefile ---------------------------*- Makefile -*-===##
+##===- lib/Target/Alpha/AsmPrinter/Makefile ----------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
diff --git a/lib/Target/Alpha/CMakeLists.txt b/lib/Target/Alpha/CMakeLists.txt
index 2a382d5..b4f41ae 100644
--- a/lib/Target/Alpha/CMakeLists.txt
+++ b/lib/Target/Alpha/CMakeLists.txt
@@ -8,6 +8,7 @@ tablegen(AlphaGenInstrInfo.inc -gen-instr-desc)
 tablegen(AlphaGenCodeEmitter.inc -gen-emitter)
 tablegen(AlphaGenAsmWriter.inc -gen-asm-writer)
 tablegen(AlphaGenDAGISel.inc -gen-dag-isel)
+tablegen(AlphaGenCallingConv.inc -gen-callingconv)
 tablegen(AlphaGenSubtarget.inc -gen-subtarget)
 
 add_llvm_target(AlphaCodeGen
@@ -18,9 +19,9 @@ add_llvm_target(AlphaCodeGen
   AlphaISelLowering.cpp
   AlphaJITInfo.cpp
   AlphaLLRP.cpp
+  AlphaMCAsmInfo.cpp
   AlphaRegisterInfo.cpp
   AlphaSubtarget.cpp
-  AlphaTargetAsmInfo.cpp
   AlphaTargetMachine.cpp
   )
 
diff --git a/lib/Target/Alpha/Makefile b/lib/Target/Alpha/Makefile
index d6c82c7..d2d7109 100644
--- a/lib/Target/Alpha/Makefile
+++ b/lib/Target/Alpha/Makefile
@@ -15,8 +15,8 @@ BUILT_SOURCES = AlphaGenRegisterInfo.h.inc AlphaGenRegisterNames.inc \
                 AlphaGenRegisterInfo.inc AlphaGenInstrNames.inc \
                 AlphaGenInstrInfo.inc AlphaGenCodeEmitter.inc \
                 AlphaGenAsmWriter.inc AlphaGenDAGISel.inc \
-                AlphaGenSubtarget.inc
+                AlphaGenCallingConv.inc AlphaGenSubtarget.inc
 
-DIRS = AsmPrinter
+DIRS = AsmPrinter TargetInfo
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp b/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp
new file mode 100644
index 0000000..f7099b9
--- /dev/null
+++ b/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp
@@ -0,0 +1,20 @@
+//===-- AlphaTargetInfo.cpp - Alpha Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+llvm::Target llvm::TheAlphaTarget;
+
+extern "C" void LLVMInitializeAlphaTargetInfo() { 
+  RegisterTarget<Triple::alpha, /*HasJIT=*/true>
+    X(TheAlphaTarget, "alpha", "Alpha [experimental]");
+}
diff --git a/lib/Target/Alpha/TargetInfo/CMakeLists.txt b/lib/Target/Alpha/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..2a7291b
--- /dev/null
+++ b/lib/Target/Alpha/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAlphaInfo
+  AlphaTargetInfo.cpp
+  )
+
+add_dependencies(LLVMAlphaInfo AlphaCodeGenTable_gen)
diff --git a/lib/Target/Alpha/TargetInfo/Makefile b/lib/Target/Alpha/TargetInfo/Makefile
new file mode 100644
index 0000000..de01d7f
--- /dev/null
+++ b/lib/Target/Alpha/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/Alpha/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAlphaInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Blackfin/AsmPrinter/BlackfinAsmPrinter.cpp b/lib/Target/Blackfin/AsmPrinter/BlackfinAsmPrinter.cpp
new file mode 100644
index 0000000..91fd5dd
--- /dev/null
+++ b/lib/Target/Blackfin/AsmPrinter/BlackfinAsmPrinter.cpp
@@ -0,0 +1,242 @@
+//===-- BlackfinAsmPrinter.cpp - Blackfin LLVM assembly writer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format BLACKFIN assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "Blackfin.h"
+#include "BlackfinInstrInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  class VISIBILITY_HIDDEN BlackfinAsmPrinter : public AsmPrinter {
+  public:
+    BlackfinAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                       const MCAsmInfo *MAI, bool V)
+      : AsmPrinter(O, TM, MAI, V) {}
+
+    virtual const char *getPassName() const {
+      return "Blackfin Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printMemoryOperand(const MachineInstr *MI, int opNum);
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
+    void emitLinkage(const std::string &n, GlobalValue::LinkageTypes l);
+    bool runOnMachineFunction(MachineFunction &F);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode);
+    void PrintGlobalVariable(const GlobalVariable* GVar);
+  };
+} // end of anonymous namespace
+
+#include "BlackfinGenAsmWriter.inc"
+
+extern "C" void LLVMInitializeBlackfinAsmPrinter() {
+  RegisterAsmPrinter<BlackfinAsmPrinter> X(TheBlackfinTarget);
+}
+
+void BlackfinAsmPrinter::emitLinkage(const std::string &name,
+                                     GlobalValue::LinkageTypes l) {
+  switch (l) {
+  default: llvm_unreachable("Unknown linkage type!");
+  case GlobalValue::InternalLinkage:  // Symbols default to internal.
+  case GlobalValue::PrivateLinkage:
+  case GlobalValue::LinkerPrivateLinkage:
+    break;
+  case GlobalValue::ExternalLinkage:
+    O << MAI->getGlobalDirective() << name << "\n";
+    break;
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+    O << MAI->getGlobalDirective() << name << "\n";
+    O << MAI->getWeakDefDirective() << name << "\n";
+    break;
+  }
+}
+
+void BlackfinAsmPrinter::PrintGlobalVariable(const GlobalVariable* GV) {
+  const TargetData *TD = TM.getTargetData();
+
+  if (!GV->hasInitializer() || EmitSpecialLLVMGlobal(GV))
+    return;
+
+  std::string name = Mang->getMangledName(GV);
+  Constant *C = GV->getInitializer();
+
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GV, Mang,
+                                                                  TM));
+  emitLinkage(name, GV->getLinkage());
+  EmitAlignment(TD->getPreferredAlignmentLog(GV), GV);
+  printVisibility(name, GV->getVisibility());
+
+  O << "\t.type " << name << ", STT_OBJECT\n";
+  O << "\t.size " << name << ',' << TD->getTypeAllocSize(C->getType()) << '\n';
+  O << name << ":\n";
+  EmitGlobalConstant(C);
+}
+
+/// runOnMachineFunction - This uses the printInstruction()
+/// method to print assembly for each instruction.
+///
+bool BlackfinAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+  EmitConstantPool(MF.getConstantPool());
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  const Function *F = MF.getFunction();
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
+  EmitAlignment(2, F);
+  emitLinkage(CurrentFnName, F->getLinkage());
+  printVisibility(CurrentFnName, F->getVisibility());
+
+  O << "\t.type\t" << CurrentFnName << ", STT_FUNC\n"
+    << CurrentFnName << ":\n";
+
+  if (DW)
+    DW->BeginFunction(&MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    EmitBasicBlockStart(I);
+
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      processDebugLoc(II, true);
+
+      printInstruction(II);
+      if (VerboseAsm && !II->getDebugLoc().isUnknown())
+        EmitComments(*II);
+      O << '\n';
+      
+      processDebugLoc(II, false);
+      ++EmittedInsts;
+    }
+  }
+
+  O << "\t.size " << CurrentFnName << ", .-" << CurrentFnName << "\n";
+
+  if (DW)
+    DW->EndFunction(&MF);
+
+  return false;
+}
+
+void BlackfinAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand (opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           "Virtual registers should be already mapped!");
+    O << getRegisterName(MO.getReg());
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    O << Mang->getMangledName(MO.getGlobal());
+    printOffset(MO.getOffset());
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    O << Mang->makeNameProper(MO.getSymbolName());
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+      << MO.getIndex();
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+  default:
+    llvm_unreachable("<unknown operand type>");
+    break;
+  }
+}
+
+void BlackfinAsmPrinter::printMemoryOperand(const MachineInstr *MI, int opNum) {
+  printOperand(MI, opNum);
+
+  if (MI->getOperand(opNum+1).isImm() && MI->getOperand(opNum+1).getImm() == 0)
+    return;
+
+  O << " + ";
+  printOperand(MI, opNum+1);
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool BlackfinAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
+                                         unsigned OpNo,
+                                         unsigned AsmVariant,
+                                         const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'r':
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo);
+
+  return false;
+}
+
+bool BlackfinAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                               unsigned OpNo,
+                                               unsigned AsmVariant,
+                                               const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true;  // Unknown modifier
+
+  O << '[';
+  printOperand(MI, OpNo);
+  O << ']';
+
+  return false;
+}
diff --git a/lib/Target/Blackfin/AsmPrinter/CMakeLists.txt b/lib/Target/Blackfin/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..795aebf
--- /dev/null
+++ b/lib/Target/Blackfin/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMBlackfinAsmPrinter
+  BlackfinAsmPrinter.cpp
+  )
+add_dependencies(LLVMBlackfinAsmPrinter BlackfinCodeGenTable_gen)
diff --git a/lib/Target/Blackfin/AsmPrinter/Makefile b/lib/Target/Blackfin/AsmPrinter/Makefile
new file mode 100644
index 0000000..091d4df
--- /dev/null
+++ b/lib/Target/Blackfin/AsmPrinter/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/Blackfin/AsmPrinter/Makefile -------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMBlackfinAsmPrinter
+
+# Hack: we need to include 'main' Blackfin target directory to grab private
+# headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Blackfin/Blackfin.h b/lib/Target/Blackfin/Blackfin.h
new file mode 100644
index 0000000..ec1fa86
--- /dev/null
+++ b/lib/Target/Blackfin/Blackfin.h
@@ -0,0 +1,38 @@
+//=== Blackfin.h - Top-level interface for Blackfin backend -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Blackfin back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_BLACKFIN_H
+#define TARGET_BLACKFIN_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+  class FunctionPass;
+  class BlackfinTargetMachine;
+
+  FunctionPass *createBlackfinISelDag(BlackfinTargetMachine &TM,
+                                      CodeGenOpt::Level OptLevel);
+  extern Target TheBlackfinTarget;
+
+} // end namespace llvm
+
+// Defines symbolic names for Blackfin registers.  This defines a mapping from
+// register name to register number.
+#include "BlackfinGenRegisterNames.inc"
+
+// Defines symbolic names for the Blackfin instructions.
+#include "BlackfinGenInstrNames.inc"
+
+#endif
diff --git a/lib/Target/Blackfin/Blackfin.td b/lib/Target/Blackfin/Blackfin.td
new file mode 100644
index 0000000..b904638
--- /dev/null
+++ b/lib/Target/Blackfin/Blackfin.td
@@ -0,0 +1,201 @@
+//===- Blackfin.td - Describe the Blackfin Target Machine --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Blackfin Subtarget features.
+//===----------------------------------------------------------------------===//
+
+def FeatureSDRAM : SubtargetFeature<"sdram", "sdram", "true",
+    "Build for SDRAM">;
+
+def FeatureICPLB : SubtargetFeature<"icplb", "icplb", "true",
+    "Assume instruction cache lookaside buffers are enabled at runtime">;
+
+//===----------------------------------------------------------------------===//
+// Bugs in the silicon becomes workarounds in the compiler.
+// See http://www.analog.com/ for the full list of IC anomalies.
+//===----------------------------------------------------------------------===//
+
+def WA_MI_SHIFT : SubtargetFeature<"mi-shift-anomaly","wa_mi_shift", "true",
+    "Work around 05000074 - "
+    "Multi-Issue Instruction with dsp32shiftimm and P-reg Store">;
+
+def WA_CSYNC : SubtargetFeature<"csync-anomaly","wa_csync", "true",
+    "Work around 05000244 - "
+    "If I-Cache Is On, CSYNC/SSYNC/IDLE Around Change of Control">;
+
+def WA_SPECLD : SubtargetFeature<"specld-anomaly","wa_specld", "true",
+    "Work around 05000245 - "
+    "Access in the Shadow of a Conditional Branch">;
+
+def WA_HWLOOP : SubtargetFeature<"hwloop-anomaly","wa_hwloop", "true",
+    "Work around 05000257 - "
+    "Interrupt/Exception During Short Hardware Loop">;
+
+def WA_MMR_STALL : SubtargetFeature<"mmr-stall-anomaly","wa_mmr_stall", "true",
+    "Work around 05000283 - "
+    "System MMR Write Is Stalled Indefinitely when Killed">;
+
+def WA_LCREGS : SubtargetFeature<"lcregs-anomaly","wa_lcregs", "true",
+    "Work around 05000312 - "
+    "SSYNC, CSYNC, or Loads to LT, LB and LC Registers Are Interrupted">;
+
+def WA_KILLED_MMR : SubtargetFeature<"killed-mmr-anomaly",
+                                     "wa_killed_mmr", "true",
+    "Work around 05000315 - "
+    "Killed System MMR Write Completes Erroneously on Next System MMR Access">;
+
+def WA_RETS : SubtargetFeature<"rets-anomaly", "wa_rets", "true",
+    "Work around 05000371 - "
+    "Possible RETS Register Corruption when Subroutine Is under 5 Cycles">;
+
+def WA_IND_CALL : SubtargetFeature<"ind-call-anomaly", "wa_ind_call", "true",
+    "Work around 05000426 - "
+    "Speculative Fetches of Indirect-Pointer Instructions">;
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "BlackfinRegisterInfo.td"
+include "BlackfinCallingConv.td"
+include "BlackfinInstrInfo.td"
+
+def BlackfinInstrInfo : InstrInfo {}
+
+//===----------------------------------------------------------------------===//
+// Blackfin processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, string Suffix, list<SubtargetFeature> Features>
+ : Processor<!strconcat(Name, Suffix), NoItineraries, Features>;
+
+def : Proc<"generic", "", []>;
+
+multiclass Core<string Name,string Suffix,
+                list<SubtargetFeature> Features> {
+  def : Proc<Name, Suffix, Features>;
+  def : Proc<Name, "", Features>;
+  def : Proc<Name, "-none", []>;
+}
+
+multiclass CoreEdinburgh<string Name>
+      : Core<Name, "-0.6", [WA_MI_SHIFT, WA_SPECLD, WA_LCREGS]> {
+  def : Proc<Name, "-0.5",
+        [WA_MI_SHIFT, WA_SPECLD, WA_MMR_STALL, WA_LCREGS, WA_KILLED_MMR,
+         WA_RETS]>;
+  def : Proc<Name, "-0.4",
+        [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+         WA_KILLED_MMR, WA_RETS]>;
+  def : Proc<Name, "-0.3",
+        [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+         WA_KILLED_MMR, WA_RETS]>;
+  def : Proc<Name, "-any",
+        [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+         WA_KILLED_MMR, WA_RETS]>;
+}
+multiclass CoreBraemar<string Name>
+       : Core<Name, "-0.3",
+         [WA_MI_SHIFT, WA_SPECLD, WA_LCREGS, WA_RETS, WA_IND_CALL]> {
+  def  : Proc<Name, "-0.2",
+         [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+          WA_KILLED_MMR, WA_RETS, WA_IND_CALL]>;
+  def  : Proc<Name, "-any",
+         [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+          WA_KILLED_MMR, WA_RETS, WA_IND_CALL]>;
+}
+multiclass CoreStirling<string Name>
+      : Core<Name, "-0.5", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]> {
+  def : Proc<Name, "-0.4",
+        [WA_MI_SHIFT, WA_SPECLD, WA_LCREGS, WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-0.3",
+        [WA_MI_SHIFT, WA_SPECLD, WA_MMR_STALL, WA_LCREGS, WA_KILLED_MMR,
+         WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-any",
+        [WA_MI_SHIFT, WA_SPECLD, WA_MMR_STALL, WA_LCREGS, WA_KILLED_MMR,
+         WA_RETS, WA_IND_CALL]>;
+}
+multiclass CoreMoab<string Name>
+      : Core<Name, "-0.3", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]> {
+  def : Proc<Name, "-0.2", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]>;
+  def : Proc<Name, "-0.1", [WA_MI_SHIFT, WA_SPECLD, WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-0.0",
+        [WA_MI_SHIFT, WA_SPECLD, WA_LCREGS, WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-any",
+        [WA_MI_SHIFT, WA_SPECLD, WA_LCREGS, WA_RETS, WA_IND_CALL]>;
+}
+multiclass CoreTeton<string Name>
+      : Core<Name, "-0.5",
+        [WA_MI_SHIFT, WA_SPECLD, WA_MMR_STALL, WA_LCREGS, WA_KILLED_MMR,
+         WA_RETS, WA_IND_CALL]> {
+  def : Proc<Name, "-0.3",
+        [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+         WA_KILLED_MMR, WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-any",
+        [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+         WA_KILLED_MMR, WA_RETS, WA_IND_CALL]>;
+}
+multiclass CoreKookaburra<string Name>
+      : Core<Name, "-0.2", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]> {
+  def : Proc<Name, "-0.1", [WA_MI_SHIFT, WA_SPECLD, WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-0.0", [WA_MI_SHIFT, WA_SPECLD, WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-any", [WA_MI_SHIFT, WA_SPECLD, WA_RETS, WA_IND_CALL]>;
+}
+multiclass CoreMockingbird<string Name>
+      : Core<Name, "-0.1", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]> {
+  def : Proc<Name, "-0.0", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]>;
+  def : Proc<Name, "-any", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]>;
+}
+multiclass CoreBrodie<string Name>
+      : Core<Name, "-0.1", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]> {
+  def : Proc<Name, "-0.0", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]>;
+  def : Proc<Name, "-any", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]>;
+}
+
+defm BF512 : CoreBrodie<"bf512">;
+defm BF514 : CoreBrodie<"bf514">;
+defm BF516 : CoreBrodie<"bf516">;
+defm BF518 : CoreBrodie<"bf518">;
+defm BF522 : CoreMockingbird<"bf522">;
+defm BF523 : CoreKookaburra<"bf523">;
+defm BF524 : CoreMockingbird<"bf524">;
+defm BF525 : CoreKookaburra<"bf525">;
+defm BF526 : CoreMockingbird<"bf526">;
+defm BF527 : CoreKookaburra<"bf527">;
+defm BF531 : CoreEdinburgh<"bf531">;
+defm BF532 : CoreEdinburgh<"bf532">;
+defm BF533 : CoreEdinburgh<"bf533">;
+defm BF534 : CoreBraemar<"bf534">;
+defm BF536 : CoreBraemar<"bf536">;
+defm BF537 : CoreBraemar<"bf537">;
+defm BF538 : CoreStirling<"bf538">;
+defm BF539 : CoreStirling<"bf539">;
+defm BF542 : CoreMoab<"bf542">;
+defm BF544 : CoreMoab<"bf544">;
+defm BF548 : CoreMoab<"bf548">;
+defm BF549 : CoreMoab<"bf549">;
+defm BF561 : CoreTeton<"bf561">;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Blackfin : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = BlackfinInstrInfo;
+}
diff --git a/lib/Target/Blackfin/BlackfinCallingConv.td b/lib/Target/Blackfin/BlackfinCallingConv.td
new file mode 100644
index 0000000..0abc84c
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinCallingConv.td
@@ -0,0 +1,30 @@
+//===--- BlackfinCallingConv.td - Calling Conventions ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Blackfin architectures.
+//
+//===----------------------------------------------------------------------===//
+
+// Blackfin C Calling convention.
+def CC_Blackfin : CallingConv<[
+  CCIfType<[i16], CCPromoteToType<i32>>,
+  CCIfSRet<CCAssignToReg<[P0]>>,
+  CCAssignToReg<[R0, R1, R2]>,
+  CCAssignToStack<4, 4>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Blackfin C return-value convention.
+def RetCC_Blackfin : CallingConv<[
+  CCIfType<[i16], CCPromoteToType<i32>>,
+  CCAssignToReg<[R0, R1]>
+]>;
diff --git a/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
new file mode 100644
index 0000000..fc62a18
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
@@ -0,0 +1,191 @@
+//===- BlackfinISelDAGToDAG.cpp - A dag to dag inst selector for Blackfin -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the Blackfin target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Blackfin.h"
+#include "BlackfinISelLowering.h"
+#include "BlackfinTargetMachine.h"
+#include "BlackfinRegisterInfo.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+/// BlackfinDAGToDAGISel - Blackfin specific code to select blackfin machine
+/// instructions for SelectionDAG operations.
+namespace {
+  class BlackfinDAGToDAGISel : public SelectionDAGISel {
+    /// Subtarget - Keep a pointer to the Blackfin Subtarget around so that we
+    /// can make the right decision when generating code for different targets.
+    //const BlackfinSubtarget &Subtarget;
+  public:
+    BlackfinDAGToDAGISel(BlackfinTargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel) {}
+
+    virtual void InstructionSelect();
+
+    virtual const char *getPassName() const {
+      return "Blackfin DAG->DAG Pattern Instruction Selection";
+    }
+
+    // Include the pieces autogenerated from the target description.
+#include "BlackfinGenDAGISel.inc"
+
+  private:
+    SDNode *Select(SDValue Op);
+    bool SelectADDRspii(SDValue Op, SDValue Addr,
+                        SDValue &Base, SDValue &Offset);
+
+    // Walk the DAG after instruction selection, fixing register class issues.
+    void FixRegisterClasses(SelectionDAG &DAG);
+
+    const BlackfinInstrInfo &getInstrInfo() {
+      return *static_cast<const BlackfinTargetMachine&>(TM).getInstrInfo();
+    }
+    const BlackfinRegisterInfo *getRegisterInfo() {
+      return static_cast<const BlackfinTargetMachine&>(TM).getRegisterInfo();
+    }
+  };
+}  // end anonymous namespace
+
+FunctionPass *llvm::createBlackfinISelDag(BlackfinTargetMachine &TM,
+                                          CodeGenOpt::Level OptLevel) {
+  return new BlackfinDAGToDAGISel(TM, OptLevel);
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void BlackfinDAGToDAGISel::InstructionSelect() {
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+  DEBUG(errs() << "Selected selection DAG before regclass fixup:\n");
+  DEBUG(CurDAG->dump());
+  FixRegisterClasses(*CurDAG);
+}
+
+SDNode *BlackfinDAGToDAGISel::Select(SDValue Op) {
+  SDNode *N = Op.getNode();
+  DebugLoc dl = N->getDebugLoc();
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::FrameIndex: {
+    // Selects to ADDpp FI, 0 which in turn will become ADDimm7 SP, imm or ADDpp
+    // SP, Px
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32);
+    return CurDAG->SelectNodeTo(N, BF::ADDpp, MVT::i32, TFI,
+                                CurDAG->getTargetConstant(0, MVT::i32));
+  }
+  }
+
+  return SelectCode(Op);
+}
+
+bool BlackfinDAGToDAGISel::SelectADDRspii(SDValue Op,
+                                          SDValue Addr,
+                                          SDValue &Base,
+                                          SDValue &Offset) {
+  FrameIndexSDNode *FIN = 0;
+  if ((FIN = dyn_cast<FrameIndexSDNode>(Addr))) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    ConstantSDNode *CN = 0;
+    if ((FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) &&
+        (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) &&
+        (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
+      // Constant positive word offset from frame index
+      Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+static inline bool isCC(const TargetRegisterClass *RC) {
+  return RC == &BF::AnyCCRegClass || BF::AnyCCRegClass.hasSubClass(RC);
+}
+
+static inline bool isDCC(const TargetRegisterClass *RC) {
+  return RC == &BF::DRegClass || BF::DRegClass.hasSubClass(RC) || isCC(RC);
+}
+
+static void UpdateNodeOperand(SelectionDAG &DAG,
+                              SDNode *N,
+                              unsigned Num,
+                              SDValue Val) {
+  SmallVector<SDValue, 8> ops(N->op_begin(), N->op_end());
+  ops[Num] = Val;
+  SDValue New = DAG.UpdateNodeOperands(SDValue(N, 0), ops.data(), ops.size());
+  DAG.ReplaceAllUsesWith(N, New.getNode());
+}
+
+// After instruction selection, insert COPY_TO_REGCLASS nodes to help in
+// choosing the proper register classes.
+void BlackfinDAGToDAGISel::FixRegisterClasses(SelectionDAG &DAG) {
+  const BlackfinInstrInfo &TII = getInstrInfo();
+  const BlackfinRegisterInfo *TRI = getRegisterInfo();
+  DAG.AssignTopologicalOrder();
+  HandleSDNode Dummy(DAG.getRoot());
+
+  for (SelectionDAG::allnodes_iterator NI = DAG.allnodes_begin();
+       NI != DAG.allnodes_end(); ++NI) {
+    if (NI->use_empty() || !NI->isMachineOpcode())
+      continue;
+    const TargetInstrDesc &DefTID = TII.get(NI->getMachineOpcode());
+    for (SDNode::use_iterator UI = NI->use_begin(); !UI.atEnd(); ++UI) {
+      if (!UI->isMachineOpcode())
+        continue;
+
+      if (UI.getUse().getResNo() >= DefTID.getNumDefs())
+        continue;
+      const TargetRegisterClass *DefRC =
+        DefTID.OpInfo[UI.getUse().getResNo()].getRegClass(TRI);
+
+      const TargetInstrDesc &UseTID = TII.get(UI->getMachineOpcode());
+      if (UseTID.getNumDefs()+UI.getOperandNo() >= UseTID.getNumOperands())
+        continue;
+      const TargetRegisterClass *UseRC =
+        UseTID.OpInfo[UseTID.getNumDefs()+UI.getOperandNo()].getRegClass(TRI);
+      if (!DefRC || !UseRC)
+        continue;
+      // We cannot copy CC <-> !(CC/D)
+      if ((isCC(DefRC) && !isDCC(UseRC)) || (isCC(UseRC) && !isDCC(DefRC))) {
+        SDNode *Copy =
+          DAG.getMachineNode(TargetInstrInfo::COPY_TO_REGCLASS,
+                             NI->getDebugLoc(),
+                             MVT::i32,
+                             UI.getUse().get(),
+                             DAG.getTargetConstant(BF::DRegClassID, MVT::i32));
+        UpdateNodeOperand(DAG, *UI, UI.getOperandNo(), SDValue(Copy, 0));
+      }
+    }
+  }
+  DAG.setRoot(Dummy.getValue());
+}
+
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.cpp b/lib/Target/Blackfin/BlackfinISelLowering.cpp
new file mode 100644
index 0000000..4b321ec
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinISelLowering.cpp
@@ -0,0 +1,614 @@
+//===- BlackfinISelLowering.cpp - Blackfin DAG Lowering Implementation ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that Blackfin uses to lower LLVM code
+// into a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinISelLowering.h"
+#include "BlackfinTargetMachine.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+BlackfinTargetLowering::BlackfinTargetLowering(TargetMachine &TM)
+  : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+  setShiftAmountType(MVT::i16);
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setStackPointerRegisterToSaveRestore(BF::SP);
+  setIntDivIsCheap(false);
+
+  // Set up the legal register classes.
+  addRegisterClass(MVT::i32, BF::DRegisterClass);
+  addRegisterClass(MVT::i16, BF::D16RegisterClass);
+
+  computeRegisterProperties();
+
+  // Blackfin doesn't have i1 loads or stores
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i32, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setOperationAction(ISD::BR_JT,     MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,     MVT::Other, Expand);
+
+  // i16 registers don't do much
+  setOperationAction(ISD::AND,   MVT::i16, Promote);
+  setOperationAction(ISD::OR,    MVT::i16, Promote);
+  setOperationAction(ISD::XOR,   MVT::i16, Promote);
+  setOperationAction(ISD::CTPOP, MVT::i16, Promote);
+  // The expansion of CTLZ/CTTZ uses AND/OR, so we might as well promote
+  // immediately.
+  setOperationAction(ISD::CTLZ,  MVT::i16, Promote);
+  setOperationAction(ISD::CTTZ,  MVT::i16, Promote);
+  setOperationAction(ISD::SETCC, MVT::i16, Promote);
+
+  // Blackfin has no division
+  setOperationAction(ISD::SDIV,    MVT::i16, Expand);
+  setOperationAction(ISD::SDIV,    MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i16, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM,    MVT::i16, Expand);
+  setOperationAction(ISD::SREM,    MVT::i32, Expand);
+  setOperationAction(ISD::UDIV,    MVT::i16, Expand);
+  setOperationAction(ISD::UDIV,    MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i16, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM,    MVT::i16, Expand);
+  setOperationAction(ISD::UREM,    MVT::i32, Expand);
+
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::MULHU,     MVT::i32, Expand);
+  setOperationAction(ISD::MULHS,     MVT::i32, Expand);
+
+  // No carry-in operations.
+  setOperationAction(ISD::ADDE, MVT::i32, Custom);
+  setOperationAction(ISD::SUBE, MVT::i32, Custom);
+
+  // Blackfin has no intrinsics for these particular operations.
+  setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // i32 has native CTPOP, but not CTLZ/CTTZ
+  setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+
+  // READCYCLECOUNTER needs special type legalization.
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
+
+  // We don't have line number support yet.
+  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+  setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
+  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+}
+
+const char *BlackfinTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case BFISD::CALL:     return "BFISD::CALL";
+  case BFISD::RET_FLAG: return "BFISD::RET_FLAG";
+  case BFISD::Wrapper:  return "BFISD::Wrapper";
+  }
+}
+
+MVT::SimpleValueType BlackfinTargetLowering::getSetCCResultType(EVT VT) const {
+  // SETCC always sets the CC register. Technically that is an i1 register, but
+  // that type is not legal, so we treat it as an i32 register.
+  return MVT::i32;
+}
+
+SDValue BlackfinTargetLowering::LowerGlobalAddress(SDValue Op,
+                                                   SelectionDAG &DAG) {
+  DebugLoc DL = Op.getDebugLoc();
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+
+  Op = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  return DAG.getNode(BFISD::Wrapper, DL, MVT::i32, Op);
+}
+
+SDValue BlackfinTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc DL = Op.getDebugLoc();
+  int JTI = cast<JumpTableSDNode>(Op)->getIndex();
+
+  Op = DAG.getTargetJumpTable(JTI, MVT::i32);
+  return DAG.getNode(BFISD::Wrapper, DL, MVT::i32, Op);
+}
+
+SDValue
+BlackfinTargetLowering::LowerFormalArguments(SDValue Chain,
+                                             CallingConv::ID CallConv, bool isVarArg,
+                                            const SmallVectorImpl<ISD::InputArg>
+                                               &Ins,
+                                             DebugLoc dl, SelectionDAG &DAG,
+                                             SmallVectorImpl<SDValue> &InVals) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+  CCInfo.AllocateStack(12, 4);  // ABI requires 12 bytes stack space
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Blackfin);
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+      TargetRegisterClass *RC = VA.getLocReg() == BF::P0 ?
+        BF::PRegisterClass : BF::DRegisterClass;
+      assert(RC->contains(VA.getLocReg()) && "Unexpected regclass in CCState");
+      assert(RC->hasType(RegVT) && "Unexpected regclass in CCState");
+
+      unsigned Reg = MF.getRegInfo().createVirtualRegister(RC);
+      MF.getRegInfo().addLiveIn(VA.getLocReg(), Reg);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+
+      if (VA.getLocInfo() != CCValAssign::Full)
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+
+      InVals.push_back(ArgValue);
+    } else {
+      assert(VA.isMemLoc() && "CCValAssign must be RegLoc or MemLoc");
+      unsigned ObjSize = VA.getLocVT().getStoreSize();
+      int FI = MFI->CreateFixedObject(ObjSize, VA.getLocMemOffset());
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, NULL, 0));
+    }
+  }
+
+  return Chain;
+}
+
+SDValue
+BlackfinTargetLowering::LowerReturn(SDValue Chain,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    DebugLoc dl, SelectionDAG &DAG) {
+
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(),
+                 RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_Blackfin);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue Opi = Outs[i].Val;
+
+    // Expand to i32 if necessary
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Opi = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Opi);
+      break;
+    case CCValAssign::ZExt:
+      Opi = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Opi);
+      break;
+    case CCValAssign::AExt:
+      Opi = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Opi);
+      break;
+    }
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Opi, SDValue());
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.getNode()) {
+    return DAG.getNode(BFISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+  } else {
+    return DAG.getNode(BFISD::RET_FLAG, dl, MVT::Other, Chain);
+  }
+}
+
+SDValue
+BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  bool isTailCall,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) {
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(), ArgLocs,
+                 *DAG.getContext());
+  CCInfo.AllocateStack(12, 4);  // ABI requires 12 bytes stack space
+  CCInfo.AnalyzeCallOperands(Outs, CC_Blackfin);
+
+  // Get the size of the outgoing arguments stack space requirement.
+  unsigned ArgsSize = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true));
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = Outs[i].Val;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc() && "CCValAssign must be RegLoc or MemLoc");
+      int Offset = VA.getLocMemOffset();
+      assert(Offset%4 == 0 && "Unaligned LocMemOffset");
+      assert(VA.getLocVT()==MVT::i32 && "Illegal CCValAssign type");
+      SDValue SPN = DAG.getCopyFromReg(Chain, dl, BF::SP, MVT::i32);
+      SDValue OffsetN = DAG.getIntPtrConstant(Offset);
+      OffsetN = DAG.getNode(ISD::ADD, dl, MVT::i32, SPN, OffsetN);
+      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, OffsetN,
+                                         PseudoSourceValue::getStack(),
+                                         Offset));
+    }
+  }
+
+  // Transform all store nodes into one single node because
+  // all store nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emited instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i32);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
+
+  std::vector<EVT> NodeTys;
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
+  SDValue Ops[] = { Chain, Callee, InFlag };
+  Chain = DAG.getNode(BFISD::CALL, dl, NodeTys, Ops,
+                      InFlag.getNode() ? 3 : 2);
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState RVInfo(CallConv, isVarArg, DAG.getTarget(), RVLocs,
+                 *DAG.getContext());
+
+  RVInfo.AnalyzeCallResult(Ins, RetCC_Blackfin);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &RV = RVLocs[i];
+    unsigned Reg = RV.getLocReg();
+
+    Chain = DAG.getCopyFromReg(Chain, dl, Reg,
+                               RVLocs[i].getLocVT(), InFlag);
+    SDValue Val = Chain.getValue(0);
+    InFlag = Chain.getValue(2);
+    Chain = Chain.getValue(1);
+
+    // Callee is responsible for extending any i16 return values.
+    switch (RV.getLocInfo()) {
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, dl, RV.getLocVT(), Val,
+                        DAG.getValueType(RV.getValVT()));
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, dl, RV.getLocVT(), Val,
+                        DAG.getValueType(RV.getValVT()));
+      break;
+    default:
+      break;
+    }
+
+    // Truncate to valtype
+    if (RV.getLocInfo() != CCValAssign::Full)
+      Val = DAG.getNode(ISD::TRUNCATE, dl, RV.getValVT(), Val);
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+// Expansion of ADDE / SUBE. This is a bit involved since blackfin doesn't have
+// add-with-carry instructions.
+SDValue BlackfinTargetLowering::LowerADDE(SDValue Op, SelectionDAG &DAG) {
+  // Operands: lhs, rhs, carry-in (AC0 flag)
+  // Results: sum, carry-out (AC0 flag)
+  DebugLoc dl = Op.getDebugLoc();
+
+  unsigned Opcode = Op.getOpcode()==ISD::ADDE ? BF::ADD : BF::SUB;
+
+  // zext incoming carry flag in AC0 to 32 bits
+  SDNode* CarryIn = DAG.getMachineNode(BF::MOVE_cc_ac0, dl, MVT::i32,
+                                       /* flag= */ Op.getOperand(2));
+  CarryIn = DAG.getMachineNode(BF::MOVECC_zext, dl, MVT::i32,
+                               SDValue(CarryIn, 0));
+
+  // Add operands, produce sum and carry flag
+  SDNode *Sum = DAG.getMachineNode(Opcode, dl, MVT::i32, MVT::Flag,
+                                   Op.getOperand(0), Op.getOperand(1));
+
+  // Store intermediate carry from Sum
+  SDNode* Carry1 = DAG.getMachineNode(BF::MOVE_cc_ac0, dl, MVT::i32,
+                                      /* flag= */ SDValue(Sum, 1));
+
+  // Add incoming carry, again producing an output flag
+  Sum = DAG.getMachineNode(Opcode, dl, MVT::i32, MVT::Flag,
+                           SDValue(Sum, 0), SDValue(CarryIn, 0));
+
+  // Update AC0 with the intermediate carry, producing a flag.
+  SDNode *CarryOut = DAG.getMachineNode(BF::OR_ac0_cc, dl, MVT::Flag,
+                                        SDValue(Carry1, 0));
+
+  // Compose (i32, flag) pair
+  SDValue ops[2] = { SDValue(Sum, 0), SDValue(CarryOut, 0) };
+  return DAG.getMergeValues(ops, 2, dl);
+}
+
+SDValue BlackfinTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default:
+    Op.getNode()->dump();
+    llvm_unreachable("Should not custom lower this!");
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:
+    llvm_unreachable("TLS not implemented for Blackfin.");
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+    // Frame & Return address.  Currently unimplemented
+  case ISD::FRAMEADDR:          return SDValue();
+  case ISD::RETURNADDR:         return SDValue();
+  case ISD::ADDE:
+  case ISD::SUBE:               return LowerADDE(Op, DAG);
+  }
+}
+
+void
+BlackfinTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue> &Results,
+                                           SelectionDAG &DAG) {
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+    return;
+  case ISD::READCYCLECOUNTER: {
+    // The low part of the cycle counter is in CYCLES, the high part in
+    // CYCLES2. Reading CYCLES will latch the value of CYCLES2, so we must read
+    // CYCLES2 last.
+    SDValue TheChain = N->getOperand(0);
+    SDValue lo = DAG.getCopyFromReg(TheChain, dl, BF::CYCLES, MVT::i32);
+    SDValue hi = DAG.getCopyFromReg(lo.getValue(1), dl, BF::CYCLES2, MVT::i32);
+    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, lo, hi));
+    // Outgoing chain. If we were to use the chain from lo instead, it would be
+    // possible to entirely eliminate the CYCLES2 read in (i32 (trunc
+    // readcyclecounter)). Unfortunately this could possibly delay the CYCLES2
+    // read beyond the next CYCLES read, leading to invalid results.
+    Results.push_back(hi.getValue(1));
+    return;
+  }
+  }
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned BlackfinTargetLowering::getFunctionAlignment(const Function *F) const {
+  return 2;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Blackfin Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+BlackfinTargetLowering::ConstraintType
+BlackfinTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() != 1)
+    return TargetLowering::getConstraintType(Constraint);
+
+  switch (Constraint[0]) {
+    // Standard constraints
+  case 'r':
+    return C_RegisterClass;
+
+    // Blackfin-specific constraints
+  case 'a':
+  case 'd':
+  case 'z':
+  case 'D':
+  case 'W':
+  case 'e':
+  case 'b':
+  case 'v':
+  case 'f':
+  case 'c':
+  case 't':
+  case 'u':
+  case 'k':
+  case 'x':
+  case 'y':
+  case 'w':
+    return C_RegisterClass;
+  case 'A':
+  case 'B':
+  case 'C':
+  case 'Z':
+  case 'Y':
+    return C_Register;
+  }
+
+  // Not implemented: q0-q7, qA. Use {R2} etc instead
+
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// getRegForInlineAsmConstraint - Return register no and class for a C_Register
+/// constraint.
+std::pair<unsigned, const TargetRegisterClass*> BlackfinTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
+  typedef std::pair<unsigned, const TargetRegisterClass*> Pair;
+  using namespace BF;
+
+  if (Constraint.size() != 1)
+    return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  switch (Constraint[0]) {
+    // Standard constraints
+  case 'r':
+    return Pair(0U, VT == MVT::i16 ? D16RegisterClass : DPRegisterClass);
+
+    // Blackfin-specific constraints
+  case 'a': return Pair(0U, PRegisterClass);
+  case 'd': return Pair(0U, DRegisterClass);
+  case 'e': return Pair(0U, AccuRegisterClass);
+  case 'A': return Pair(A0, AccuRegisterClass);
+  case 'B': return Pair(A1, AccuRegisterClass);
+  case 'b': return Pair(0U, IRegisterClass);
+  case 'v': return Pair(0U, BRegisterClass);
+  case 'f': return Pair(0U, MRegisterClass);
+  case 'C': return Pair(CC, JustCCRegisterClass);
+  case 'x': return Pair(0U, GRRegisterClass);
+  case 'w': return Pair(0U, ALLRegisterClass);
+  case 'Z': return Pair(P3, PRegisterClass);
+  case 'Y': return Pair(P1, PRegisterClass);
+  }
+
+  // Not implemented: q0-q7, qA. Use {R2} etc instead.
+  // Constraints z, D, W, c, t, u, k, and y use non-existing classes, defer to
+  // getRegClassForInlineAsmConstraint()
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+std::vector<unsigned> BlackfinTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
+  using namespace BF;
+
+  if (Constraint.size() != 1)
+    return std::vector<unsigned>();
+
+  switch (Constraint[0]) {
+  case 'z': return make_vector<unsigned>(P0, P1, P2, 0);
+  case 'D': return make_vector<unsigned>(R0, R2, R4, R6, 0);
+  case 'W': return make_vector<unsigned>(R1, R3, R5, R7, 0);
+  case 'c': return make_vector<unsigned>(I0, I1, I2, I3,
+                                         B0, B1, B2, B3,
+                                         L0, L1, L2, L3, 0);
+  case 't': return make_vector<unsigned>(LT0, LT1, 0);
+  case 'u': return make_vector<unsigned>(LB0, LB1, 0);
+  case 'k': return make_vector<unsigned>(LC0, LC1, 0);
+  case 'y': return make_vector<unsigned>(RETS, RETN, RETI, RETX, RETE,
+                                         ASTAT, SEQSTAT, USP, 0);
+  }
+
+  return std::vector<unsigned>();
+}
+
+bool BlackfinTargetLowering::
+isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Blackfin target isn't yet aware of offsets.
+  return false;
+}
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.h b/lib/Target/Blackfin/BlackfinISelLowering.h
new file mode 100644
index 0000000..cdbc7d2
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinISelLowering.h
@@ -0,0 +1,81 @@
+//===- BlackfinISelLowering.h - Blackfin DAG Lowering Interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Blackfin uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BLACKFIN_ISELLOWERING_H
+#define BLACKFIN_ISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "Blackfin.h"
+
+namespace llvm {
+
+  namespace BFISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+      CALL,                     // A call instruction.
+      RET_FLAG,                 // Return with a flag operand.
+      Wrapper                   // Address wrapper
+    };
+  }
+
+  class BlackfinTargetLowering : public TargetLowering {
+    int VarArgsFrameOffset;   // Frame offset to start of varargs area.
+  public:
+    BlackfinTargetLowering(TargetMachine &TM);
+    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+    virtual void ReplaceNodeResults(SDNode *N,
+                                    SmallVectorImpl<SDValue> &Results,
+                                    SelectionDAG &DAG);
+
+    int getVarArgsFrameOffset() const { return VarArgsFrameOffset; }
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+    std::vector<unsigned>
+    getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                      EVT VT) const;
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+    const char *getTargetNodeName(unsigned Opcode) const;
+    unsigned getFunctionAlignment(const Function *F) const;
+
+  private:
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerADDE(SDValue Op, SelectionDAG &DAG);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+  };
+} // end namespace llvm
+
+#endif    // BLACKFIN_ISELLOWERING_H
diff --git a/lib/Target/Blackfin/BlackfinInstrFormats.td b/lib/Target/Blackfin/BlackfinInstrFormats.td
new file mode 100644
index 0000000..d8e6e25
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinInstrFormats.td
@@ -0,0 +1,34 @@
+//===--- BlackfinInstrFormats.td ---------------------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+class InstBfin<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "BF";
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+}
+
+// Single-word (16-bit) instructions
+class F1<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstBfin<outs, ins, asmstr, pattern> {
+}
+
+// Double-word (32-bit) instructions
+class F2<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstBfin<outs, ins, asmstr, pattern> {
+}
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.cpp b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
new file mode 100644
index 0000000..3fd5d4d
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
@@ -0,0 +1,280 @@
+//===- BlackfinInstrInfo.cpp - Blackfin Instruction Information -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Blackfin implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinInstrInfo.h"
+#include "BlackfinSubtarget.h"
+#include "Blackfin.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "BlackfinGenInstrInfo.inc"
+
+using namespace llvm;
+
+BlackfinInstrInfo::BlackfinInstrInfo(BlackfinSubtarget &ST)
+  : TargetInstrInfoImpl(BlackfinInsts, array_lengthof(BlackfinInsts)),
+    RI(ST, *this),
+    Subtarget(ST) {}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+bool BlackfinInstrInfo::isMoveInstr(const MachineInstr &MI,
+                                    unsigned &SrcReg,
+                                    unsigned &DstReg,
+                                    unsigned &SrcSR,
+                                    unsigned &DstSR) const {
+  SrcSR = DstSR = 0; // No sub-registers.
+  switch (MI.getOpcode()) {
+  case BF::MOVE:
+  case BF::MOVE_ncccc:
+  case BF::MOVE_ccncc:
+  case BF::MOVECC_zext:
+  case BF::MOVECC_nz:
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  case BF::SLL16i:
+    if (MI.getOperand(2).getImm()!=0)
+      return false;
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned BlackfinInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                                int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case BF::LOAD32fi:
+  case BF::LOAD16fi:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned BlackfinInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                               int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case BF::STORE32fi:
+  case BF::STORE16fi:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned BlackfinInstrInfo::
+InsertBranch(MachineBasicBlock &MBB,
+             MachineBasicBlock *TBB,
+             MachineBasicBlock *FBB,
+             const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc operand
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "Branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, dl, get(BF::JUMPa)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  llvm_unreachable("Implement conditional branches!");
+}
+
+static bool inClass(const TargetRegisterClass &Test,
+                    unsigned Reg,
+                    const TargetRegisterClass *RC) {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return Test.contains(Reg);
+  else
+    return &Test==RC || Test.hasSubClass(RC);
+}
+
+bool BlackfinInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     unsigned DestReg,
+                                     unsigned SrcReg,
+                                     const TargetRegisterClass *DestRC,
+                                     const TargetRegisterClass *SrcRC) const {
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+
+  if (inClass(BF::ALLRegClass, DestReg, DestRC) &&
+      inClass(BF::ALLRegClass, SrcReg,  SrcRC)) {
+    BuildMI(MBB, I, dl, get(BF::MOVE), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  if (inClass(BF::D16RegClass, DestReg, DestRC) &&
+      inClass(BF::D16RegClass, SrcReg,  SrcRC)) {
+    BuildMI(MBB, I, dl, get(BF::SLL16i), DestReg).addReg(SrcReg).addImm(0);
+    return true;
+  }
+
+  if (inClass(BF::AnyCCRegClass, SrcReg, SrcRC) &&
+      inClass(BF::DRegClass, DestReg, DestRC)) {
+    if (inClass(BF::NotCCRegClass, SrcReg, SrcRC)) {
+      BuildMI(MBB, I, dl, get(BF::MOVENCC_z), DestReg).addReg(SrcReg);
+      BuildMI(MBB, I, dl, get(BF::BITTGL), DestReg).addReg(DestReg).addImm(0);
+    } else {
+      BuildMI(MBB, I, dl, get(BF::MOVECC_zext), DestReg).addReg(SrcReg);
+    }
+    return true;
+  }
+
+  if (inClass(BF::AnyCCRegClass, DestReg, DestRC) &&
+      inClass(BF::DRegClass, SrcReg,  SrcRC)) {
+    if (inClass(BF::NotCCRegClass, DestReg, DestRC))
+      BuildMI(MBB, I, dl, get(BF::SETEQri_not), DestReg).addReg(SrcReg);
+    else
+      BuildMI(MBB, I, dl, get(BF::MOVECC_nz), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  if (inClass(BF::NotCCRegClass, DestReg, DestRC) &&
+      inClass(BF::JustCCRegClass, SrcReg,  SrcRC)) {
+    BuildMI(MBB, I, dl, get(BF::MOVE_ncccc), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  if (inClass(BF::JustCCRegClass, DestReg, DestRC) &&
+      inClass(BF::NotCCRegClass, SrcReg,  SrcRC)) {
+    BuildMI(MBB, I, dl, get(BF::MOVE_ccncc), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  llvm_unreachable((std::string("Bad regclasses for reg-to-reg copy: ")+
+                    SrcRC->getName() + " -> " + DestRC->getName()).c_str());
+  return false;
+}
+
+void
+BlackfinInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned SrcReg,
+                                       bool isKill,
+                                       int FI,
+                                       const TargetRegisterClass *RC) const {
+  DebugLoc DL = I != MBB.end() ?
+    I->getDebugLoc() : DebugLoc::getUnknownLoc();
+
+  if (inClass(BF::DPRegClass, SrcReg, RC)) {
+    BuildMI(MBB, I, DL, get(BF::STORE32fi))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FI)
+      .addImm(0);
+    return;
+  }
+
+  if (inClass(BF::D16RegClass, SrcReg, RC)) {
+    BuildMI(MBB, I, DL, get(BF::STORE16fi))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FI)
+      .addImm(0);
+    return;
+  }
+
+  if (inClass(BF::AnyCCRegClass, SrcReg, RC)) {
+    BuildMI(MBB, I, DL, get(BF::STORE8fi))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FI)
+      .addImm(0);
+    return;
+  }
+
+  llvm_unreachable((std::string("Cannot store regclass to stack slot: ")+
+                    RC->getName()).c_str());
+}
+
+void BlackfinInstrInfo::
+storeRegToAddr(MachineFunction &MF,
+               unsigned SrcReg,
+               bool isKill,
+               SmallVectorImpl<MachineOperand> &Addr,
+               const TargetRegisterClass *RC,
+               SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  llvm_unreachable("storeRegToAddr not implemented");
+}
+
+void
+BlackfinInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned DestReg,
+                                        int FI,
+                                        const TargetRegisterClass *RC) const {
+  DebugLoc DL = I != MBB.end() ?
+    I->getDebugLoc() : DebugLoc::getUnknownLoc();
+  if (inClass(BF::DPRegClass, DestReg, RC)) {
+    BuildMI(MBB, I, DL, get(BF::LOAD32fi), DestReg)
+      .addFrameIndex(FI)
+      .addImm(0);
+    return;
+  }
+
+  if (inClass(BF::D16RegClass, DestReg, RC)) {
+    BuildMI(MBB, I, DL, get(BF::LOAD16fi), DestReg)
+      .addFrameIndex(FI)
+      .addImm(0);
+    return;
+  }
+
+  if (inClass(BF::AnyCCRegClass, DestReg, RC)) {
+    BuildMI(MBB, I, DL, get(BF::LOAD8fi), DestReg)
+      .addFrameIndex(FI)
+      .addImm(0);
+    return;
+  }
+
+  llvm_unreachable("Cannot load regclass from stack slot");
+}
+
+void BlackfinInstrInfo::
+loadRegFromAddr(MachineFunction &MF,
+                unsigned DestReg,
+                SmallVectorImpl<MachineOperand> &Addr,
+                const TargetRegisterClass *RC,
+                SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  llvm_unreachable("loadRegFromAddr not implemented");
+}
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.h b/lib/Target/Blackfin/BlackfinInstrInfo.h
new file mode 100644
index 0000000..ea3429c
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.h
@@ -0,0 +1,80 @@
+//===- BlackfinInstrInfo.h - Blackfin Instruction Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Blackfin implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BLACKFININSTRUCTIONINFO_H
+#define BLACKFININSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "BlackfinRegisterInfo.h"
+
+namespace llvm {
+
+  class BlackfinInstrInfo : public TargetInstrInfoImpl {
+    const BlackfinRegisterInfo RI;
+    const BlackfinSubtarget& Subtarget;
+  public:
+    explicit BlackfinInstrInfo(BlackfinSubtarget &ST);
+
+    /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+    /// such, whenever a client has an instance of instruction info, it should
+    /// always be able to get register info as well (through this method).
+    virtual const BlackfinRegisterInfo &getRegisterInfo() const { return RI; }
+
+    virtual bool isMoveInstr(const MachineInstr &MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+    virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                         int &FrameIndex) const;
+
+    virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                        int &FrameIndex) const;
+
+    virtual unsigned
+    InsertBranch(MachineBasicBlock &MBB,
+                 MachineBasicBlock *TBB,
+                 MachineBasicBlock *FBB,
+                 const SmallVectorImpl<MachineOperand> &Cond) const;
+
+    virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DestReg, unsigned SrcReg,
+                              const TargetRegisterClass *DestRC,
+                              const TargetRegisterClass *SrcRC) const;
+
+    virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned SrcReg, bool isKill,
+                                     int FrameIndex,
+                                     const TargetRegisterClass *RC) const;
+
+    virtual void storeRegToAddr(MachineFunction &MF,
+                                unsigned SrcReg, bool isKill,
+                                SmallVectorImpl<MachineOperand> &Addr,
+                                const TargetRegisterClass *RC,
+                                SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+    virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI,
+                                      unsigned DestReg, int FrameIndex,
+                                      const TargetRegisterClass *RC) const;
+
+    virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                 SmallVectorImpl<MachineOperand> &Addr,
+                                 const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const;
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.td b/lib/Target/Blackfin/BlackfinInstrInfo.td
new file mode 100644
index 0000000..934b188
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.td
@@ -0,0 +1,873 @@
+//===- BlackfinInstrInfo.td - Target Description for Blackfin Target ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Blackfin instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "BlackfinInstrFormats.td"
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_BfinCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_BfinCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def BfinCallseqStart : SDNode<"ISD::CALLSEQ_START", SDT_BfinCallSeqStart,
+                              [SDNPHasChain, SDNPOutFlag]>;
+def BfinCallseqEnd   : SDNode<"ISD::CALLSEQ_END",   SDT_BfinCallSeqEnd,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def SDT_BfinCall  : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def BfinCall      : SDNode<"BFISD::CALL", SDT_BfinCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def BfinRet: SDNode<"BFISD::RET_FLAG", SDTNone,
+                    [SDNPHasChain, SDNPOptInFlag]>;
+
+def BfinWrapper: SDNode<"BFISD::Wrapper", SDTIntUnaryOp>;
+
+//===----------------------------------------------------------------------===//
+// Transformations
+//===----------------------------------------------------------------------===//
+
+def trailingZeros_xform : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingZeros(),
+                                   MVT::i32);
+}]>;
+
+def trailingOnes_xform : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingOnes(),
+                                   MVT::i32);
+}]>;
+
+def LO16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((unsigned short)N->getZExtValue(), MVT::i16);
+}]>;
+
+def HI16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return CurDAG->getTargetConstant((unsigned)N->getZExtValue() >> 16, MVT::i16);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Immediates
+//===----------------------------------------------------------------------===//
+
+def imm3  : PatLeaf<(imm), [{return isInt<3>(N->getSExtValue());}]>;
+def uimm3 : PatLeaf<(imm), [{return isUint<3>(N->getZExtValue());}]>;
+def uimm4 : PatLeaf<(imm), [{return isUint<4>(N->getZExtValue());}]>;
+def uimm5 : PatLeaf<(imm), [{return isUint<5>(N->getZExtValue());}]>;
+
+def uimm5m2 : PatLeaf<(imm), [{
+    uint64_t value = N->getZExtValue();
+    return value % 2 == 0 && isUint<5>(value);
+}]>;
+
+def uimm6m4 : PatLeaf<(imm), [{
+    uint64_t value = N->getZExtValue();
+    return value % 4 == 0 && isUint<6>(value);
+}]>;
+
+def imm7   : PatLeaf<(imm), [{return isInt<7>(N->getSExtValue());}]>;
+def imm16  : PatLeaf<(imm), [{return isInt<16>(N->getSExtValue());}]>;
+def uimm16 : PatLeaf<(imm), [{return isUint<16>(N->getZExtValue());}]>;
+
+def ximm16 : PatLeaf<(imm), [{
+    int64_t value = N->getSExtValue();
+    return value < (1<<16) && value >= -(1<<15);
+}]>;
+
+def imm17m2 : PatLeaf<(imm), [{
+    int64_t value = N->getSExtValue();
+    return value % 2 == 0 && isInt<17>(value);
+}]>;
+
+def imm18m4 : PatLeaf<(imm), [{
+    int64_t value = N->getSExtValue();
+    return value % 4 == 0 && isInt<18>(value);
+}]>;
+
+// 32-bit bitmask transformed to a bit number
+def uimm5mask : Operand<i32>, PatLeaf<(imm), [{
+    return isPowerOf2_32(N->getZExtValue());
+}], trailingZeros_xform>;
+
+// 32-bit inverse bitmask transformed to a bit number
+def uimm5imask : Operand<i32>, PatLeaf<(imm), [{
+    return isPowerOf2_32(~N->getZExtValue());
+}], trailingOnes_xform>;
+
+//===----------------------------------------------------------------------===//
+// Operands
+//===----------------------------------------------------------------------===//
+
+def calltarget : Operand<iPTR>;
+
+def brtarget : Operand<OtherVT>;
+
+// Addressing modes
+def ADDRspii : ComplexPattern<i32, 2, "SelectADDRspii", [add, frameindex], []>;
+
+// Address operands
+def MEMii : Operand<i32> {
+  let PrintMethod = "printMemoryOperand";
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstBfin<outs, ins, asmstr, pattern>;
+
+let Defs = [SP], Uses = [SP] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                              "${:comment}ADJCALLSTACKDOWN $amt",
+                              [(BfinCallseqStart timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "${:comment}ADJCALLSTACKUP $amt1 $amt2",
+                            [(BfinCallseqEnd timm:$amt1, timm:$amt2)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Table C-9. Program Flow Control Instructions
+//===----------------------------------------------------------------------===//
+
+let isBranch = 1, isTerminator = 1 in {
+
+let isIndirectBranch = 1 in
+def JUMPp : F1<(outs), (ins P:$target),
+               "JUMP ($target);",
+               [(brind P:$target)]>;
+
+// TODO JUMP (PC-P)
+
+// NOTE: assembler chooses between JUMP.S and JUMP.L
+def JUMPa : F1<(outs), (ins brtarget:$target),
+               "jump $target;",
+               [(br bb:$target)]>;
+
+def JUMPcc : F1<(outs), (ins AnyCC:$cc, brtarget:$target),
+               "if $cc jump $target;",
+               [(brcond AnyCC:$cc, bb:$target)]>;
+}
+
+let isCall = 1,
+    Defs   = [R0, R1, R2, R3, P0, P1, P2, LB0, LB1, LC0, LC1, RETS, ASTAT] in {
+def CALLa: F1<(outs), (ins calltarget:$func, variable_ops),
+              "call $func;", []>;
+def CALLp: F1<(outs), (ins P:$func, variable_ops),
+              "call ($func);", [(BfinCall P:$func)]>;
+}
+
+let isReturn     = 1,
+    isTerminator = 1,
+    Uses         = [RETS] in
+def RTS: F1<(outs), (ins), "rts;", [(BfinRet)]>;
+
+//===----------------------------------------------------------------------===//
+// Table C-10. Load / Store Instructions
+//===----------------------------------------------------------------------===//
+
+// Immediate constant loads
+
+// sext immediate, i32 D/P regs
+def LOADimm7: F1<(outs DP:$dst), (ins i32imm:$src),
+                 "$dst = $src (x);",
+                 [(set DP:$dst, imm7:$src)]>;
+
+// zext immediate, i32 reg groups 0-3
+def LOADuimm16: F2<(outs GR:$dst), (ins i32imm:$src),
+                   "$dst = $src (z);",
+                   [(set GR:$dst, uimm16:$src)]>;
+
+// sext immediate, i32 reg groups 0-3
+def LOADimm16: F2<(outs GR:$dst), (ins i32imm:$src),
+                  "$dst = $src (x);",
+                  [(set GR:$dst, imm16:$src)]>;
+
+// Pseudo-instruction for loading a general 32-bit constant.
+def LOAD32imm: Pseudo<(outs GR:$dst), (ins i32imm:$src),
+                      "$dst.h = ($src >> 16); $dst.l = ($src & 0xffff);",
+                      [(set GR:$dst, imm:$src)]>;
+
+def LOAD32sym: Pseudo<(outs GR:$dst), (ins i32imm:$src),
+                      "$dst.h = $src; $dst.l = $src;", []>;
+
+
+// 16-bit immediate, i16 reg groups 0-3
+def LOAD16i: F2<(outs GR16:$dst), (ins i16imm:$src),
+                 "$dst = $src;", []>;
+
+def : Pat<(BfinWrapper (i32 tglobaladdr:$addr)),
+          (LOAD32sym tglobaladdr:$addr)>;
+
+def : Pat<(BfinWrapper (i32 tjumptable:$addr)),
+          (LOAD32sym tjumptable:$addr)>;
+
+// We cannot copy from GR16 to D16, and codegen wants to insert copies if we
+// emit GR16 instructions. As a hack, we use this fake instruction instead.
+def LOAD16i_d16: F2<(outs D16:$dst), (ins i16imm:$src),
+                    "$dst = $src;",
+                    [(set D16:$dst, ximm16:$src)]>;
+
+// Memory loads with patterns
+
+def LOAD32p: F1<(outs DP:$dst), (ins P:$ptr),
+                "$dst = [$ptr];",
+                [(set DP:$dst, (load P:$ptr))]>;
+
+// Pseudo-instruction for loading a stack slot
+def LOAD32fi: Pseudo<(outs DP:$dst), (ins MEMii:$mem),
+                     "${:comment}FI $dst = [$mem];",
+                     [(set DP:$dst, (load ADDRspii:$mem))]>;
+
+// Note: Expands to multiple insns
+def LOAD16fi: Pseudo<(outs D16:$dst), (ins MEMii:$mem),
+                     "${:comment}FI $dst = [$mem];",
+                     [(set D16:$dst, (load ADDRspii:$mem))]>;
+
+// Pseudo-instruction for loading a stack slot, used for AnyCC regs.
+// Replaced with Load D + CC=D
+def LOAD8fi: Pseudo<(outs AnyCC:$dst), (ins MEMii:$mem),
+                    "${:comment}FI $dst = B[$mem];",
+                    [(set AnyCC:$dst, (load ADDRspii:$mem))]>;
+
+def LOAD32p_uimm6m4: F1<(outs DP:$dst), (ins P:$ptr, i32imm:$off),
+                        "$dst = [$ptr + $off];",
+                        [(set DP:$dst, (load (add P:$ptr, uimm6m4:$off)))]>;
+
+def LOAD32p_imm18m4: F2<(outs DP:$dst), (ins P:$ptr, i32imm:$off),
+                         "$dst = [$ptr + $off];",
+                         [(set DP:$dst, (load (add P:$ptr, imm18m4:$off)))]>;
+
+def LOAD32p_16z: F1<(outs D:$dst), (ins P:$ptr),
+                    "$dst = W[$ptr] (z);",
+                    [(set D:$dst, (zextloadi16 P:$ptr))]>;
+
+def : Pat<(i32 (extloadi16 P:$ptr)),(LOAD32p_16z P:$ptr)>;
+
+def LOAD32p_uimm5m2_16z: F1<(outs D:$dst), (ins P:$ptr, i32imm:$off),
+                            "$dst = w[$ptr + $off] (z);",
+                            [(set D:$dst, (zextloadi16 (add P:$ptr,
+                                                        uimm5m2:$off)))]>;
+
+def : Pat<(i32 (extloadi16 (add P:$ptr, uimm5m2:$off))),
+          (LOAD32p_uimm5m2_16z P:$ptr, imm:$off)>;
+
+def LOAD32p_imm17m2_16z: F1<(outs D:$dst), (ins P:$ptr, i32imm:$off),
+                            "$dst = w[$ptr + $off] (z);",
+                            [(set D:$dst,
+                                  (zextloadi16 (add P:$ptr, imm17m2:$off)))]>;
+
+def : Pat<(i32 (extloadi16 (add P:$ptr, imm17m2:$off))),
+          (LOAD32p_imm17m2_16z P:$ptr, imm:$off)>;
+
+def LOAD32p_16s: F1<(outs D:$dst), (ins P:$ptr),
+                    "$dst = w[$ptr] (x);",
+                    [(set D:$dst, (sextloadi16 P:$ptr))]>;
+
+def LOAD32p_uimm5m2_16s: F1<(outs D:$dst), (ins P:$ptr, i32imm:$off),
+                            "$dst = w[$ptr + $off] (x);",
+                            [(set D:$dst,
+                                  (sextloadi16 (add P:$ptr, uimm5m2:$off)))]>;
+
+def LOAD32p_imm17m2_16s: F1<(outs D:$dst), (ins P:$ptr, i32imm:$off),
+                            "$dst = w[$ptr + $off] (x);",
+                            [(set D:$dst,
+                                  (sextloadi16 (add P:$ptr, imm17m2:$off)))]>;
+
+def LOAD16pi: F1<(outs D16:$dst), (ins PI:$ptr),
+                "$dst = w[$ptr];",
+                [(set D16:$dst, (load PI:$ptr))]>;
+
+def LOAD32p_8z: F1<(outs D:$dst), (ins P:$ptr),
+                   "$dst = B[$ptr] (z);",
+                   [(set D:$dst, (zextloadi8 P:$ptr))]>;
+
+def : Pat<(i32 (extloadi8 P:$ptr)), (LOAD32p_8z P:$ptr)>;
+def : Pat<(i16 (extloadi8 P:$ptr)),
+          (EXTRACT_SUBREG (LOAD32p_8z P:$ptr), bfin_subreg_lo16)>;
+def : Pat<(i16 (zextloadi8 P:$ptr)),
+          (EXTRACT_SUBREG (LOAD32p_8z P:$ptr), bfin_subreg_lo16)>;
+
+def LOAD32p_imm16_8z: F1<(outs D:$dst), (ins P:$ptr, i32imm:$off),
+                         "$dst = b[$ptr + $off] (z);",
+                         [(set D:$dst, (zextloadi8 (add P:$ptr, imm16:$off)))]>;
+
+def : Pat<(i32 (extloadi8 (add P:$ptr, imm16:$off))),
+          (LOAD32p_imm16_8z P:$ptr, imm:$off)>;
+def : Pat<(i16 (extloadi8 (add P:$ptr, imm16:$off))),
+          (EXTRACT_SUBREG (LOAD32p_imm16_8z P:$ptr, imm:$off),
+                           bfin_subreg_lo16)>;
+def : Pat<(i16 (zextloadi8 (add P:$ptr, imm16:$off))),
+          (EXTRACT_SUBREG (LOAD32p_imm16_8z P:$ptr, imm:$off),
+                           bfin_subreg_lo16)>;
+
+def LOAD32p_8s: F1<(outs D:$dst), (ins P:$ptr),
+                   "$dst = b[$ptr] (x);",
+                   [(set D:$dst, (sextloadi8 P:$ptr))]>;
+
+def : Pat<(i16 (sextloadi8 P:$ptr)),
+          (EXTRACT_SUBREG (LOAD32p_8s P:$ptr), bfin_subreg_lo16)>;
+
+def LOAD32p_imm16_8s: F1<(outs D:$dst), (ins P:$ptr, i32imm:$off),
+                         "$dst = b[$ptr + $off] (x);",
+                         [(set D:$dst, (sextloadi8 (add P:$ptr, imm16:$off)))]>;
+
+def : Pat<(i16 (sextloadi8 (add P:$ptr, imm16:$off))),
+          (EXTRACT_SUBREG (LOAD32p_imm16_8s P:$ptr, imm:$off),
+                           bfin_subreg_lo16)>;
+// Memory loads without patterns
+
+let mayLoad = 1 in {
+
+multiclass LOAD_incdec<RegisterClass drc, RegisterClass prc,
+                       string mem="", string suf=";"> {
+  def _inc : F1<(outs drc:$dst, prc:$ptr_wb), (ins prc:$ptr),
+                !strconcat(!subst("M", mem, "$dst = M[$ptr++]"), suf), []>;
+  def _dec : F1<(outs drc:$dst, prc:$ptr_wb), (ins prc:$ptr),
+                !strconcat(!subst("M", mem, "$dst = M[$ptr--]"), suf), []>;
+}
+multiclass LOAD_incdecpost<RegisterClass drc, RegisterClass prc,
+                           string mem="", string suf=";">
+         : LOAD_incdec<drc, prc, mem, suf> {
+  def _post : F1<(outs drc:$dst, prc:$ptr_wb), (ins prc:$ptr, prc:$off),
+                 !strconcat(!subst("M", mem, "$dst = M[$ptr++$off]"), suf), []>;
+}
+
+defm LOAD32p:    LOAD_incdec<DP, P>;
+defm LOAD32i:    LOAD_incdec<D, I>;
+defm LOAD8z32p:  LOAD_incdec<D, P, "b", " (z);">;
+defm LOAD8s32p:  LOAD_incdec<D, P, "b", " (x);">;
+defm LOADhi:     LOAD_incdec<D16, I, "w">;
+defm LOAD16z32p: LOAD_incdecpost<D, P, "w", " (z);">;
+defm LOAD16s32p: LOAD_incdecpost<D, P, "w", " (x);">;
+
+def LOAD32p_post: F1<(outs D:$dst, P:$ptr_wb), (ins P:$ptr, P:$off),
+                     "$dst = [$ptr ++ $off];", []>;
+
+// Note: $fp MUST be FP
+def LOAD32fp_nimm7m4: F1<(outs DP:$dst), (ins P:$fp, i32imm:$off),
+                         "$dst = [$fp - $off];", []>;
+
+def LOAD32i:      F1<(outs D:$dst), (ins I:$ptr),
+                     "$dst = [$ptr];", []>;
+def LOAD32i_post: F1<(outs D:$dst, I:$ptr_wb), (ins I:$ptr, M:$off),
+                     "$dst = [$ptr ++ $off];", []>;
+
+
+
+def LOADhp_post: F1<(outs D16:$dst, P:$ptr_wb), (ins P:$ptr, P:$off),
+                    "$dst = w[$ptr ++ $off];", []>;
+
+
+}
+
+// Memory stores with patterns
+def STORE32p: F1<(outs), (ins DP:$val, P:$ptr),
+                 "[$ptr] = $val;",
+                 [(store DP:$val, P:$ptr)]>;
+
+// Pseudo-instructions for storing to a stack slot
+def STORE32fi: Pseudo<(outs), (ins DP:$val, MEMii:$mem),
+                      "${:comment}FI [$mem] = $val;",
+                      [(store DP:$val, ADDRspii:$mem)]>;
+
+// Note: This stack-storing pseudo-instruction is expanded to multiple insns
+def STORE16fi: Pseudo<(outs), (ins D16:$val, MEMii:$mem),
+                  "${:comment}FI [$mem] = $val;",
+                  [(store D16:$val, ADDRspii:$mem)]>;
+
+// Pseudo-instructions for storing AnyCC register to a stack slot.
+// Replaced with D=CC + STORE byte
+def STORE8fi: Pseudo<(outs), (ins AnyCC:$val, MEMii:$mem),
+                      "${:comment}FI b[$mem] = $val;",
+                      [(store AnyCC:$val, ADDRspii:$mem)]>;
+
+def STORE32p_uimm6m4: F1<(outs), (ins DP:$val, P:$ptr, i32imm:$off),
+                 "[$ptr + $off] = $val;",
+                 [(store DP:$val, (add P:$ptr, uimm6m4:$off))]>;
+
+def STORE32p_imm18m4: F1<(outs), (ins DP:$val, P:$ptr, i32imm:$off),
+                 "[$ptr + $off] = $val;",
+                 [(store DP:$val, (add P:$ptr, imm18m4:$off))]>;
+
+def STORE16pi: F1<(outs), (ins D16:$val, PI:$ptr),
+                  "w[$ptr] = $val;",
+                  [(store D16:$val, PI:$ptr)]>;
+
+def STORE8p: F1<(outs), (ins D:$val, P:$ptr),
+                "b[$ptr] = $val;",
+                [(truncstorei8 D:$val, P:$ptr)]>;
+
+def STORE8p_imm16: F1<(outs), (ins D:$val, P:$ptr, i32imm:$off),
+                 "b[$ptr + $off] = $val;",
+                 [(truncstorei8 D:$val, (add P:$ptr, imm16:$off))]>;
+
+let Constraints = "$ptr = $ptr_wb" in {
+
+multiclass STORE_incdec<RegisterClass drc, RegisterClass prc,
+                        int off=4, string pre=""> {
+  def _inc : F1<(outs prc:$ptr_wb), (ins drc:$val, prc:$ptr),
+                !strconcat(pre, "[$ptr++] = $val;"),
+                [(set prc:$ptr_wb, (post_store drc:$val, prc:$ptr, off))]>;
+  def _dec : F1<(outs prc:$ptr_wb), (ins drc:$val, prc:$ptr),
+                !strconcat(pre, "[$ptr--] = $val;"),
+                [(set prc:$ptr_wb, (post_store drc:$val, prc:$ptr,
+                                               (ineg off)))]>;
+}
+
+defm STORE32p: STORE_incdec<DP, P>;
+defm STORE16i: STORE_incdec<D16, I, 2, "w">;
+defm STORE8p:  STORE_incdec<D, P, 1, "b">;
+
+def STORE32p_post: F1<(outs P:$ptr_wb), (ins D:$val, P:$ptr, P:$off),
+                      "[$ptr ++ $off] = $val;",
+                      [(set P:$ptr_wb, (post_store D:$val, P:$ptr, P:$off))]>;
+
+def STORE16p_post: F1<(outs P:$ptr_wb), (ins D16:$val, P:$ptr, P:$off),
+                      "w[$ptr ++ $off] = $val;",
+                      [(set P:$ptr_wb, (post_store D16:$val, P:$ptr, P:$off))]>;
+}
+
+// Memory stores without patterns
+
+let mayStore = 1 in {
+
+// Note: only works for $fp == FP
+def STORE32fp_nimm7m4: F1<(outs), (ins DP:$val, P:$fp, i32imm:$off),
+                         "[$fp - $off] = $val;", []>;
+
+def STORE32i: F1<(outs), (ins D:$val, I:$ptr),
+                 "[$ptr] = $val;", []>;
+
+def STORE32i_inc: F1<(outs I:$ptr_wb), (ins D:$val, I:$ptr),
+                 "[$ptr++] = $val;", []>;
+
+def STORE32i_dec: F1<(outs I:$ptr_wb), (ins D:$val, I:$ptr),
+                 "[$ptr--] = $val;", []>;
+
+def STORE32i_post: F1<(outs I:$ptr_wb), (ins D:$val, I:$ptr, M:$off),
+                      "[$ptr ++ $off] = $val;", []>;
+}
+
+def : Pat<(truncstorei16 D:$val, PI:$ptr),
+          (STORE16pi (EXTRACT_SUBREG (COPY_TO_REGCLASS D:$val, D),
+                                     bfin_subreg_lo16), PI:$ptr)>;
+
+def : Pat<(truncstorei16 (srl D:$val, (i16 16)), PI:$ptr),
+          (STORE16pi (EXTRACT_SUBREG (COPY_TO_REGCLASS D:$val, D),
+                                     bfin_subreg_hi16), PI:$ptr)>;
+
+def : Pat<(truncstorei8 D16L:$val, P:$ptr),
+          (STORE8p (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                  (COPY_TO_REGCLASS D16L:$val, D16L),
+                                  bfin_subreg_lo16),
+                   P:$ptr)>;
+
+//===----------------------------------------------------------------------===//
+// Table C-11. Move Instructions.
+//===----------------------------------------------------------------------===//
+
+def MOVE: F1<(outs ALL:$dst), (ins ALL:$src),
+             "$dst = $src;",
+             []>;
+
+let isTwoAddress = 1 in
+def MOVEcc: F1<(outs DP:$dst), (ins DP:$src1, DP:$src2, AnyCC:$cc),
+               "if $cc $dst = $src2;",
+               [(set DP:$dst, (select AnyCC:$cc, DP:$src2, DP:$src1))]>;
+
+let Defs = [AZ, AN, AC0, V] in {
+def MOVEzext: F1<(outs D:$dst), (ins D16L:$src),
+                 "$dst = $src (z);",
+                 [(set D:$dst, (zext D16L:$src))]>;
+
+def MOVEsext: F1<(outs D:$dst), (ins D16L:$src),
+                 "$dst = $src (x);",
+                 [(set D:$dst, (sext D16L:$src))]>;
+
+def MOVEzext8: F1<(outs D:$dst), (ins D:$src),
+                  "$dst = $src.b (z);",
+                  [(set D:$dst, (and D:$src, 0xff))]>;
+
+def MOVEsext8: F1<(outs D:$dst), (ins D:$src),
+                  "$dst = $src.b (x);",
+                  [(set D:$dst, (sext_inreg D:$src, i8))]>;
+
+}
+
+def : Pat<(sext_inreg D16L:$src, i8),
+          (EXTRACT_SUBREG (MOVEsext8
+                           (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                          D16L:$src,
+                                          bfin_subreg_lo16)),
+                          bfin_subreg_lo16)>;
+
+def : Pat<(sext_inreg D:$src, i16),
+          (MOVEsext (EXTRACT_SUBREG D:$src, bfin_subreg_lo16))>;
+
+def : Pat<(and D:$src, 0xffff),
+          (MOVEzext (EXTRACT_SUBREG D:$src, bfin_subreg_lo16))>;
+
+def : Pat<(i32 (anyext D16L:$src)),
+          (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                         (COPY_TO_REGCLASS D16L:$src, D16L),
+                         bfin_subreg_lo16)>;
+
+// TODO Dreg = Dreg_byte (X/Z)
+
+// TODO Accumulator moves
+
+//===----------------------------------------------------------------------===//
+// Table C-12. Stack Control Instructions
+//===----------------------------------------------------------------------===//
+
+let Uses = [SP], Defs = [SP] in {
+def PUSH: F1<(outs), (ins ALL:$src),
+             "[--sp] = $src;", []> { let mayStore = 1; }
+
+// NOTE: POP does not work for DP regs, use LOAD instead
+def POP:  F1<(outs ALL:$dst), (ins),
+             "$dst = [sp++];", []> { let mayLoad = 1; }
+}
+
+// TODO: push/pop multiple
+
+def LINK: F2<(outs), (ins i32imm:$amount),
+             "link $amount;", []>;
+
+def UNLINK: F2<(outs), (ins),
+               "unlink;", []>;
+
+//===----------------------------------------------------------------------===//
+// Table C-13. Control Code Bit Management Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SETCC<PatFrag opnode, PatFrag invnode, string cond, string suf=";"> {
+  def dd : F1<(outs JustCC:$cc), (ins D:$a, D:$b),
+              !strconcat(!subst("XX", cond, "cc = $a XX $b"), suf),
+              [(set JustCC:$cc, (opnode  D:$a, D:$b))]>;
+
+  def ri : F1<(outs JustCC:$cc), (ins DP:$a, i32imm:$b),
+              !strconcat(!subst("XX", cond, "cc = $a XX $b"), suf),
+              [(set JustCC:$cc, (opnode  DP:$a, imm3:$b))]>;
+
+  def pp : F1<(outs JustCC:$cc), (ins P:$a, P:$b),
+              !strconcat(!subst("XX", cond, "cc = $a XX $b"), suf),
+              []>;
+
+  def ri_not : F1<(outs NotCC:$cc), (ins DP:$a, i32imm:$b),
+                  !strconcat(!subst("XX", cond, "cc = $a XX $b"), suf),
+                  [(set NotCC:$cc, (invnode  DP:$a, imm3:$b))]>;
+}
+
+defm SETEQ  : SETCC<seteq,  setne,  "==">;
+defm SETLT  : SETCC<setlt,  setge,  "<">;
+defm SETLE  : SETCC<setle,  setgt,  "<=">;
+defm SETULT : SETCC<setult, setuge, "<",  " (iu);">;
+defm SETULE : SETCC<setule, setugt, "<=", " (iu);">;
+
+def SETNEdd : F1<(outs NotCC:$cc), (ins D:$a, D:$b),
+                 "cc = $a == $b;",
+                 [(set NotCC:$cc, (setne  D:$a, D:$b))]>;
+
+def : Pat<(setgt  D:$a, D:$b), (SETLTdd  D:$b, D:$a)>;
+def : Pat<(setge  D:$a, D:$b), (SETLEdd  D:$b, D:$a)>;
+def : Pat<(setugt D:$a, D:$b), (SETULTdd D:$b, D:$a)>;
+def : Pat<(setuge D:$a, D:$b), (SETULEdd D:$b, D:$a)>;
+
+// TODO: compare pointer for P-P comparisons
+// TODO: compare accumulator
+
+let Defs = [AC0] in
+def OR_ac0_cc : F1<(outs), (ins JustCC:$cc),
+                   "ac0 \\|= cc;", []>;
+
+let Uses = [AC0] in
+def MOVE_cc_ac0 : F1<(outs JustCC:$cc), (ins),
+                   "cc = ac0;", []>;
+
+def MOVE_ccncc : F1<(outs JustCC:$cc), (ins NotCC:$sb),
+                    "cc = !cc;", []>;
+
+def MOVE_ncccc : F1<(outs NotCC:$cc), (ins JustCC:$sb),
+                    "cc = !cc;", []>;
+
+def MOVECC_zext : F1<(outs D:$dst), (ins JustCC:$cc),
+                      "$dst = $cc;",
+                      [(set D:$dst, (zext JustCC:$cc))]>;
+
+def MOVENCC_z : F1<(outs D:$dst), (ins NotCC:$cc),
+                   "$dst = cc;", []>;
+
+def MOVECC_nz : F1<(outs AnyCC:$cc), (ins D:$src),
+                   "cc = $src;",
+                   [(set AnyCC:$cc, (setne D:$src, 0))]>;
+
+//===----------------------------------------------------------------------===//
+// Table C-14. Logical Operations Instructions
+//===----------------------------------------------------------------------===//
+
+def AND: F1<(outs D:$dst), (ins D:$src1, D:$src2),
+            "$dst = $src1 & $src2;",
+            [(set D:$dst, (and D:$src1, D:$src2))]>;
+
+def NOT: F1<(outs D:$dst), (ins D:$src),
+            "$dst = ~$src;",
+            [(set D:$dst, (not D:$src))]>;
+
+def OR: F1<(outs D:$dst), (ins D:$src1, D:$src2),
+           "$dst = $src1 \\| $src2;",
+           [(set D:$dst, (or D:$src1, D:$src2))]>;
+
+def XOR: F1<(outs D:$dst), (ins D:$src1, D:$src2),
+            "$dst = $src1 ^ $src2;",
+            [(set D:$dst, (xor D:$src1, D:$src2))]>;
+
+// missing: BXOR, BXORSHIFT
+
+//===----------------------------------------------------------------------===//
+// Table C-15. Bit Operations Instructions
+//===----------------------------------------------------------------------===//
+
+let isTwoAddress = 1 in {
+def BITCLR: F1<(outs D:$dst), (ins D:$src1, uimm5imask:$src2),
+              "bitclr($dst, $src2);",
+              [(set D:$dst, (and D:$src1, uimm5imask:$src2))]>;
+
+def BITSET: F1<(outs D:$dst), (ins D:$src1, uimm5mask:$src2),
+              "bitset($dst, $src2);",
+              [(set D:$dst, (or D:$src1, uimm5mask:$src2))]>;
+
+def BITTGL: F1<(outs D:$dst), (ins D:$src1, uimm5mask:$src2),
+              "bittgl($dst, $src2);",
+              [(set D:$dst, (xor D:$src1, uimm5mask:$src2))]>;
+}
+
+def BITTST: F1<(outs JustCC:$cc), (ins D:$src1, uimm5mask:$src2),
+              "cc = bittst($src1, $src2);",
+              [(set JustCC:$cc, (setne (and D:$src1, uimm5mask:$src2),
+                                       (i32 0)))]>;
+
+def NBITTST: F1<(outs JustCC:$cc), (ins D:$src1, uimm5mask:$src2),
+               "cc = !bittst($src1, $src2);",
+               [(set JustCC:$cc, (seteq (and D:$src1, uimm5mask:$src2),
+                                        (i32 0)))]>;
+
+// TODO: DEPOSIT, EXTRACT, BITMUX
+
+def ONES: F2<(outs D16L:$dst), (ins D:$src),
+              "$dst = ones $src;",
+              [(set D16L:$dst, (trunc (ctpop D:$src)))]>;
+
+def : Pat<(ctpop D:$src), (MOVEzext (ONES D:$src))>;
+
+//===----------------------------------------------------------------------===//
+// Table C-16. Shift / Rotate Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SHIFT32<SDNode opnode, string ops> {
+  def i : F1<(outs D:$dst), (ins D:$src, i16imm:$amount),
+             !subst("XX", ops, "$dst XX= $amount;"),
+             [(set D:$dst, (opnode D:$src, (i16 uimm5:$amount)))]>;
+  def r : F1<(outs D:$dst), (ins D:$src, D:$amount),
+             !subst("XX", ops, "$dst XX= $amount;"),
+             [(set D:$dst, (opnode D:$src, D:$amount))]>;
+}
+
+let Defs = [AZ, AN, V, VS],
+    isTwoAddress = 1 in {
+defm SRA : SHIFT32<sra, ">>>">;
+defm SRL : SHIFT32<srl, ">>">;
+defm SLL : SHIFT32<shl, "<<">;
+}
+
+// TODO: automatic switching between 2-addr and 3-addr (?)
+
+let Defs = [AZ, AN, V, VS] in {
+def SLLr16: F2<(outs D:$dst), (ins D:$src, D16L:$amount),
+             "$dst = lshift $src by $amount;",
+             [(set D:$dst, (shl D:$src, D16L:$amount))]>;
+
+// Arithmetic left-shift = saturing overflow.
+def SLAr16: F2<(outs D:$dst), (ins D:$src, D16L:$amount),
+             "$dst = ashift $src by $amount;",
+             [(set D:$dst, (sra D:$src, (ineg D16L:$amount)))]>;
+
+def SRA16i: F1<(outs D16:$dst), (ins D16:$src, i16imm:$amount),
+              "$dst = $src >>> $amount;",
+              [(set D16:$dst, (sra D16:$src, (i16 uimm4:$amount)))]>;
+
+def SRL16i: F1<(outs D16:$dst), (ins D16:$src, i16imm:$amount),
+              "$dst = $src >> $amount;",
+              [(set D16:$dst, (srl D16:$src, (i16 uimm4:$amount)))]>;
+
+// Arithmetic left-shift = saturing overflow.
+def SLA16r: F1<(outs D16:$dst), (ins D16:$src, D16L:$amount),
+              "$dst = ashift $src BY $amount;",
+              [(set D16:$dst, (srl D16:$src, (ineg D16L:$amount)))]>;
+
+def SLL16i: F1<(outs D16:$dst), (ins D16:$src, i16imm:$amount),
+              "$dst = $src << $amount;",
+              [(set D16:$dst, (shl D16:$src, (i16 uimm4:$amount)))]>;
+
+def SLL16r: F1<(outs D16:$dst), (ins D16:$src, D16L:$amount),
+              "$dst = lshift $src by $amount;",
+              [(set D16:$dst, (shl D16:$src, D16L:$amount))]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Table C-17. Arithmetic Operations Instructions
+//===----------------------------------------------------------------------===//
+
+// TODO: ABS
+
+let Defs = [AZ, AN, AC0, V, VS] in {
+
+def ADD: F1<(outs D:$dst), (ins D:$src1, D:$src2),
+            "$dst = $src1 + $src2;",
+            [(set D:$dst, (add D:$src1, D:$src2))]>;
+
+def ADD16: F2<(outs D16:$dst), (ins D16:$src1, D16:$src2),
+              "$dst = $src1 + $src2;",
+              [(set D16:$dst, (add D16:$src1, D16:$src2))]>;
+
+let isTwoAddress = 1 in
+def ADDimm7: F1<(outs D:$dst), (ins D:$src1, i32imm:$src2),
+                "$dst += $src2;",
+                [(set D:$dst, (add D:$src1, imm7:$src2))]>;
+
+def SUB: F1<(outs D:$dst), (ins D:$src1, D:$src2),
+            "$dst = $src1 - $src2;",
+            [(set D:$dst, (sub D:$src1, D:$src2))]>;
+
+def SUB16: F2<(outs D16:$dst), (ins D16:$src1, D16:$src2),
+              "$dst = $src1 - $src2;",
+              [(set D16:$dst, (sub D16:$src1, D16:$src2))]>;
+
+}
+
+def : Pat<(addc D:$src1, D:$src2), (ADD D:$src1, D:$src2)>;
+def : Pat<(subc D:$src1, D:$src2), (SUB D:$src1, D:$src2)>;
+
+let Defs = [AZ, AN, V, VS] in
+def NEG: F1<(outs D:$dst), (ins D:$src),
+            "$dst = -$src;",
+            [(set D:$dst, (ineg D:$src))]>;
+
+// No pattern, it would confuse isel to have two i32 = i32+i32 patterns
+def ADDpp: F1<(outs P:$dst), (ins P:$src1, P:$src2),
+              "$dst = $src1 + $src2;", []>;
+
+let isTwoAddress = 1 in
+def ADDpp_imm7: F1<(outs P:$dst), (ins P:$src1, i32imm:$src2),
+                "$dst += $src2;", []>;
+
+let Defs = [AZ, AN, V] in
+def ADD_RND20: F2<(outs D16:$dst), (ins D:$src1, D:$src2),
+                  "$dst = $src1 + $src2 (rnd20);", []>;
+
+let Defs = [V, VS] in {
+def MUL16: F2<(outs D16:$dst), (ins D16:$src1, D16:$src2),
+              "$dst = $src1 * $src2 (is);",
+              [(set D16:$dst, (mul D16:$src1, D16:$src2))]>;
+
+def MULHS16: F2<(outs D16:$dst), (ins D16:$src1, D16:$src2),
+                "$dst = $src1 * $src2 (ih);",
+                [(set D16:$dst, (mulhs D16:$src1, D16:$src2))]>;
+
+def MULhh32s: F2<(outs D:$dst), (ins D16:$src1, D16:$src2),
+                "$dst = $src1 * $src2 (is);",
+                [(set D:$dst, (mul (sext D16:$src1), (sext D16:$src2)))]>;
+
+def MULhh32u: F2<(outs D:$dst), (ins D16:$src1, D16:$src2),
+                "$dst = $src1 * $src2 (is);",
+                [(set D:$dst, (mul (zext D16:$src1), (zext D16:$src2)))]>;
+}
+
+
+let isTwoAddress = 1 in
+def MUL32: F1<(outs D:$dst), (ins D:$src1, D:$src2),
+            "$dst *= $src2;",
+            [(set D:$dst, (mul D:$src1, D:$src2))]>;
+
+//===----------------------------------------------------------------------===//
+// Table C-18. External Exent Management Instructions
+//===----------------------------------------------------------------------===//
+
+def IDLE : F1<(outs), (ins), "idle;", [(int_bfin_idle)]>;
+def CSYNC : F1<(outs), (ins), "csync;", [(int_bfin_csync)]>;
+def SSYNC : F1<(outs), (ins), "ssync;", [(int_bfin_ssync)]>;
+def EMUEXCPT : F1<(outs), (ins), "emuexcpt;", []>;
+def CLI : F1<(outs D:$mask), (ins), "cli $mask;", []>;
+def STI : F1<(outs), (ins D:$mask), "sti $mask;", []>;
+def RAISE : F1<(outs), (ins i32imm:$itr), "raise $itr;", []>;
+def EXCPT : F1<(outs), (ins i32imm:$exc), "excpt $exc;", []>;
+def NOP : F1<(outs), (ins), "nop;", []>;
+def MNOP : F2<(outs), (ins), "mnop;", []>;
+def ABORT : F1<(outs), (ins), "abort;", []>;
+
+//===----------------------------------------------------------------------===//
+// Table C-19. Cache Control Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Table C-20. Video Pixel Operations Instructions
+//===----------------------------------------------------------------------===//
+
+def ALIGN8 : F2<(outs D:$dst), (ins D:$src1, D:$src2),
+                "$dst = align8($src1, $src2);",
+                [(set D:$dst, (or (shl D:$src1, (i32 24)),
+                                  (srl D:$src2, (i32 8))))]>;
+
+def ALIGN16 : F2<(outs D:$dst), (ins D:$src1, D:$src2),
+                 "$dst = align16($src1, $src2);",
+                 [(set D:$dst, (or (shl D:$src1, (i32 16)),
+                                   (srl D:$src2, (i32 16))))]>;
+
+def ALIGN24 : F2<(outs D:$dst), (ins D:$src1, D:$src2),
+                 "$dst = align16($src1, $src2);",
+                 [(set D:$dst, (or (shl D:$src1, (i32 8)),
+                                   (srl D:$src2, (i32 24))))]>;
+
+def DISALGNEXCPT : F2<(outs), (ins), "disalignexcpt;", []>;
+
+// TODO: BYTEOP3P, BYTEOP16P, BYTEOP1P, BYTEOP2P, BYTEOP16M, SAA,
+//       BYTEPACK, BYTEUNPACK
+
+// Table C-21. Vector Operations Instructions
+
+// Patterns
+def : Pat<(BfinCall (i32 tglobaladdr:$dst)),
+          (CALLa tglobaladdr:$dst)>;
+def : Pat<(BfinCall (i32 texternalsym:$dst)),
+          (CALLa texternalsym:$dst)>;
+
+def : Pat<(sext JustCC:$cc),
+          (NEG (MOVECC_zext JustCC:$cc))>;
+def : Pat<(anyext JustCC:$cc),
+          (MOVECC_zext JustCC:$cc)>;
+def : Pat<(i16 (zext JustCC:$cc)),
+          (EXTRACT_SUBREG (MOVECC_zext JustCC:$cc), bfin_subreg_lo16)>;
+def : Pat<(i16 (sext JustCC:$cc)),
+          (EXTRACT_SUBREG (NEG (MOVECC_zext JustCC:$cc)), bfin_subreg_lo16)>;
+def : Pat<(i16 (anyext JustCC:$cc)),
+          (EXTRACT_SUBREG (MOVECC_zext JustCC:$cc), bfin_subreg_lo16)>;
+
+def : Pat<(i16 (trunc D:$src)),
+          (EXTRACT_SUBREG (COPY_TO_REGCLASS D:$src, D), bfin_subreg_lo16)>;
diff --git a/lib/Target/Blackfin/BlackfinMCAsmInfo.cpp b/lib/Target/Blackfin/BlackfinMCAsmInfo.cpp
new file mode 100644
index 0000000..6d0f66c
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinMCAsmInfo.cpp
@@ -0,0 +1,21 @@
+//===-- BlackfinMCAsmInfo.cpp - Blackfin asm properties -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the BlackfinMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinMCAsmInfo.h"
+
+using namespace llvm;
+
+BlackfinMCAsmInfo::BlackfinMCAsmInfo(const Target &T, const StringRef &TT) {
+  GlobalPrefix = "_";
+  CommentString = "//";
+}
diff --git a/lib/Target/Blackfin/BlackfinMCAsmInfo.h b/lib/Target/Blackfin/BlackfinMCAsmInfo.h
new file mode 100644
index 0000000..0efc295
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinMCAsmInfo.h
@@ -0,0 +1,29 @@
+//===-- BlackfinMCAsmInfo.h - Blackfin asm properties ---------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the BlackfinMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BLACKFINTARGETASMINFO_H
+#define BLACKFINTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+
+  struct BlackfinMCAsmInfo : public MCAsmInfo {
+    explicit BlackfinMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
new file mode 100644
index 0000000..8c0a58a
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
@@ -0,0 +1,472 @@
+//===- BlackfinRegisterInfo.cpp - Blackfin Register Information -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Blackfin implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Blackfin.h"
+#include "BlackfinRegisterInfo.h"
+#include "BlackfinSubtarget.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+BlackfinRegisterInfo::BlackfinRegisterInfo(BlackfinSubtarget &st,
+                                           const TargetInstrInfo &tii)
+  : BlackfinGenRegisterInfo(BF::ADJCALLSTACKDOWN, BF::ADJCALLSTACKUP),
+    Subtarget(st),
+    TII(tii) {}
+
+const unsigned*
+BlackfinRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  using namespace BF;
+  static const unsigned CalleeSavedRegs[] = {
+    FP,
+    R4, R5, R6, R7,
+    P3, P4, P5,
+    0 };
+  return  CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const *BlackfinRegisterInfo::
+getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  using namespace BF;
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &PRegClass,
+    &DRegClass, &DRegClass, &DRegClass, &DRegClass,
+    &PRegClass, &PRegClass, &PRegClass,
+    0 };
+  return CalleeSavedRegClasses;
+}
+
+BitVector
+BlackfinRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  using namespace BF;
+  BitVector Reserved(getNumRegs());
+  Reserved.set(AZ);
+  Reserved.set(AN);
+  Reserved.set(AQ);
+  Reserved.set(AC0);
+  Reserved.set(AC1);
+  Reserved.set(AV0);
+  Reserved.set(AV0S);
+  Reserved.set(AV1);
+  Reserved.set(AV1S);
+  Reserved.set(V);
+  Reserved.set(VS);
+  Reserved.set(CYCLES).set(CYCLES2);
+  Reserved.set(L0);
+  Reserved.set(L1);
+  Reserved.set(L2);
+  Reserved.set(L3);
+  Reserved.set(SP);
+  Reserved.set(RETS);
+  if (hasFP(MF))
+    Reserved.set(FP);
+  return Reserved;
+}
+
+const TargetRegisterClass*
+BlackfinRegisterInfo::getPhysicalRegisterRegClass(unsigned reg, EVT VT) const {
+  assert(isPhysicalRegister(reg) && "reg must be a physical register");
+
+  // Pick the smallest register class of the right type that contains
+  // this physreg.
+  const TargetRegisterClass* BestRC = 0;
+  for (regclass_iterator I = regclass_begin(), E = regclass_end();
+       I != E; ++I) {
+    const TargetRegisterClass* RC = *I;
+    if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) &&
+        (!BestRC || RC->getNumRegs() < BestRC->getNumRegs()))
+      BestRC = RC;
+  }
+
+  assert(BestRC && "Couldn't find the register class");
+  return BestRC;
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool BlackfinRegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasCalls() || MFI->hasVarSizedObjects();
+}
+
+bool BlackfinRegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+// Emit instructions to add delta to D/P register. ScratchReg must be of the
+// same class as Reg (P).
+void BlackfinRegisterInfo::adjustRegister(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          DebugLoc DL,
+                                          unsigned Reg,
+                                          unsigned ScratchReg,
+                                          int delta) const {
+  if (!delta)
+    return;
+  if (isInt<7>(delta)) {
+    BuildMI(MBB, I, DL, TII.get(BF::ADDpp_imm7), Reg)
+      .addReg(Reg)              // No kill on two-addr operand
+      .addImm(delta);
+    return;
+  }
+
+  // We must load delta into ScratchReg and add that.
+  loadConstant(MBB, I, DL, ScratchReg, delta);
+  if (BF::PRegClass.contains(Reg)) {
+    assert(BF::PRegClass.contains(ScratchReg) &&
+           "ScratchReg must be a P register");
+    BuildMI(MBB, I, DL, TII.get(BF::ADDpp), Reg)
+      .addReg(Reg, RegState::Kill)
+      .addReg(ScratchReg, RegState::Kill);
+  } else {
+    assert(BF::DRegClass.contains(Reg) && "Reg must be a D or P register");
+    assert(BF::DRegClass.contains(ScratchReg) &&
+           "ScratchReg must be a D register");
+    BuildMI(MBB, I, DL, TII.get(BF::ADD), Reg)
+      .addReg(Reg, RegState::Kill)
+      .addReg(ScratchReg, RegState::Kill);
+  }
+}
+
+// Emit instructions to load a constant into D/P register
+void BlackfinRegisterInfo::loadConstant(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        DebugLoc DL,
+                                        unsigned Reg,
+                                        int value) const {
+  if (isInt<7>(value)) {
+    BuildMI(MBB, I, DL, TII.get(BF::LOADimm7), Reg).addImm(value);
+    return;
+  }
+
+  if (isUint<16>(value)) {
+    BuildMI(MBB, I, DL, TII.get(BF::LOADuimm16), Reg).addImm(value);
+    return;
+  }
+
+  if (isInt<16>(value)) {
+    BuildMI(MBB, I, DL, TII.get(BF::LOADimm16), Reg).addImm(value);
+    return;
+  }
+
+  // We must split into halves
+  BuildMI(MBB, I, DL,
+          TII.get(BF::LOAD16i), getSubReg(Reg, bfin_subreg_hi16))
+    .addImm((value >> 16) & 0xffff)
+    .addReg(Reg, RegState::ImplicitDefine);
+  BuildMI(MBB, I, DL,
+          TII.get(BF::LOAD16i), getSubReg(Reg, bfin_subreg_lo16))
+    .addImm(value & 0xffff)
+    .addReg(Reg, RegState::ImplicitKill)
+    .addReg(Reg, RegState::ImplicitDefine);
+}
+
+void BlackfinRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF,
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    int64_t Amount = I->getOperand(0).getImm();
+    if (Amount != 0) {
+      assert(Amount%4 == 0 && "Unaligned call frame size");
+      if (I->getOpcode() == BF::ADJCALLSTACKDOWN) {
+        adjustRegister(MBB, I, I->getDebugLoc(), BF::SP, BF::P1, -Amount);
+      } else {
+        assert(I->getOpcode() == BF::ADJCALLSTACKUP &&
+               "Unknown call frame pseudo instruction");
+        adjustRegister(MBB, I, I->getDebugLoc(), BF::SP, BF::P1, Amount);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
+/// findScratchRegister - Find a 'free' register. Try for a call-clobbered
+/// register first and then a spilled callee-saved register if that fails.
+static unsigned findScratchRegister(MachineBasicBlock::iterator II,
+                                    RegScavenger *RS,
+                                    const TargetRegisterClass *RC,
+                                    int SPAdj) {
+  assert(RS && "Register scavenging must be on");
+  unsigned Reg = RS->FindUnusedReg(RC);
+  if (Reg == 0)
+    Reg = RS->scavengeRegister(RC, II, SPAdj);
+  return Reg;
+}
+
+unsigned
+BlackfinRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, int *Value,
+                                          RegScavenger *RS) const {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned FIPos;
+  for (FIPos=0; !MI.getOperand(FIPos).isFI(); ++FIPos) {
+    assert(FIPos < MI.getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+  int FrameIndex = MI.getOperand(FIPos).getIndex();
+  assert(FIPos+1 < MI.getNumOperands() && MI.getOperand(FIPos+1).isImm());
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex)
+    + MI.getOperand(FIPos+1).getImm();
+  unsigned BaseReg = BF::FP;
+  if (hasFP(MF)) {
+    assert(SPAdj==0 && "Unexpected SP adjust in function with frame pointer");
+  } else {
+    BaseReg = BF::SP;
+    Offset += MF.getFrameInfo()->getStackSize() + SPAdj;
+  }
+
+  bool isStore = false;
+
+  switch (MI.getOpcode()) {
+  case BF::STORE32fi:
+    isStore = true;
+  case BF::LOAD32fi: {
+    assert(Offset%4 == 0 && "Unaligned i32 stack access");
+    assert(FIPos==1 && "Bad frame index operand");
+    MI.getOperand(FIPos).ChangeToRegister(BaseReg, false);
+    MI.getOperand(FIPos+1).setImm(Offset);
+    if (isUint<6>(Offset)) {
+      MI.setDesc(TII.get(isStore
+                         ? BF::STORE32p_uimm6m4
+                         : BF::LOAD32p_uimm6m4));
+      return 0;
+    }
+    if (BaseReg == BF::FP && isUint<7>(-Offset)) {
+      MI.setDesc(TII.get(isStore
+                         ? BF::STORE32fp_nimm7m4
+                         : BF::LOAD32fp_nimm7m4));
+      MI.getOperand(FIPos+1).setImm(-Offset);
+      return 0;
+    }
+    if (isInt<18>(Offset)) {
+      MI.setDesc(TII.get(isStore
+                         ? BF::STORE32p_imm18m4
+                         : BF::LOAD32p_imm18m4));
+      return 0;
+    }
+    // Use RegScavenger to calculate proper offset...
+    MI.dump();
+    llvm_unreachable("Stack frame offset too big");
+    break;
+  }
+  case BF::ADDpp: {
+    assert(MI.getOperand(0).isReg() && "ADD instruction needs a register");
+    unsigned DestReg = MI.getOperand(0).getReg();
+    // We need to produce a stack offset in a P register. We emit:
+    // P0 = offset;
+    // P0 = BR + P0;
+    assert(FIPos==1 && "Bad frame index operand");
+    loadConstant(MBB, II, DL, DestReg, Offset);
+    MI.getOperand(1).ChangeToRegister(DestReg, false, false, true);
+    MI.getOperand(2).ChangeToRegister(BaseReg, false);
+    break;
+  }
+  case BF::STORE16fi:
+    isStore = true;
+  case BF::LOAD16fi: {
+    assert(Offset%2 == 0 && "Unaligned i16 stack access");
+    assert(FIPos==1 && "Bad frame index operand");
+    // We need a P register to use as an address
+    unsigned ScratchReg = findScratchRegister(II, RS, &BF::PRegClass, SPAdj);
+    assert(ScratchReg && "Could not scavenge register");
+    loadConstant(MBB, II, DL, ScratchReg, Offset);
+    BuildMI(MBB, II, DL, TII.get(BF::ADDpp), ScratchReg)
+      .addReg(ScratchReg, RegState::Kill)
+      .addReg(BaseReg);
+    MI.setDesc(TII.get(isStore ? BF::STORE16pi : BF::LOAD16pi));
+    MI.getOperand(1).ChangeToRegister(ScratchReg, false, false, true);
+    MI.RemoveOperand(2);
+    break;
+  }
+  case BF::STORE8fi: {
+    // This is an AnyCC spill, we need a scratch register.
+    assert(FIPos==1 && "Bad frame index operand");
+    MachineOperand SpillReg = MI.getOperand(0);
+    unsigned ScratchReg = findScratchRegister(II, RS, &BF::DRegClass, SPAdj);
+    assert(ScratchReg && "Could not scavenge register");
+    if (SpillReg.getReg()==BF::NCC) {
+      BuildMI(MBB, II, DL, TII.get(BF::MOVENCC_z), ScratchReg)
+        .addOperand(SpillReg);
+      BuildMI(MBB, II, DL, TII.get(BF::BITTGL), ScratchReg)
+        .addReg(ScratchReg).addImm(0);
+    } else {
+      BuildMI(MBB, II, DL, TII.get(BF::MOVECC_zext), ScratchReg)
+        .addOperand(SpillReg);
+    }
+    // STORE D
+    MI.setDesc(TII.get(BF::STORE8p_imm16));
+    MI.getOperand(0).ChangeToRegister(ScratchReg, false, false, true);
+    MI.getOperand(FIPos).ChangeToRegister(BaseReg, false);
+    MI.getOperand(FIPos+1).setImm(Offset);
+    break;
+  }
+  case BF::LOAD8fi: {
+    // This is an restore, we need a scratch register.
+    assert(FIPos==1 && "Bad frame index operand");
+    MachineOperand SpillReg = MI.getOperand(0);
+    unsigned ScratchReg = findScratchRegister(II, RS, &BF::DRegClass, SPAdj);
+    assert(ScratchReg && "Could not scavenge register");
+    MI.setDesc(TII.get(BF::LOAD32p_imm16_8z));
+    MI.getOperand(0).ChangeToRegister(ScratchReg, true);
+    MI.getOperand(FIPos).ChangeToRegister(BaseReg, false);
+    MI.getOperand(FIPos+1).setImm(Offset);
+    ++II;
+    if (SpillReg.getReg()==BF::CC) {
+      // CC = D
+      BuildMI(MBB, II, DL, TII.get(BF::MOVECC_nz), BF::CC)
+        .addReg(ScratchReg, RegState::Kill);
+    } else {
+      // Restore NCC (CC = D==0)
+      BuildMI(MBB, II, DL, TII.get(BF::SETEQri_not), BF::NCC)
+        .addReg(ScratchReg, RegState::Kill)
+        .addImm(0);
+    }
+    break;
+  }
+  default:
+    llvm_unreachable("Cannot eliminate frame index");
+    break;
+  }
+  return 0;
+}
+
+void BlackfinRegisterInfo::
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterClass *RC = BF::DPRegisterClass;
+  if (requiresRegisterScavenging(MF)) {
+    // Reserve a slot close to SP or frame pointer.
+    RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment()));
+  }
+}
+
+void BlackfinRegisterInfo::
+processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
+}
+
+// Emit a prologue that sets up a stack frame.
+// On function entry, R0-R2 and P0 may hold arguments.
+// R3, P1, and P2 may be used as scratch registers
+void BlackfinRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  DebugLoc dl = (MBBI != MBB.end()
+                 ? MBBI->getDebugLoc()
+                 : DebugLoc::getUnknownLoc());
+
+  int FrameSize = MFI->getStackSize();
+  if (FrameSize%4) {
+    FrameSize = (FrameSize+3) & ~3;
+    MFI->setStackSize(FrameSize);
+  }
+
+  if (!hasFP(MF)) {
+    assert(!MFI->hasCalls() &&
+           "FP elimination on a non-leaf function is not supported");
+    adjustRegister(MBB, MBBI, dl, BF::SP, BF::P1, -FrameSize);
+    return;
+  }
+
+  // emit a LINK instruction
+  if (FrameSize <= 0x3ffff) {
+    BuildMI(MBB, MBBI, dl, TII.get(BF::LINK)).addImm(FrameSize);
+    return;
+  }
+
+  // Frame is too big, do a manual LINK:
+  // [--SP] = RETS;
+  // [--SP] = FP;
+  // FP = SP;
+  // P1 = -FrameSize;
+  // SP = SP + P1;
+  BuildMI(MBB, MBBI, dl, TII.get(BF::PUSH))
+    .addReg(BF::RETS, RegState::Kill);
+  BuildMI(MBB, MBBI, dl, TII.get(BF::PUSH))
+    .addReg(BF::FP, RegState::Kill);
+  BuildMI(MBB, MBBI, dl, TII.get(BF::MOVE), BF::FP)
+    .addReg(BF::SP);
+  loadConstant(MBB, MBBI, dl, BF::P1, -FrameSize);
+  BuildMI(MBB, MBBI, dl, TII.get(BF::ADDpp), BF::SP)
+    .addReg(BF::SP, RegState::Kill)
+    .addReg(BF::P1, RegState::Kill);
+
+}
+
+void BlackfinRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  DebugLoc dl = MBBI->getDebugLoc();
+
+  int FrameSize = MFI->getStackSize();
+  assert(FrameSize%4 == 0 && "Misaligned frame size");
+
+  if (!hasFP(MF)) {
+    assert(!MFI->hasCalls() &&
+           "FP elimination on a non-leaf function is not supported");
+    adjustRegister(MBB, MBBI, dl, BF::SP, BF::P1, FrameSize);
+    return;
+  }
+
+  // emit an UNLINK instruction
+  BuildMI(MBB, MBBI, dl, TII.get(BF::UNLINK));
+}
+
+unsigned BlackfinRegisterInfo::getRARegister() const {
+  return BF::RETS;
+}
+
+unsigned BlackfinRegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  return hasFP(MF) ? BF::FP : BF::SP;
+}
+
+unsigned BlackfinRegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+  return 0;
+}
+
+unsigned BlackfinRegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+  return 0;
+}
+
+int BlackfinRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  llvm_unreachable("What is the dwarf register number");
+  return -1;
+}
+
+#include "BlackfinGenRegisterInfo.inc"
+
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.h b/lib/Target/Blackfin/BlackfinRegisterInfo.h
new file mode 100644
index 0000000..501f504
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.h
@@ -0,0 +1,104 @@
+//===- BlackfinRegisterInfo.h - Blackfin Register Information ..-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Blackfin implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BLACKFINREGISTERINFO_H
+#define BLACKFINREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "BlackfinGenRegisterInfo.h.inc"
+
+namespace llvm {
+
+  class BlackfinSubtarget;
+  class TargetInstrInfo;
+  class Type;
+
+  // Subregister indices, keep in sync with BlackfinRegisterInfo.td
+  enum BfinSubregIdx {
+    bfin_subreg_lo16 = 1,
+    bfin_subreg_hi16 = 2,
+    bfin_subreg_lo32 = 3
+  };
+
+  struct BlackfinRegisterInfo : public BlackfinGenRegisterInfo {
+    BlackfinSubtarget &Subtarget;
+    const TargetInstrInfo &TII;
+
+    BlackfinRegisterInfo(BlackfinSubtarget &st, const TargetInstrInfo &tii);
+
+    /// Code Generation virtual methods...
+    const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+    const TargetRegisterClass* const*
+    getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+    BitVector getReservedRegs(const MachineFunction &MF) const;
+
+    // getSubReg implemented by tablegen
+
+    const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const {
+      return &BF::PRegClass;
+    }
+
+    const TargetRegisterClass *getPhysicalRegisterRegClass(unsigned reg,
+                                                           EVT VT) const;
+
+    bool hasFP(const MachineFunction &MF) const;
+
+    // bool hasReservedCallFrame(MachineFunction &MF) const;
+
+    bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+    void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I) const;
+
+    unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                 int SPAdj, int *Value = NULL,
+                                 RegScavenger *RS = NULL) const;
+
+    void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                              RegScavenger *RS) const;
+
+    void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+    void emitPrologue(MachineFunction &MF) const;
+    void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+    unsigned getFrameRegister(MachineFunction &MF) const;
+    unsigned getRARegister() const;
+
+    // Exception handling queries.
+    unsigned getEHExceptionRegister() const;
+    unsigned getEHHandlerRegister() const;
+
+    int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+
+    // Utility functions
+    void adjustRegister(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator I,
+                        DebugLoc DL,
+                        unsigned Reg,
+                        unsigned ScratchReg,
+                        int delta) const;
+    void loadConstant(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I,
+                      DebugLoc DL,
+                      unsigned Reg,
+                      int value) const;
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.td b/lib/Target/Blackfin/BlackfinRegisterInfo.td
new file mode 100644
index 0000000..642d10f
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.td
@@ -0,0 +1,385 @@
+//===- BlackfinRegisterInfo.td - Blackfin Register defs ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Blackfin register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 3-bit group and 3-bit ID numbers.
+
+class BlackfinReg<string n> : Register<n> {
+  field bits<3> Group;
+  field bits<3> Num;
+  let Namespace = "BF";
+}
+
+// Rc - 1-bit registers
+class Rc<bits<5> bitno, string n> : BlackfinReg<n> {
+  field bits<5> BitNum = bitno;
+}
+
+// Rs - 16-bit integer registers
+class Rs<bits<3> group, bits<3> num, bits<1> hi, string n> : BlackfinReg<n> {
+  let Group = group;
+  let Num = num;
+  field bits<1> High = hi;
+}
+
+// Ri - 32-bit integer registers with subregs
+class Ri<bits<3> group, bits<3> num, string n> : BlackfinReg<n> {
+  let Group = group;
+  let Num = num;
+}
+
+// Ra 40-bit accumulator registers
+class Ra<bits<3> num, string n, list<Register> subs> : BlackfinReg<n> {
+  let SubRegs = subs;
+  let Group = 4;
+  let Num = num;
+}
+
+// Ywo halves of 32-bit register
+multiclass Rss<bits<3> group, bits<3> num, string n> {
+  def H : Rs<group, num, 1, !strconcat(n, ".h")>;
+  def L : Rs<group, num, 0, !strconcat(n, ".l")>;
+}
+
+// Rii - 32-bit integer registers with subregs
+class Rii<bits<3> group, bits<3> num, string n, list<Register> subs>
+      : BlackfinReg<n> {
+  let SubRegs = subs;
+  let Group = group;
+  let Num = num;
+}
+
+// Status bits are all part of ASTAT
+def AZ   : Rc<0,  "az">;
+def AN   : Rc<1,  "an">;
+def CC   : Rc<5,  "cc">, DwarfRegNum<[34]>;
+def NCC  : Rc<5,  "!cc"> { let Aliases = [CC]; }
+def AQ   : Rc<6,  "aq">;
+def AC0  : Rc<12, "ac0">;
+def AC1  : Rc<13, "ac1">;
+def AV0  : Rc<16, "av0">;
+def AV0S : Rc<17, "av0s">;
+def AV1  : Rc<18, "av1">;
+def AV1S : Rc<19, "av1s">;
+def V    : Rc<24, "v">;
+def VS   : Rc<25, "vs">;
+// Skipped non-status bits: AC0_COPY, V_COPY, RND_MOD
+
+// Group 0: Integer registers
+defm R0 : Rss<0, 0, "r0">;
+def  R0 : Rii<0, 0, "r0", [R0H, R0L]>, DwarfRegNum<[0]>;
+defm R1 : Rss<0, 1, "r1">;
+def  R1 : Rii<0, 1, "r1", [R1H, R1L]>, DwarfRegNum<[1]>;
+defm R2 : Rss<0, 2, "r2">;
+def  R2 : Rii<0, 2, "r2", [R2H, R2L]>, DwarfRegNum<[2]>;
+defm R3 : Rss<0, 3, "r3">;
+def  R3 : Rii<0, 3, "r3", [R3H, R3L]>, DwarfRegNum<[3]>;
+defm R4 : Rss<0, 4, "r4">;
+def  R4 : Rii<0, 4, "r4", [R4H, R4L]>, DwarfRegNum<[4]>;
+defm R5 : Rss<0, 5, "r5">;
+def  R5 : Rii<0, 5, "r5", [R5H, R5L]>, DwarfRegNum<[5]>;
+defm R6 : Rss<0, 6, "r6">;
+def  R6 : Rii<0, 6, "r6", [R6H, R6L]>, DwarfRegNum<[6]>;
+defm R7 : Rss<0, 7, "r7">;
+def  R7 : Rii<0, 7, "r7", [R7H, R7L]>, DwarfRegNum<[7]>;
+
+// Group 1: Pointer registers
+defm P0 : Rss<1, 0, "p0">;
+def  P0 : Rii<1, 0, "p0", [P0H, P0L]>, DwarfRegNum<[8]>;
+defm P1 : Rss<1, 1, "p1">;
+def  P1 : Rii<1, 1, "p1", [P1H, P1L]>, DwarfRegNum<[9]>;
+defm P2 : Rss<1, 2, "p2">;
+def  P2 : Rii<1, 2, "p2", [P2H, P2L]>, DwarfRegNum<[10]>;
+defm P3 : Rss<1, 3, "p3">;
+def  P3 : Rii<1, 3, "p3", [P3H, P3L]>, DwarfRegNum<[11]>;
+defm P4 : Rss<1, 4, "p4">;
+def  P4 : Rii<1, 4, "p4", [P4H, P4L]>, DwarfRegNum<[12]>;
+defm P5 : Rss<1, 5, "p5">;
+def  P5 : Rii<1, 5, "p5", [P5H, P5L]>, DwarfRegNum<[13]>;
+defm SP : Rss<1, 6, "sp">;
+def  SP : Rii<1, 6, "sp", [SPH, SPL]>, DwarfRegNum<[14]>;
+defm FP : Rss<1, 7, "fp">;
+def  FP : Rii<1, 7, "fp", [FPH, FPL]>, DwarfRegNum<[15]>;
+
+// Group 2: Index registers
+defm I0 : Rss<2, 0, "i0">;
+def  I0 : Rii<2, 0, "i0", [I0H, I0L]>, DwarfRegNum<[16]>;
+defm I1 : Rss<2, 1, "i1">;
+def  I1 : Rii<2, 1, "i1", [I1H, I1L]>, DwarfRegNum<[17]>;
+defm I2 : Rss<2, 2, "i2">;
+def  I2 : Rii<2, 2, "i2", [I2H, I2L]>, DwarfRegNum<[18]>;
+defm I3 : Rss<2, 3, "i3">;
+def  I3 : Rii<2, 3, "i3", [I3H, I3L]>, DwarfRegNum<[19]>;
+defm M0 : Rss<2, 4, "m0">;
+def  M0 : Rii<2, 4, "m0", [M0H, M0L]>, DwarfRegNum<[20]>;
+defm M1 : Rss<2, 5, "m1">;
+def  M1 : Rii<2, 5, "m1", [M1H, M1L]>, DwarfRegNum<[21]>;
+defm M2 : Rss<2, 6, "m2">;
+def  M2 : Rii<2, 6, "m2", [M2H, M2L]>, DwarfRegNum<[22]>;
+defm M3 : Rss<2, 7, "m3">;
+def  M3 : Rii<2, 7, "m3", [M3H, M3L]>, DwarfRegNum<[23]>;
+
+// Group 3: Cyclic indexing registers
+defm B0 : Rss<3, 0, "b0">;
+def  B0 : Rii<3, 0, "b0", [B0H, B0L]>, DwarfRegNum<[24]>;
+defm B1 : Rss<3, 1, "b1">;
+def  B1 : Rii<3, 1, "b1", [B1H, B1L]>, DwarfRegNum<[25]>;
+defm B2 : Rss<3, 2, "b2">;
+def  B2 : Rii<3, 2, "b2", [B2H, B2L]>, DwarfRegNum<[26]>;
+defm B3 : Rss<3, 3, "b3">;
+def  B3 : Rii<3, 3, "b3", [B3H, B3L]>, DwarfRegNum<[27]>;
+defm L0 : Rss<3, 4, "l0">;
+def  L0 : Rii<3, 4, "l0", [L0H, L0L]>, DwarfRegNum<[28]>;
+defm L1 : Rss<3, 5, "l1">;
+def  L1 : Rii<3, 5, "l1", [L1H, L1L]>, DwarfRegNum<[29]>;
+defm L2 : Rss<3, 6, "l2">;
+def  L2 : Rii<3, 6, "l2", [L2H, L2L]>, DwarfRegNum<[30]>;
+defm L3 : Rss<3, 7, "l3">;
+def  L3 : Rii<3, 7, "l3", [L3H, L3L]>, DwarfRegNum<[31]>;
+
+// Accumulators
+def  A0X : Ri <4, 0, "a0.x">;
+defm A0  : Rss<4, 1, "a0">;
+def  A0W : Rii<4, 1, "a0.w", [A0H, A0L]>, DwarfRegNum<[32]>;
+def  A0  : Ra <0, "a0", [A0X, A0W]>;
+
+def  A1X : Ri <4, 2, "a1.x">;
+defm A1  : Rss<4, 3, "a1">;
+def  A1W : Rii<4, 3, "a1.w", [A1H, A1L]>, DwarfRegNum<[33]>;
+def  A1  : Ra <2, "a1", [A1X, A1W]>;
+
+def RETS : Ri<4, 7, "rets">,  DwarfRegNum<[35]>;
+def RETI : Ri<7, 3, "reti">,  DwarfRegNum<[36]>;
+def RETX : Ri<7, 4, "retx">,  DwarfRegNum<[37]>;
+def RETN : Ri<7, 5, "retn">,  DwarfRegNum<[38]>;
+def RETE : Ri<7, 6, "rete">,  DwarfRegNum<[39]>;
+
+def ASTAT   : Ri<4, 6, "astat">,   DwarfRegNum<[40]> {
+  let SubRegs = [AZ, AN, CC, NCC, AQ, AC0, AC1, AV0, AV0S, AV1, AV1S, V, VS];
+}
+
+def SEQSTAT : Ri<7, 1, "seqstat">, DwarfRegNum<[41]>;
+def USP     : Ri<7, 0, "usp">,     DwarfRegNum<[42]>;
+def EMUDAT  : Ri<7, 7, "emudat">,  DwarfRegNum<[43]>;
+def SYSCFG  : Ri<7, 2, "syscfg">;
+def CYCLES  : Ri<6, 6, "cycles">;
+def CYCLES2 : Ri<6, 7, "cycles2">;
+
+// Hardware loops
+def LT0 : Ri<6, 1, "lt0">, DwarfRegNum<[44]>;
+def LT1 : Ri<6, 4, "lt1">, DwarfRegNum<[45]>;
+def LC0 : Ri<6, 0, "lc0">, DwarfRegNum<[46]>;
+def LC1 : Ri<6, 3, "lc1">, DwarfRegNum<[47]>;
+def LB0 : Ri<6, 2, "lb0">, DwarfRegNum<[48]>;
+def LB1 : Ri<6, 5, "lb1">, DwarfRegNum<[49]>;
+
+// Subregs are:
+// 1: .L
+// 2: .H
+// 3: .W (32 low bits of 40-bit accu)
+// Keep in sync with enum in BlackfinRegisterInfo.h
+def bfin_subreg_lo16  : PatLeaf<(i32 1)>;
+def bfin_subreg_hi16  : PatLeaf<(i32 2)>;
+def bfin_subreg_32bit : PatLeaf<(i32 3)>;
+
+def : SubRegSet<1,
+    [R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,
+     P0,  P1,  P2,  P3,  P4,  P5,  SP,  FP,
+     I0,  I1,  I2,  I3,  M0,  M1,  M2,  M3,
+     B0,  B1,  B2,  B3,  L0,  L1,  L2,  L3],
+    [R0L, R1L, R2L, R3L, R4L, R5L, R6L, R7L,
+     P0L, P1L, P2L, P3L, P4L, P5L, SPL, FPL,
+     I0L, I1L, I2L, I3L, M0L, M1L, M2L, M3L,
+     B0L, B1L, B2L, B3L, L0L, L1L, L2L, L3L]>;
+
+def : SubRegSet<2,
+    [R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,
+     P0,  P1,  P2,  P3,  P4,  P5,  SP,  FP,
+     I0,  I1,  I2,  I3,  M0,  M1,  M2,  M3,
+     B0,  B1,  B2,  B3,  L0,  L1,  L2,  L3],
+    [R0H, R1H, R2H, R3H, R4H, R5H, R6H, R7H,
+     P0H, P1H, P2H, P3H, P4H, P5H, SPH, FPH,
+     I0H, I1H, I2H, I3H, M0H, M1H, M2H, M3H,
+     B0H, B1H, B2H, B3H, L0H, L1H, L2H, L3H]>;
+
+def : SubRegSet<1, [A0, A0W, A1, A1W], [A0L, A0L, A1L, A1L]>;
+def : SubRegSet<2, [A0, A0W, A1, A1W], [A0H, A0H, A1H, A1H]>;
+
+// Register classes.
+def D16 : RegisterClass<"BF", [i16], 16,
+    [R0H, R0L, R1H, R1L, R2H, R2L, R3H, R3L,
+     R4H, R4L, R5H, R5L, R6H, R6L, R7H, R7L]>;
+
+def D16L : RegisterClass<"BF", [i16], 16,
+    [R0L, R1L, R2L, R3L, R4L, R5L, R6L, R7L]>;
+
+def D16H : RegisterClass<"BF", [i16], 16,
+    [R0H, R1H, R2H, R3H, R4H, R5H, R6H, R7H]>;
+
+def P16 : RegisterClass<"BF", [i16], 16,
+    [P0H, P0L, P1H, P1L, P2H, P2L, P3H, P3L,
+     P4H, P4L, P5H, P5L, SPH, SPL, FPH, FPL]>;
+
+def P16L : RegisterClass<"BF", [i16], 16,
+    [P0L, P1L, P2L, P3L, P4L, P5L, SPL, FPL]>;
+
+def P16H : RegisterClass<"BF", [i16], 16,
+    [P0H, P1H, P2H, P3H, P4H, P5H, SPH, FPH]>;
+
+def DP16 : RegisterClass<"BF", [i16], 16,
+    [R0H, R0L, R1H, R1L, R2H, R2L, R3H, R3L,
+     R4H, R4L, R5H, R5L, R6H, R6L, R7H, R7L,
+     P0H, P0L, P1H, P1L, P2H, P2L, P3H, P3L,
+     P4H, P4L, P5H, P5L, SPH, SPL, FPH, FPL]>;
+
+def DP16L : RegisterClass<"BF", [i16], 16,
+    [R0L, R1L, R2L, R3L, R4L, R5L, R6L, R7L,
+     P0L, P1L, P2L, P3L, P4L, P5L, SPL, FPL]>;
+
+def DP16H : RegisterClass<"BF", [i16], 16,
+    [R0H, R1H, R2H, R3H, R4H, R5H, R6H, R7H,
+     P0H, P1H, P2H, P3H, P4H, P5H, SPH, FPH]>;
+
+def GR16 : RegisterClass<"BF", [i16], 16,
+    [R0H, R0L, R1H, R1L, R2H, R2L, R3H, R3L,
+     R4H, R4L, R5H, R5L, R6H, R6L, R7H, R7L,
+     P0H, P0L, P1H, P1L, P2H, P2L, P3H, P3L,
+     P4H, P4L, P5H, P5L, SPH, SPL, FPH, FPL,
+     I0H, I0L, I1H, I1L, I2H, I2L, I3H, I3L,
+     M0H, M0L, M1H, M1L, M2H, M2L, M3H, M3L,
+     B0H, B0L, B1H, B1L, B2H, B2L, B3H, B3L,
+     L0H, L0L, L1H, L1L, L2H, L2L, L3H, L3L]>;
+
+def D : RegisterClass<"BF", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
+  let SubRegClassList = [D16L, D16H];
+}
+
+def P : RegisterClass<"BF", [i32], 32, [P0, P1, P2, P3, P4, P5, FP, SP]> {
+  let SubRegClassList = [P16L, P16H];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    PClass::iterator
+    PClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      return allocation_order_begin(MF)
+             + (RI->hasFP(MF) ? 7 : 6);
+    }
+  }];
+}
+
+def I : RegisterClass<"BF", [i32], 32, [I0, I1, I2, I3]>;
+def M : RegisterClass<"BF", [i32], 32, [M0, M1, M2, M3]>;
+def B : RegisterClass<"BF", [i32], 32, [B0, B1, B2, B3]>;
+def L : RegisterClass<"BF", [i32], 32, [L0, L1, L2, L3]>;
+
+def DP : RegisterClass<"BF", [i32], 32,
+    [R0, R1, R2, R3, R4, R5, R6, R7,
+     P0, P1, P2, P3, P4, P5, FP, SP]> {
+  let SubRegClassList = [DP16L, DP16H];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    DPClass::iterator
+    DPClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      return allocation_order_begin(MF)
+             + (RI->hasFP(MF) ? 15 : 14);
+    }
+  }];
+}
+
+def GR : RegisterClass<"BF", [i32], 32,
+    [R0, R1, R2, R3, R4, R5, R6, R7,
+     P0, P1, P2, P3, P4, P5,
+     I0, I1, I2, I3, M0, M1, M2, M3,
+     B0, B1, B2, B3, L0, L1, L2, L3,
+     FP, SP]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GRClass::iterator
+    GRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      return allocation_order_begin(MF)
+             + (RI->hasFP(MF) ? 31 : 30);
+    }
+  }];
+}
+
+def ALL : RegisterClass<"BF", [i32], 32,
+    [R0, R1, R2, R3, R4, R5, R6, R7,
+     P0, P1, P2, P3, P4, P5,
+     I0, I1, I2, I3, M0, M1, M2, M3,
+     B0, B1, B2, B3, L0, L1, L2, L3,
+     FP, SP,
+     A0X, A0W, A1X, A1W, ASTAT, RETS,
+     LC0, LT0, LB0, LC1, LT1, LB1, CYCLES, CYCLES2,
+     USP, SEQSTAT, SYSCFG, RETI, RETX, RETN, RETE, EMUDAT]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    ALLClass::iterator
+    ALLClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      return allocation_order_begin(MF)
+             + (RI->hasFP(MF) ? 31 : 30);
+    }
+  }];
+}
+
+def PI : RegisterClass<"BF", [i32], 32,
+    [P0, P1, P2, P3, P4, P5, I0, I1, I2, I3, FP, SP]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    PIClass::iterator
+    PIClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      return allocation_order_begin(MF)
+             + (RI->hasFP(MF) ? 11 : 10);
+    }
+  }];
+}
+
+// We are going to pretend that CC and !CC are 32-bit registers, even though
+// they only can hold 1 bit.
+let CopyCost = -1, Size = 8 in {
+def JustCC  : RegisterClass<"BF", [i32], 8, [CC]>;
+def NotCC   : RegisterClass<"BF", [i32], 8, [NCC]>;
+def AnyCC   : RegisterClass<"BF", [i32], 8, [CC, NCC]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    AnyCCClass::iterator
+    AnyCCClass::allocation_order_end(const MachineFunction &MF) const {
+      return allocation_order_begin(MF)+1;
+    }
+  }];
+}
+def StatBit : RegisterClass<"BF", [i1], 8,
+    [AZ, AN, CC, AQ, AC0, AC1, AV0, AV0S, AV1, AV1S, V, VS]>;
+}
+
+// Should be i40, but that isn't defined. It is not a legal type yet anyway.
+def Accu : RegisterClass<"BF", [i64], 64, [A0, A1]>;
diff --git a/lib/Target/Blackfin/BlackfinSubtarget.cpp b/lib/Target/Blackfin/BlackfinSubtarget.cpp
new file mode 100644
index 0000000..e104c52
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinSubtarget.cpp
@@ -0,0 +1,36 @@
+//===- BlackfinSubtarget.cpp - BLACKFIN Subtarget Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the blackfin specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinSubtarget.h"
+#include "BlackfinGenSubtarget.inc"
+
+using namespace llvm;
+
+BlackfinSubtarget::BlackfinSubtarget(const std::string &TT,
+                                     const std::string &FS)
+  : sdram(false),
+    icplb(false),
+    wa_mi_shift(false),
+    wa_csync(false),
+    wa_specld(false),
+    wa_mmr_stall(false),
+    wa_lcregs(false),
+    wa_hwloop(false),
+    wa_ind_call(false),
+    wa_killed_mmr(false),
+    wa_rets(false)
+{
+  std::string CPU = "generic";
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}
diff --git a/lib/Target/Blackfin/BlackfinSubtarget.h b/lib/Target/Blackfin/BlackfinSubtarget.h
new file mode 100644
index 0000000..d667fe2
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinSubtarget.h
@@ -0,0 +1,45 @@
+//===- BlackfinSubtarget.h - Define Subtarget for the Blackfin -*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the BLACKFIN specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BLACKFIN_SUBTARGET_H
+#define BLACKFIN_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include <string>
+
+namespace llvm {
+
+  class BlackfinSubtarget : public TargetSubtarget {
+    bool sdram;
+    bool icplb;
+    bool wa_mi_shift;
+    bool wa_csync;
+    bool wa_specld;
+    bool wa_mmr_stall;
+    bool wa_lcregs;
+    bool wa_hwloop;
+    bool wa_ind_call;
+    bool wa_killed_mmr;
+    bool wa_rets;
+  public:
+    BlackfinSubtarget(const std::string &TT, const std::string &FS);
+
+    /// ParseSubtargetFeatures - Parses features string setting specified
+    /// subtarget options.  Definition of function is auto generated by tblgen.
+    std::string ParseSubtargetFeatures(const std::string &FS,
+                                       const std::string &CPU);
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Blackfin/BlackfinTargetMachine.cpp b/lib/Target/Blackfin/BlackfinTargetMachine.cpp
new file mode 100644
index 0000000..47ba2fe
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinTargetMachine.cpp
@@ -0,0 +1,42 @@
+//===-- BlackfinTargetMachine.cpp - Define TargetMachine for Blackfin -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinTargetMachine.h"
+#include "Blackfin.h"
+#include "BlackfinMCAsmInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetRegistry.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeBlackfinTarget() {
+  RegisterTargetMachine<BlackfinTargetMachine> X(TheBlackfinTarget);
+  RegisterAsmInfo<BlackfinMCAsmInfo> Y(TheBlackfinTarget);
+
+}
+
+BlackfinTargetMachine::BlackfinTargetMachine(const Target &T,
+                                             const std::string &TT,
+                                             const std::string &FS)
+  : LLVMTargetMachine(T, TT),
+    DataLayout("e-p:32:32-i64:32-f64:32"),
+    Subtarget(TT, FS),
+    TLInfo(*this),
+    InstrInfo(Subtarget),
+    FrameInfo(TargetFrameInfo::StackGrowsDown, 4, 0) {
+}
+
+bool BlackfinTargetMachine::addInstSelector(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel) {
+  PM.add(createBlackfinISelDag(*this, OptLevel));
+  return false;
+}
diff --git a/lib/Target/Blackfin/BlackfinTargetMachine.h b/lib/Target/Blackfin/BlackfinTargetMachine.h
new file mode 100644
index 0000000..73ed314
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinTargetMachine.h
@@ -0,0 +1,54 @@
+//===-- BlackfinTargetMachine.h - TargetMachine for Blackfin ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Blackfin specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BLACKFINTARGETMACHINE_H
+#define BLACKFINTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "BlackfinInstrInfo.h"
+#include "BlackfinSubtarget.h"
+#include "BlackfinISelLowering.h"
+
+namespace llvm {
+
+  class BlackfinTargetMachine : public LLVMTargetMachine {
+    const TargetData DataLayout;
+    BlackfinSubtarget Subtarget;
+    BlackfinTargetLowering TLInfo;
+    BlackfinInstrInfo InstrInfo;
+    TargetFrameInfo FrameInfo;
+  public:
+    BlackfinTargetMachine(const Target &T, const std::string &TT,
+                          const std::string &FS);
+
+    virtual const BlackfinInstrInfo *getInstrInfo() const { return &InstrInfo; }
+    virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; }
+    virtual const BlackfinSubtarget *getSubtargetImpl() const {
+      return &Subtarget;
+    }
+    virtual const BlackfinRegisterInfo *getRegisterInfo() const {
+      return &InstrInfo.getRegisterInfo();
+    }
+    virtual BlackfinTargetLowering* getTargetLowering() const {
+      return const_cast<BlackfinTargetLowering*>(&TLInfo);
+    }
+    virtual const TargetData *getTargetData() const { return &DataLayout; }
+    virtual bool addInstSelector(PassManagerBase &PM,
+                                 CodeGenOpt::Level OptLevel);
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Blackfin/CMakeLists.txt b/lib/Target/Blackfin/CMakeLists.txt
new file mode 100644
index 0000000..6c3b244
--- /dev/null
+++ b/lib/Target/Blackfin/CMakeLists.txt
@@ -0,0 +1,21 @@
+set(LLVM_TARGET_DEFINITIONS Blackfin.td)
+
+tablegen(BlackfinGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(BlackfinGenRegisterNames.inc -gen-register-enums)
+tablegen(BlackfinGenRegisterInfo.inc -gen-register-desc)
+tablegen(BlackfinGenInstrNames.inc -gen-instr-enums)
+tablegen(BlackfinGenInstrInfo.inc -gen-instr-desc)
+tablegen(BlackfinGenAsmWriter.inc -gen-asm-writer)
+tablegen(BlackfinGenDAGISel.inc -gen-dag-isel)
+tablegen(BlackfinGenSubtarget.inc -gen-subtarget)
+tablegen(BlackfinGenCallingConv.inc -gen-callingconv)
+
+add_llvm_target(BlackfinCodeGen
+  BlackfinInstrInfo.cpp
+  BlackfinISelDAGToDAG.cpp
+  BlackfinISelLowering.cpp
+  BlackfinMCAsmInfo.cpp
+  BlackfinRegisterInfo.cpp
+  BlackfinSubtarget.cpp
+  BlackfinTargetMachine.cpp
+  )
diff --git a/lib/Target/Blackfin/Makefile b/lib/Target/Blackfin/Makefile
new file mode 100644
index 0000000..c0c1bce
--- /dev/null
+++ b/lib/Target/Blackfin/Makefile
@@ -0,0 +1,23 @@
+##===- lib/Target/Blackfin/Makefile ------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMBlackfinCodeGen
+TARGET = Blackfin
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = BlackfinGenRegisterInfo.h.inc BlackfinGenRegisterNames.inc \
+                BlackfinGenRegisterInfo.inc BlackfinGenInstrNames.inc \
+                BlackfinGenInstrInfo.inc BlackfinGenAsmWriter.inc \
+                BlackfinGenDAGISel.inc BlackfinGenSubtarget.inc \
+		BlackfinGenCallingConv.inc
+
+DIRS = AsmPrinter TargetInfo
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/Blackfin/README.txt b/lib/Target/Blackfin/README.txt
new file mode 100644
index 0000000..b4c8227
--- /dev/null
+++ b/lib/Target/Blackfin/README.txt
@@ -0,0 +1,244 @@
+//===-- README.txt - Notes for Blackfin Target ------------------*- org -*-===//
+
+* Condition codes
+** DONE Problem with asymmetric SETCC operations
+The instruction
+
+  CC = R0 < 2
+
+is not symmetric - there is no R0 > 2 instruction. On the other hand, IF CC
+JUMP can take both CC and !CC as a condition. We cannot pattern-match (brcond
+(not cc), target), the DAG optimizer removes that kind of thing.
+
+This is handled by creating a pseudo-register NCC that aliases CC. Register
+classes JustCC and NotCC are used to control the inversion of CC.
+
+** DONE CC as an i32 register
+The AnyCC register class pretends to hold i32 values. It can only represent the
+values 0 and 1, but we can copy to and from the D class. This hack makes it
+possible to represent the setcc instruction without having i1 as a legal type.
+
+In most cases, the CC register is set by a "CC = .." or BITTST instruction, and
+then used in a conditional branch or move. The code generator thinks it is
+moving 32 bits, but the value stays in CC. In other cases, the result of a
+comparison is actually used as am i32 number, and CC will be copied to a D
+register.
+
+* Stack frames
+** TODO Use Push/Pop instructions
+We should use the push/pop instructions when saving callee-saved
+registers. The are smaller, and we may even use push multiple instructions.
+
+** TODO requiresRegisterScavenging
+We need more intelligence in determining when the scavenger is needed. We
+should keep track of:
+- Spilling D16 registers
+- Spilling AnyCC registers
+
+* Assembler
+** TODO Implement PrintGlobalVariable
+** TODO Remove LOAD32sym
+It's a hack combining two instructions by concatenation.
+
+* Inline Assembly
+
+These are the GCC constraints from bfin/constraints.md:
+
+| Code  | Register class                            | LLVM |
+|-------+-------------------------------------------+------|
+| a     | P                                         | C    |
+| d     | D                                         | C    |
+| z     | Call clobbered P (P0, P1, P2)             | X    |
+| D     | EvenD                                     | X    |
+| W     | OddD                                      | X    |
+| e     | Accu                                      | C    |
+| A     | A0                                        | S    |
+| B     | A1                                        | S    |
+| b     | I                                         | C    |
+| v     | B                                         | C    |
+| f     | M                                         | C    |
+| c     | Circular I, B, L                          | X    |
+| C     | JustCC                                    | S    |
+| t     | LoopTop                                   | X    |
+| u     | LoopBottom                                | X    |
+| k     | LoopCount                                 | X    |
+| x     | GR                                        | C    |
+| y     | RET*, ASTAT, SEQSTAT, USP                 | X    |
+| w     | ALL                                       | C    |
+| Z     | The FD-PIC GOT pointer (P3)               | S    |
+| Y     | The FD-PIC function pointer register (P1) | S    |
+| q0-q7 | R0-R7 individually                        |      |
+| qA    | P0                                        |      |
+|-------+-------------------------------------------+------|
+| Code  | Constant                                  |      |
+|-------+-------------------------------------------+------|
+| J     | 1<<N, N<32                                |      |
+| Ks3   | imm3                                      |      |
+| Ku3   | uimm3                                     |      |
+| Ks4   | imm4                                      |      |
+| Ku4   | uimm4                                     |      |
+| Ks5   | imm5                                      |      |
+| Ku5   | uimm5                                     |      |
+| Ks7   | imm7                                      |      |
+| KN7   | -imm7                                     |      |
+| Ksh   | imm16                                     |      |
+| Kuh   | uimm16                                    |      |
+| L     | ~(1<<N)                                   |      |
+| M1    | 0xff                                      |      |
+| M2    | 0xffff                                    |      |
+| P0-P4 | 0-4                                       |      |
+| PA    | Macflag, not M                            |      |
+| PB    | Macflag, only M                           |      |
+| Q     | Symbol                                    |      |
+
+** TODO Support all register classes
+* DAG combiner
+** Create test case for each Illegal SETCC case
+The DAG combiner may someimes produce illegal i16 SETCC instructions.
+
+*** TODO SETCC (ctlz x), 5) == const
+*** TODO SETCC (and load, const) == const
+*** DONE SETCC (zext x) == const
+*** TODO SETCC (sext x) == const
+
+* Instruction selection
+** TODO Better imediate constants
+Like ARM, build constants as small imm + shift.
+
+** TODO Implement cycle counter
+We have CYCLES and CYCLES2 registers, but the readcyclecounter intrinsic wants
+to return i64, and the code generator doesn't know how to legalize that.
+
+** TODO Instruction alternatives
+Some instructions come in different variants for example:
+
+  D = D + D
+  P = P + P
+
+Cross combinations are not allowed:
+
+  P = D + D (bad)
+
+Similarly for the subreg pseudo-instructions:
+
+ D16L = EXTRACT_SUBREG D16, bfin_subreg_lo16
+ P16L = EXTRACT_SUBREG P16, bfin_subreg_lo16
+
+We want to take advantage of the alternative instructions. This could be done by
+changing the DAG after instruction selection.
+
+
+** Multipatterns for load/store
+We should try to identify multipatterns for load and store instructions. The
+available instruction matrix is a bit irregular.
+
+Loads:
+
+| Addr       | D | P | D 16z | D 16s | D16 | D 8z | D 8s |
+|------------+---+---+-------+-------+-----+------+------|
+| P          | * | * | *     | *     | *   | *    | *    |
+| P++        | * | * | *     | *     |     | *    | *    |
+| P--        | * | * | *     | *     |     | *    | *    |
+| P+uimm5m2  |   |   | *     | *     |     |      |      |
+| P+uimm6m4  | * | * |       |       |     |      |      |
+| P+imm16    |   |   |       |       |     | *    | *    |
+| P+imm17m2  |   |   | *     | *     |     |      |      |
+| P+imm18m4  | * | * |       |       |     |      |      |
+| P++P       | * |   | *     | *     | *   |      |      |
+| FP-uimm7m4 | * | * |       |       |     |      |      |
+| I          | * |   |       |       | *   |      |      |
+| I++        | * |   |       |       | *   |      |      |
+| I--        | * |   |       |       | *   |      |      |
+| I++M       | * |   |       |       |     |      |      |
+
+Stores:
+
+| Addr       | D | P | D16H | D16L | D 8 |
+|------------+---+---+------+------+-----|
+| P          | * | * | *    | *    | *   |
+| P++        | * | * |      | *    | *   |
+| P--        | * | * |      | *    | *   |
+| P+uimm5m2  |   |   |      | *    |     |
+| P+uimm6m4  | * | * |      |      |     |
+| P+imm16    |   |   |      |      | *   |
+| P+imm17m2  |   |   |      | *    |     |
+| P+imm18m4  | * | * |      |      |     |
+| P++P       | * |   | *    | *    |     |
+| FP-uimm7m4 | * | * |      |      |     |
+| I          | * |   | *    | *    |     |
+| I++        | * |   | *    | *    |     |
+| I--        | * |   | *    | *    |     |
+| I++M       | * |   |      |      |     |
+
+* Workarounds and features
+Blackfin CPUs have bugs. Each model comes in a number of silicon revisions with
+different bugs. We learn about the CPU model from the -mcpu switch.
+
+** Interpretation of -mcpu value
+- -mcpu=bf527 refers to the latest known BF527 revision
+- -mcpu=bf527-0.2 refers to silicon rev. 0.2
+- -mcpu=bf527-any refers to all known revisions
+- -mcpu=bf527-none disables all workarounds
+
+The -mcpu setting affects the __SILICON_REVISION__ macro and enabled workarounds:
+
+| -mcpu      | __SILICON_REVISION__ | Workarounds        |
+|------------+----------------------+--------------------|
+| bf527      | Def Latest           | Specific to latest |
+| bf527-1.3  | Def 0x0103           | Specific to 1.3    |
+| bf527-any  | Def 0xffff           | All bf527-x.y      |
+| bf527-none | Undefined            | None               |
+
+These are the known cores and revisions:
+
+| Core        | Silicon            | Processors              |
+|-------------+--------------------+-------------------------|
+| Edinburgh   | 0.3, 0.4, 0.5, 0.6 | BF531 BF532 BF533       |
+| Braemar     | 0.2, 0.3           | BF534 BF536 BF537       |
+| Stirling    | 0.3, 0.4, 0.5      | BF538 BF539             |
+| Moab        | 0.0, 0.1, 0.2      | BF542 BF544 BF548 BF549 |
+| Teton       | 0.3, 0.5           | BF561                   |
+| Kookaburra  | 0.0, 0.1, 0.2      | BF523 BF525 BF527       |
+| Mockingbird | 0.0, 0.1           | BF522 BF524 BF526       |
+| Brodie      | 0.0, 0.1           | BF512 BF514 BF516 BF518 |
+
+
+** Compiler implemented workarounds
+Most workarounds are implemented in header files and source code using the
+__ADSPBF527__ macros. A few workarounds require compiler support.
+
+|  Anomaly | Macro                          | GCC Switch       |
+|----------+--------------------------------+------------------|
+|      Any | __WORKAROUNDS_ENABLED          |                  |
+| 05000074 | WA_05000074                    |                  |
+| 05000244 | __WORKAROUND_SPECULATIVE_SYNCS | -mcsync-anomaly  |
+| 05000245 | __WORKAROUND_SPECULATIVE_LOADS | -mspecld-anomaly |
+| 05000257 | WA_05000257                    |                  |
+| 05000283 | WA_05000283                    |                  |
+| 05000312 | WA_LOAD_LCREGS                 |                  |
+| 05000315 | WA_05000315                    |                  |
+| 05000371 | __WORKAROUND_RETS              |                  |
+| 05000426 | __WORKAROUND_INDIRECT_CALLS    | Not -micplb      |
+
+** GCC feature switches
+| Switch                    | Description                            |
+|---------------------------+----------------------------------------|
+| -msim                     | Use simulator runtime                  |
+| -momit-leaf-frame-pointer | Omit frame pointer for leaf functions  |
+| -mlow64k                  |                                        |
+| -mcsync-anomaly           |                                        |
+| -mspecld-anomaly          |                                        |
+| -mid-shared-library       |                                        |
+| -mleaf-id-shared-library  |                                        |
+| -mshared-library-id=      |                                        |
+| -msep-data                | Enable separate data segment           |
+| -mlong-calls              | Use indirect calls                     |
+| -mfast-fp                 |                                        |
+| -mfdpic                   |                                        |
+| -minline-plt              |                                        |
+| -mstack-check-l1          | Do stack checking in L1 scratch memory |
+| -mmulticore               | Enable multicore support               |
+| -mcorea                   | Build for Core A                       |
+| -mcoreb                   | Build for Core B                       |
+| -msdram                   | Build for SDRAM                        |
+| -micplb                   | Assume ICPLBs are enabled at runtime.  |
diff --git a/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp b/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp
new file mode 100644
index 0000000..402e0af
--- /dev/null
+++ b/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp
@@ -0,0 +1,21 @@
+//===-- BlackfinTargetInfo.cpp - Blackfin Target Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Blackfin.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+
+using namespace llvm;
+
+Target llvm::TheBlackfinTarget;
+
+extern "C" void LLVMInitializeBlackfinTargetInfo() {
+  RegisterTarget<Triple::bfin> X(TheBlackfinTarget, "bfin",
+                                 "Analog Devices Blackfin [experimental]");
+}
diff --git a/lib/Target/Blackfin/TargetInfo/CMakeLists.txt b/lib/Target/Blackfin/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..5ca8060
--- /dev/null
+++ b/lib/Target/Blackfin/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMBlackfinInfo
+  BlackfinTargetInfo.cpp
+  )
+
+add_dependencies(LLVMBlackfinInfo BlackfinCodeGenTable_gen)
diff --git a/lib/Target/Blackfin/TargetInfo/Makefile b/lib/Target/Blackfin/TargetInfo/Makefile
new file mode 100644
index 0000000..c49cfbe
--- /dev/null
+++ b/lib/Target/Blackfin/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/Blackfin/TargetInfo/Makefile -------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMBlackfinInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index 294c6d3..fe63edf 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -24,43 +24,36 @@
 #include "llvm/Intrinsics.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/InlineAsm.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/ConstantsScanner.h"
 #include "llvm/Analysis/FindUsedTypes.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Target/TargetMachineRegistry.h"
-#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/InstVisitor.h"
 #include "llvm/Support/Mangler.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/MathExtras.h"
+#include "llvm/System/Host.h"
 #include "llvm/Config/config.h"
 #include <algorithm>
 #include <sstream>
 using namespace llvm;
 
-/// CBackendTargetMachineModule - Note that this is used on hosts that
-/// cannot link in a library unless there are references into the
-/// library.  In particular, it seems that it is not possible to get
-/// things to work on Win32 without this.  Though it is unused, do not
-/// remove it.
-extern "C" int CBackendTargetMachineModule;
-int CBackendTargetMachineModule = 0;
-
-// Register the target.
-static RegisterTarget<CTargetMachine> X("c", "C backend");
-
-// Force static initialization.
-extern "C" void LLVMInitializeCBackendTarget() { }
+extern "C" void LLVMInitializeCBackendTarget() { 
+  // Register the target.
+  RegisterTargetMachine<CTargetMachine> X(TheCBackendTarget);
+}
 
 namespace {
   /// CBackendNameAllUsedStructsAndMergeFunctions - This pass inserts names for
@@ -88,12 +81,12 @@ namespace {
   /// CWriter - This class is the main chunk of code that converts an LLVM
   /// module to a C translation unit.
   class CWriter : public FunctionPass, public InstVisitor<CWriter> {
-    raw_ostream &Out;
+    formatted_raw_ostream &Out;
     IntrinsicLowering *IL;
     Mangler *Mang;
     LoopInfo *LI;
     const Module *TheModule;
-    const TargetAsmInfo* TAsm;
+    const MCAsmInfo* TAsm;
     const TargetData* TD;
     std::map<const Type *, std::string> TypeNames;
     std::map<const ConstantFP *, unsigned> FPConstantMap;
@@ -101,12 +94,14 @@ namespace {
     std::set<const Argument*> ByValParams;
     unsigned FPCounter;
     unsigned OpaqueCounter;
+    DenseMap<const Value*, unsigned> AnonValueNumbers;
+    unsigned NextAnonValueNumber;
 
   public:
     static char ID;
-    explicit CWriter(raw_ostream &o)
+    explicit CWriter(formatted_raw_ostream &o)
       : FunctionPass(&ID), Out(o), IL(0), Mang(0), LI(0), 
-        TheModule(0), TAsm(0), TD(0), OpaqueCounter(0) {
+        TheModule(0), TAsm(0), TD(0), OpaqueCounter(0), NextAnonValueNumber(0) {
       FPCounter = 0;
     }
 
@@ -149,24 +144,26 @@ namespace {
       return false;
     }
 
-    raw_ostream &printType(raw_ostream &Out, const Type *Ty, 
-                            bool isSigned = false,
-                            const std::string &VariableName = "",
-                            bool IgnoreName = false,
-                            const AttrListPtr &PAL = AttrListPtr());
+    raw_ostream &printType(formatted_raw_ostream &Out,
+                           const Type *Ty, 
+                           bool isSigned = false,
+                           const std::string &VariableName = "",
+                           bool IgnoreName = false,
+                           const AttrListPtr &PAL = AttrListPtr());
     std::ostream &printType(std::ostream &Out, const Type *Ty, 
                            bool isSigned = false,
                            const std::string &VariableName = "",
                            bool IgnoreName = false,
                            const AttrListPtr &PAL = AttrListPtr());
-    raw_ostream &printSimpleType(raw_ostream &Out, const Type *Ty, 
-                                  bool isSigned, 
-                                  const std::string &NameSoFar = "");
+    raw_ostream &printSimpleType(formatted_raw_ostream &Out,
+                                 const Type *Ty, 
+                                 bool isSigned, 
+                                 const std::string &NameSoFar = "");
     std::ostream &printSimpleType(std::ostream &Out, const Type *Ty, 
                                  bool isSigned, 
                                  const std::string &NameSoFar = "");
 
-    void printStructReturnPointerFunctionType(raw_ostream &Out,
+    void printStructReturnPointerFunctionType(formatted_raw_ostream &Out,
                                               const AttrListPtr &PAL,
                                               const PointerType *Ty);
 
@@ -239,7 +236,7 @@ namespace {
 
       // Must be an expression, must be used exactly once.  If it is dead, we
       // emit it inline where it would go.
-      if (I.getType() == Type::VoidTy || !I.hasOneUse() ||
+      if (I.getType() == Type::getVoidTy(I.getContext()) || !I.hasOneUse() ||
           isa<TerminatorInst>(I) || isa<CallInst>(I) || isa<PHINode>(I) ||
           isa<LoadInst>(I) || isa<VAArgInst>(I) || isa<InsertElementInst>(I) ||
           isa<InsertValueInst>(I))
@@ -286,11 +283,11 @@ namespace {
     void visitBranchInst(BranchInst &I);
     void visitSwitchInst(SwitchInst &I);
     void visitInvokeInst(InvokeInst &I) {
-      assert(0 && "Lowerinvoke pass didn't work!");
+      llvm_unreachable("Lowerinvoke pass didn't work!");
     }
 
     void visitUnwindInst(UnwindInst &I) {
-      assert(0 && "Lowerinvoke pass didn't work!");
+      llvm_unreachable("Lowerinvoke pass didn't work!");
     }
     void visitUnreachableInst(UnreachableInst &I);
 
@@ -321,8 +318,10 @@ namespace {
     void visitExtractValueInst(ExtractValueInst &I);
 
     void visitInstruction(Instruction &I) {
-      cerr << "C Writer does not know about " << I;
-      abort();
+#ifndef NDEBUG
+      errs() << "C Writer does not know about " << I;
+#endif
+      llvm_unreachable(0);
     }
 
     void outputLValue(Instruction *I) {
@@ -430,7 +429,7 @@ bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) {
 /// printStructReturnPointerFunctionType - This is like printType for a struct
 /// return type, except, instead of printing the type as void (*)(Struct*, ...)
 /// print it as "Struct (*)(...)", for struct return functions.
-void CWriter::printStructReturnPointerFunctionType(raw_ostream &Out,
+void CWriter::printStructReturnPointerFunctionType(formatted_raw_ostream &Out,
                                                    const AttrListPtr &PAL,
                                                    const PointerType *TheTy) {
   const FunctionType *FTy = cast<FunctionType>(TheTy->getElementType());
@@ -466,7 +465,8 @@ void CWriter::printStructReturnPointerFunctionType(raw_ostream &Out,
 }
 
 raw_ostream &
-CWriter::printSimpleType(raw_ostream &Out, const Type *Ty, bool isSigned,
+CWriter::printSimpleType(formatted_raw_ostream &Out, const Type *Ty,
+                         bool isSigned,
                          const std::string &NameSoFar) {
   assert((Ty->isPrimitiveType() || Ty->isInteger() || isa<VectorType>(Ty)) && 
          "Invalid type for printSimpleType");
@@ -505,8 +505,10 @@ CWriter::printSimpleType(raw_ostream &Out, const Type *Ty, bool isSigned,
   }
     
   default:
-    cerr << "Unknown primitive type: " << *Ty << "\n";
-    abort();
+#ifndef NDEBUG
+    errs() << "Unknown primitive type: " << *Ty << "\n";
+#endif
+    llvm_unreachable(0);
   }
 }
 
@@ -550,17 +552,20 @@ CWriter::printSimpleType(std::ostream &Out, const Type *Ty, bool isSigned,
   }
     
   default:
-    cerr << "Unknown primitive type: " << *Ty << "\n";
-    abort();
+#ifndef NDEBUG
+    errs() << "Unknown primitive type: " << *Ty << "\n";
+#endif
+    llvm_unreachable(0);
   }
 }
 
 // Pass the Type* and the variable name and this prints out the variable
 // declaration.
 //
-raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
-                                 bool isSigned, const std::string &NameSoFar,
-                                 bool IgnoreName, const AttrListPtr &PAL) {
+raw_ostream &CWriter::printType(formatted_raw_ostream &Out,
+                                const Type *Ty,
+                                bool isSigned, const std::string &NameSoFar,
+                                bool IgnoreName, const AttrListPtr &PAL) {
   if (Ty->isPrimitiveType() || Ty->isInteger() || isa<VectorType>(Ty)) {
     printSimpleType(Out, Ty, isSigned, NameSoFar);
     return Out;
@@ -652,8 +657,7 @@ raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
     return Out << TyName << ' ' << NameSoFar;
   }
   default:
-    assert(0 && "Unhandled case in getTypeProps!");
-    abort();
+    llvm_unreachable("Unhandled case in getTypeProps!");
   }
 
   return Out;
@@ -756,8 +760,7 @@ std::ostream &CWriter::printType(std::ostream &Out, const Type *Ty,
     return Out << TyName << ' ' << NameSoFar;
   }
   default:
-    assert(0 && "Unhandled case in getTypeProps!");
-    abort();
+    llvm_unreachable("Unhandled case in getTypeProps!");
   }
 
   return Out;
@@ -769,7 +772,8 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
   // ubytes or an array of sbytes with positive values.
   //
   const Type *ETy = CPA->getType()->getElementType();
-  bool isString = (ETy == Type::Int8Ty || ETy == Type::Int8Ty);
+  bool isString = (ETy == Type::getInt8Ty(CPA->getContext()) ||
+                   ETy == Type::getInt8Ty(CPA->getContext()));
 
   // Make sure the last character is a null char, as automatically added by C
   if (isString && (CPA->getNumOperands() == 0 ||
@@ -855,10 +859,11 @@ void CWriter::printConstantVector(ConstantVector *CP, bool Static) {
 static bool isFPCSafeToPrint(const ConstantFP *CFP) {
   bool ignored;
   // Do long doubles in hex for now.
-  if (CFP->getType() != Type::FloatTy && CFP->getType() != Type::DoubleTy)
+  if (CFP->getType() != Type::getFloatTy(CFP->getContext()) &&
+      CFP->getType() != Type::getDoubleTy(CFP->getContext()))
     return false;
   APFloat APF = APFloat(CFP->getValueAPF());  // copy
-  if (CFP->getType() == Type::FloatTy)
+  if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
     APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
 #if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
   char Buffer[100];
@@ -916,7 +921,7 @@ void CWriter::printCast(unsigned opc, const Type *SrcTy, const Type *DstTy) {
       Out << ')';
       break;
     default:
-      assert(0 && "Invalid cast opcode");
+      llvm_unreachable("Invalid cast opcode");
   }
 
   // Print the source type cast
@@ -946,7 +951,7 @@ void CWriter::printCast(unsigned opc, const Type *SrcTy, const Type *DstTy) {
     case Instruction::FPToUI:
       break; // These don't need a source cast.
     default:
-      assert(0 && "Invalid cast opcode");
+      llvm_unreachable("Invalid cast opcode");
       break;
   }
 }
@@ -970,12 +975,12 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
       Out << "(";
       printCast(CE->getOpcode(), CE->getOperand(0)->getType(), CE->getType());
       if (CE->getOpcode() == Instruction::SExt &&
-          CE->getOperand(0)->getType() == Type::Int1Ty) {
+          CE->getOperand(0)->getType() == Type::getInt1Ty(CPV->getContext())) {
         // Make sure we really sext from bool here by subtracting from 0
         Out << "0-";
       }
       printConstant(CE->getOperand(0), Static);
-      if (CE->getType() == Type::Int1Ty &&
+      if (CE->getType() == Type::getInt1Ty(CPV->getContext()) &&
           (CE->getOpcode() == Instruction::Trunc ||
            CE->getOpcode() == Instruction::FPToUI ||
            CE->getOpcode() == Instruction::FPToSI ||
@@ -1055,10 +1060,10 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
           case ICmpInst::ICMP_UGT: Out << " > "; break;
           case ICmpInst::ICMP_SGE:
           case ICmpInst::ICMP_UGE: Out << " >= "; break;
-          default: assert(0 && "Illegal ICmp predicate");
+          default: llvm_unreachable("Illegal ICmp predicate");
         }
         break;
-      default: assert(0 && "Illegal opcode here!");
+      default: llvm_unreachable("Illegal opcode here!");
       }
       printConstantWithCast(CE->getOperand(1), CE->getOpcode());
       if (NeedsClosingParens)
@@ -1076,7 +1081,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
       else {
         const char* op = 0;
         switch (CE->getPredicate()) {
-        default: assert(0 && "Illegal FCmp predicate");
+        default: llvm_unreachable("Illegal FCmp predicate");
         case FCmpInst::FCMP_ORD: op = "ord"; break;
         case FCmpInst::FCMP_UNO: op = "uno"; break;
         case FCmpInst::FCMP_UEQ: op = "ueq"; break;
@@ -1104,9 +1109,11 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
       return;
     }
     default:
-      cerr << "CWriter Error: Unhandled constant expression: "
+#ifndef NDEBUG
+      errs() << "CWriter Error: Unhandled constant expression: "
            << *CE << "\n";
-      abort();
+#endif
+      llvm_unreachable(0);
     }
   } else if (isa<UndefValue>(CPV) && CPV->getType()->isSingleValueType()) {
     Out << "((";
@@ -1122,9 +1129,9 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
 
   if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
     const Type* Ty = CI->getType();
-    if (Ty == Type::Int1Ty)
+    if (Ty == Type::getInt1Ty(CPV->getContext()))
       Out << (CI->getZExtValue() ? '1' : '0');
-    else if (Ty == Type::Int32Ty)
+    else if (Ty == Type::getInt32Ty(CPV->getContext()))
       Out << CI->getZExtValue() << 'u';
     else if (Ty->getPrimitiveSizeInBits() > 32)
       Out << CI->getZExtValue() << "ull";
@@ -1151,15 +1158,17 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
     if (I != FPConstantMap.end()) {
       // Because of FP precision problems we must load from a stack allocated
       // value that holds the value in hex.
-      Out << "(*(" << (FPC->getType() == Type::FloatTy ? "float" : 
-                       FPC->getType() == Type::DoubleTy ? "double" :
+      Out << "(*(" << (FPC->getType() == Type::getFloatTy(CPV->getContext()) ?
+                       "float" : 
+                       FPC->getType() == Type::getDoubleTy(CPV->getContext()) ? 
+                       "double" :
                        "long double")
           << "*)&FPConstant" << I->second << ')';
     } else {
       double V;
-      if (FPC->getType() == Type::FloatTy)
+      if (FPC->getType() == Type::getFloatTy(CPV->getContext()))
         V = FPC->getValueAPF().convertToFloat();
-      else if (FPC->getType() == Type::DoubleTy)
+      else if (FPC->getType() == Type::getDoubleTy(CPV->getContext()))
         V = FPC->getValueAPF().convertToDouble();
       else {
         // Long double.  Convert the number to double, discarding precision.
@@ -1189,7 +1198,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
         std::string Num(&Buffer[0], &Buffer[6]);
         unsigned long Val = strtoul(Num.c_str(), 0, 16);
 
-        if (FPC->getType() == Type::FloatTy)
+        if (FPC->getType() == Type::getFloatTy(FPC->getContext()))
           Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\""
               << Buffer << "\") /*nan*/ ";
         else
@@ -1198,7 +1207,8 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
       } else if (IsInf(V)) {
         // The value is Inf
         if (V < 0) Out << '-';
-        Out << "LLVM_INF" << (FPC->getType() == Type::FloatTy ? "F" : "")
+        Out << "LLVM_INF" <<
+            (FPC->getType() == Type::getFloatTy(FPC->getContext()) ? "F" : "")
             << " /*inf*/ ";
       } else {
         std::string Num;
@@ -1312,8 +1322,10 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
     }
     // FALL THROUGH
   default:
-    cerr << "Unknown constant type: " << *CPV << "\n";
-    abort();
+#ifndef NDEBUG
+    errs() << "Unknown constant type: " << *CPV << "\n";
+#endif
+    llvm_unreachable(0);
   }
 }
 
@@ -1359,7 +1371,7 @@ bool CWriter::printConstExprCast(const ConstantExpr* CE, bool Static) {
   }
   if (NeedsExplicitCast) {
     Out << "((";
-    if (Ty->isInteger() && Ty != Type::Int1Ty)
+    if (Ty->isInteger() && Ty != Type::getInt1Ty(Ty->getContext()))
       printSimpleType(Out, Ty, TypeIsSigned);
     else
       printType(Out, Ty); // not integer, sign doesn't matter
@@ -1419,33 +1431,36 @@ void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) {
 }
 
 std::string CWriter::GetValueName(const Value *Operand) {
-  std::string Name;
-
-  if (!isa<GlobalValue>(Operand) && Operand->getName() != "") {
-    std::string VarName;
-
-    Name = Operand->getName();
-    VarName.reserve(Name.capacity());
-
-    for (std::string::iterator I = Name.begin(), E = Name.end();
-         I != E; ++I) {
-      char ch = *I;
+  // Mangle globals with the standard mangler interface for LLC compatibility.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(Operand))
+    return Mang->getMangledName(GV);
+    
+  std::string Name = Operand->getName();
+    
+  if (Name.empty()) { // Assign unique names to local temporaries.
+    unsigned &No = AnonValueNumbers[Operand];
+    if (No == 0)
+      No = ++NextAnonValueNumber;
+    Name = "tmp__" + utostr(No);
+  }
+    
+  std::string VarName;
+  VarName.reserve(Name.capacity());
 
-      if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
-            (ch >= '0' && ch <= '9') || ch == '_')) {
-        char buffer[5];
-        sprintf(buffer, "_%x_", ch);
-        VarName += buffer;
-      } else
-        VarName += ch;
-    }
+  for (std::string::iterator I = Name.begin(), E = Name.end();
+       I != E; ++I) {
+    char ch = *I;
 
-    Name = "llvm_cbe_" + VarName;
-  } else {
-    Name = Mang->getValueName(Operand);
+    if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+          (ch >= '0' && ch <= '9') || ch == '_')) {
+      char buffer[5];
+      sprintf(buffer, "_%x_", ch);
+      VarName += buffer;
+    } else
+      VarName += ch;
   }
 
-  return Name;
+  return "llvm_cbe_" + VarName;
 }
 
 /// writeInstComputationInline - Emit the computation for the specified
@@ -1454,19 +1469,22 @@ void CWriter::writeInstComputationInline(Instruction &I) {
   // We can't currently support integer types other than 1, 8, 16, 32, 64.
   // Validate this.
   const Type *Ty = I.getType();
-  if (Ty->isInteger() && (Ty!=Type::Int1Ty && Ty!=Type::Int8Ty &&
-        Ty!=Type::Int16Ty && Ty!=Type::Int32Ty && Ty!=Type::Int64Ty)) {
-      cerr << "The C backend does not currently support integer "
-           << "types of widths other than 1, 8, 16, 32, 64.\n";
-      cerr << "This is being tracked as PR 4158.\n";
-      abort();
+  if (Ty->isInteger() && (Ty!=Type::getInt1Ty(I.getContext()) &&
+        Ty!=Type::getInt8Ty(I.getContext()) && 
+        Ty!=Type::getInt16Ty(I.getContext()) &&
+        Ty!=Type::getInt32Ty(I.getContext()) &&
+        Ty!=Type::getInt64Ty(I.getContext()))) {
+      llvm_report_error("The C backend does not currently support integer "
+                        "types of widths other than 1, 8, 16, 32, 64.\n"
+                        "This is being tracked as PR 4158.");
   }
 
   // If this is a non-trivial bool computation, make sure to truncate down to
   // a 1 bit value.  This is important because we want "add i1 x, y" to return
   // "0" when x and y are true, not "2" for example.
   bool NeedBoolTrunc = false;
-  if (I.getType() == Type::Int1Ty && !isa<ICmpInst>(I) && !isa<FCmpInst>(I))
+  if (I.getType() == Type::getInt1Ty(I.getContext()) &&
+      !isa<ICmpInst>(I) && !isa<FCmpInst>(I))
     NeedBoolTrunc = true;
   
   if (NeedBoolTrunc)
@@ -1615,7 +1633,7 @@ void CWriter::writeOperandWithCast(Value* Operand, const ICmpInst &Cmp) {
   // If the operand was a pointer, convert to a large integer type.
   const Type* OpTy = Operand->getType();
   if (isa<PointerType>(OpTy))
-    OpTy = TD->getIntPtrType();
+    OpTy = TD->getIntPtrType(Operand->getContext());
   
   Out << "((";
   printSimpleType(Out, OpTy, castIsSigned);
@@ -1627,13 +1645,13 @@ void CWriter::writeOperandWithCast(Value* Operand, const ICmpInst &Cmp) {
 // generateCompilerSpecificCode - This is where we add conditional compilation
 // directives to cater to specific compilers as need be.
 //
-static void generateCompilerSpecificCode(raw_ostream& Out,
+static void generateCompilerSpecificCode(formatted_raw_ostream& Out,
                                          const TargetData *TD) {
   // Alloca is hard to get, and we don't want to include stdlib.h here.
   Out << "/* get a declaration for alloca */\n"
       << "#if defined(__CYGWIN__) || defined(__MINGW32__)\n"
       << "#define  alloca(x) __builtin_alloca((x))\n"
-      << "#define _alloca(x) __builtin_alloca((x))\n"    
+      << "#define _alloca(x) __builtin_alloca((x))\n"
       << "#elif defined(__APPLE__)\n"
       << "extern void *__builtin_alloca(unsigned long);\n"
       << "#define alloca(x) __builtin_alloca(x)\n"
@@ -1646,7 +1664,7 @@ static void generateCompilerSpecificCode(raw_ostream& Out,
       << "extern void *__builtin_alloca(unsigned int);\n"
       << "#endif\n"
       << "#define alloca(x) __builtin_alloca(x)\n"
-      << "#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)\n"
+      << "#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__arm__)\n"
       << "#define alloca(x) __builtin_alloca(x)\n"
       << "#elif defined(_MSC_VER)\n"
       << "#define inline _inline\n"
@@ -1803,8 +1821,34 @@ static SpecialGlobalClass getGlobalVariableClass(const GlobalVariable *GV) {
   return NotSpecial;
 }
 
+// PrintEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+static void PrintEscapedString(const char *Str, unsigned Length,
+                               raw_ostream &Out) {
+  for (unsigned i = 0; i != Length; ++i) {
+    unsigned char C = Str[i];
+    if (isprint(C) && C != '\\' && C != '"')
+      Out << C;
+    else if (C == '\\')
+      Out << "\\\\";
+    else if (C == '\"')
+      Out << "\\\"";
+    else if (C == '\t')
+      Out << "\\t";
+    else
+      Out << "\\x" << hexdigit(C >> 4) << hexdigit(C & 0x0F);
+  }
+}
+
+// PrintEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+static void PrintEscapedString(const std::string &Str, raw_ostream &Out) {
+  PrintEscapedString(Str.c_str(), Str.size(), Out);
+}
 
 bool CWriter::doInitialization(Module &M) {
+  FunctionPass::doInitialization(M);
+  
   // Initialize
   TheModule = &M;
 
@@ -1855,6 +1899,29 @@ bool CWriter::doInitialization(Module &M) {
   // First output all the declarations for the program, because C requires
   // Functions & globals to be declared before they are used.
   //
+  if (!M.getModuleInlineAsm().empty()) {
+    Out << "/* Module asm statements */\n"
+        << "asm(";
+
+    // Split the string into lines, to make it easier to read the .ll file.
+    std::string Asm = M.getModuleInlineAsm();
+    size_t CurPos = 0;
+    size_t NewLine = Asm.find_first_of('\n', CurPos);
+    while (NewLine != std::string::npos) {
+      // We found a newline, print the portion of the asm string from the
+      // last newline up to this newline.
+      Out << "\"";
+      PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.begin()+NewLine),
+                         Out);
+      Out << "\\n\"\n";
+      CurPos = NewLine+1;
+      NewLine = Asm.find_first_of('\n', CurPos);
+    }
+    Out << "\"";
+    PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.end()), Out);
+    Out << "\");\n"
+        << "/* End Module asm statements */\n";
+  }
 
   // Loop over the symbol table, emitting all named constants...
   printModuleTypes(M.getTypeSymbolTable());
@@ -1910,7 +1977,7 @@ bool CWriter::doInitialization(Module &M) {
         Out << " __HIDDEN__";
       
       if (I->hasName() && I->getName()[0] == 1)
-        Out << " LLVM_ASM(\"" << I->getName().c_str()+1 << "\")";
+        Out << " LLVM_ASM(\"" << I->getName().substr(1) << "\")";
           
       Out << ";\n";
     }
@@ -2085,20 +2152,20 @@ void CWriter::printFloatingPointConstants(const Constant *C) {
 
   FPConstantMap[FPC] = FPCounter;  // Number the FP constants
   
-  if (FPC->getType() == Type::DoubleTy) {
+  if (FPC->getType() == Type::getDoubleTy(FPC->getContext())) {
     double Val = FPC->getValueAPF().convertToDouble();
     uint64_t i = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
     Out << "static const ConstantDoubleTy FPConstant" << FPCounter++
     << " = 0x" << utohexstr(i)
     << "ULL;    /* " << Val << " */\n";
-  } else if (FPC->getType() == Type::FloatTy) {
+  } else if (FPC->getType() == Type::getFloatTy(FPC->getContext())) {
     float Val = FPC->getValueAPF().convertToFloat();
     uint32_t i = (uint32_t)FPC->getValueAPF().bitcastToAPInt().
     getZExtValue();
     Out << "static const ConstantFloatTy FPConstant" << FPCounter++
     << " = 0x" << utohexstr(i)
     << "U;    /* " << Val << " */\n";
-  } else if (FPC->getType() == Type::X86_FP80Ty) {
+  } else if (FPC->getType() == Type::getX86_FP80Ty(FPC->getContext())) {
     // api needed to prevent premature destruction
     APInt api = FPC->getValueAPF().bitcastToAPInt();
     const uint64_t *p = api.getRawData();
@@ -2106,7 +2173,8 @@ void CWriter::printFloatingPointConstants(const Constant *C) {
     << " = { 0x" << utohexstr(p[0]) 
     << "ULL, 0x" << utohexstr((uint16_t)p[1]) << ",{0,0,0}"
     << "}; /* Long double constant */\n";
-  } else if (FPC->getType() == Type::PPC_FP128Ty) {
+  } else if (FPC->getType() == Type::getPPC_FP128Ty(FPC->getContext()) ||
+             FPC->getType() == Type::getFP128Ty(FPC->getContext())) {
     APInt api = FPC->getValueAPF().bitcastToAPInt();
     const uint64_t *p = api.getRawData();
     Out << "static const ConstantFP128Ty FPConstant" << FPCounter++
@@ -2115,7 +2183,7 @@ void CWriter::printFloatingPointConstants(const Constant *C) {
     << "}; /* Long double constant */\n";
     
   } else {
-    assert(0 && "Unknown float type!");
+    llvm_unreachable("Unknown float type!");
   }
 }
 
@@ -2215,6 +2283,8 @@ void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
    case CallingConv::X86_FastCall:
     Out << "__attribute__((fastcall)) ";
     break;
+   default:
+    break;
   }
   
   // Loop over the arguments, printing them...
@@ -2351,7 +2421,8 @@ void CWriter::printFunction(Function &F) {
       printType(Out, AI->getAllocatedType(), false, GetValueName(AI));
       Out << ";    /* Address-exposed local */\n";
       PrintedVar = true;
-    } else if (I->getType() != Type::VoidTy && !isInlinableInst(*I)) {
+    } else if (I->getType() != Type::getVoidTy(F.getContext()) && 
+               !isInlinableInst(*I)) {
       Out << "  ";
       printType(Out, I->getType(), false, GetValueName(&*I));
       Out << ";\n";
@@ -2428,7 +2499,8 @@ void CWriter::printBasicBlock(BasicBlock *BB) {
   for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E;
        ++II) {
     if (!isInlinableInst(*II) && !isDirectAlloca(II)) {
-      if (II->getType() != Type::VoidTy && !isInlineAsm(*II))
+      if (II->getType() != Type::getVoidTy(BB->getContext()) &&
+          !isInlineAsm(*II))
         outputLValue(II);
       else
         Out << "  ";
@@ -2603,8 +2675,9 @@ void CWriter::visitBinaryOperator(Instruction &I) {
 
   // We must cast the results of binary operations which might be promoted.
   bool needsCast = false;
-  if ((I.getType() == Type::Int8Ty) || (I.getType() == Type::Int16Ty) 
-      || (I.getType() == Type::FloatTy)) {
+  if ((I.getType() == Type::getInt8Ty(I.getContext())) ||
+      (I.getType() == Type::getInt16Ty(I.getContext())) 
+      || (I.getType() == Type::getFloatTy(I.getContext()))) {
     needsCast = true;
     Out << "((";
     printType(Out, I.getType(), false);
@@ -2623,9 +2696,9 @@ void CWriter::visitBinaryOperator(Instruction &I) {
     Out << ")";
   } else if (I.getOpcode() == Instruction::FRem) {
     // Output a call to fmod/fmodf instead of emitting a%b
-    if (I.getType() == Type::FloatTy)
+    if (I.getType() == Type::getFloatTy(I.getContext()))
       Out << "fmodf(";
-    else if (I.getType() == Type::DoubleTy)
+    else if (I.getType() == Type::getDoubleTy(I.getContext()))
       Out << "fmod(";
     else  // all 3 flavors of long double
       Out << "fmodl(";
@@ -2663,7 +2736,11 @@ void CWriter::visitBinaryOperator(Instruction &I) {
     case Instruction::Shl : Out << " << "; break;
     case Instruction::LShr:
     case Instruction::AShr: Out << " >> "; break;
-    default: cerr << "Invalid operator type!" << I; abort();
+    default: 
+#ifndef NDEBUG
+       errs() << "Invalid operator type!" << I;
+#endif
+       llvm_unreachable(0);
     }
 
     writeOperandWithCast(I.getOperand(1), I.getOpcode());
@@ -2700,7 +2777,11 @@ void CWriter::visitICmpInst(ICmpInst &I) {
   case ICmpInst::ICMP_SLT: Out << " < "; break;
   case ICmpInst::ICMP_UGT:
   case ICmpInst::ICMP_SGT: Out << " > "; break;
-  default: cerr << "Invalid icmp predicate!" << I; abort();
+  default:
+#ifndef NDEBUG
+    errs() << "Invalid icmp predicate!" << I; 
+#endif
+    llvm_unreachable(0);
   }
 
   writeOperandWithCast(I.getOperand(1), I);
@@ -2724,7 +2805,7 @@ void CWriter::visitFCmpInst(FCmpInst &I) {
 
   const char* op = 0;
   switch (I.getPredicate()) {
-  default: assert(0 && "Illegal FCmp predicate");
+  default: llvm_unreachable("Illegal FCmp predicate");
   case FCmpInst::FCMP_ORD: op = "ord"; break;
   case FCmpInst::FCMP_UNO: op = "uno"; break;
   case FCmpInst::FCMP_UEQ: op = "ueq"; break;
@@ -2752,7 +2833,7 @@ void CWriter::visitFCmpInst(FCmpInst &I) {
 
 static const char * getFloatBitCastField(const Type *Ty) {
   switch (Ty->getTypeID()) {
-    default: assert(0 && "Invalid Type");
+    default: llvm_unreachable("Invalid Type");
     case Type::FloatTyID:  return "Float";
     case Type::DoubleTyID: return "Double";
     case Type::IntegerTyID: {
@@ -2784,12 +2865,13 @@ void CWriter::visitCastInst(CastInst &I) {
   printCast(I.getOpcode(), SrcTy, DstTy);
 
   // Make a sext from i1 work by subtracting the i1 from 0 (an int).
-  if (SrcTy == Type::Int1Ty && I.getOpcode() == Instruction::SExt)
+  if (SrcTy == Type::getInt1Ty(I.getContext()) &&
+      I.getOpcode() == Instruction::SExt)
     Out << "0-";
   
   writeOperand(I.getOperand(0));
     
-  if (DstTy == Type::Int1Ty && 
+  if (DstTy == Type::getInt1Ty(I.getContext()) && 
       (I.getOpcode() == Instruction::Trunc ||
        I.getOpcode() == Instruction::FPToUI ||
        I.getOpcode() == Instruction::FPToSI ||
@@ -3020,10 +3102,12 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
     Out << ", ";
     // Output the last argument to the enclosing function.
     if (I.getParent()->getParent()->arg_empty()) {
-      cerr << "The C backend does not currently support zero "
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "The C backend does not currently support zero "
            << "argument varargs functions, such as '"
-           << I.getParent()->getParent()->getName() << "'!\n";
-      abort();
+           << I.getParent()->getParent()->getName() << "'!";
+      llvm_report_error(Msg.str());
     }
     writeOperand(--I.getParent()->getParent()->arg_end());
     Out << ')';
@@ -3092,16 +3176,15 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
   case Intrinsic::dbg_stoppoint: {
     // If we use writeOperand directly we get a "u" suffix which is rejected
     // by gcc.
-    std::stringstream SPIStr;
     DbgStopPointInst &SPI = cast<DbgStopPointInst>(I);
-    SPI.getDirectory()->print(SPIStr);
+    std::string dir;
+    GetConstantStringInfo(SPI.getDirectory(), dir);
+    std::string file;
+    GetConstantStringInfo(SPI.getFileName(), file);
     Out << "\n#line "
         << SPI.getLine()
-        << " \"";
-    Out << SPIStr.str();
-    SPIStr.clear();
-    SPI.getFileName()->print(SPIStr);
-    Out << SPIStr.str() << "\"\n";
+        << " \""
+        << dir << '/' << file << "\"\n";
     return true;
   }
   case Intrinsic::x86_sse_cmp_ss:
@@ -3113,7 +3196,7 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
     Out << ')';  
     // Multiple GCC builtins multiplex onto this intrinsic.
     switch (cast<ConstantInt>(I.getOperand(3))->getZExtValue()) {
-    default: assert(0 && "Invalid llvm.x86.sse.cmp!");
+    default: llvm_unreachable("Invalid llvm.x86.sse.cmp!");
     case 0: Out << "__builtin_ia32_cmpeq"; break;
     case 1: Out << "__builtin_ia32_cmplt"; break;
     case 2: Out << "__builtin_ia32_cmple"; break;
@@ -3159,27 +3242,25 @@ std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
 
   const char *const *table = 0;
   
-  //Grab the translation table from TargetAsmInfo if it exists
+  // Grab the translation table from MCAsmInfo if it exists.
   if (!TAsm) {
+    std::string Triple = TheModule->getTargetTriple();
+    if (Triple.empty())
+      Triple = llvm::sys::getHostTriple();
+
     std::string E;
-    const TargetMachineRegistry::entry* Match = 
-      TargetMachineRegistry::getClosestStaticTargetForModule(*TheModule, E);
-    if (Match) {
-      //Per platform Target Machines don't exist, so create it
-      // this must be done only once
-      const TargetMachine* TM = Match->CtorFn(*TheModule, "");
-      TAsm = TM->getTargetAsmInfo();
-    }
+    if (const Target *Match = TargetRegistry::lookupTarget(Triple, E))
+      TAsm = Match->createAsmInfo(Triple);
   }
   if (TAsm)
     table = TAsm->getAsmCBE();
 
-  //Search the translation table if it exists
+  // Search the translation table if it exists.
   for (int i = 0; table && table[i]; i += 2)
     if (c.Codes[0] == table[i])
       return table[i+1];
 
-  //default is identity
+  // Default is identity.
   return c.Codes[0];
 }
 
@@ -3215,7 +3296,7 @@ void CWriter::visitInlineAsm(CallInst &CI) {
   std::vector<InlineAsm::ConstraintInfo> Constraints = as->ParseConstraints();
   
   std::vector<std::pair<Value*, int> > ResultVals;
-  if (CI.getType() == Type::VoidTy)
+  if (CI.getType() == Type::getVoidTy(CI.getContext()))
     ;
   else if (const StructType *ST = dyn_cast<StructType>(CI.getType())) {
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i)
@@ -3325,7 +3406,7 @@ void CWriter::visitInlineAsm(CallInst &CI) {
 }
 
 void CWriter::visitMallocInst(MallocInst &I) {
-  assert(0 && "lowerallocations pass didn't work!");
+  llvm_unreachable("lowerallocations pass didn't work!");
 }
 
 void CWriter::visitAllocaInst(AllocaInst &I) {
@@ -3342,7 +3423,7 @@ void CWriter::visitAllocaInst(AllocaInst &I) {
 }
 
 void CWriter::visitFreeInst(FreeInst &I) {
-  assert(0 && "lowerallocations pass didn't work!");
+  llvm_unreachable("lowerallocations pass didn't work!");
 }
 
 void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
@@ -3603,7 +3684,7 @@ void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
 //===----------------------------------------------------------------------===//
 
 bool CTargetMachine::addPassesToEmitWholeFile(PassManager &PM,
-                                              raw_ostream &o,
+                                              formatted_raw_ostream &o,
                                               CodeGenFileType FileType,
                                               CodeGenOpt::Level OptLevel) {
   if (FileType != TargetMachine::AssemblyFile) return true;
diff --git a/lib/Target/CBackend/CTargetMachine.h b/lib/Target/CBackend/CTargetMachine.h
index 8b26245..715bbda 100644
--- a/lib/Target/CBackend/CTargetMachine.h
+++ b/lib/Target/CBackend/CTargetMachine.h
@@ -20,23 +20,20 @@
 namespace llvm {
 
 struct CTargetMachine : public TargetMachine {
-  const TargetData DataLayout;       // Calculates type size & alignment
-
-  CTargetMachine(const Module &M, const std::string &FS)
-    : DataLayout(&M) {}
+  CTargetMachine(const Target &T, const std::string &TT, const std::string &FS)
+    : TargetMachine(T) {}
 
   virtual bool WantsWholeFile() const { return true; }
-  virtual bool addPassesToEmitWholeFile(PassManager &PM, raw_ostream &Out,
+  virtual bool addPassesToEmitWholeFile(PassManager &PM,
+                                        formatted_raw_ostream &Out,
                                         CodeGenFileType FileType,
                                         CodeGenOpt::Level OptLevel);
-
-  // This class always works, but must be requested explicitly on 
-  // llc command line.
-  static unsigned getModuleMatchQuality(const Module &M) { return 0; }
   
-  virtual const TargetData *getTargetData() const { return &DataLayout; }
+  virtual const TargetData *getTargetData() const { return 0; }
 };
 
+extern Target TheCBackendTarget;
+
 } // End llvm namespace
 
 
diff --git a/lib/Target/CBackend/Makefile b/lib/Target/CBackend/Makefile
index 336de0c..3b5ef0f 100644
--- a/lib/Target/CBackend/Makefile
+++ b/lib/Target/CBackend/Makefile
@@ -9,6 +9,9 @@
 
 LEVEL = ../../..
 LIBRARYNAME = LLVMCBackend
+
+DIRS = TargetInfo
+
 include $(LEVEL)/Makefile.common
 
 CompileCommonOpts += -Wno-format
diff --git a/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp b/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp
new file mode 100644
index 0000000..f7e8ff2
--- /dev/null
+++ b/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp
@@ -0,0 +1,19 @@
+//===-- CBackendTargetInfo.cpp - CBackend Target Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheCBackendTarget;
+
+extern "C" void LLVMInitializeCBackendTargetInfo() { 
+  RegisterTarget<> X(TheCBackendTarget, "c", "C backend");
+}
diff --git a/lib/Target/CBackend/TargetInfo/CMakeLists.txt b/lib/Target/CBackend/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..5b35fa7
--- /dev/null
+++ b/lib/Target/CBackend/TargetInfo/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMCBackendInfo
+  CBackendTargetInfo.cpp
+  )
+
diff --git a/lib/Target/CBackend/TargetInfo/Makefile b/lib/Target/CBackend/TargetInfo/Makefile
new file mode 100644
index 0000000..d4d5e15
--- /dev/null
+++ b/lib/Target/CBackend/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/CBackend/TargetInfo/Makefile -------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMCBackendInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index 7cffd0e..8769ee2 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -1,18 +1,14 @@
 add_llvm_library(LLVMTarget
-  DarwinTargetAsmInfo.cpp
-  ELFTargetAsmInfo.cpp
   SubtargetFeature.cpp
   Target.cpp
-  TargetAsmInfo.cpp
   TargetData.cpp
   TargetELFWriterInfo.cpp
   TargetFrameInfo.cpp
   TargetInstrInfo.cpp
+  TargetIntrinsicInfo.cpp
+  TargetLoweringObjectFile.cpp
   TargetMachOWriterInfo.cpp
   TargetMachine.cpp
-  TargetMachineRegistry.cpp
   TargetRegisterInfo.cpp
   TargetSubtarget.cpp
   )
-
-# TODO: Support other targets besides X86. See Makefile.
diff --git a/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt b/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt
index 9684e63..1e508fe 100644
--- a/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt
+++ b/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt
@@ -4,6 +4,6 @@ include_directories(
   )
 
 add_llvm_library(LLVMCellSPUAsmPrinter
-  SPUAsmPrinter.cpp
+  SPUAsmPrinter.cpp
   )
 add_dependencies(LLVMCellSPUAsmPrinter CellSPUCodeGenTable_gen)
 \ No newline at end of file
diff --git a/lib/Target/CellSPU/AsmPrinter/Makefile b/lib/Target/CellSPU/AsmPrinter/Makefile
index dd56df7..69639ef 100644
--- a/lib/Target/CellSPU/AsmPrinter/Makefile
+++ b/lib/Target/CellSPU/AsmPrinter/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/CellSPU/Makefile -------------------------*- Makefile -*-===##
+##===- lib/Target/CellSPU/AsmPrinter/Makefile --------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
diff --git a/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp b/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp
index 2847d0b..0f8d539 100644
--- a/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp
+++ b/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp
@@ -19,25 +19,29 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
-#include "llvm/MDNode.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DwarfWriter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/Support/Mangler.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetAsmInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/MathExtras.h"
 #include <set>
 using namespace llvm;
 
@@ -49,8 +53,8 @@ namespace {
   class VISIBILITY_HIDDEN SPUAsmPrinter : public AsmPrinter {
     std::set<std::string> FnStubs, GVStubs;
   public:
-    explicit SPUAsmPrinter(raw_ostream &O, TargetMachine &TM,
-                           const TargetAsmInfo *T, bool V) :
+    explicit SPUAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                           const MCAsmInfo *T, bool V) :
       AsmPrinter(O, TM, T, V) {}
 
     virtual const char *getPassName() const {
@@ -62,10 +66,10 @@ namespace {
     }
 
     /// printInstruction - This method is automatically generated by tablegen
-    /// from the instruction set description.  This method returns true if the
-    /// machine instruction was sufficiently described to print it, otherwise it
-    /// returns false.
-    bool printInstruction(const MachineInstr *MI);
+    /// from the instruction set description.
+    void printInstruction(const MachineInstr *MI);
+    static const char *getRegisterName(unsigned RegNo);
+
 
     void printMachineInstruction(const MachineInstr *MI);
     void printOp(const MachineOperand &MO);
@@ -76,14 +80,13 @@ namespace {
       unsigned RegNo = MO.getReg();
       assert(TargetRegisterInfo::isPhysicalRegister(RegNo) &&
              "Not physreg??");
-      O << TM.getRegisterInfo()->get(RegNo).AsmName;
+      O << getRegisterName(RegNo);
     }
 
     void printOperand(const MachineInstr *MI, unsigned OpNo) {
       const MachineOperand &MO = MI->getOperand(OpNo);
       if (MO.isReg()) {
-        assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg())&&"Not physreg??");
-        O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
+        O << getRegisterName(MO.getReg());
       } else if (MO.isImm()) {
         O << MO.getImm();
       } else {
@@ -150,8 +153,7 @@ namespace {
       // the value contained in the register.  For this reason, the darwin
       // assembler requires that we print r0 as 0 (no r) when used as the base.
       const MachineOperand &MO = MI->getOperand(OpNo);
-      O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
-      O << ", ";
+      O << getRegisterName(MO.getReg()) << ", ";
       printOperand(MI, OpNo+1);
     }
 
@@ -264,7 +266,7 @@ namespace {
                && "Invalid negated immediate rotate 7-bit argument");
         O << -value;
       } else {
-        assert(0 &&"Invalid/non-immediate rotate amount in printRotateNeg7Imm");
+        llvm_unreachable("Invalid/non-immediate rotate amount in printRotateNeg7Imm");
       }
     }
 
@@ -275,31 +277,25 @@ namespace {
                && "Invalid negated immediate rotate 7-bit argument");
         O << -value;
       } else {
-        assert(0 &&"Invalid/non-immediate rotate amount in printRotateNeg7Imm");
+        llvm_unreachable("Invalid/non-immediate rotate amount in printRotateNeg7Imm");
       }
     }
 
     virtual bool runOnMachineFunction(MachineFunction &F) = 0;
-    //! Assembly printer cleanup after function has been emitted
-    virtual bool doFinalization(Module &M) = 0;
   };
 
   /// LinuxAsmPrinter - SPU assembly printer, customized for Linux
   class VISIBILITY_HIDDEN LinuxAsmPrinter : public SPUAsmPrinter {
-    DwarfWriter *DW;
   public:
-    explicit LinuxAsmPrinter(raw_ostream &O, SPUTargetMachine &TM,
-                             const TargetAsmInfo *T, bool V)
-      : SPUAsmPrinter(O, TM, T, V), DW(0) {}
+    explicit LinuxAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                             const MCAsmInfo *T, bool V)
+      : SPUAsmPrinter(O, TM, T, V) {}
 
     virtual const char *getPassName() const {
       return "STI CBEA SPU Assembly Printer";
     }
 
     bool runOnMachineFunction(MachineFunction &F);
-    bool doInitialization(Module &M);
-    //! Dump globals, perform cleanup after function emission
-    bool doFinalization(Module &M);
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
@@ -309,7 +305,7 @@ namespace {
     }
 
     //! Emit a global variable according to its section and type
-    void printModuleLevelGV(const GlobalVariable* GVar);
+    void PrintGlobalVariable(const GlobalVariable* GVar);
   };
 } // end of anonymous namespace
 
@@ -319,35 +315,34 @@ namespace {
 void SPUAsmPrinter::printOp(const MachineOperand &MO) {
   switch (MO.getType()) {
   case MachineOperand::MO_Immediate:
-    cerr << "printOp() does not handle immediate values\n";
-    abort();
+    llvm_report_error("printOp() does not handle immediate values");
     return;
 
   case MachineOperand::MO_MachineBasicBlock:
-    printBasicBlockLabel(MO.getMBB());
+    GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
     return;
   case MachineOperand::MO_JumpTableIndex:
-    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
       << '_' << MO.getIndex();
     return;
   case MachineOperand::MO_ConstantPoolIndex:
-    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
       << '_' << MO.getIndex();
     return;
   case MachineOperand::MO_ExternalSymbol:
     // Computing the address of an external symbol, not calling it.
     if (TM.getRelocationModel() != Reloc::Static) {
-      std::string Name(TAI->getGlobalPrefix()); Name += MO.getSymbolName();
+      std::string Name(MAI->getGlobalPrefix()); Name += MO.getSymbolName();
       GVStubs.insert(Name);
       O << "L" << Name << "$non_lazy_ptr";
       return;
     }
-    O << TAI->getGlobalPrefix() << MO.getSymbolName();
+    O << MAI->getGlobalPrefix() << MO.getSymbolName();
     return;
   case MachineOperand::MO_GlobalAddress: {
     // Computing the address of a global symbol, not calling it.
     GlobalValue *GV = MO.getGlobal();
-    std::string Name = Mang->getValueName(GV);
+    std::string Name = Mang->getMangledName(GV);
 
     // External or weakly linked global variables need non-lazily-resolved
     // stubs
@@ -410,15 +405,18 @@ bool SPUAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 ///
 void SPUAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
   ++EmittedInsts;
+  processDebugLoc(MI, true);
   printInstruction(MI);
+  if (VerboseAsm && !MI->getDebugLoc().isUnknown())
+    EmitComments(*MI);
+  processDebugLoc(MI, false);
+  O << '\n';
 }
 
 /// runOnMachineFunction - This uses the printMachineInstruction()
 /// method to print assembly for each instruction.
 ///
-bool
-LinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF)
-{
+bool LinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   this->MF = &MF;
 
   SetupMachineFunction(MF);
@@ -430,12 +428,13 @@ LinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF)
   // Print out labels for the function.
   const Function *F = MF.getFunction();
 
-  SwitchToSection(TAI->SectionForGlobal(F));
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
   EmitAlignment(MF.getAlignment(), F);
 
   switch (F->getLinkage()) {
-  default: assert(0 && "Unknown linkage type!");
+  default: llvm_unreachable("Unknown linkage type!");
   case Function::PrivateLinkage:
+  case Function::LinkerPrivateLinkage:
   case Function::InternalLinkage:  // Symbols default to internal.
     break;
   case Function::ExternalLinkage:
@@ -460,8 +459,7 @@ LinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF)
        I != E; ++I) {
     // Print a label for the basic block.
     if (I != MF.begin()) {
-      printBasicBlockLabel(I, true, true);
-      O << '\n';
+      EmitBasicBlockStart(I);
     }
     for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
          II != E; ++II) {
@@ -483,29 +481,13 @@ LinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF)
 }
 
 
-bool LinuxAsmPrinter::doInitialization(Module &M) {
-  bool Result = AsmPrinter::doInitialization(M);
-  DW = getAnalysisIfAvailable<DwarfWriter>();
-  SwitchToTextSection("\t.text");
-  return Result;
-}
-
-/// PrintUnmangledNameSafely - Print out the printable characters in the name.
-/// Don't print things like \\n or \\0.
-static void PrintUnmangledNameSafely(const Value *V, raw_ostream &OS) {
-  for (const char *Name = V->getNameStart(), *E = Name+V->getNameLen();
-       Name != E; ++Name)
-    if (isprint(*Name))
-      OS << *Name;
-}
-
 /*!
   Emit a global variable according to its section, alignment, etc.
 
   \note This code was shamelessly copied from the PowerPC's assembly printer,
   which sort of screams for some kind of refactorization of common code.
  */
-void LinuxAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+void LinuxAsmPrinter::PrintGlobalVariable(const GlobalVariable *GVar) {
   const TargetData *TD = TM.getTargetData();
 
   if (!GVar->hasInitializer())
@@ -515,18 +497,17 @@ void LinuxAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
   if (EmitSpecialLLVMGlobal(GVar))
     return;
 
-  std::string name = Mang->getValueName(GVar);
+  std::string name = Mang->getMangledName(GVar);
 
   printVisibility(name, GVar->getVisibility());
 
   Constant *C = GVar->getInitializer();
-  if (isa<MDNode>(C) || isa<MDString>(C))
-    return;
   const Type *Type = C->getType();
   unsigned Size = TD->getTypeAllocSize(Type);
   unsigned Align = TD->getPreferredAlignmentLog(GVar);
 
-  SwitchToSection(TAI->SectionForGlobal(GVar));
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GVar, Mang,
+                                                                  TM));
 
   if (C->isNullValue() && /* FIXME: Verify correct */
       !GVar->hasSection() &&
@@ -540,12 +521,12 @@ void LinuxAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
         O << name << ":\n";
         O << "\t.zero " << Size << '\n';
       } else if (GVar->hasLocalLinkage()) {
-        O << TAI->getLCOMMDirective() << name << ',' << Size;
+        O << MAI->getLCOMMDirective() << name << ',' << Size;
       } else {
         O << ".comm " << name << ',' << Size;
       }
-      O << "\t\t" << TAI->getCommentString() << " '";
-      PrintUnmangledNameSafely(GVar, O);
+      O << "\t\t" << MAI->getCommentString() << " '";
+      WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
       O << "'\n";
       return;
   }
@@ -570,48 +551,23 @@ void LinuxAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
       << "\t.type " << name << ", @object\n";
     // FALL THROUGH
    case GlobalValue::PrivateLinkage:
+   case GlobalValue::LinkerPrivateLinkage:
    case GlobalValue::InternalLinkage:
     break;
    default:
-    cerr << "Unknown linkage type!";
-    abort();
+    llvm_report_error("Unknown linkage type!");
   }
 
   EmitAlignment(Align, GVar);
-  O << name << ":\t\t\t\t" << TAI->getCommentString() << " '";
-  PrintUnmangledNameSafely(GVar, O);
+  O << name << ":\t\t\t\t" << MAI->getCommentString() << " '";
+  WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
   O << "'\n";
 
   EmitGlobalConstant(C);
   O << '\n';
 }
 
-bool LinuxAsmPrinter::doFinalization(Module &M) {
-  // Print out module-level global variables here.
-  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    printModuleLevelGV(I);
-
-  return AsmPrinter::doFinalization(M);
-}
-
-/// createSPUCodePrinterPass - Returns a pass that prints the Cell SPU
-/// assembly code for a MachineFunction to the given output stream, in a format
-/// that the Linux SPU assembler can deal with.
-///
-FunctionPass *llvm::createSPUAsmPrinterPass(raw_ostream &o,
-                                            SPUTargetMachine &tm,
-                                            bool verbose) {
-  return new LinuxAsmPrinter(o, tm, tm.getTargetAsmInfo(), verbose);
-}
-
 // Force static initialization.
-extern "C" void LLVMInitializeCellSPUAsmPrinter() { }
-
-namespace {
-  static struct Register {
-    Register() {
-      SPUTargetMachine::registerAsmPrinter(createSPUAsmPrinterPass);
-    }
-  } Registrator;
+extern "C" void LLVMInitializeCellSPUAsmPrinter() { 
+  RegisterAsmPrinter<LinuxAsmPrinter> X(TheCellSPUTarget);
 }
diff --git a/lib/Target/CellSPU/CMakeLists.txt b/lib/Target/CellSPU/CMakeLists.txt
index 8a55845..0cb6676 100644
--- a/lib/Target/CellSPU/CMakeLists.txt
+++ b/lib/Target/CellSPU/CMakeLists.txt
@@ -17,9 +17,9 @@ add_llvm_target(CellSPUCodeGen
   SPUInstrInfo.cpp
   SPUISelDAGToDAG.cpp
   SPUISelLowering.cpp
+  SPUMCAsmInfo.cpp
   SPURegisterInfo.cpp
   SPUSubtarget.cpp
-  SPUTargetAsmInfo.cpp
   SPUTargetMachine.cpp
   )
 
diff --git a/lib/Target/CellSPU/Makefile b/lib/Target/CellSPU/Makefile
index a460db3..8415168 100644
--- a/lib/Target/CellSPU/Makefile
+++ b/lib/Target/CellSPU/Makefile
@@ -17,6 +17,6 @@ BUILT_SOURCES = SPUGenInstrNames.inc SPUGenRegisterNames.inc \
 		SPUGenInstrInfo.inc SPUGenDAGISel.inc \
 		SPUGenSubtarget.inc SPUGenCallingConv.inc
 
-DIRS = AsmPrinter
+DIRS = AsmPrinter TargetInfo
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/CellSPU/SPU.h b/lib/Target/CellSPU/SPU.h
index 10d1110..02713b5 100644
--- a/lib/Target/CellSPU/SPU.h
+++ b/lib/Target/CellSPU/SPU.h
@@ -21,12 +21,9 @@
 namespace llvm {
   class SPUTargetMachine;
   class FunctionPass;
-  class raw_ostream;
+  class formatted_raw_ostream;
 
   FunctionPass *createSPUISelDag(SPUTargetMachine &TM);
-  FunctionPass *createSPUAsmPrinterPass(raw_ostream &o,
-                                        SPUTargetMachine &tm,
-                                        bool verbose);
 
   /*--== Utility functions/predicates/etc used all over the place: --==*/
   //! Predicate test for a signed 10-bit value
@@ -92,6 +89,9 @@ namespace llvm {
   inline bool isU10Constant(uint64_t Value) {
     return (Value == (Value & 0x3ff));
   }
+
+  extern Target TheCellSPUTarget;
+
 }
 
 // Defines symbolic names for the SPU instructions.
diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.cpp b/lib/Target/CellSPU/SPUHazardRecognizers.cpp
index caaa71a..9dbab1d 100644
--- a/lib/Target/CellSPU/SPUHazardRecognizers.cpp
+++ b/lib/Target/CellSPU/SPUHazardRecognizers.cpp
@@ -20,7 +20,7 @@
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/Support/Debug.h"
-
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -115,7 +115,8 @@ SPUHazardRecognizer::getHazardType(SUnit *SU)
   if (mustBeOdd && !EvenOdd)
     retval = Hazard;
 
-  DOUT << "SPUHazardRecognizer EvenOdd " << EvenOdd << " Hazard " << retval << "\n";
+  DEBUG(errs() << "SPUHazardRecognizer EvenOdd " << EvenOdd << " Hazard "
+               << retval << "\n");
   EvenOdd ^= 1;
   return retval;
 #else
@@ -129,7 +130,7 @@ void SPUHazardRecognizer::EmitInstruction(SUnit *SU)
 
 void SPUHazardRecognizer::AdvanceCycle()
 {
-  DOUT << "SPUHazardRecognizer::AdvanceCycle\n";
+  DEBUG(errs() << "SPUHazardRecognizer::AdvanceCycle\n");
 }
 
 void SPUHazardRecognizer::EmitNoop()
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index 779d75d..1f9e5fc 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -30,9 +30,12 @@
 #include "llvm/Constants.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -100,7 +103,7 @@ namespace {
   bool
   isIntS16Immediate(ConstantSDNode *CN, short &Imm)
   {
-    MVT vt = CN->getValueType(0);
+    EVT vt = CN->getValueType(0);
     Imm = (short) CN->getZExtValue();
     if (vt.getSimpleVT() >= MVT::i1 && vt.getSimpleVT() <= MVT::i16) {
       return true;
@@ -129,7 +132,7 @@ namespace {
   static bool
   isFPS16Immediate(ConstantFPSDNode *FPN, short &Imm)
   {
-    MVT vt = FPN->getValueType(0);
+    EVT vt = FPN->getValueType(0);
     if (vt == MVT::f32) {
       int val = FloatToBits(FPN->getValueAPF().convertToFloat());
       int sval = (int) ((val << 16) >> 16);
@@ -151,10 +154,10 @@ namespace {
   }
 
   //===------------------------------------------------------------------===//
-  //! MVT to "useful stuff" mapping structure:
+  //! EVT to "useful stuff" mapping structure:
 
   struct valtype_map_s {
-    MVT VT;
+    EVT VT;
     unsigned ldresult_ins;      /// LDRESULT instruction (0 = undefined)
     bool ldresult_imm;          /// LDRESULT instruction requires immediate?
     unsigned lrinst;            /// LR instruction
@@ -178,7 +181,7 @@ namespace {
 
   const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);
 
-  const valtype_map_s *getValueTypeMapEntry(MVT VT)
+  const valtype_map_s *getValueTypeMapEntry(EVT VT)
   {
     const valtype_map_s *retval = 0;
     for (size_t i = 0; i < n_valtype_map; ++i) {
@@ -191,10 +194,11 @@ namespace {
 
 #ifndef NDEBUG
     if (retval == 0) {
-      cerr << "SPUISelDAGToDAG.cpp: getValueTypeMapEntry returns NULL for "
-           << VT.getMVTString()
-           << "\n";
-      abort();
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "SPUISelDAGToDAG.cpp: getValueTypeMapEntry returns NULL for "
+           << VT.getEVTString();
+      llvm_report_error(Msg.str());
     }
 #endif
 
@@ -249,10 +253,10 @@ namespace {
       SPUtli(*tm.getTargetLowering())
     { }
 
-    virtual bool runOnFunction(Function &Fn) {
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
       // Make sure we re-emit a set of the global base reg if necessary
       GlobalBaseReg = 0;
-      SelectionDAGISel::runOnFunction(Fn);
+      SelectionDAGISel::runOnMachineFunction(MF);
       return true;
     }
 
@@ -274,8 +278,8 @@ namespace {
       }
 
     SDNode *emitBuildVector(SDValue build_vec) {
-      MVT vecVT = build_vec.getValueType();
-      MVT eltVT = vecVT.getVectorElementType();
+      EVT vecVT = build_vec.getValueType();
+      EVT eltVT = vecVT.getVectorElementType();
       SDNode *bvNode = build_vec.getNode();
       DebugLoc dl = bvNode->getDebugLoc();
 
@@ -319,19 +323,19 @@ namespace {
     SDNode *Select(SDValue Op);
 
     //! Emit the instruction sequence for i64 shl
-    SDNode *SelectSHLi64(SDValue &Op, MVT OpVT);
+    SDNode *SelectSHLi64(SDValue &Op, EVT OpVT);
 
     //! Emit the instruction sequence for i64 srl
-    SDNode *SelectSRLi64(SDValue &Op, MVT OpVT);
+    SDNode *SelectSRLi64(SDValue &Op, EVT OpVT);
 
     //! Emit the instruction sequence for i64 sra
-    SDNode *SelectSRAi64(SDValue &Op, MVT OpVT);
+    SDNode *SelectSRAi64(SDValue &Op, EVT OpVT);
 
     //! Emit the necessary sequence for loading i64 constants:
-    SDNode *SelectI64Constant(SDValue &Op, MVT OpVT, DebugLoc dl);
+    SDNode *SelectI64Constant(SDValue &Op, EVT OpVT, DebugLoc dl);
 
     //! Alternate instruction emit sequence for loading i64 constants
-    SDNode *SelectI64Constant(uint64_t i64const, MVT OpVT, DebugLoc dl);
+    SDNode *SelectI64Constant(uint64_t i64const, EVT OpVT, DebugLoc dl);
 
     //! Returns true if the address N is an A-form (local store) address
     bool SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base,
@@ -375,7 +379,7 @@ namespace {
         break;
       case 'v':   // not offsetable
 #if 1
-        assert(0 && "InlineAsmMemoryOperand 'v' constraint not handled.");
+        llvm_unreachable("InlineAsmMemoryOperand 'v' constraint not handled.");
 #else
         SelectAddrIdxOnly(Op, Op, Op0, Op1);
 #endif
@@ -430,23 +434,21 @@ bool
 SPUDAGToDAGISel::SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base,
                     SDValue &Index) {
   // These match the addr256k operand type:
-  MVT OffsVT = MVT::i16;
+  EVT OffsVT = MVT::i16;
   SDValue Zero = CurDAG->getTargetConstant(0, OffsVT);
 
   switch (N.getOpcode()) {
   case ISD::Constant:
   case ISD::ConstantPool:
   case ISD::GlobalAddress:
-    cerr << "SPU SelectAFormAddr: Constant/Pool/Global not lowered.\n";
-    abort();
+    llvm_report_error("SPU SelectAFormAddr: Constant/Pool/Global not lowered.");
     /*NOTREACHED*/
 
   case ISD::TargetConstant:
   case ISD::TargetGlobalAddress:
   case ISD::TargetJumpTable:
-    cerr << "SPUSelectAFormAddr: Target Constant/Pool/Global not wrapped as "
-         << "A-form address.\n";
-    abort();
+    llvm_report_error("SPUSelectAFormAddr: Target Constant/Pool/Global "
+                      "not wrapped as A-form address.");
     /*NOTREACHED*/
 
   case SPUISD::AFormAddr:
@@ -512,13 +514,13 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDValue Op, SDValue N, SDValue &Base,
                                       SDValue &Index, int minOffset,
                                       int maxOffset) {
   unsigned Opc = N.getOpcode();
-  MVT PtrTy = SPUtli.getPointerTy();
+  EVT PtrTy = SPUtli.getPointerTy();
 
   if (Opc == ISD::FrameIndex) {
     // Stack frame index must be less than 512 (divided by 16):
     FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N);
     int FI = int(FIN->getIndex());
-    DEBUG(cerr << "SelectDFormAddr: ISD::FrameIndex = "
+    DEBUG(errs() << "SelectDFormAddr: ISD::FrameIndex = "
                << FI << "\n");
     if (SPUFrameInfo::FItoStackOffset(FI) < maxOffset) {
       Base = CurDAG->getTargetConstant(0, PtrTy);
@@ -543,7 +545,7 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDValue Op, SDValue N, SDValue &Base,
       if (Op0.getOpcode() == ISD::FrameIndex) {
         FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op0);
         int FI = int(FIN->getIndex());
-        DEBUG(cerr << "SelectDFormAddr: ISD::ADD offset = " << offset
+        DEBUG(errs() << "SelectDFormAddr: ISD::ADD offset = " << offset
                    << " frame index = " << FI << "\n");
 
         if (SPUFrameInfo::FItoStackOffset(FI) < maxOffset) {
@@ -564,7 +566,7 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDValue Op, SDValue N, SDValue &Base,
       if (Op1.getOpcode() == ISD::FrameIndex) {
         FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op1);
         int FI = int(FIN->getIndex());
-        DEBUG(cerr << "SelectDFormAddr: ISD::ADD offset = " << offset
+        DEBUG(errs() << "SelectDFormAddr: ISD::ADD offset = " << offset
                    << " frame index = " << FI << "\n");
 
         if (SPUFrameInfo::FItoStackOffset(FI) < maxOffset) {
@@ -690,7 +692,7 @@ SPUDAGToDAGISel::Select(SDValue Op) {
   unsigned Opc = N->getOpcode();
   int n_ops = -1;
   unsigned NewOpc;
-  MVT OpVT = Op.getValueType();
+  EVT OpVT = Op.getValueType();
   SDValue Ops[8];
   DebugLoc dl = N->getDebugLoc();
 
@@ -711,8 +713,9 @@ SPUDAGToDAGISel::Select(SDValue Op) {
     } else {
       NewOpc = SPU::Ar32;
       Ops[0] = CurDAG->getRegister(SPU::R1, Op.getValueType());
-      Ops[1] = SDValue(CurDAG->getTargetNode(SPU::ILAr32, dl, Op.getValueType(),
-                                             TFI, Imm0), 0);
+      Ops[1] = SDValue(CurDAG->getMachineNode(SPU::ILAr32, dl,
+                                              Op.getValueType(), TFI, Imm0),
+                       0);
       n_ops = 2;
     }
   } else if (Opc == ISD::Constant && OpVT == MVT::i64) {
@@ -723,17 +726,17 @@ SPUDAGToDAGISel::Select(SDValue Op) {
   } else if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND)
              && OpVT == MVT::i64) {
     SDValue Op0 = Op.getOperand(0);
-    MVT Op0VT = Op0.getValueType();
-    MVT Op0VecVT = MVT::getVectorVT(Op0VT, (128 / Op0VT.getSizeInBits()));
-    MVT OpVecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+    EVT Op0VT = Op0.getValueType();
+    EVT Op0VecVT = EVT::getVectorVT(*CurDAG->getContext(),
+                                    Op0VT, (128 / Op0VT.getSizeInBits()));
+    EVT OpVecVT = EVT::getVectorVT(*CurDAG->getContext(), 
+                                   OpVT, (128 / OpVT.getSizeInBits()));
     SDValue shufMask;
 
-    switch (Op0VT.getSimpleVT()) {
+    switch (Op0VT.getSimpleVT().SimpleTy) {
     default:
-      cerr << "CellSPU Select: Unhandled zero/any extend MVT\n";
-      abort();
+      llvm_report_error("CellSPU Select: Unhandled zero/any extend EVT");
       /*NOTREACHED*/
-      break;
     case MVT::i32:
       shufMask = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                                  CurDAG->getConstant(0x80808080, MVT::i32),
@@ -811,8 +814,8 @@ SPUDAGToDAGISel::Select(SDValue Op) {
 
         if (shift_amt >= 32) {
           SDNode *hi32 =
-                  CurDAG->getTargetNode(SPU::ORr32_r64, dl, OpVT,
-                                        Op0.getOperand(0));
+                  CurDAG->getMachineNode(SPU::ORr32_r64, dl, OpVT,
+                                         Op0.getOperand(0));
 
           shift_amt -= 32;
           if (shift_amt > 0) {
@@ -823,8 +826,8 @@ SPUDAGToDAGISel::Select(SDValue Op) {
             if (Op0.getOpcode() == ISD::SRL)
               Opc = SPU::ROTMr32;
 
-            hi32 = CurDAG->getTargetNode(Opc, dl, OpVT, SDValue(hi32, 0),
-                                         shift);
+            hi32 = CurDAG->getMachineNode(Opc, dl, OpVT, SDValue(hi32, 0),
+                                          shift);
           }
 
           return hi32;
@@ -856,10 +859,10 @@ SPUDAGToDAGISel::Select(SDValue Op) {
         if (OpVT == MVT::v2f64)
           Opc = SPU::DFNMSv2f64;
 
-        return CurDAG->getTargetNode(Opc, dl, OpVT,
-                                     Op00.getOperand(0),
-                                     Op00.getOperand(1),
-                                     Op0.getOperand(1));
+        return CurDAG->getMachineNode(Opc, dl, OpVT,
+                                      Op00.getOperand(0),
+                                      Op00.getOperand(1),
+                                      Op0.getOperand(1));
       }
     }
 
@@ -876,43 +879,44 @@ SPUDAGToDAGISel::Select(SDValue Op) {
                                                  negConst, negConst));
     }
 
-    return CurDAG->getTargetNode(Opc, dl, OpVT,
-                                 Op.getOperand(0), SDValue(signMask, 0));
+    return CurDAG->getMachineNode(Opc, dl, OpVT,
+                                  Op.getOperand(0), SDValue(signMask, 0));
   } else if (Opc == ISD::FABS) {
     if (OpVT == MVT::f64) {
       SDNode *signMask = SelectI64Constant(0x7fffffffffffffffULL, MVT::i64, dl);
-      return CurDAG->getTargetNode(SPU::ANDfabs64, dl, OpVT,
-                                   Op.getOperand(0), SDValue(signMask, 0));
+      return CurDAG->getMachineNode(SPU::ANDfabs64, dl, OpVT,
+                                    Op.getOperand(0), SDValue(signMask, 0));
     } else if (OpVT == MVT::v2f64) {
       SDValue absConst = CurDAG->getConstant(0x7fffffffffffffffULL, MVT::i64);
       SDValue absVec = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
                                        absConst, absConst);
       SDNode *signMask = emitBuildVector(absVec);
-      return CurDAG->getTargetNode(SPU::ANDfabsvec, dl, OpVT,
-                                   Op.getOperand(0), SDValue(signMask, 0));
+      return CurDAG->getMachineNode(SPU::ANDfabsvec, dl, OpVT,
+                                    Op.getOperand(0), SDValue(signMask, 0));
     }
   } else if (Opc == SPUISD::LDRESULT) {
     // Custom select instructions for LDRESULT
-    MVT VT = N->getValueType(0);
+    EVT VT = N->getValueType(0);
     SDValue Arg = N->getOperand(0);
     SDValue Chain = N->getOperand(1);
     SDNode *Result;
     const valtype_map_s *vtm = getValueTypeMapEntry(VT);
 
     if (vtm->ldresult_ins == 0) {
-      cerr << "LDRESULT for unsupported type: "
-           << VT.getMVTString()
-           << "\n";
-      abort();
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "LDRESULT for unsupported type: "
+           << VT.getEVTString();
+      llvm_report_error(Msg.str());
     }
 
     Opc = vtm->ldresult_ins;
     if (vtm->ldresult_imm) {
       SDValue Zero = CurDAG->getTargetConstant(0, VT);
 
-      Result = CurDAG->getTargetNode(Opc, dl, VT, MVT::Other, Arg, Zero, Chain);
+      Result = CurDAG->getMachineNode(Opc, dl, VT, MVT::Other, Arg, Zero, Chain);
     } else {
-      Result = CurDAG->getTargetNode(Opc, dl, VT, MVT::Other, Arg, Arg, Chain);
+      Result = CurDAG->getMachineNode(Opc, dl, VT, MVT::Other, Arg, Arg, Chain);
     }
 
     return Result;
@@ -923,7 +927,7 @@ SPUDAGToDAGISel::Select(SDValue Op) {
     // SPUInstrInfo catches the following patterns:
     // (SPUindirect (SPUhi ...), (SPUlo ...))
     // (SPUindirect $sp, imm)
-    MVT VT = Op.getValueType();
+    EVT VT = Op.getValueType();
     SDValue Op0 = N->getOperand(0);
     SDValue Op1 = N->getOperand(1);
     RegisterSDNode *RN;
@@ -948,7 +952,7 @@ SPUDAGToDAGISel::Select(SDValue Op) {
     if (N->hasOneUse())
       return CurDAG->SelectNodeTo(N, NewOpc, OpVT, Ops, n_ops);
     else
-      return CurDAG->getTargetNode(NewOpc, dl, OpVT, Ops, n_ops);
+      return CurDAG->getMachineNode(NewOpc, dl, OpVT, Ops, n_ops);
   } else
     return SelectCode(Op);
 }
@@ -966,24 +970,25 @@ SPUDAGToDAGISel::Select(SDValue Op) {
  * @return The SDNode with the entire instruction sequence
  */
 SDNode *
-SPUDAGToDAGISel::SelectSHLi64(SDValue &Op, MVT OpVT) {
+SPUDAGToDAGISel::SelectSHLi64(SDValue &Op, EVT OpVT) {
   SDValue Op0 = Op.getOperand(0);
-  MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+  EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), 
+                               OpVT, (128 / OpVT.getSizeInBits()));
   SDValue ShiftAmt = Op.getOperand(1);
-  MVT ShiftAmtVT = ShiftAmt.getValueType();
+  EVT ShiftAmtVT = ShiftAmt.getValueType();
   SDNode *VecOp0, *SelMask, *ZeroFill, *Shift = 0;
   SDValue SelMaskVal;
   DebugLoc dl = Op.getDebugLoc();
 
-  VecOp0 = CurDAG->getTargetNode(SPU::ORv2i64_i64, dl, VecVT, Op0);
+  VecOp0 = CurDAG->getMachineNode(SPU::ORv2i64_i64, dl, VecVT, Op0);
   SelMaskVal = CurDAG->getTargetConstant(0xff00ULL, MVT::i16);
-  SelMask = CurDAG->getTargetNode(SPU::FSMBIv2i64, dl, VecVT, SelMaskVal);
-  ZeroFill = CurDAG->getTargetNode(SPU::ILv2i64, dl, VecVT,
-                                   CurDAG->getTargetConstant(0, OpVT));
-  VecOp0 = CurDAG->getTargetNode(SPU::SELBv2i64, dl, VecVT,
-                                 SDValue(ZeroFill, 0),
-                                 SDValue(VecOp0, 0),
-                                 SDValue(SelMask, 0));
+  SelMask = CurDAG->getMachineNode(SPU::FSMBIv2i64, dl, VecVT, SelMaskVal);
+  ZeroFill = CurDAG->getMachineNode(SPU::ILv2i64, dl, VecVT,
+                                    CurDAG->getTargetConstant(0, OpVT));
+  VecOp0 = CurDAG->getMachineNode(SPU::SELBv2i64, dl, VecVT,
+                                  SDValue(ZeroFill, 0),
+                                  SDValue(VecOp0, 0),
+                                  SDValue(SelMask, 0));
 
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
     unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
@@ -991,35 +996,35 @@ SPUDAGToDAGISel::SelectSHLi64(SDValue &Op, MVT OpVT) {
 
     if (bytes > 0) {
       Shift =
-        CurDAG->getTargetNode(SPU::SHLQBYIv2i64, dl, VecVT,
-                              SDValue(VecOp0, 0),
-                              CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+        CurDAG->getMachineNode(SPU::SHLQBYIv2i64, dl, VecVT,
+                               SDValue(VecOp0, 0),
+                               CurDAG->getTargetConstant(bytes, ShiftAmtVT));
     }
 
     if (bits > 0) {
       Shift =
-        CurDAG->getTargetNode(SPU::SHLQBIIv2i64, dl, VecVT,
-                              SDValue((Shift != 0 ? Shift : VecOp0), 0),
-                              CurDAG->getTargetConstant(bits, ShiftAmtVT));
+        CurDAG->getMachineNode(SPU::SHLQBIIv2i64, dl, VecVT,
+                               SDValue((Shift != 0 ? Shift : VecOp0), 0),
+                               CurDAG->getTargetConstant(bits, ShiftAmtVT));
     }
   } else {
     SDNode *Bytes =
-      CurDAG->getTargetNode(SPU::ROTMIr32, dl, ShiftAmtVT,
-                            ShiftAmt,
-                            CurDAG->getTargetConstant(3, ShiftAmtVT));
+      CurDAG->getMachineNode(SPU::ROTMIr32, dl, ShiftAmtVT,
+                             ShiftAmt,
+                             CurDAG->getTargetConstant(3, ShiftAmtVT));
     SDNode *Bits =
-      CurDAG->getTargetNode(SPU::ANDIr32, dl, ShiftAmtVT,
-                            ShiftAmt,
-                            CurDAG->getTargetConstant(7, ShiftAmtVT));
+      CurDAG->getMachineNode(SPU::ANDIr32, dl, ShiftAmtVT,
+                             ShiftAmt,
+                             CurDAG->getTargetConstant(7, ShiftAmtVT));
     Shift =
-      CurDAG->getTargetNode(SPU::SHLQBYv2i64, dl, VecVT,
-                            SDValue(VecOp0, 0), SDValue(Bytes, 0));
+      CurDAG->getMachineNode(SPU::SHLQBYv2i64, dl, VecVT,
+                             SDValue(VecOp0, 0), SDValue(Bytes, 0));
     Shift =
-      CurDAG->getTargetNode(SPU::SHLQBIv2i64, dl, VecVT,
-                            SDValue(Shift, 0), SDValue(Bits, 0));
+      CurDAG->getMachineNode(SPU::SHLQBIv2i64, dl, VecVT,
+                             SDValue(Shift, 0), SDValue(Bits, 0));
   }
 
-  return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0));
+  return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0));
 }
 
 /*!
@@ -1031,15 +1036,16 @@ SPUDAGToDAGISel::SelectSHLi64(SDValue &Op, MVT OpVT) {
  * @return The SDNode with the entire instruction sequence
  */
 SDNode *
-SPUDAGToDAGISel::SelectSRLi64(SDValue &Op, MVT OpVT) {
+SPUDAGToDAGISel::SelectSRLi64(SDValue &Op, EVT OpVT) {
   SDValue Op0 = Op.getOperand(0);
-  MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+  EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(),
+                               OpVT, (128 / OpVT.getSizeInBits()));
   SDValue ShiftAmt = Op.getOperand(1);
-  MVT ShiftAmtVT = ShiftAmt.getValueType();
+  EVT ShiftAmtVT = ShiftAmt.getValueType();
   SDNode *VecOp0, *Shift = 0;
   DebugLoc dl = Op.getDebugLoc();
 
-  VecOp0 = CurDAG->getTargetNode(SPU::ORv2i64_i64, dl, VecVT, Op0);
+  VecOp0 = CurDAG->getMachineNode(SPU::ORv2i64_i64, dl, VecVT, Op0);
 
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
     unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
@@ -1047,45 +1053,45 @@ SPUDAGToDAGISel::SelectSRLi64(SDValue &Op, MVT OpVT) {
 
     if (bytes > 0) {
       Shift =
-        CurDAG->getTargetNode(SPU::ROTQMBYIv2i64, dl, VecVT,
-                              SDValue(VecOp0, 0),
-                              CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+        CurDAG->getMachineNode(SPU::ROTQMBYIv2i64, dl, VecVT,
+                               SDValue(VecOp0, 0),
+                               CurDAG->getTargetConstant(bytes, ShiftAmtVT));
     }
 
     if (bits > 0) {
       Shift =
-        CurDAG->getTargetNode(SPU::ROTQMBIIv2i64, dl, VecVT,
-                              SDValue((Shift != 0 ? Shift : VecOp0), 0),
-                              CurDAG->getTargetConstant(bits, ShiftAmtVT));
+        CurDAG->getMachineNode(SPU::ROTQMBIIv2i64, dl, VecVT,
+                               SDValue((Shift != 0 ? Shift : VecOp0), 0),
+                               CurDAG->getTargetConstant(bits, ShiftAmtVT));
     }
   } else {
     SDNode *Bytes =
-      CurDAG->getTargetNode(SPU::ROTMIr32, dl, ShiftAmtVT,
-                            ShiftAmt,
-                            CurDAG->getTargetConstant(3, ShiftAmtVT));
+      CurDAG->getMachineNode(SPU::ROTMIr32, dl, ShiftAmtVT,
+                             ShiftAmt,
+                             CurDAG->getTargetConstant(3, ShiftAmtVT));
     SDNode *Bits =
-      CurDAG->getTargetNode(SPU::ANDIr32, dl, ShiftAmtVT,
-                            ShiftAmt,
-                            CurDAG->getTargetConstant(7, ShiftAmtVT));
+      CurDAG->getMachineNode(SPU::ANDIr32, dl, ShiftAmtVT,
+                             ShiftAmt,
+                             CurDAG->getTargetConstant(7, ShiftAmtVT));
 
     // Ensure that the shift amounts are negated!
-    Bytes = CurDAG->getTargetNode(SPU::SFIr32, dl, ShiftAmtVT,
-                                  SDValue(Bytes, 0),
-                                  CurDAG->getTargetConstant(0, ShiftAmtVT));
+    Bytes = CurDAG->getMachineNode(SPU::SFIr32, dl, ShiftAmtVT,
+                                   SDValue(Bytes, 0),
+                                   CurDAG->getTargetConstant(0, ShiftAmtVT));
 
-    Bits = CurDAG->getTargetNode(SPU::SFIr32, dl, ShiftAmtVT,
-                                 SDValue(Bits, 0),
-                                 CurDAG->getTargetConstant(0, ShiftAmtVT));
+    Bits = CurDAG->getMachineNode(SPU::SFIr32, dl, ShiftAmtVT,
+                                  SDValue(Bits, 0),
+                                  CurDAG->getTargetConstant(0, ShiftAmtVT));
 
     Shift =
-      CurDAG->getTargetNode(SPU::ROTQMBYv2i64, dl, VecVT,
-                            SDValue(VecOp0, 0), SDValue(Bytes, 0));
+      CurDAG->getMachineNode(SPU::ROTQMBYv2i64, dl, VecVT,
+                             SDValue(VecOp0, 0), SDValue(Bytes, 0));
     Shift =
-      CurDAG->getTargetNode(SPU::ROTQMBIv2i64, dl, VecVT,
-                            SDValue(Shift, 0), SDValue(Bits, 0));
+      CurDAG->getMachineNode(SPU::ROTQMBIv2i64, dl, VecVT,
+                             SDValue(Shift, 0), SDValue(Bits, 0));
   }
 
-  return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0));
+  return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0));
 }
 
 /*!
@@ -1097,33 +1103,34 @@ SPUDAGToDAGISel::SelectSRLi64(SDValue &Op, MVT OpVT) {
  * @return The SDNode with the entire instruction sequence
  */
 SDNode *
-SPUDAGToDAGISel::SelectSRAi64(SDValue &Op, MVT OpVT) {
+SPUDAGToDAGISel::SelectSRAi64(SDValue &Op, EVT OpVT) {
   // Promote Op0 to vector
-  MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+  EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), 
+                               OpVT, (128 / OpVT.getSizeInBits()));
   SDValue ShiftAmt = Op.getOperand(1);
-  MVT ShiftAmtVT = ShiftAmt.getValueType();
+  EVT ShiftAmtVT = ShiftAmt.getValueType();
   DebugLoc dl = Op.getDebugLoc();
 
   SDNode *VecOp0 =
-    CurDAG->getTargetNode(SPU::ORv2i64_i64, dl, VecVT, Op.getOperand(0));
+    CurDAG->getMachineNode(SPU::ORv2i64_i64, dl, VecVT, Op.getOperand(0));
 
   SDValue SignRotAmt = CurDAG->getTargetConstant(31, ShiftAmtVT);
   SDNode *SignRot =
-    CurDAG->getTargetNode(SPU::ROTMAIv2i64_i32, dl, MVT::v2i64,
-                          SDValue(VecOp0, 0), SignRotAmt);
+    CurDAG->getMachineNode(SPU::ROTMAIv2i64_i32, dl, MVT::v2i64,
+                           SDValue(VecOp0, 0), SignRotAmt);
   SDNode *UpperHalfSign =
-    CurDAG->getTargetNode(SPU::ORi32_v4i32, dl, MVT::i32, SDValue(SignRot, 0));
+    CurDAG->getMachineNode(SPU::ORi32_v4i32, dl, MVT::i32, SDValue(SignRot, 0));
 
   SDNode *UpperHalfSignMask =
-    CurDAG->getTargetNode(SPU::FSM64r32, dl, VecVT, SDValue(UpperHalfSign, 0));
+    CurDAG->getMachineNode(SPU::FSM64r32, dl, VecVT, SDValue(UpperHalfSign, 0));
   SDNode *UpperLowerMask =
-    CurDAG->getTargetNode(SPU::FSMBIv2i64, dl, VecVT,
-                          CurDAG->getTargetConstant(0xff00ULL, MVT::i16));
+    CurDAG->getMachineNode(SPU::FSMBIv2i64, dl, VecVT,
+                           CurDAG->getTargetConstant(0xff00ULL, MVT::i16));
   SDNode *UpperLowerSelect =
-    CurDAG->getTargetNode(SPU::SELBv2i64, dl, VecVT,
-                          SDValue(UpperHalfSignMask, 0),
-                          SDValue(VecOp0, 0),
-                          SDValue(UpperLowerMask, 0));
+    CurDAG->getMachineNode(SPU::SELBv2i64, dl, VecVT,
+                           SDValue(UpperHalfSignMask, 0),
+                           SDValue(VecOp0, 0),
+                           SDValue(UpperLowerMask, 0));
 
   SDNode *Shift = 0;
 
@@ -1134,46 +1141,46 @@ SPUDAGToDAGISel::SelectSRAi64(SDValue &Op, MVT OpVT) {
     if (bytes > 0) {
       bytes = 31 - bytes;
       Shift =
-        CurDAG->getTargetNode(SPU::ROTQBYIv2i64, dl, VecVT,
-                              SDValue(UpperLowerSelect, 0),
-                              CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+        CurDAG->getMachineNode(SPU::ROTQBYIv2i64, dl, VecVT,
+                               SDValue(UpperLowerSelect, 0),
+                               CurDAG->getTargetConstant(bytes, ShiftAmtVT));
     }
 
     if (bits > 0) {
       bits = 8 - bits;
       Shift =
-        CurDAG->getTargetNode(SPU::ROTQBIIv2i64, dl, VecVT,
-                              SDValue((Shift != 0 ? Shift : UpperLowerSelect), 0),
-                              CurDAG->getTargetConstant(bits, ShiftAmtVT));
+        CurDAG->getMachineNode(SPU::ROTQBIIv2i64, dl, VecVT,
+                               SDValue((Shift != 0 ? Shift : UpperLowerSelect), 0),
+                               CurDAG->getTargetConstant(bits, ShiftAmtVT));
     }
   } else {
     SDNode *NegShift =
-      CurDAG->getTargetNode(SPU::SFIr32, dl, ShiftAmtVT,
-                            ShiftAmt, CurDAG->getTargetConstant(0, ShiftAmtVT));
+      CurDAG->getMachineNode(SPU::SFIr32, dl, ShiftAmtVT,
+                             ShiftAmt, CurDAG->getTargetConstant(0, ShiftAmtVT));
 
     Shift =
-      CurDAG->getTargetNode(SPU::ROTQBYBIv2i64_r32, dl, VecVT,
-                            SDValue(UpperLowerSelect, 0), SDValue(NegShift, 0));
+      CurDAG->getMachineNode(SPU::ROTQBYBIv2i64_r32, dl, VecVT,
+                             SDValue(UpperLowerSelect, 0), SDValue(NegShift, 0));
     Shift =
-      CurDAG->getTargetNode(SPU::ROTQBIv2i64, dl, VecVT,
-                            SDValue(Shift, 0), SDValue(NegShift, 0));
+      CurDAG->getMachineNode(SPU::ROTQBIv2i64, dl, VecVT,
+                             SDValue(Shift, 0), SDValue(NegShift, 0));
   }
 
-  return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0));
+  return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0));
 }
 
 /*!
  Do the necessary magic necessary to load a i64 constant
  */
-SDNode *SPUDAGToDAGISel::SelectI64Constant(SDValue& Op, MVT OpVT,
+SDNode *SPUDAGToDAGISel::SelectI64Constant(SDValue& Op, EVT OpVT,
                                            DebugLoc dl) {
   ConstantSDNode *CN = cast<ConstantSDNode>(Op.getNode());
   return SelectI64Constant(CN->getZExtValue(), OpVT, dl);
 }
 
-SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, MVT OpVT,
+SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, EVT OpVT,
                                            DebugLoc dl) {
-  MVT OpVecVT = MVT::getVectorVT(OpVT, 2);
+  EVT OpVecVT = EVT::getVectorVT(*CurDAG->getContext(), OpVT, 2);
   SDValue i64vec =
           SPU::LowerV2I64Splat(OpVecVT, *CurDAG, Value64, dl);
 
@@ -1186,8 +1193,8 @@ SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, MVT OpVT,
     SDValue Op0 = i64vec.getOperand(0);
 
     ReplaceUses(i64vec, Op0);
-    return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT,
-                                 SDValue(emitBuildVector(Op0), 0));
+    return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT,
+                                  SDValue(emitBuildVector(Op0), 0));
   } else if (i64vec.getOpcode() == SPUISD::SHUFB) {
     SDValue lhs = i64vec.getOperand(0);
     SDValue rhs = i64vec.getOperand(1);
@@ -1225,14 +1232,14 @@ SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, MVT OpVT,
                                    SDValue(lhsNode, 0), SDValue(rhsNode, 0),
                                    SDValue(shufMaskNode, 0)));
 
-    return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT,
-                                 SDValue(shufNode, 0));
+    return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT,
+                                  SDValue(shufNode, 0));
   } else if (i64vec.getOpcode() == ISD::BUILD_VECTOR) {
-    return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT,
-                                 SDValue(emitBuildVector(i64vec), 0));
+    return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT,
+                                  SDValue(emitBuildVector(i64vec), 0));
   } else {
-    cerr << "SPUDAGToDAGISel::SelectI64Constant: Unhandled i64vec condition\n";
-    abort();
+    llvm_report_error("SPUDAGToDAGISel::SelectI64Constant: Unhandled i64vec"
+                      "condition");
   }
 }
 
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index d8a7776..aaf0783 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -15,8 +15,9 @@
 #include "SPUISelLowering.h"
 #include "SPUTargetMachine.h"
 #include "SPUFrameInfo.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
 #include "llvm/CallingConv.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -24,13 +25,13 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/Intrinsics.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/VectorExtras.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetOptions.h"
-
+#include "llvm/Support/raw_ostream.h"
 #include <map>
 
 using namespace llvm;
@@ -39,10 +40,10 @@ using namespace llvm;
 namespace {
   std::map<unsigned, const char *> node_names;
 
-  //! MVT mapping to useful data for Cell SPU
+  //! EVT mapping to useful data for Cell SPU
   struct valtype_map_s {
-    const MVT   valtype;
-    const int   prefslot_byte;
+    EVT   valtype;
+    int   prefslot_byte;
   };
 
   const valtype_map_s valtype_map[] = {
@@ -58,7 +59,7 @@ namespace {
 
   const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);
 
-  const valtype_map_s *getValueTypeMapEntry(MVT VT) {
+  const valtype_map_s *getValueTypeMapEntry(EVT VT) {
     const valtype_map_s *retval = 0;
 
     for (size_t i = 0; i < n_valtype_map; ++i) {
@@ -70,10 +71,11 @@ namespace {
 
 #ifndef NDEBUG
     if (retval == 0) {
-      cerr << "getValueTypeMapEntry returns NULL for "
-           << VT.getMVTString()
-           << "\n";
-      abort();
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "getValueTypeMapEntry returns NULL for "
+           << VT.getEVTString();
+      llvm_report_error(Msg.str());
     }
 #endif
 
@@ -98,8 +100,8 @@ namespace {
     TargetLowering::ArgListTy Args;
     TargetLowering::ArgListEntry Entry;
     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
-      MVT ArgVT = Op.getOperand(i).getValueType();
-      const Type *ArgTy = ArgVT.getTypeForMVT();
+      EVT ArgVT = Op.getOperand(i).getValueType();
+      const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
       Entry.Node = Op.getOperand(i);
       Entry.Ty = ArgTy;
       Entry.isSExt = isSigned;
@@ -110,10 +112,13 @@ namespace {
                                            TLI.getPointerTy());
 
     // Splice the libcall in wherever FindInputOutputChains tells us to.
-    const Type *RetTy = Op.getNode()->getValueType(0).getTypeForMVT();
+    const Type *RetTy =
+                Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
     std::pair<SDValue, SDValue> CallInfo =
             TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
-                            0, CallingConv::C, false, Callee, Args, DAG,
+                            0, TLI.getLibcallCallingConv(LC), false,
+                            /*isReturnValueUsed=*/true,
+                            Callee, Args, DAG,
                             Op.getDebugLoc());
 
     return CallInfo.first;
@@ -121,9 +126,8 @@ namespace {
 }
 
 SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
-  : TargetLowering(TM),
-    SPUTM(TM)
-{
+  : TargetLowering(TM, new TargetLoweringObjectFileELF()),
+    SPUTM(TM) {
   // Fold away setcc operations if possible.
   setPow2DivIsCheap();
 
@@ -151,6 +155,13 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
   setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
 
+  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i32, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i8, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
   // SPU constant load actions are custom lowered:
   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
@@ -158,7 +169,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   // SPU's loads and stores have to be custom lowered:
   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
        ++sctype) {
-    MVT VT = (MVT::SimpleValueType)sctype;
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
 
     setOperationAction(ISD::LOAD,   VT, Custom);
     setOperationAction(ISD::STORE,  VT, Custom);
@@ -167,20 +178,20 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
     setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
 
     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
-      MVT StoreVT = (MVT::SimpleValueType) stype;
+      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
       setTruncStoreAction(VT, StoreVT, Expand);
     }
   }
 
   for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
        ++sctype) {
-    MVT VT = (MVT::SimpleValueType) sctype;
+    MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;
 
     setOperationAction(ISD::LOAD,   VT, Custom);
     setOperationAction(ISD::STORE,  VT, Custom);
 
     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
-      MVT StoreVT = (MVT::SimpleValueType) stype;
+      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
       setTruncStoreAction(VT, StoreVT, Expand);
     }
   }
@@ -199,11 +210,37 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   // SPU has no intrinsics for these particular operations:
   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
 
-  // SPU has no SREM/UREM instructions
-  setOperationAction(ISD::SREM, MVT::i32, Expand);
-  setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::SREM, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
+  // SPU has no division/remainder instructions
+  setOperationAction(ISD::SREM,    MVT::i8,   Expand);
+  setOperationAction(ISD::UREM,    MVT::i8,   Expand);
+  setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
+  setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
+  setOperationAction(ISD::SREM,    MVT::i16,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i16,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i32,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i32,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i64,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i64,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i128, Expand);
+  setOperationAction(ISD::UREM,    MVT::i128, Expand);
+  setOperationAction(ISD::SDIV,    MVT::i128, Expand);
+  setOperationAction(ISD::UDIV,    MVT::i128, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
 
   // We don't support sin/cos/sqrt/fmod
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
@@ -283,11 +320,19 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
   setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
   setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
+  setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
 
+  setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
   setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
   setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
+  setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
 
+  setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
+  setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
   setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
+  setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
+  setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
 
   // SPU has a version of select that implements (a&~c)|(b&c), just like
   // select ought to work:
@@ -305,10 +350,21 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   // Custom lower i128 -> i64 truncates
   setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
 
+  // Custom lower i32/i64 -> i128 sign extend
+  setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);
+
+  setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
   // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
   // to expand to a libcall, hence the custom lowering:
   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
 
   // FDIV on SPU requires custom lowering
   setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
@@ -339,16 +395,13 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   // appropriate instructions to materialize the address.
   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
        ++sctype) {
-    MVT VT = (MVT::SimpleValueType)sctype;
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
 
     setOperationAction(ISD::GlobalAddress,  VT, Custom);
     setOperationAction(ISD::ConstantPool,   VT, Custom);
     setOperationAction(ISD::JumpTable,      VT, Custom);
   }
 
-  // RET must be custom lowered, to meet ABI requirements
-  setOperationAction(ISD::RET,           MVT::Other, Custom);
-
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 
@@ -385,7 +438,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
 
   for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
-    MVT VT = (MVT::SimpleValueType)i;
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
 
     // add/sub are legal for all supported vector VT's.
     setOperationAction(ISD::ADD,     VT, Legal);
@@ -461,9 +514,6 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
     node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
     node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS";
     node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES";
-    node_names[(unsigned) SPUISD::VEC_SHL] = "SPUISD::VEC_SHL";
-    node_names[(unsigned) SPUISD::VEC_SRL] = "SPUISD::VEC_SRL";
-    node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA";
     node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
     node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
     node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
@@ -490,9 +540,11 @@ unsigned SPUTargetLowering::getFunctionAlignment(const Function *) const {
 // Return the Cell SPU's SETCC result type
 //===----------------------------------------------------------------------===//
 
-MVT SPUTargetLowering::getSetCCResultType(MVT VT) const {
+MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const {
   // i16 and i32 are valid SETCC result types
-  return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ? VT : MVT::i32);
+  return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ?
+    VT.getSimpleVT().SimpleTy :
+    MVT::i32);
 }
 
 //===----------------------------------------------------------------------===//
@@ -525,9 +577,9 @@ static SDValue
 LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   LoadSDNode *LN = cast<LoadSDNode>(Op);
   SDValue the_chain = LN->getChain();
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-  MVT InVT = LN->getMemoryVT();
-  MVT OutVT = Op.getValueType();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT InVT = LN->getMemoryVT();
+  EVT OutVT = Op.getValueType();
   ISD::LoadExtType ExtType = LN->getExtensionType();
   unsigned alignment = LN->getAlignment();
   const valtype_map_s *vtm = getValueTypeMapEntry(InVT);
@@ -632,7 +684,8 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 
     // Convert the loaded v16i8 vector to the appropriate vector type
     // specified by the operand:
-    MVT vecVT = MVT::getVectorVT(InVT, (128 / InVT.getSizeInBits()));
+    EVT vecVT = EVT::getVectorVT(*DAG.getContext(), 
+                                 InVT, (128 / InVT.getSizeInBits()));
     result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
                          DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, result));
 
@@ -665,11 +718,15 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   case ISD::POST_INC:
   case ISD::POST_DEC:
   case ISD::LAST_INDEXED_MODE:
-    cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
+    {
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
             "UNINDEXED\n";
-    cerr << (unsigned) LN->getAddressingMode() << "\n";
-    abort();
-    /*NOTREACHED*/
+      Msg << (unsigned) LN->getAddressingMode();
+      llvm_report_error(Msg.str());
+      /*NOTREACHED*/
+    }
   }
 
   return SDValue();
@@ -685,17 +742,19 @@ static SDValue
 LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   StoreSDNode *SN = cast<StoreSDNode>(Op);
   SDValue Value = SN->getValue();
-  MVT VT = Value.getValueType();
-  MVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT VT = Value.getValueType();
+  EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   DebugLoc dl = Op.getDebugLoc();
   unsigned alignment = SN->getAlignment();
 
   switch (SN->getAddressingMode()) {
   case ISD::UNINDEXED: {
     // The vector type we really want to load from the 16-byte chunk.
-    MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits())),
-        stVecVT = MVT::getVectorVT(StVT, (128 / StVT.getSizeInBits()));
+    EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
+                                 VT, (128 / VT.getSizeInBits())),
+        stVecVT = EVT::getVectorVT(*DAG.getContext(),
+                                   StVT, (128 / StVT.getSizeInBits()));
 
     SDValue alignLoadVec;
     SDValue basePtr = SN->getBasePtr();
@@ -790,9 +849,9 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
     // to the stack pointer, which is always aligned.
 #if !defined(NDEBUG)
       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
-        cerr << "CellSPU LowerSTORE: basePtr = ";
+        errs() << "CellSPU LowerSTORE: basePtr = ";
         basePtr.getNode()->dump(&DAG);
-        cerr << "\n";
+        errs() << "\n";
       }
 #endif
 
@@ -815,9 +874,9 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
       const SDValue &currentRoot = DAG.getRoot();
 
       DAG.setRoot(result);
-      cerr << "------- CellSPU:LowerStore result:\n";
+      errs() << "------- CellSPU:LowerStore result:\n";
       DAG.dump();
-      cerr << "-------\n";
+      errs() << "-------\n";
       DAG.setRoot(currentRoot);
     }
 #endif
@@ -830,20 +889,24 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   case ISD::POST_INC:
   case ISD::POST_DEC:
   case ISD::LAST_INDEXED_MODE:
-    cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
+    {
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
             "UNINDEXED\n";
-    cerr << (unsigned) SN->getAddressingMode() << "\n";
-    abort();
-    /*NOTREACHED*/
+      Msg << (unsigned) SN->getAddressingMode();
+      llvm_report_error(Msg.str());
+      /*NOTREACHED*/
+    }
   }
 
   return SDValue();
 }
 
 //! Generate the address of a constant pool entry.
-SDValue
+static SDValue
 LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
-  MVT PtrVT = Op.getValueType();
+  EVT PtrVT = Op.getValueType();
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   Constant *C = CP->getConstVal();
   SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
@@ -863,9 +926,8 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
     }
   }
 
-  assert(0 &&
-         "LowerConstantPool: Relocation model other than static"
-         " not supported.");
+  llvm_unreachable("LowerConstantPool: Relocation model other than static"
+                   " not supported.");
   return SDValue();
 }
 
@@ -877,7 +939,7 @@ SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM
 
 static SDValue
 LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
-  MVT PtrVT = Op.getValueType();
+  EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
   SDValue Zero = DAG.getConstant(0, PtrVT);
@@ -895,14 +957,14 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
     }
   }
 
-  assert(0 &&
-         "LowerJumpTable: Relocation model other than static not supported.");
+  llvm_unreachable("LowerJumpTable: Relocation model other than static"
+                   " not supported.");
   return SDValue();
 }
 
 static SDValue
 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
-  MVT PtrVT = Op.getValueType();
+  EVT PtrVT = Op.getValueType();
   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
   GlobalValue *GV = GSDN->getGlobal();
   SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
@@ -920,9 +982,8 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
     }
   } else {
-    cerr << "LowerGlobalAddress: Relocation model other than static not "
-         << "supported.\n";
-    abort();
+    llvm_report_error("LowerGlobalAddress: Relocation model other than static"
+                      "not supported.");
     /*NOTREACHED*/
   }
 
@@ -932,7 +993,7 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 //! Custom lower double precision floating point constants
 static SDValue
 LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   // FIXME there is no actual debug info here
   DebugLoc dl = Op.getDebugLoc();
 
@@ -952,16 +1013,17 @@ LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
   return SDValue();
 }
 
-static SDValue
-LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
-{
+SDValue
+SPUTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals) {
+
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
-  SmallVector<SDValue, 48> ArgValues;
-  SDValue Root = Op.getOperand(0);
-  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
-  DebugLoc dl = Op.getDebugLoc();
 
   const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
   const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
@@ -970,24 +1032,24 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
   unsigned ArgRegIdx = 0;
   unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
 
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   // Add DAG nodes to load the arguments or copy them out of registers.
-  for (unsigned ArgNo = 0, e = Op.getNode()->getNumValues() - 1;
-       ArgNo != e; ++ArgNo) {
-    MVT ObjectVT = Op.getValue(ArgNo).getValueType();
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+    EVT ObjectVT = Ins[ArgNo].VT;
     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
     SDValue ArgVal;
 
     if (ArgRegIdx < NumArgRegs) {
       const TargetRegisterClass *ArgRegClass;
 
-      switch (ObjectVT.getSimpleVT()) {
+      switch (ObjectVT.getSimpleVT().SimpleTy) {
       default: {
-        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
-             << ObjectVT.getMVTString()
-             << "\n";
-        abort();
+        std::string msg;
+        raw_string_ostream Msg(msg);
+        Msg << "LowerFormalArguments Unhandled argument type: "
+             << ObjectVT.getEVTString();
+        llvm_report_error(Msg.str());
       }
       case MVT::i8:
         ArgRegClass = &SPU::R8CRegClass;
@@ -1022,7 +1084,7 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
 
       unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
       RegInfo.addLiveIn(ArgRegs[ArgRegIdx], VReg);
-      ArgVal = DAG.getCopyFromReg(Root, dl, VReg, ObjectVT);
+      ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
       ++ArgRegIdx;
     } else {
       // We need to load the argument to a virtual register if we determined
@@ -1030,13 +1092,13 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
       // or we're forced to do vararg
       int FI = MFI->CreateFixedObject(ObjSize, ArgOffset);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-      ArgVal = DAG.getLoad(ObjectVT, dl, Root, FIN, NULL, 0);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, NULL, 0);
       ArgOffset += StackSlotSize;
     }
 
-    ArgValues.push_back(ArgVal);
+    InVals.push_back(ArgVal);
     // Update the chain
-    Root = ArgVal.getOperand(0);
+    Chain = ArgVal.getOperand(0);
   }
 
   // vararg handling:
@@ -1051,23 +1113,19 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
       VarArgsFrameIndex = MFI->CreateFixedObject(StackSlotSize, ArgOffset);
       SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
       SDValue ArgVal = DAG.getRegister(ArgRegs[ArgRegIdx], MVT::v16i8);
-      SDValue Store = DAG.getStore(Root, dl, ArgVal, FIN, NULL, 0);
-      Root = Store.getOperand(0);
+      SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, NULL, 0);
+      Chain = Store.getOperand(0);
       MemOps.push_back(Store);
 
       // Increment address by stack slot size for the next stored argument
       ArgOffset += StackSlotSize;
     }
     if (!MemOps.empty())
-      Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                         &MemOps[0], MemOps.size());
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &MemOps[0], MemOps.size());
   }
 
-  ArgValues.push_back(Root);
-
-  // Return the new list of results.
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
-                     &ArgValues[0], ArgValues.size());
+  return Chain;
 }
 
 /// isLSAAddress - Return the immediate to use if the specified
@@ -1084,19 +1142,23 @@ static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
   return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
 }
 
-static SDValue
-LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
-  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
-  SDValue Chain = TheCall->getChain();
-  SDValue Callee    = TheCall->getCallee();
-  unsigned NumOps     = TheCall->getNumArgs();
+SDValue
+SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool isTailCall,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) {
+
+  const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
+  unsigned NumOps     = Outs.size();
   unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
   const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
   const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
-  DebugLoc dl = TheCall->getDebugLoc();
 
   // Handy pointer type
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   // Accumulate how many bytes are to be pushed on the stack, including the
   // linkage area, and parameter passing area.  According to the SPU ABI,
@@ -1119,15 +1181,15 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   SmallVector<SDValue, 8> MemOpChains;
 
   for (unsigned i = 0; i != NumOps; ++i) {
-    SDValue Arg = TheCall->getArg(i);
+    SDValue Arg = Outs[i].Val;
 
     // PtrOff will be used to store the current argument to the stack if a
     // register cannot be found for it.
     SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
     PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
 
-    switch (Arg.getValueType().getSimpleVT()) {
-    default: assert(0 && "Unexpected ValueType for argument!");
+    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
     case MVT::i8:
     case MVT::i16:
     case MVT::i32:
@@ -1193,7 +1255,7 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   // node so that legalize doesn't hack it.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     GlobalValue *GV = G->getGlobal();
-    MVT CalleeVT = Callee.getValueType();
+    EVT CalleeVT = Callee.getValueType();
     SDValue Zero = DAG.getConstant(0, PtrVT);
     SDValue GA = DAG.getTargetGlobalAddress(GV, CalleeVT);
 
@@ -1217,7 +1279,7 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    MVT CalleeVT = Callee.getValueType();
+    EVT CalleeVT = Callee.getValueType();
     SDValue Zero = DAG.getConstant(0, PtrVT);
     SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
         Callee.getValueType());
@@ -1251,50 +1313,46 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
                              DAG.getIntPtrConstant(0, true), InFlag);
-  if (TheCall->getValueType(0) != MVT::Other)
+  if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
-  SDValue ResultVals[3];
-  unsigned NumResults = 0;
+  // If the function returns void, just return the chain.
+  if (Ins.empty())
+    return Chain;
 
   // If the call has results, copy the values out of the ret val registers.
-  switch (TheCall->getValueType(0).getSimpleVT()) {
-  default: assert(0 && "Unexpected ret value!");
+  switch (Ins[0].VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected ret value!");
   case MVT::Other: break;
   case MVT::i32:
-    if (TheCall->getValueType(1) == MVT::i32) {
+    if (Ins.size() > 1 && Ins[1].VT == MVT::i32) {
       Chain = DAG.getCopyFromReg(Chain, dl, SPU::R4,
                                  MVT::i32, InFlag).getValue(1);
-      ResultVals[0] = Chain.getValue(0);
+      InVals.push_back(Chain.getValue(0));
       Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32,
                                  Chain.getValue(2)).getValue(1);
-      ResultVals[1] = Chain.getValue(0);
-      NumResults = 2;
+      InVals.push_back(Chain.getValue(0));
     } else {
       Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32,
                                  InFlag).getValue(1);
-      ResultVals[0] = Chain.getValue(0);
-      NumResults = 1;
+      InVals.push_back(Chain.getValue(0));
     }
     break;
   case MVT::i64:
     Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i64,
                                InFlag).getValue(1);
-    ResultVals[0] = Chain.getValue(0);
-    NumResults = 1;
+    InVals.push_back(Chain.getValue(0));
     break;
   case MVT::i128:
     Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i128,
                                InFlag).getValue(1);
-    ResultVals[0] = Chain.getValue(0);
-    NumResults = 1;
+    InVals.push_back(Chain.getValue(0));
     break;
   case MVT::f32:
   case MVT::f64:
-    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, TheCall->getValueType(0),
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, Ins[0].VT,
                                InFlag).getValue(1);
-    ResultVals[0] = Chain.getValue(0);
-    NumResults = 1;
+    InVals.push_back(Chain.getValue(0));
     break;
   case MVT::v2f64:
   case MVT::v2i64:
@@ -1302,31 +1360,25 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   case MVT::v4i32:
   case MVT::v8i16:
   case MVT::v16i8:
-    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, TheCall->getValueType(0),
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, Ins[0].VT,
                                    InFlag).getValue(1);
-    ResultVals[0] = Chain.getValue(0);
-    NumResults = 1;
+    InVals.push_back(Chain.getValue(0));
     break;
   }
 
-  // If the function returns void, just return the chain.
-  if (NumResults == 0)
-    return Chain;
-
-  // Otherwise, merge everything together with a MERGE_VALUES node.
-  ResultVals[NumResults++] = Chain;
-  SDValue Res = DAG.getMergeValues(ResultVals, NumResults, dl);
-  return Res.getValue(Op.getResNo());
+  return Chain;
 }
 
-static SDValue
-LowerRET(SDValue Op, SelectionDAG &DAG, TargetMachine &TM) {
+SDValue
+SPUTargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               DebugLoc dl, SelectionDAG &DAG) {
+
   SmallVector<CCValAssign, 16> RVLocs;
-  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
-  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
-  DebugLoc dl = Op.getDebugLoc();
-  CCState CCInfo(CC, isVarArg, TM, RVLocs);
-  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_SPU);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
 
   // If this is the first return lowered for this function, add the regs to the
   // liveout set for the function.
@@ -1335,7 +1387,6 @@ LowerRET(SDValue Op, SelectionDAG &DAG, TargetMachine &TM) {
       DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
   }
 
-  SDValue Chain = Op.getOperand(0);
   SDValue Flag;
 
   // Copy the result values into the output registers.
@@ -1343,7 +1394,7 @@ LowerRET(SDValue Op, SelectionDAG &DAG, TargetMachine &TM) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             Op.getOperand(i*2+1), Flag);
+                             Outs[i].Val, Flag);
     Flag = Chain.getValue(1);
   }
 
@@ -1384,7 +1435,7 @@ getVecImm(SDNode *N) {
 /// and the value fits into an unsigned 18-bit constant, and if so, return the
 /// constant
 SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
-                              MVT ValueType) {
+                              EVT ValueType) {
   if (ConstantSDNode *CN = getVecImm(N)) {
     uint64_t Value = CN->getZExtValue();
     if (ValueType == MVT::i64) {
@@ -1406,7 +1457,7 @@ SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
 /// and the value fits into a signed 16-bit constant, and if so, return the
 /// constant
 SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
-                              MVT ValueType) {
+                              EVT ValueType) {
   if (ConstantSDNode *CN = getVecImm(N)) {
     int64_t Value = CN->getSExtValue();
     if (ValueType == MVT::i64) {
@@ -1429,7 +1480,7 @@ SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
 /// and the value fits into a signed 10-bit constant, and if so, return the
 /// constant
 SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
-                              MVT ValueType) {
+                              EVT ValueType) {
   if (ConstantSDNode *CN = getVecImm(N)) {
     int64_t Value = CN->getSExtValue();
     if (ValueType == MVT::i64) {
@@ -1455,7 +1506,7 @@ SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
 /// constant vectors. Thus, we test to see if the upper and lower bytes are the
 /// same value.
 SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
-                             MVT ValueType) {
+                             EVT ValueType) {
   if (ConstantSDNode *CN = getVecImm(N)) {
     int Value = (int) CN->getZExtValue();
     if (ValueType == MVT::i16
@@ -1474,7 +1525,7 @@ SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
 /// and the value fits into a signed 16-bit constant, and if so, return the
 /// constant
 SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
-                               MVT ValueType) {
+                               EVT ValueType) {
   if (ConstantSDNode *CN = getVecImm(N)) {
     uint64_t Value = CN->getZExtValue();
     if ((ValueType == MVT::i32
@@ -1505,10 +1556,10 @@ SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
 }
 
 //! Lower a BUILD_VECTOR instruction creatively:
-SDValue
+static SDValue
 LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
-  MVT EltVT = VT.getVectorElementType();
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT.getVectorElementType();
   DebugLoc dl = Op.getDebugLoc();
   BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
@@ -1528,13 +1579,15 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
 
   uint64_t SplatBits = APSplatBits.getZExtValue();
 
-  switch (VT.getSimpleVT()) {
-  default:
-    cerr << "CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = "
-         << VT.getMVTString()
-         << "\n";
-    abort();
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: {
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = "
+         << VT.getEVTString();
+    llvm_report_error(Msg.str());
     /*NOTREACHED*/
+  }
   case MVT::v4f32: {
     uint32_t Value32 = uint32_t(SplatBits);
     assert(SplatBitSize == 32
@@ -1591,7 +1644,7 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
 /*!
  */
 SDValue
-SPU::LowerV2I64Splat(MVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
+SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
                      DebugLoc dl) {
   uint32_t upper = uint32_t(SplatVal >> 32);
   uint32_t lower = uint32_t(SplatVal);
@@ -1704,8 +1757,8 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   // If we have a single element being moved from V1 to V2, this can be handled
   // using the C*[DX] compute mask instructions, but the vector elements have
   // to be monotonically increasing with one exception element.
-  MVT VecVT = V1.getValueType();
-  MVT EltVT = VecVT.getVectorElementType();
+  EVT VecVT = V1.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
   unsigned EltsFromV2 = 0;
   unsigned V2Elt = 0;
   unsigned V2EltIdx0 = 0;
@@ -1725,7 +1778,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
     V2EltIdx0 = 2;
   } else
-    assert(0 && "Unhandled vector type in LowerVECTOR_SHUFFLE");
+    llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
 
   for (unsigned i = 0; i != MaxElts; ++i) {
     if (SVN->getMaskElt(i) < 0)
@@ -1770,7 +1823,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     MachineFunction &MF = DAG.getMachineFunction();
     MachineRegisterInfo &RegInfo = MF.getRegInfo();
     unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
-    MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
     // Initialize temporary register to 0
     SDValue InitTempReg =
       DAG.getCopyToReg(DAG.getEntryNode(), dl, VReg, DAG.getConstant(0, PtrVT));
@@ -1816,13 +1869,13 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
 
     ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
     SmallVector<SDValue, 16> ConstVecValues;
-    MVT VT;
+    EVT VT;
     size_t n_copies;
 
     // Create a constant vector:
-    switch (Op.getValueType().getSimpleVT()) {
-    default: assert(0 && "Unexpected constant value type in "
-                         "LowerSCALAR_TO_VECTOR");
+    switch (Op.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected constant value type in "
+                              "LowerSCALAR_TO_VECTOR");
     case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
     case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
     case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
@@ -1839,8 +1892,8 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
                        &ConstVecValues[0], ConstVecValues.size());
   } else {
     // Otherwise, copy the value from one register to another:
-    switch (Op0.getValueType().getSimpleVT()) {
-    default: assert(0 && "Unexpected value type in LowerSCALAR_TO_VECTOR");
+    switch (Op0.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
     case MVT::i8:
     case MVT::i16:
     case MVT::i32:
@@ -1855,7 +1908,7 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   SDValue N = Op.getOperand(0);
   SDValue Elt = Op.getOperand(1);
   DebugLoc dl = Op.getDebugLoc();
@@ -1867,13 +1920,13 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
 
     // sanity checks:
     if (VT == MVT::i8 && EltNo >= 16)
-      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
     else if (VT == MVT::i16 && EltNo >= 8)
-      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
     else if (VT == MVT::i32 && EltNo >= 4)
-      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
     else if (VT == MVT::i64 && EltNo >= 2)
-      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
 
     if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
       // i32 and i64: Element 0 is the preferred slot
@@ -1884,7 +1937,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
     int prefslot_begin = -1, prefslot_end = -1;
     int elt_byte = EltNo * VT.getSizeInBits() / 8;
 
-    switch (VT.getSimpleVT()) {
+    switch (VT.getSimpleVT().SimpleTy) {
     default:
       assert(false && "Invalid value type!");
     case MVT::i8: {
@@ -1910,7 +1963,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
     assert(prefslot_begin != -1 && prefslot_end != -1 &&
            "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
 
-    unsigned int ShufBytes[16];
+    unsigned int ShufBytes[16] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    };
     for (int i = 0; i < 16; ++i) {
       // zero fill uppper part of preferred slot, don't care about the
       // other slots:
@@ -1946,10 +2001,10 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   } else {
     // Variable index: Rotate the requested element into slot 0, then replicate
     // slot 0 across the vector
-    MVT VecVT = N.getValueType();
+    EVT VecVT = N.getValueType();
     if (!VecVT.isSimple() || !VecVT.isVector() || !VecVT.is128BitVector()) {
-      cerr << "LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit vector type!\n";
-      abort();
+      llvm_report_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
+                        "vector type!");
     }
 
     // Make life easier by making sure the index is zero-extended to i32
@@ -1974,10 +2029,10 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
     // consistency with the notion of a unified register set)
     SDValue replicate;
 
-    switch (VT.getSimpleVT()) {
+    switch (VT.getSimpleVT().SimpleTy) {
     default:
-      cerr << "LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector type\n";
-      abort();
+      llvm_report_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
+                        "type");
       /*NOTREACHED*/
     case MVT::i8: {
       SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
@@ -2021,12 +2076,12 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   SDValue ValOp = Op.getOperand(1);
   SDValue IdxOp = Op.getOperand(2);
   DebugLoc dl = Op.getDebugLoc();
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
 
   ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
   assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
 
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   // Use $sp ($1) because it's always 16-byte aligned and it's available:
   SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                 DAG.getRegister(SPU::R1, PtrVT),
@@ -2047,12 +2102,12 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
 {
   SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
   DebugLoc dl = Op.getDebugLoc();
-  MVT ShiftVT = TLI.getShiftAmountTy();
+  EVT ShiftVT = TLI.getShiftAmountTy();
 
   assert(Op.getValueType() == MVT::i8);
   switch (Opc) {
   default:
-    assert(0 && "Unhandled i8 math operator");
+    llvm_unreachable("Unhandled i8 math operator");
     /*NOTREACHED*/
     break;
   case ISD::ADD: {
@@ -2078,7 +2133,7 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
   case ISD::ROTR:
   case ISD::ROTL: {
     SDValue N1 = Op.getOperand(1);
-    MVT N1VT = N1.getValueType();
+    EVT N1VT = N1.getValueType();
 
     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
     if (!N1VT.bitsEq(ShiftVT)) {
@@ -2101,7 +2156,7 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
   case ISD::SRL:
   case ISD::SHL: {
     SDValue N1 = Op.getOperand(1);
-    MVT N1VT = N1.getValueType();
+    EVT N1VT = N1.getValueType();
 
     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
     if (!N1VT.bitsEq(ShiftVT)) {
@@ -2118,7 +2173,7 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
   }
   case ISD::SRA: {
     SDValue N1 = Op.getOperand(1);
-    MVT N1VT = N1.getValueType();
+    EVT N1VT = N1.getValueType();
 
     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
     if (!N1VT.bitsEq(ShiftVT)) {
@@ -2151,7 +2206,7 @@ static SDValue
 LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
   SDValue ConstVec;
   SDValue Arg;
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
 
   ConstVec = Op.getOperand(0);
@@ -2202,11 +2257,12 @@ LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
   ones per byte, which then have to be accumulated.
 */
 static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
-  MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
+  EVT VT = Op.getValueType();
+  EVT vecVT = EVT::getVectorVT(*DAG.getContext(), 
+                               VT, (128 / VT.getSizeInBits()));
   DebugLoc dl = Op.getDebugLoc();
 
-  switch (VT.getSimpleVT()) {
+  switch (VT.getSimpleVT().SimpleTy) {
   default:
     assert(false && "Invalid value type!");
   case MVT::i8: {
@@ -2312,9 +2368,9 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
  */
 static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
                               SPUTargetLowering &TLI) {
-  MVT OpVT = Op.getValueType();
+  EVT OpVT = Op.getValueType();
   SDValue Op0 = Op.getOperand(0);
-  MVT Op0VT = Op0.getValueType();
+  EVT Op0VT = Op0.getValueType();
 
   if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
       || OpVT == MVT::i64) {
@@ -2338,9 +2394,9 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
  */
 static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
                               SPUTargetLowering &TLI) {
-  MVT OpVT = Op.getValueType();
+  EVT OpVT = Op.getValueType();
   SDValue Op0 = Op.getOperand(0);
-  MVT Op0VT = Op0.getValueType();
+  EVT Op0VT = Op0.getValueType();
 
   if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
       || Op0VT == MVT::i64) {
@@ -2369,12 +2425,12 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
 
   SDValue lhs = Op.getOperand(0);
   SDValue rhs = Op.getOperand(1);
-  MVT lhsVT = lhs.getValueType();
+  EVT lhsVT = lhs.getValueType();
   assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
 
-  MVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
+  EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
   APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
-  MVT IntVT(MVT::i64);
+  EVT IntVT(MVT::i64);
 
   // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
   // selected to a NOP:
@@ -2458,9 +2514,7 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
   case ISD::SETONE:
     compareOp = ISD::SETNE; break;
   default:
-    cerr << "CellSPU ISel Select: unimplemented f64 condition\n";
-    abort();
-    break;
+    llvm_report_error("CellSPU ISel Select: unimplemented f64 condition");
   }
 
   SDValue result =
@@ -2497,7 +2551,7 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
 
 static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
                               const TargetLowering &TLI) {
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   SDValue lhs = Op.getOperand(0);
   SDValue rhs = Op.getOperand(1);
   SDValue trueval = Op.getOperand(2);
@@ -2526,14 +2580,15 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
 static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
 {
   // Type to truncate to
-  MVT VT = Op.getValueType();
-  MVT::SimpleValueType simpleVT = VT.getSimpleVT();
-  MVT VecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
+  EVT VT = Op.getValueType();
+  MVT simpleVT = VT.getSimpleVT();
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), 
+                               VT, (128 / VT.getSizeInBits()));
   DebugLoc dl = Op.getDebugLoc();
 
   // Type to truncate from
   SDValue Op0 = Op.getOperand(0);
-  MVT Op0VT = Op0.getValueType();
+  EVT Op0VT = Op0.getValueType();
 
   if (Op0VT.getSimpleVT() == MVT::i128 && simpleVT == MVT::i64) {
     // Create shuffle mask, least significant doubleword of quadword
@@ -2555,6 +2610,61 @@ static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
   return SDValue();             // Leave the truncate unmolested
 }
 
+/*!
+ * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
+ * algorithm is to duplicate the sign bit using rotmai to generate at
+ * least one byte full of sign bits. Then propagate the "sign-byte" into
+ * the leftmost words and the i64/i32 into the rightmost words using shufb.
+ *
+ * @param Op The sext operand
+ * @param DAG The current DAG
+ * @return The SDValue with the entire instruction sequence
+ */
+static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
+{
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Type to extend to
+  MVT OpVT = Op.getValueType().getSimpleVT();
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(),
+                               OpVT, (128 / OpVT.getSizeInBits()));
+
+  // Type to extend from
+  SDValue Op0 = Op.getOperand(0);
+  MVT Op0VT = Op0.getValueType().getSimpleVT();
+
+  // The type to extend to needs to be a i128 and
+  // the type to extend from needs to be i64 or i32.
+  assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
+          "LowerSIGN_EXTEND: input and/or output operand have wrong size");
+
+  // Create shuffle mask
+  unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
+  unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
+  unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
+  SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                 DAG.getConstant(mask1, MVT::i32),
+                                 DAG.getConstant(mask1, MVT::i32),
+                                 DAG.getConstant(mask2, MVT::i32),
+                                 DAG.getConstant(mask3, MVT::i32));
+
+  // Word wise arithmetic right shift to generate at least one byte
+  // that contains sign bits.
+  MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
+  SDValue sraVal = DAG.getNode(ISD::SRA,
+                 dl,
+                 mvt,
+                 DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
+                 DAG.getConstant(31, MVT::i32));
+
+  // Shuffle bytes - Copy the sign bits into the upper 64 bits
+  // and the input value into the lower 64 bits.
+  SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
+      DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i128, Op0), sraVal, shufMask);
+
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, extShuffle);
+}
+
 //! Custom (target-specific) lowering entry point
 /*!
   This is where LLVM's DAG selection process calls to do target-specific
@@ -2564,15 +2674,17 @@ SDValue
 SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
 {
   unsigned Opc = (unsigned) Op.getOpcode();
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
 
   switch (Opc) {
   default: {
-    cerr << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
-    cerr << "Op.getOpcode() = " << Opc << "\n";
-    cerr << "*Op.getNode():\n";
+#ifndef NDEBUG
+    errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
+    errs() << "Op.getOpcode() = " << Opc << "\n";
+    errs() << "*Op.getNode():\n";
     Op.getNode()->dump();
-    abort();
+#endif
+    llvm_unreachable(0);
   }
   case ISD::LOAD:
   case ISD::EXTLOAD:
@@ -2589,12 +2701,6 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
     return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
   case ISD::ConstantFP:
     return LowerConstantFP(Op, DAG);
-  case ISD::FORMAL_ARGUMENTS:
-    return LowerFORMAL_ARGUMENTS(Op, DAG, VarArgsFrameIndex);
-  case ISD::CALL:
-    return LowerCALL(Op, DAG, SPUTM.getSubtargetImpl());
-  case ISD::RET:
-    return LowerRET(Op, DAG, getTargetMachine());
 
   // i8, i64 math ops:
   case ISD::ADD:
@@ -2651,6 +2757,9 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
 
   case ISD::TRUNCATE:
     return LowerTRUNCATE(Op, DAG);
+
+  case ISD::SIGN_EXTEND:
+    return LowerSIGN_EXTEND(Op, DAG);
   }
 
   return SDValue();
@@ -2662,13 +2771,13 @@ void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
 {
 #if 0
   unsigned Opc = (unsigned) N->getOpcode();
-  MVT OpVT = N->getValueType(0);
+  EVT OpVT = N->getValueType(0);
 
   switch (Opc) {
   default: {
-    cerr << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
-    cerr << "Op.getOpcode() = " << Opc << "\n";
-    cerr << "*Op.getNode():\n";
+    errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
+    errs() << "Op.getOpcode() = " << Opc << "\n";
+    errs() << "*Op.getNode():\n";
     N->dump();
     abort();
     /*NOTREACHED*/
@@ -2692,8 +2801,8 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
   SelectionDAG &DAG = DCI.DAG;
   SDValue Op0 = N->getOperand(0);       // everything has at least one operand
-  MVT NodeVT = N->getValueType(0);      // The node's value type
-  MVT Op0VT = Op0.getValueType();       // The first operand's result
+  EVT NodeVT = N->getValueType(0);      // The node's value type
+  EVT Op0VT = Op0.getValueType();       // The first operand's result
   SDValue Result;                       // Initially, empty result
   DebugLoc dl = N->getDebugLoc();
 
@@ -2722,7 +2831,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
 
 #if !defined(NDEBUG)
           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
-            cerr << "\n"
+            errs() << "\n"
                  << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
                  << "With:    (SPUindirect <arg>, <arg>)\n";
           }
@@ -2738,7 +2847,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
 
 #if !defined(NDEBUG)
           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
-            cerr << "\n"
+            errs() << "\n"
                  << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
                  << "), " << CN0->getSExtValue() << ")\n"
                  << "With:    (SPUindirect <arg>, "
@@ -2762,11 +2871,11 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
       // Types must match, however...
 #if !defined(NDEBUG)
       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
-        cerr << "\nReplace: ";
+        errs() << "\nReplace: ";
         N->dump(&DAG);
-        cerr << "\nWith:    ";
+        errs() << "\nWith:    ";
         Op0.getNode()->dump(&DAG);
-        cerr << "\n";
+        errs() << "\n";
       }
 #endif
 
@@ -2781,11 +2890,11 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
         // (SPUindirect (SPUaform <addr>, 0), 0) ->
         // (SPUaform <addr>, 0)
 
-        DEBUG(cerr << "Replace: ");
+        DEBUG(errs() << "Replace: ");
         DEBUG(N->dump(&DAG));
-        DEBUG(cerr << "\nWith:    ");
+        DEBUG(errs() << "\nWith:    ");
         DEBUG(Op0.getNode()->dump(&DAG));
-        DEBUG(cerr << "\n");
+        DEBUG(errs() << "\n");
 
         return Op0;
       }
@@ -2798,7 +2907,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
 
 #if !defined(NDEBUG)
           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
-            cerr << "\n"
+            errs() << "\n"
                  << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
                  << "With:    (SPUindirect <arg>, <arg>)\n";
           }
@@ -2813,9 +2922,6 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   }
   case SPUISD::SHLQUAD_L_BITS:
   case SPUISD::SHLQUAD_L_BYTES:
-  case SPUISD::VEC_SHL:
-  case SPUISD::VEC_SRL:
-  case SPUISD::VEC_SRA:
   case SPUISD::ROTBYTES_LEFT: {
     SDValue Op1 = N->getOperand(1);
 
@@ -2860,11 +2966,11 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   // Otherwise, return unchanged.
 #ifndef NDEBUG
   if (Result.getNode()) {
-    DEBUG(cerr << "\nReplace.SPU: ");
+    DEBUG(errs() << "\nReplace.SPU: ");
     DEBUG(N->dump(&DAG));
-    DEBUG(cerr << "\nWith:        ");
+    DEBUG(errs() << "\nWith:        ");
     DEBUG(Result.getNode()->dump(&DAG));
-    DEBUG(cerr << "\n");
+    DEBUG(errs() << "\n");
   }
 #endif
 
@@ -2895,7 +3001,7 @@ SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const
 
 std::pair<unsigned, const TargetRegisterClass*>
 SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                MVT VT) const
+                                                EVT VT) const
 {
   if (Constraint.size() == 1) {
     // GCC RS6000 Constraint Letters
@@ -2943,9 +3049,6 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   case SPUISD::VEC2PREFSLOT:
   case SPUISD::SHLQUAD_L_BITS:
   case SPUISD::SHLQUAD_L_BYTES:
-  case SPUISD::VEC_SHL:
-  case SPUISD::VEC_SRL:
-  case SPUISD::VEC_SRA:
   case SPUISD::VEC_ROTL:
   case SPUISD::VEC_ROTR:
   case SPUISD::ROTBYTES_LEFT:
@@ -2963,7 +3066,7 @@ SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
     return 1;
 
   case ISD::SETCC: {
-    MVT VT = Op.getValueType();
+    EVT VT = Op.getValueType();
 
     if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
       VT = MVT::i32;
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index b1583f4..ab349bb 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -43,9 +43,6 @@ namespace llvm {
       VEC2PREFSLOT,             ///< Extract element 0
       SHLQUAD_L_BITS,           ///< Rotate quad left, by bits
       SHLQUAD_L_BYTES,          ///< Rotate quad left, by bytes
-      VEC_SHL,                  ///< Vector shift left
-      VEC_SRL,                  ///< Vector shift right (logical)
-      VEC_SRA,                  ///< Vector shift right (arithmetic)
       VEC_ROTL,                 ///< Vector rotate left
       VEC_ROTR,                 ///< Vector rotate right
       ROTBYTES_LEFT,            ///< Rotate bytes (loads -> ROTQBYI)
@@ -64,22 +61,22 @@ namespace llvm {
   //! Utility functions specific to CellSPU:
   namespace SPU {
     SDValue get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
-                             MVT ValueType);
+                             EVT ValueType);
     SDValue get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
-                             MVT ValueType);
+                             EVT ValueType);
     SDValue get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
-                             MVT ValueType);
+                             EVT ValueType);
     SDValue get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
-                            MVT ValueType);
+                            EVT ValueType);
     SDValue get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
-                              MVT ValueType);
+                              EVT ValueType);
     SDValue get_v4i32_imm(SDNode *N, SelectionDAG &DAG);
     SDValue get_v2i64_imm(SDNode *N, SelectionDAG &DAG);
 
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG,
                               const SPUTargetMachine &TM);
-    //! Simplify a MVT::v2i64 constant splat to CellSPU-ready form
-    SDValue LowerV2I64Splat(MVT OpVT, SelectionDAG &DAG, uint64_t splat,
+    //! Simplify a EVT::v2i64 constant splat to CellSPU-ready form
+    SDValue LowerV2I64Splat(EVT OpVT, SelectionDAG &DAG, uint64_t splat,
                              DebugLoc dl);
   }
 
@@ -109,7 +106,7 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - Return the ValueType for ISD::SETCC
-    virtual MVT getSetCCResultType(MVT VT) const;
+    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
 
     //! Custom lowering hooks
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
@@ -134,7 +131,7 @@ namespace llvm {
 
     std::pair<unsigned, const TargetRegisterClass*>
       getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   MVT VT) const;
+                                   EVT VT) const;
 
     void LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter,
                                       bool hasMemory,
@@ -150,6 +147,28 @@ namespace llvm {
 
     /// getFunctionAlignment - Return the Log2 alignment of this function.
     virtual unsigned getFunctionAlignment(const Function *F) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
   };
 }
 
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index e629c8d..ecce8e3 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -17,8 +17,9 @@
 #include "SPUTargetMachine.h"
 #include "SPUGenInstrInfo.inc"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/Streams.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -313,8 +314,7 @@ SPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   } else if (RC == SPU::VECREGRegisterClass) {
     opc = (isValidFrameIdx) ? SPU::STQDv16i8 : SPU::STQXv16i8;
   } else {
-    assert(0 && "Unknown regclass!");
-    abort();
+    llvm_unreachable("Unknown regclass!");
   }
 
   DebugLoc DL = DebugLoc::getUnknownLoc();
@@ -323,43 +323,6 @@ SPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                     .addReg(SrcReg, getKillRegState(isKill)), FrameIdx);
 }
 
-void SPUInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
-                                  bool isKill,
-                                  SmallVectorImpl<MachineOperand> &Addr,
-                                  const TargetRegisterClass *RC,
-                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  cerr << "storeRegToAddr() invoked!\n";
-  abort();
-
-  if (Addr[0].isFI()) {
-    /* do what storeRegToStackSlot does here */
-  } else {
-    unsigned Opc = 0;
-    if (RC == SPU::GPRCRegisterClass) {
-      /* Opc = PPC::STW; */
-    } else if (RC == SPU::R16CRegisterClass) {
-      /* Opc = PPC::STD; */
-    } else if (RC == SPU::R32CRegisterClass) {
-      /* Opc = PPC::STFD; */
-    } else if (RC == SPU::R32FPRegisterClass) {
-      /* Opc = PPC::STFD; */
-    } else if (RC == SPU::R64FPRegisterClass) {
-      /* Opc = PPC::STFS; */
-    } else if (RC == SPU::VECREGRegisterClass) {
-      /* Opc = PPC::STVX; */
-    } else {
-      assert(0 && "Unknown regclass!");
-      abort();
-    }
-    DebugLoc DL = DebugLoc::getUnknownLoc();
-    MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc))
-      .addReg(SrcReg, getKillRegState(isKill));
-    for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-      MIB.addOperand(Addr[i]);
-    NewMIs.push_back(MIB);
-  }
-}
-
 void
 SPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MI,
@@ -385,8 +348,7 @@ SPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   } else if (RC == SPU::VECREGRegisterClass) {
     opc = (isValidFrameIdx) ? SPU::LQDv16i8 : SPU::LQXv16i8;
   } else {
-    assert(0 && "Unknown regclass in loadRegFromStackSlot!");
-    abort();
+    llvm_unreachable("Unknown regclass in loadRegFromStackSlot!");
   }
 
   DebugLoc DL = DebugLoc::getUnknownLoc();
@@ -394,47 +356,6 @@ SPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   addFrameReference(BuildMI(MBB, MI, DL, get(opc), DestReg), FrameIdx);
 }
 
-/*!
-  \note We are really pessimistic here about what kind of a load we're doing.
- */
-void SPUInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                                   SmallVectorImpl<MachineOperand> &Addr,
-                                   const TargetRegisterClass *RC,
-                                   SmallVectorImpl<MachineInstr*> &NewMIs)
-    const {
-  cerr << "loadRegToAddr() invoked!\n";
-  abort();
-
-  if (Addr[0].isFI()) {
-    /* do what loadRegFromStackSlot does here... */
-  } else {
-    unsigned Opc = 0;
-    if (RC == SPU::R8CRegisterClass) {
-      /* do brilliance here */
-    } else if (RC == SPU::R16CRegisterClass) {
-      /* Opc = PPC::LWZ; */
-    } else if (RC == SPU::R32CRegisterClass) {
-      /* Opc = PPC::LD; */
-    } else if (RC == SPU::R32FPRegisterClass) {
-      /* Opc = PPC::LFD; */
-    } else if (RC == SPU::R64FPRegisterClass) {
-      /* Opc = PPC::LFS; */
-    } else if (RC == SPU::VECREGRegisterClass) {
-      /* Opc = PPC::LVX; */
-    } else if (RC == SPU::GPRCRegisterClass) {
-      /* Opc = something else! */
-    } else {
-      assert(0 && "Unknown regclass!");
-      abort();
-    }
-    DebugLoc DL = DebugLoc::getUnknownLoc();
-    MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
-    for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-      MIB.addOperand(Addr[i]);
-    NewMIs.push_back(MIB);
-  }
-}
-
 //! Return true if the specified load or store can be folded
 bool
 SPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
@@ -543,7 +464,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
     } else if (isCondBranch(LastInst)) {
       // Block ends with fall-through condbranch.
       TBB = LastInst->getOperand(1).getMBB();
-      DEBUG(cerr << "Pushing LastInst:               ");
+      DEBUG(errs() << "Pushing LastInst:               ");
       DEBUG(LastInst->dump());
       Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
       Cond.push_back(LastInst->getOperand(0));
@@ -564,7 +485,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   // If the block ends with a conditional and unconditional branch, handle it.
   if (isCondBranch(SecondLastInst) && isUncondBranch(LastInst)) {
     TBB =  SecondLastInst->getOperand(1).getMBB();
-    DEBUG(cerr << "Pushing SecondLastInst:         ");
+    DEBUG(errs() << "Pushing SecondLastInst:         ");
     DEBUG(SecondLastInst->dump());
     Cond.push_back(MachineOperand::CreateImm(SecondLastInst->getOpcode()));
     Cond.push_back(SecondLastInst->getOperand(0));
@@ -596,7 +517,7 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     return 0;
 
   // Remove the first branch.
-  DEBUG(cerr << "Removing branch:                ");
+  DEBUG(errs() << "Removing branch:                ");
   DEBUG(I->dump());
   I->eraseFromParent();
   I = MBB.end();
@@ -608,7 +529,7 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     return 1;
 
   // Remove the second branch.
-  DEBUG(cerr << "Removing second branch:         ");
+  DEBUG(errs() << "Removing second branch:         ");
   DEBUG(I->dump());
   I->eraseFromParent();
   return 2;
@@ -632,14 +553,14 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
       MachineInstrBuilder MIB = BuildMI(&MBB, dl, get(SPU::BR));
       MIB.addMBB(TBB);
 
-      DEBUG(cerr << "Inserted one-way uncond branch: ");
+      DEBUG(errs() << "Inserted one-way uncond branch: ");
       DEBUG((*MIB).dump());
     } else {
       // Conditional branch
       MachineInstrBuilder  MIB = BuildMI(&MBB, dl, get(Cond[0].getImm()));
       MIB.addReg(Cond[1].getReg()).addMBB(TBB);
 
-      DEBUG(cerr << "Inserted one-way cond branch:   ");
+      DEBUG(errs() << "Inserted one-way cond branch:   ");
       DEBUG((*MIB).dump());
     }
     return 1;
@@ -651,9 +572,9 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
     MIB.addReg(Cond[1].getReg()).addMBB(TBB);
     MIB2.addMBB(FBB);
 
-    DEBUG(cerr << "Inserted conditional branch:    ");
+    DEBUG(errs() << "Inserted conditional branch:    ");
     DEBUG((*MIB).dump());
-    DEBUG(cerr << "part 2: ");
+    DEBUG(errs() << "part 2: ");
     DEBUG((*MIB2).dump());
    return 2;
   }
diff --git a/lib/Target/CellSPU/SPUInstrInfo.h b/lib/Target/CellSPU/SPUInstrInfo.h
index ffb4087..c644a11 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.h
+++ b/lib/Target/CellSPU/SPUInstrInfo.h
@@ -68,24 +68,12 @@ namespace llvm {
                                      unsigned SrcReg, bool isKill, int FrameIndex,
                                      const TargetRegisterClass *RC) const;
 
-    //! Store a register to an address, based on its register class
-    virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                                                  SmallVectorImpl<MachineOperand> &Addr,
-                                                  const TargetRegisterClass *RC,
-                                                  SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
     //! Load a register from a stack slot, based on its register class.
     virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MBBI,
                                       unsigned DestReg, int FrameIndex,
                                       const TargetRegisterClass *RC) const;
 
-    //! Loqad a register from an address, based on its register class
-    virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                                                         SmallVectorImpl<MachineOperand> &Addr,
-                                                         const TargetRegisterClass *RC,
-                                 SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
     //! Return true if the specified load or store can be folded
     virtual
     bool canFoldMemoryOperand(const MachineInstr *MI,
diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
index 63eb85a..09849da 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -4431,13 +4431,6 @@ def : Pat<(i8 imm:$imm),
           (ILHr8 imm:$imm)>;
 
 //===----------------------------------------------------------------------===//
-// Call instruction patterns:
-//===----------------------------------------------------------------------===//
-// Return void
-def : Pat<(ret),
-          (RET)>;
-
-//===----------------------------------------------------------------------===//
 // Zero/Any/Sign extensions
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/CellSPU/SPUMCAsmInfo.cpp b/lib/Target/CellSPU/SPUMCAsmInfo.cpp
new file mode 100644
index 0000000..1c921ab
--- /dev/null
+++ b/lib/Target/CellSPU/SPUMCAsmInfo.cpp
@@ -0,0 +1,40 @@
+//===-- SPUMCAsmInfo.cpp - Cell SPU asm properties ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SPUMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPUMCAsmInfo.h"
+using namespace llvm;
+
+SPULinuxMCAsmInfo::SPULinuxMCAsmInfo(const Target &T, const StringRef &TT) {
+  ZeroDirective = "\t.space\t";
+  SetDirective = "\t.set";
+  Data64bitsDirective = "\t.quad\t";
+  AlignmentIsInBytes = false;
+  LCOMMDirective = "\t.lcomm\t";
+      
+  PCSymbol = ".";
+  CommentString = "#";
+  GlobalPrefix = "";
+  PrivateGlobalPrefix = ".L";
+
+  // Has leb128, .loc and .file
+  HasLEB128 = true;
+  HasDotLocAndDotFile = true;
+
+  SupportsDebugInformation = true;
+  NeedsSet = true;
+
+  // Exception handling is not supported on CellSPU (think about it: you only
+  // have 256K for code+data. Would you support exception handling?)
+  ExceptionsType = ExceptionHandling::None;
+}
+
diff --git a/lib/Target/CellSPU/SPUMCAsmInfo.h b/lib/Target/CellSPU/SPUMCAsmInfo.h
new file mode 100644
index 0000000..8d75ea8
--- /dev/null
+++ b/lib/Target/CellSPU/SPUMCAsmInfo.h
@@ -0,0 +1,28 @@
+//===-- SPUMCAsmInfo.h - Cell SPU asm properties ---------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SPUMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPUTARGETASMINFO_H
+#define SPUTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+  
+  struct SPULinuxMCAsmInfo : public MCAsmInfo {
+    explicit SPULinuxMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+} // namespace llvm
+
+#endif /* SPUTARGETASMINFO_H */
diff --git a/lib/Target/CellSPU/SPUNodes.td b/lib/Target/CellSPU/SPUNodes.td
index 87c4115..c722e4b 100644
--- a/lib/Target/CellSPU/SPUNodes.td
+++ b/lib/Target/CellSPU/SPUNodes.td
@@ -87,9 +87,9 @@ def SPUshlquad_l_bits: SDNode<"SPUISD::SHLQUAD_L_BITS", SPUvecshift_type, []>;
 def SPUshlquad_l_bytes: SDNode<"SPUISD::SHLQUAD_L_BYTES", SPUvecshift_type, []>;
 
 // Vector shifts (ISD::SHL,SRL,SRA are for _integers_ only):
-def SPUvec_shl: SDNode<"SPUISD::VEC_SHL", SPUvecshift_type, []>;
-def SPUvec_srl: SDNode<"SPUISD::VEC_SRL", SPUvecshift_type, []>;
-def SPUvec_sra: SDNode<"SPUISD::VEC_SRA", SPUvecshift_type, []>;
+def SPUvec_shl: SDNode<"ISD::SHL", SPUvecshift_type, []>;
+def SPUvec_srl: SDNode<"ISD::SRL", SPUvecshift_type, []>;
+def SPUvec_sra: SDNode<"ISD::SRA", SPUvecshift_type, []>;
 
 def SPUvec_rotl: SDNode<"SPUISD::VEC_ROTL", SPUvecshift_type, []>;
 def SPUvec_rotr: SDNode<"SPUISD::VEC_ROTR", SPUvecshift_type, []>;
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index e031048..8412006 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -35,7 +35,9 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include <cstdlib>
@@ -176,8 +178,7 @@ unsigned SPURegisterInfo::getRegisterNumbering(unsigned RegEnum) {
   case SPU::R126: return 126;
   case SPU::R127: return 127;
   default:
-    cerr << "Unhandled reg in SPURegisterInfo::getRegisterNumbering!\n";
-    abort();
+    llvm_report_error("Unhandled reg in SPURegisterInfo::getRegisterNumbering");
   }
 }
 
@@ -218,8 +219,8 @@ SPURegisterInfo::getNumArgRegs()
 
 /// getPointerRegClass - Return the register class to use to hold pointers.
 /// This is used for addressing modes.
-const TargetRegisterClass * SPURegisterInfo::getPointerRegClass() const
-{
+const TargetRegisterClass *
+SPURegisterInfo::getPointerRegClass(unsigned Kind) const {
   return &SPU::R32CRegClass;
 }
 
@@ -325,9 +326,9 @@ SPURegisterInfo::eliminateCallFramePseudoInstr(MachineFunction &MF,
   MBB.erase(I);
 }
 
-void
+unsigned
 SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                                     RegScavenger *RS) const
+                                     int *Value, RegScavenger *RS) const
 {
   unsigned i = 0;
   MachineInstr &MI = *II;
@@ -364,12 +365,13 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   SPOp.ChangeToRegister(SPU::R1, false);
   if (Offset > SPUFrameInfo::maxFrameOffset()
       || Offset < SPUFrameInfo::minFrameOffset()) {
-    cerr << "Large stack adjustment ("
+    errs() << "Large stack adjustment ("
          << Offset
          << ") in SPURegisterInfo::eliminateFrameIndex.";
   } else {
     MO.ChangeToImmediate(Offset);
   }
+  return 0;
 }
 
 /// determineFrameLayout - Determine the size of the frame and maximum call
@@ -485,8 +487,10 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
         .addReg(SPU::R2)
         .addReg(SPU::R1);
     } else {
-      cerr << "Unhandled frame size: " << FrameSize << "\n";
-      abort();
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "Unhandled frame size: " << FrameSize;
+      llvm_report_error(Msg.str());
     }
 
     if (hasDebugInfo) {
@@ -577,8 +581,10 @@ SPURegisterInfo::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
         .addReg(SPU::R2)
         .addReg(SPU::R1);
     } else {
-      cerr << "Unhandled frame size: " << FrameSize << "\n";
-      abort();
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "Unhandled frame size: " << FrameSize;
+      llvm_report_error(Msg.str());
     }
    }
 }
diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
index 5b6e9ec..1d9d07e 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.h
+++ b/lib/Target/CellSPU/SPURegisterInfo.h
@@ -43,7 +43,8 @@ namespace llvm {
 
     /// getPointerRegClass - Return the register class to use to hold pointers.
     /// This is used for addressing modes.
-    virtual const TargetRegisterClass *getPointerRegClass() const;
+    virtual const TargetRegisterClass *
+    getPointerRegClass(unsigned Kind = 0) const;
 
     //! Return the array of callee-saved registers
     virtual const unsigned* getCalleeSavedRegs(const MachineFunction *MF) const;
@@ -62,8 +63,9 @@ namespace llvm {
                                        MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I) const;
     //! Convert frame indicies into machine operands
-    void eliminateFrameIndex(MachineBasicBlock::iterator II, int,
-                             RegScavenger *RS) const;
+    unsigned eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                                 int *Value = NULL,
+                                 RegScavenger *RS = NULL) const;
     //! Determine the frame's layour
     void determineFrameLayout(MachineFunction &MF) const;
 
diff --git a/lib/Target/CellSPU/SPUSubtarget.cpp b/lib/Target/CellSPU/SPUSubtarget.cpp
index 0a1c2f7..0f18b7f 100644
--- a/lib/Target/CellSPU/SPUSubtarget.cpp
+++ b/lib/Target/CellSPU/SPUSubtarget.cpp
@@ -13,15 +13,11 @@
 
 #include "SPUSubtarget.h"
 #include "SPU.h"
-#include "llvm/Module.h"
-#include "llvm/Target/TargetMachine.h"
 #include "SPUGenSubtarget.inc"
 
 using namespace llvm;
 
-SPUSubtarget::SPUSubtarget(const TargetMachine &tm, const Module &M,
-                           const std::string &FS) :
-  TM(tm),
+SPUSubtarget::SPUSubtarget(const std::string &TT, const std::string &FS) :
   StackAlignment(16),
   ProcDirective(SPU::DEFAULT_PROC),
   UseLargeMem(false)
diff --git a/lib/Target/CellSPU/SPUSubtarget.h b/lib/Target/CellSPU/SPUSubtarget.h
index b6a3409..94ac73c 100644
--- a/lib/Target/CellSPU/SPUSubtarget.h
+++ b/lib/Target/CellSPU/SPUSubtarget.h
@@ -20,9 +20,7 @@
 #include <string>
 
 namespace llvm {
-  class Module;
   class GlobalValue;
-  class TargetMachine;
 
   namespace SPU {
     enum {
@@ -33,8 +31,6 @@ namespace llvm {
     
   class SPUSubtarget : public TargetSubtarget {
   protected:
-    const TargetMachine &TM;
-    
     /// stackAlignment - The minimum alignment known to hold of the stack frame
     /// on entry to the function and which must be maintained by every function.
     unsigned StackAlignment;
@@ -52,10 +48,9 @@ namespace llvm {
     
   public:
     /// This constructor initializes the data members to match that
-    /// of the specified module.
+    /// of the specified triple.
     ///
-    SPUSubtarget(const TargetMachine &TM, const Module &M,
-                 const std::string &FS);
+    SPUSubtarget(const std::string &TT, const std::string &FS);
     
     /// ParseSubtargetFeatures - Parses features string setting specified 
     /// subtarget options.  Definition of function is auto generated by tblgen.
diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp
index 2470972..6500067 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.cpp
+++ b/lib/Target/CellSPU/SPUTargetMachine.cpp
@@ -13,62 +13,36 @@
 
 #include "SPU.h"
 #include "SPURegisterNames.h"
-#include "SPUTargetAsmInfo.h"
+#include "SPUMCAsmInfo.h"
 #include "SPUTargetMachine.h"
-#include "llvm/Module.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetMachineRegistry.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Target/TargetRegistry.h"
 
 using namespace llvm;
 
-namespace {
-  // Register the targets
-  RegisterTarget<SPUTargetMachine>
-  CELLSPU("cellspu", "STI CBEA Cell SPU [experimental]");
+extern "C" void LLVMInitializeCellSPUTarget() { 
+  // Register the target.
+  RegisterTargetMachine<SPUTargetMachine> X(TheCellSPUTarget);
+  RegisterAsmInfo<SPULinuxMCAsmInfo> Y(TheCellSPUTarget);
 }
 
-// No assembler printer by default
-SPUTargetMachine::AsmPrinterCtorFn SPUTargetMachine::AsmPrinterCtor = 0;
-
-// Force static initialization.
-extern "C" void LLVMInitializeCellSPUTarget() { }
-
 const std::pair<unsigned, int> *
 SPUFrameInfo::getCalleeSaveSpillSlots(unsigned &NumEntries) const {
   NumEntries = 1;
   return &LR[0];
 }
 
-const TargetAsmInfo *
-SPUTargetMachine::createTargetAsmInfo() const
-{
-  return new SPULinuxTargetAsmInfo(*this);
-}
-
-unsigned
-SPUTargetMachine::getModuleMatchQuality(const Module &M)
-{
-  // We strongly match "spu-*" or "cellspu-*".
-  std::string TT = M.getTargetTriple();
-  if ((TT.size() == 3 && std::string(TT.begin(), TT.begin()+3) == "spu")
-      || (TT.size() == 7 && std::string(TT.begin(), TT.begin()+7) == "cellspu")
-      || (TT.size() >= 4 && std::string(TT.begin(), TT.begin()+4) == "spu-")
-      || (TT.size() >= 8 && std::string(TT.begin(), TT.begin()+8) == "cellspu-"))
-    return 20;
-  
-  return 0;                     // No match at all...
-}
-
-SPUTargetMachine::SPUTargetMachine(const Module &M, const std::string &FS)
-  : Subtarget(*this, M, FS),
+SPUTargetMachine::SPUTargetMachine(const Target &T, const std::string &TT,
+                                   const std::string &FS)
+  : LLVMTargetMachine(T, TT),
+    Subtarget(TT, FS),
     DataLayout(Subtarget.getTargetDataString()),
     InstrInfo(*this),
     FrameInfo(*this),
     TLInfo(*this),
-    InstrItins(Subtarget.getInstrItineraryData())
-{
+    InstrItins(Subtarget.getInstrItineraryData()) {
   // For the time being, use static relocations, since there's really no
   // support for PIC yet.
   setRelocationModel(Reloc::Static);
@@ -78,22 +52,9 @@ SPUTargetMachine::SPUTargetMachine(const Module &M, const std::string &FS)
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
 
-bool
-SPUTargetMachine::addInstSelector(PassManagerBase &PM,
-                                  CodeGenOpt::Level OptLevel)
-{
+bool SPUTargetMachine::addInstSelector(PassManagerBase &PM,
+                                       CodeGenOpt::Level OptLevel) {
   // Install an instruction selector.
   PM.add(createSPUISelDag(*this));
   return false;
 }
-
-bool SPUTargetMachine::addAssemblyEmitter(PassManagerBase &PM,
-                                          CodeGenOpt::Level OptLevel,
-                                          bool Verbose,
-                                          raw_ostream &Out) {
-  // Output assembly language.
-  assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-  if (AsmPrinterCtor)
-    PM.add(AsmPrinterCtor(Out, *this, Verbose));
-  return false;
-}
diff --git a/lib/Target/CellSPU/SPUTargetMachine.h b/lib/Target/CellSPU/SPUTargetMachine.h
index 4c28521..9fdcfe9 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.h
+++ b/lib/Target/CellSPU/SPUTargetMachine.h
@@ -35,19 +35,9 @@ class SPUTargetMachine : public LLVMTargetMachine {
   SPUFrameInfo        FrameInfo;
   SPUTargetLowering   TLInfo;
   InstrItineraryData  InstrItins;
-
-protected:
-  virtual const TargetAsmInfo *createTargetAsmInfo() const;
-
-  // To avoid having target depend on the asmprinter stuff libraries, asmprinter
-  // set this functions to ctor pointer at startup time if they are linked in.
-  typedef FunctionPass *(*AsmPrinterCtorFn)(raw_ostream &o,
-                                            SPUTargetMachine &tm,
-                                            bool verbose);
-  static AsmPrinterCtorFn AsmPrinterCtor;
-
 public:
-  SPUTargetMachine(const Module &M, const std::string &FS);
+  SPUTargetMachine(const Target &T, const std::string &TT,
+                   const std::string &FS);
 
   /// Return the subtarget implementation object
   virtual const SPUSubtarget     *getSubtargetImpl() const {
@@ -66,12 +56,6 @@ public:
   virtual       TargetJITInfo    *getJITInfo() {
     return NULL;
   }
-  
-  //! Module match function
-  /*!
-    Module matching function called by TargetMachineRegistry().
-   */
-  static unsigned getModuleMatchQuality(const Module &M);
 
   virtual       SPUTargetLowering *getTargetLowering() const { 
    return const_cast<SPUTargetLowering*>(&TLInfo); 
@@ -92,13 +76,6 @@ public:
   // Pass Pipeline Configuration
   virtual bool addInstSelector(PassManagerBase &PM,
                                CodeGenOpt::Level OptLevel);
-  virtual bool addAssemblyEmitter(PassManagerBase &PM,
-                                  CodeGenOpt::Level OptLevel,
-                                  bool Verbose, raw_ostream &Out);
-
-  static void registerAsmPrinter(AsmPrinterCtorFn F) {
-    AsmPrinterCtor = F;
-  }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/CellSPU/TargetInfo/CMakeLists.txt b/lib/Target/CellSPU/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..928d0fe
--- /dev/null
+++ b/lib/Target/CellSPU/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMCellSPUInfo
+  CellSPUTargetInfo.cpp
+  )
+
+add_dependencies(LLVMCellSPUInfo CellSPUCodeGenTable_gen)
diff --git a/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp b/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp
new file mode 100644
index 0000000..049ea23
--- /dev/null
+++ b/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp
@@ -0,0 +1,20 @@
+//===-- CellSPUTargetInfo.cpp - CellSPU Target Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPU.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheCellSPUTarget;
+
+extern "C" void LLVMInitializeCellSPUTargetInfo() { 
+  RegisterTarget<Triple::cellspu> 
+    X(TheCellSPUTarget, "cellspu", "STI CBEA Cell SPU [experimental]");
+}
diff --git a/lib/Target/CellSPU/TargetInfo/Makefile b/lib/Target/CellSPU/TargetInfo/Makefile
new file mode 100644
index 0000000..9cb6827
--- /dev/null
+++ b/lib/Target/CellSPU/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/CellSPU/TargetInfo/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMCellSPUInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 28f58e8..14ad451 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -23,13 +23,12 @@
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
 #include "llvm/TypeSymbolTable.h"
-#include "llvm/Target/TargetMachineRegistry.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Streams.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
 #include <algorithm>
 #include <set>
@@ -71,19 +70,10 @@ static cl::opt<std::string> NameToGenerate("cppfor", cl::Optional,
   cl::desc("Specify the name of the thing to generate"),
   cl::init("!bad!"));
 
-/// CppBackendTargetMachineModule - Note that this is used on hosts
-/// that cannot link in a library unless there are references into the
-/// library.  In particular, it seems that it is not possible to get
-/// things to work on Win32 without this.  Though it is unused, do not
-/// remove it.
-extern "C" int CppBackendTargetMachineModule;
-int CppBackendTargetMachineModule = 0;
-
-// Register the target.
-static RegisterTarget<CPPTargetMachine> X("cpp", "C++ backend");
-
-// Force static initialization.
-extern "C" void LLVMInitializeCppBackendTarget() { }
+extern "C" void LLVMInitializeCppBackendTarget() {
+  // Register the target.
+  RegisterTargetMachine<CPPTargetMachine> X(TheCppBackendTarget);
+}
 
 namespace {
   typedef std::vector<const Type*> TypeList;
@@ -97,7 +87,7 @@ namespace {
   /// CppWriter - This class is the main chunk of code that converts an LLVM
   /// module to a C++ translation unit.
   class CppWriter : public ModulePass {
-    raw_ostream &Out;
+    formatted_raw_ostream &Out;
     const Module *TheModule;
     uint64_t uniqueNum;
     TypeMap TypeNames;
@@ -112,7 +102,7 @@ namespace {
 
   public:
     static char ID;
-    explicit CppWriter(raw_ostream &o) :
+    explicit CppWriter(formatted_raw_ostream &o) :
       ModulePass(&ID), Out(o), uniqueNum(0), is_inline(false) {}
 
     virtual const char *getPassName() const { return "C++ backend"; }
@@ -133,7 +123,7 @@ namespace {
   private:
     void printLinkageType(GlobalValue::LinkageTypes LT);
     void printVisibilityType(GlobalValue::VisibilityTypes VisTypes);
-    void printCallingConv(unsigned cc);
+    void printCallingConv(CallingConv::ID cc);
     void printEscapedString(const std::string& str);
     void printCFP(const ConstantFP* CFP);
 
@@ -165,7 +155,7 @@ namespace {
   };
 
   static unsigned indent_level = 0;
-  inline raw_ostream& nl(raw_ostream& Out, int delta = 0) {
+  inline formatted_raw_ostream& nl(formatted_raw_ostream& Out, int delta = 0) {
     Out << "\n";
     if (delta >= 0 || indent_level >= unsigned(-delta))
       indent_level += delta;
@@ -220,8 +210,7 @@ namespace {
   }
 
   void CppWriter::error(const std::string& msg) {
-    cerr << msg << "\n";
-    exit(2);
+    llvm_report_error(msg);
   }
 
   // printCFP - Print a floating point constant .. very carefully :)
@@ -230,9 +219,9 @@ namespace {
   void CppWriter::printCFP(const ConstantFP *CFP) {
     bool ignored;
     APFloat APF = APFloat(CFP->getValueAPF());  // copy
-    if (CFP->getType() == Type::FloatTy)
+    if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
       APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
-    Out << "ConstantFP::get(";
+    Out << "ConstantFP::get(getGlobalContext(), ";
     Out << "APFloat(";
 #if HAVE_PRINTF_A
     char Buffer[100];
@@ -241,7 +230,7 @@ namespace {
          !strncmp(Buffer, "-0x", 3) ||
          !strncmp(Buffer, "+0x", 3)) &&
         APF.bitwiseIsEqual(APFloat(atof(Buffer)))) {
-      if (CFP->getType() == Type::DoubleTy)
+      if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
         Out << "BitsToDouble(" << Buffer << ")";
       else
         Out << "BitsToFloat((float)" << Buffer << ")";
@@ -259,11 +248,11 @@ namespace {
            ((StrVal[0] == '-' || StrVal[0] == '+') &&
             (StrVal[1] >= '0' && StrVal[1] <= '9'))) &&
           (CFP->isExactlyValue(atof(StrVal.c_str())))) {
-        if (CFP->getType() == Type::DoubleTy)
+        if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
           Out <<  StrVal;
         else
           Out << StrVal << "f";
-      } else if (CFP->getType() == Type::DoubleTy)
+      } else if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
         Out << "BitsToDouble(0x"
             << utohexstr(CFP->getValueAPF().bitcastToAPInt().getZExtValue())
             << "ULL) /* " << StrVal << " */";
@@ -279,7 +268,7 @@ namespace {
     Out << ")";
   }
 
-  void CppWriter::printCallingConv(unsigned cc){
+  void CppWriter::printCallingConv(CallingConv::ID cc){
     // Print the calling convention.
     switch (cc) {
     case CallingConv::C:     Out << "CallingConv::C"; break;
@@ -296,6 +285,8 @@ namespace {
       Out << "GlobalValue::InternalLinkage"; break;
     case GlobalValue::PrivateLinkage:
       Out << "GlobalValue::PrivateLinkage"; break;
+    case GlobalValue::LinkerPrivateLinkage:
+      Out << "GlobalValue::LinkerPrivateLinkage"; break;
     case GlobalValue::AvailableExternallyLinkage:
       Out << "GlobalValue::AvailableExternallyLinkage "; break;
     case GlobalValue::LinkOnceAnyLinkage:
@@ -325,7 +316,7 @@ namespace {
 
   void CppWriter::printVisibilityType(GlobalValue::VisibilityTypes VisType) {
     switch (VisType) {
-    default: assert(0 && "Unknown GVar visibility");
+    default: llvm_unreachable("Unknown GVar visibility");
     case GlobalValue::DefaultVisibility:
       Out << "GlobalValue::DefaultVisibility";
       break;
@@ -357,20 +348,21 @@ namespace {
     // First, handle the primitive types .. easy
     if (Ty->isPrimitiveType() || Ty->isInteger()) {
       switch (Ty->getTypeID()) {
-      case Type::VoidTyID:   return "Type::VoidTy";
+      case Type::VoidTyID:   return "Type::getVoidTy(getGlobalContext())";
       case Type::IntegerTyID: {
         unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
-        return "IntegerType::get(" + utostr(BitWidth) + ")";
+        return "IntegerType::get(getGlobalContext(), " + utostr(BitWidth) + ")";
       }
-      case Type::X86_FP80TyID: return "Type::X86_FP80Ty";
-      case Type::FloatTyID:    return "Type::FloatTy";
-      case Type::DoubleTyID:   return "Type::DoubleTy";
-      case Type::LabelTyID:    return "Type::LabelTy";
+      case Type::X86_FP80TyID: return "Type::getX86_FP80Ty(getGlobalContext())";
+      case Type::FloatTyID:    return "Type::getFloatTy(getGlobalContext())";
+      case Type::DoubleTyID:   return "Type::getDoubleTy(getGlobalContext())";
+      case Type::LabelTyID:    return "Type::getLabelTy(getGlobalContext())";
       default:
         error("Invalid primitive type");
         break;
       }
-      return "Type::VoidTy"; // shouldn't be returned, but make it sensible
+      // shouldn't be returned, but make it sensible
+      return "Type::getVoidTy(getGlobalContext())";
     }
 
     // Now, see if we've seen the type before and return that
@@ -436,7 +428,10 @@ namespace {
     } else {
       name = getTypePrefix(val->getType());
     }
-    name += (val->hasName() ? val->getName() : utostr(uniqueNum++));
+    if (val->hasName())
+      name += val->getName();
+    else
+      name += utostr(uniqueNum++);
     sanitize(name);
     NameSet::iterator NI = UsedNames.find(name);
     if (NI != UsedNames.end())
@@ -477,6 +472,7 @@ namespace {
         HANDLE_ATTR(Nest);
         HANDLE_ATTR(ReadNone);
         HANDLE_ATTR(ReadOnly);
+        HANDLE_ATTR(InlineHint);
         HANDLE_ATTR(NoInline);
         HANDLE_ATTR(AlwaysInline);
         HANDLE_ATTR(OptimizeForSize);
@@ -519,7 +515,8 @@ namespace {
     if (TI != TypeStack.end()) {
       TypeMap::const_iterator I = UnresolvedTypes.find(Ty);
       if (I == UnresolvedTypes.end()) {
-        Out << "PATypeHolder " << typeName << "_fwd = OpaqueType::get();";
+        Out << "PATypeHolder " << typeName;
+        Out << "_fwd = OpaqueType::get(getGlobalContext());";
         nl(Out);
         UnresolvedTypes[Ty] = typeName;
       }
@@ -579,6 +576,7 @@ namespace {
         nl(Out);
       }
       Out << "StructType* " << typeName << " = StructType::get("
+          << "mod->getContext(), "
           << typeName << "_fields, /*isPacked=*/"
           << (ST->isPacked() ? "true" : "false") << ");";
       nl(Out);
@@ -618,7 +616,8 @@ namespace {
       break;
     }
     case Type::OpaqueTyID: {
-      Out << "OpaqueType* " << typeName << " = OpaqueType::get();";
+      Out << "OpaqueType* " << typeName;
+      Out << " = OpaqueType::get(getGlobalContext());";
       nl(Out);
       break;
     }
@@ -753,9 +752,10 @@ namespace {
 
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
       std::string constValue = CI->getValue().toString(10, true);
-      Out << "ConstantInt* " << constName << " = ConstantInt::get(APInt("
-          << cast<IntegerType>(CI->getType())->getBitWidth() << ",  \""
-          <<  constValue << "\", " << constValue.length() << ", 10));";
+      Out << "ConstantInt* " << constName
+          << " = ConstantInt::get(getGlobalContext(), APInt("
+          << cast<IntegerType>(CI->getType())->getBitWidth()
+          << ", StringRef(\"" <<  constValue << "\"), 10));";
     } else if (isa<ConstantAggregateZero>(CV)) {
       Out << "ConstantAggregateZero* " << constName
           << " = ConstantAggregateZero::get(" << typeName << ");";
@@ -767,8 +767,11 @@ namespace {
       printCFP(CFP);
       Out << ";";
     } else if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
-      if (CA->isString() && CA->getType()->getElementType() == Type::Int8Ty) {
-        Out << "Constant* " << constName << " = ConstantArray::get(\"";
+      if (CA->isString() &&
+          CA->getType()->getElementType() ==
+              Type::getInt8Ty(CA->getContext())) {
+        Out << "Constant* " << constName <<
+               " = ConstantArray::get(getGlobalContext(), \"";
         std::string tmp = CA->getAsString();
         bool nullTerminate = false;
         if (tmp[tmp.length()-1] == 0) {
@@ -839,12 +842,12 @@ namespace {
             << getCppName(CE->getOperand(0)) << ", "
             << "&" << constName << "_indices[0], "
             << constName << "_indices.size()"
-            << " );";
+            << ");";
       } else if (CE->isCast()) {
         printConstant(CE->getOperand(0));
         Out << "Constant* " << constName << " = ConstantExpr::getCast(";
         switch (CE->getOpcode()) {
-        default: assert(0 && "Invalid cast opcode");
+        default: llvm_unreachable("Invalid cast opcode");
         case Instruction::Trunc: Out << "Instruction::Trunc"; break;
         case Instruction::ZExt:  Out << "Instruction::ZExt"; break;
         case Instruction::SExt:  Out << "Instruction::SExt"; break;
@@ -995,13 +998,13 @@ namespace {
   void CppWriter::printVariableHead(const GlobalVariable *GV) {
     nl(Out) << "GlobalVariable* " << getCppName(GV);
     if (is_inline) {
-      Out << " = mod->getGlobalVariable(";
+      Out << " = mod->getGlobalVariable(getGlobalContext(), ";
       printEscapedString(GV->getName());
       Out << ", " << getCppName(GV->getType()->getElementType()) << ",true)";
       nl(Out) << "if (!" << getCppName(GV) << ") {";
       in(); nl(Out) << getCppName(GV);
     }
-    Out << " = new GlobalVariable(";
+    Out << " = new GlobalVariable(/*Module=*/*mod, ";
     nl(Out) << "/*Type=*/";
     printCppName(GV->getType()->getElementType());
     Out << ",";
@@ -1016,8 +1019,7 @@ namespace {
     }
     nl(Out) << "/*Name=*/\"";
     printEscapedString(GV->getName());
-    Out << "\",";
-    nl(Out) << "mod);";
+    Out << "\");";
     nl(Out);
 
     if (GV->hasSection()) {
@@ -1095,7 +1097,7 @@ namespace {
 
     case Instruction::Ret: {
       const ReturnInst* ret =  cast<ReturnInst>(I);
-      Out << "ReturnInst::Create("
+      Out << "ReturnInst::Create(getGlobalContext(), "
           << (ret->getReturnValue() ? opNames[0] + ", " : "") << bbname << ");";
       break;
     }
@@ -1159,8 +1161,9 @@ namespace {
           << bbname << ");";
       break;
     }
-    case Instruction::Unreachable:{
+    case Instruction::Unreachable: {
       Out << "new UnreachableInst("
+          << "getGlobalContext(), "
           << bbname << ");";
       break;
     }
@@ -1210,7 +1213,7 @@ namespace {
       break;
     }
     case Instruction::FCmp: {
-      Out << "FCmpInst* " << iName << " = new FCmpInst(";
+      Out << "FCmpInst* " << iName << " = new FCmpInst(*" << bbname << ", ";
       switch (cast<FCmpInst>(I)->getPredicate()) {
       case FCmpInst::FCMP_FALSE: Out << "FCmpInst::FCMP_FALSE"; break;
       case FCmpInst::FCMP_OEQ  : Out << "FCmpInst::FCMP_OEQ"; break;
@@ -1232,11 +1235,11 @@ namespace {
       }
       Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
       printEscapedString(I->getName());
-      Out << "\", " << bbname << ");";
+      Out << "\");";
       break;
     }
     case Instruction::ICmp: {
-      Out << "ICmpInst* " << iName << " = new ICmpInst(";
+      Out << "ICmpInst* " << iName << " = new ICmpInst(*" << bbname << ", ";
       switch (cast<ICmpInst>(I)->getPredicate()) {
       case ICmpInst::ICMP_EQ:  Out << "ICmpInst::ICMP_EQ";  break;
       case ICmpInst::ICMP_NE:  Out << "ICmpInst::ICMP_NE";  break;
@@ -1252,7 +1255,7 @@ namespace {
       }
       Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
       printEscapedString(I->getName());
-      Out << "\", " << bbname << ");";
+      Out << "\");";
       break;
     }
     case Instruction::Malloc: {
@@ -1680,7 +1683,8 @@ namespace {
     for (Function::const_iterator BI = F->begin(), BE = F->end();
          BI != BE; ++BI) {
       std::string bbname(getCppName(BI));
-      Out << "BasicBlock* " << bbname << " = BasicBlock::Create(\"";
+      Out << "BasicBlock* " << bbname <<
+             " = BasicBlock::Create(getGlobalContext(), \"";
       if (BI->hasName())
         printEscapedString(BI->getName());
       Out << "\"," << getCppName(BI->getParent()) << ",0);";
@@ -1799,6 +1803,7 @@ namespace {
 
   void CppWriter::printProgram(const std::string& fname,
                                const std::string& mName) {
+    Out << "#include <llvm/LLVMContext.h>\n";
     Out << "#include <llvm/Module.h>\n";
     Out << "#include <llvm/DerivedTypes.h>\n";
     Out << "#include <llvm/Constants.h>\n";
@@ -1808,8 +1813,8 @@ namespace {
     Out << "#include <llvm/BasicBlock.h>\n";
     Out << "#include <llvm/Instructions.h>\n";
     Out << "#include <llvm/InlineAsm.h>\n";
+    Out << "#include <llvm/Support/FormattedStream.h>\n";
     Out << "#include <llvm/Support/MathExtras.h>\n";
-    Out << "#include <llvm/Support/raw_ostream.h>\n";
     Out << "#include <llvm/Pass.h>\n";
     Out << "#include <llvm/PassManager.h>\n";
     Out << "#include <llvm/ADT/SmallVector.h>\n";
@@ -1821,7 +1826,6 @@ namespace {
     Out << "int main(int argc, char**argv) {\n";
     Out << "  Module* Mod = " << fname << "();\n";
     Out << "  verifyModule(*Mod, PrintMessageAction);\n";
-    Out << "  outs().flush();\n";
     Out << "  PassManager PM;\n";
     Out << "  PM.add(createPrintModulePass(&outs()));\n";
     Out << "  PM.run(*Mod);\n";
@@ -1836,7 +1840,7 @@ namespace {
     nl(Out,1) << "// Module Construction";
     nl(Out) << "Module* mod = new Module(\"";
     printEscapedString(mName);
-    Out << "\");";
+    Out << "\", getGlobalContext());";
     if (!TheModule->getTargetTriple().empty()) {
       nl(Out) << "mod->setDataLayout(\"" << TheModule->getDataLayout() << "\");";
     }
@@ -2014,7 +2018,7 @@ char CppWriter::ID = 0;
 //===----------------------------------------------------------------------===//
 
 bool CPPTargetMachine::addPassesToEmitWholeFile(PassManager &PM,
-                                                raw_ostream &o,
+                                                formatted_raw_ostream &o,
                                                 CodeGenFileType FileType,
                                                 CodeGenOpt::Level OptLevel) {
   if (FileType != TargetMachine::AssemblyFile) return true;
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index db4bc0e..1f74f76 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -19,25 +19,24 @@
 
 namespace llvm {
 
-class raw_ostream;
+class formatted_raw_ostream;
 
 struct CPPTargetMachine : public TargetMachine {
-  const TargetData DataLayout;       // Calculates type size & alignment
-
-  CPPTargetMachine(const Module &M, const std::string &FS)
-    : DataLayout(&M) {}
+  CPPTargetMachine(const Target &T, const std::string &TT,
+                   const std::string &FS)
+    : TargetMachine(T) {}
 
   virtual bool WantsWholeFile() const { return true; }
-  virtual bool addPassesToEmitWholeFile(PassManager &PM, raw_ostream &Out,
+  virtual bool addPassesToEmitWholeFile(PassManager &PM,
+                                        formatted_raw_ostream &Out,
                                         CodeGenFileType FileType,
                                         CodeGenOpt::Level OptLevel);
 
-  // This class always works, but shouldn't be the default in most cases.
-  static unsigned getModuleMatchQuality(const Module &M) { return 1; }
-
-  virtual const TargetData *getTargetData() const { return &DataLayout; }
+  virtual const TargetData *getTargetData() const { return 0; }
 };
 
+extern Target TheCppBackendTarget;
+
 } // End llvm namespace
 
 
diff --git a/lib/Target/CppBackend/Makefile b/lib/Target/CppBackend/Makefile
index ca7e1a8..dc9cf48 100644
--- a/lib/Target/CppBackend/Makefile
+++ b/lib/Target/CppBackend/Makefile
@@ -9,6 +9,9 @@
 
 LEVEL = ../../..
 LIBRARYNAME = LLVMCppBackend
+
+DIRS = TargetInfo
+
 include $(LEVEL)/Makefile.common
 
 CompileCommonOpts += -Wno-format
diff --git a/lib/Target/CppBackend/TargetInfo/CMakeLists.txt b/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..edaf5d3
--- /dev/null
+++ b/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMCppBackendInfo
+  CppBackendTargetInfo.cpp
+  )
+
diff --git a/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp b/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
new file mode 100644
index 0000000..d0aeb12
--- /dev/null
+++ b/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
@@ -0,0 +1,26 @@
+//===-- CppBackendTargetInfo.cpp - CppBackend Target Implementation -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CPPTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheCppBackendTarget;
+
+static unsigned CppBackend_TripleMatchQuality(const std::string &TT) {
+  // This class always works, but shouldn't be the default in most cases.
+  return 1;
+}
+
+extern "C" void LLVMInitializeCppBackendTargetInfo() { 
+  TargetRegistry::RegisterTarget(TheCppBackendTarget, "cpp",    
+                                  "C++ backend",
+                                  &CppBackend_TripleMatchQuality);
+}
diff --git a/lib/Target/CppBackend/TargetInfo/Makefile b/lib/Target/CppBackend/TargetInfo/Makefile
new file mode 100644
index 0000000..6e68283
--- /dev/null
+++ b/lib/Target/CppBackend/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/CppBackend/TargetInfo/Makefile -----------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMCppBackendInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/MSIL/MSILWriter.cpp b/lib/Target/MSIL/MSILWriter.cpp
index ee73c38..26d637b 100644
--- a/lib/Target/MSIL/MSILWriter.cpp
+++ b/lib/Target/MSIL/MSILWriter.cpp
@@ -19,44 +19,35 @@
 #include "llvm/TypeSymbolTable.h"
 #include "llvm/Analysis/ConstantsScanner.h"
 #include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/InstVisitor.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/Passes.h"
+using namespace llvm;
 
-namespace {
+namespace llvm {
   // TargetMachine for the MSIL 
   struct VISIBILITY_HIDDEN MSILTarget : public TargetMachine {
-    const TargetData DataLayout;       // Calculates type size & alignment
-
-    MSILTarget(const Module &M, const std::string &FS)
-      : DataLayout(&M) {}
+    MSILTarget(const Target &T, const std::string &TT, const std::string &FS)
+      : TargetMachine(T) {}
 
     virtual bool WantsWholeFile() const { return true; }
-    virtual bool addPassesToEmitWholeFile(PassManager &PM, raw_ostream &Out,
+    virtual bool addPassesToEmitWholeFile(PassManager &PM,
+                                          formatted_raw_ostream &Out,
                                           CodeGenFileType FileType,
                                           CodeGenOpt::Level OptLevel);
 
-    // This class always works, but shouldn't be the default in most cases.
-    static unsigned getModuleMatchQuality(const Module &M) { return 1; }
-
-    virtual const TargetData *getTargetData() const { return &DataLayout; }
+    virtual const TargetData *getTargetData() const { return 0; }
   };
 }
 
-/// MSILTargetMachineModule - Note that this is used on hosts that
-/// cannot link in a library unless there are references into the
-/// library.  In particular, it seems that it is not possible to get
-/// things to work on Win32 without this.  Though it is unused, do not
-/// remove it.
-extern "C" int MSILTargetMachineModule;
-int MSILTargetMachineModule = 0;
-
-static RegisterTarget<MSILTarget> X("msil", "MSIL backend");
-
-// Force static initialization.
-extern "C" void LLVMInitializeMSILTarget() { }
+extern "C" void LLVMInitializeMSILTarget() {
+  // Register the target.
+  RegisterTargetMachine<MSILTarget> X(TheMSILTarget);
+}
 
 bool MSILModule::runOnModule(Module &M) {
   ModulePtr = &M;
@@ -239,8 +230,17 @@ bool MSILWriter::isZeroValue(const Value* V) {
 
 
 std::string MSILWriter::getValueName(const Value* V) {
+  std::string Name;
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+    Name = Mang->getMangledName(GV);
+  else {
+    unsigned &No = AnonValueNumbers[V];
+    if (No == 0) No = ++NextAnonValueNumber;
+    Name = "tmp" + utostr(No);
+  }
+  
   // Name into the quotes allow control and space characters.
-  return "'"+Mang->getValueName(V)+"'";
+  return "'"+Name+"'";
 }
 
 
@@ -257,11 +257,20 @@ std::string MSILWriter::getLabelName(const std::string& Name) {
 
 
 std::string MSILWriter::getLabelName(const Value* V) {
-  return getLabelName(Mang->getValueName(V));
+  std::string Name;
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+    Name = Mang->getMangledName(GV);
+  else {
+    unsigned &No = AnonValueNumbers[V];
+    if (No == 0) No = ++NextAnonValueNumber;
+    Name = "tmp" + utostr(No);
+  }
+  
+  return getLabelName(Name);
 }
 
 
-std::string MSILWriter::getConvModopt(unsigned CallingConvID) {
+std::string MSILWriter::getConvModopt(CallingConv::ID CallingConvID) {
   switch (CallingConvID) {
   case CallingConv::C:
   case CallingConv::Cold:
@@ -272,8 +281,8 @@ std::string MSILWriter::getConvModopt(unsigned CallingConvID) {
   case CallingConv::X86_StdCall:
     return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvStdcall) ";
   default:
-    cerr << "CallingConvID = " << CallingConvID << '\n';
-    assert(0 && "Unsupported calling convention");
+    errs() << "CallingConvID = " << CallingConvID << '\n';
+    llvm_unreachable("Unsupported calling convention");
   }
   return ""; // Not reached
 }
@@ -318,8 +327,8 @@ std::string MSILWriter::getPrimitiveTypeName(const Type* Ty, bool isSigned) {
   case Type::DoubleTyID:
     return "float64 "; 
   default:
-    cerr << "Type = " << *Ty << '\n';
-    assert(0 && "Invalid primitive type");
+    errs() << "Type = " << *Ty << '\n';
+    llvm_unreachable("Invalid primitive type");
   }
   return ""; // Not reached
 }
@@ -346,8 +355,8 @@ std::string MSILWriter::getTypeName(const Type* Ty, bool isSigned,
       return getArrayTypeName(Ty->getTypeID(),Ty);
     return "valuetype '"+getArrayTypeName(Ty->getTypeID(),Ty)+"' ";
   default:
-    cerr << "Type = " << *Ty << '\n';
-    assert(0 && "Invalid type in getTypeName()");
+    errs() << "Type = " << *Ty << '\n';
+    llvm_unreachable("Invalid type in getTypeName()");
   }
   return ""; // Not reached
 }
@@ -390,8 +399,8 @@ std::string MSILWriter::getTypePostfix(const Type* Ty, bool Expand,
   case Type::PointerTyID:
     return "i"+utostr(TD->getTypeAllocSize(Ty));
   default:
-    cerr << "TypeID = " << Ty->getTypeID() << '\n';
-    assert(0 && "Invalid type in TypeToPostfix()");
+    errs() << "TypeID = " << Ty->getTypeID() << '\n';
+    llvm_unreachable("Invalid type in TypeToPostfix()");
   }
   return ""; // Not reached
 }
@@ -406,7 +415,7 @@ void MSILWriter::printConvToPtr() {
     printSimpleInstruction("conv.u8");
     break;
   default:
-    assert(0 && "Module use not supporting pointer size");
+    llvm_unreachable("Module use not supporting pointer size");
   }
 }
 
@@ -417,15 +426,15 @@ void MSILWriter::printPtrLoad(uint64_t N) {
     printSimpleInstruction("ldc.i4",utostr(N).c_str());
     // FIXME: Need overflow test?
     if (!isUInt32(N)) {
-      cerr << "Value = " << utostr(N) << '\n';
-      assert(0 && "32-bit pointer overflowed");
+      errs() << "Value = " << utostr(N) << '\n';
+      llvm_unreachable("32-bit pointer overflowed");
     }
     break;
   case Module::Pointer64:
     printSimpleInstruction("ldc.i8",utostr(N).c_str());
     break;
   default:
-    assert(0 && "Module use not supporting pointer size");
+    llvm_unreachable("Module use not supporting pointer size");
   }
 }
 
@@ -460,8 +469,8 @@ void MSILWriter::printConstLoad(const Constant* C) {
     // Undefined constant value = NULL.
     printPtrLoad(0);
   } else {
-    cerr << "Constant = " << *C << '\n';
-    assert(0 && "Invalid constant value");
+    errs() << "Constant = " << *C << '\n';
+    llvm_unreachable("Invalid constant value");
   }
   Out << '\n';
 }
@@ -509,8 +518,8 @@ void MSILWriter::printValueLoad(const Value* V) {
     printConstantExpr(cast<ConstantExpr>(V));
     break;
   default:
-    cerr << "Value = " << *V << '\n';
-    assert(0 && "Invalid value location");
+    errs() << "Value = " << *V << '\n';
+    llvm_unreachable("Invalid value location");
   }
 }
 
@@ -524,8 +533,8 @@ void MSILWriter::printValueSave(const Value* V) {
     printSimpleInstruction("stloc",getValueName(V).c_str());
     break;
   default:
-    cerr << "Value  = " << *V << '\n';
-    assert(0 && "Invalid value location");
+    errs() << "Value  = " << *V << '\n';
+    llvm_unreachable("Invalid value location");
   }
 }
 
@@ -651,12 +660,19 @@ void MSILWriter::printIndirectSave(const Type* Ty) {
 
 
 void MSILWriter::printCastInstruction(unsigned int Op, const Value* V,
-                                      const Type* Ty) {
+                                      const Type* Ty, const Type* SrcTy) {
   std::string Tmp("");
   printValueLoad(V);
   switch (Op) {
   // Signed
   case Instruction::SExt:
+    // If sign extending int, convert first from unsigned to signed
+    // with the same bit size - because otherwise we will loose the sign.
+    if (SrcTy) {
+      Tmp = "conv."+getTypePostfix(SrcTy,false,true);
+      printSimpleInstruction(Tmp.c_str());
+    }
+    // FALLTHROUGH
   case Instruction::SIToFP:
   case Instruction::FPToSI:
     Tmp = "conv."+getTypePostfix(Ty,false,true);
@@ -679,8 +695,8 @@ void MSILWriter::printCastInstruction(unsigned int Op, const Value* V,
     // FIXME: meaning that ld*/st* instruction do not change data format.
     break;
   default:
-    cerr << "Opcode = " << Op << '\n';
-    assert(0 && "Invalid conversion instruction");
+    errs() << "Opcode = " << Op << '\n';
+    llvm_unreachable("Invalid conversion instruction");
   }
 }
 
@@ -770,8 +786,8 @@ void MSILWriter::printFunctionCall(const Value* FnVal,
   else if (const InvokeInst* Invoke = dyn_cast<InvokeInst>(Inst))
     Name = getConvModopt(Invoke->getCallingConv());
   else {
-    cerr << "Instruction = " << Inst->getName() << '\n';
-    assert(0 && "Need \"Invoke\" or \"Call\" instruction only");
+    errs() << "Instruction = " << Inst->getName() << '\n';
+    llvm_unreachable("Need \"Invoke\" or \"Call\" instruction only");
   }
   if (const Function* F = dyn_cast<Function>(FnVal)) {
     // Direct call.
@@ -804,7 +820,8 @@ void MSILWriter::printIntrinsicCall(const IntrinsicInst* Inst) {
     // Save as pointer type "void*"
     printValueLoad(Inst->getOperand(1));
     printSimpleInstruction("ldloca",Name.c_str());
-    printIndirectSave(PointerType::getUnqual(IntegerType::get(8)));
+    printIndirectSave(PointerType::getUnqual(
+          IntegerType::get(Inst->getContext(), 8)));
     break;
   case Intrinsic::vaend:
     // Close argument list handle.
@@ -818,8 +835,8 @@ void MSILWriter::printIntrinsicCall(const IntrinsicInst* Inst) {
     printSimpleInstruction("cpobj","[mscorlib]System.ArgIterator");
     break;        
   default:
-    cerr << "Intrinsic ID = " << Inst->getIntrinsicID() << '\n';
-    assert(0 && "Invalid intrinsic function");
+    errs() << "Intrinsic ID = " << Inst->getIntrinsicID() << '\n';
+    llvm_unreachable("Invalid intrinsic function");
   }
 }
 
@@ -877,12 +894,13 @@ void MSILWriter::printICmpInstruction(unsigned Predicate, const Value* Left,
     break;
   case ICmpInst::ICMP_UGT:
     printBinaryInstruction("cgt.un",Left,Right);
+    break;
   case ICmpInst::ICMP_SGT:
     printBinaryInstruction("cgt",Left,Right);
     break;
   default:
-    cerr << "Predicate = " << Predicate << '\n';
-    assert(0 && "Invalid icmp predicate");
+    errs() << "Predicate = " << Predicate << '\n';
+    llvm_unreachable("Invalid icmp predicate");
   }
 }
 
@@ -976,7 +994,7 @@ void MSILWriter::printFCmpInstruction(unsigned Predicate, const Value* Left,
     printSimpleInstruction("or");
     break;
   default:
-    assert(0 && "Illegal FCmp predicate");
+    llvm_unreachable("Illegal FCmp predicate");
   }
 }
 
@@ -1024,7 +1042,8 @@ void MSILWriter::printVAArgInstruction(const VAArgInst* Inst) {
     "instance typedref [mscorlib]System.ArgIterator::GetNextArg()");
   printSimpleInstruction("refanyval","void*");
   std::string Name = 
-    "ldind."+getTypePostfix(PointerType::getUnqual(IntegerType::get(8)),false);
+    "ldind."+getTypePostfix(PointerType::getUnqual(
+            IntegerType::get(Inst->getContext(), 8)),false);
   printSimpleInstruction(Name.c_str());
 }
 
@@ -1132,9 +1151,13 @@ void MSILWriter::printInstruction(const Instruction* Inst) {
   case Instruction::Store:
     printIndirectSave(Inst->getOperand(1), Inst->getOperand(0));
     break;
+  case Instruction::SExt:
+    printCastInstruction(Inst->getOpcode(),Left,
+                         cast<CastInst>(Inst)->getDestTy(),
+                         cast<CastInst>(Inst)->getSrcTy());
+    break;
   case Instruction::Trunc:
   case Instruction::ZExt:
-  case Instruction::SExt:
   case Instruction::FPTrunc:
   case Instruction::FPExt:
   case Instruction::UIToFP:
@@ -1169,10 +1192,10 @@ void MSILWriter::printInstruction(const Instruction* Inst) {
     printAllocaInstruction(cast<AllocaInst>(Inst));
     break;
   case Instruction::Malloc:
-    assert(0 && "LowerAllocationsPass used");
+    llvm_unreachable("LowerAllocationsPass used");
     break;
   case Instruction::Free:
-    assert(0 && "LowerAllocationsPass used");
+    llvm_unreachable("LowerAllocationsPass used");
     break;
   case Instruction::Unreachable:
     printSimpleInstruction("ldstr", "\"Unreachable instruction\"");
@@ -1184,8 +1207,8 @@ void MSILWriter::printInstruction(const Instruction* Inst) {
     printVAArgInstruction(cast<VAArgInst>(Inst));
     break;
   default:
-    cerr << "Instruction = " << Inst->getName() << '\n';
-    assert(0 && "Unsupported instruction");
+    errs() << "Instruction = " << Inst->getName() << '\n';
+    llvm_unreachable("Unsupported instruction");
   }
 }
 
@@ -1216,7 +1239,7 @@ void MSILWriter::printBasicBlock(const BasicBlock* BB) {
     // Print instruction
     printInstruction(Inst);
     // Save result
-    if (Inst->getType()!=Type::VoidTy) {
+    if (Inst->getType()!=Type::getVoidTy(BB->getContext())) {
       // Do not save value after invoke, it done in "try" block
       if (Inst->getOpcode()==Instruction::Invoke) continue;
       printValueSave(Inst);
@@ -1245,7 +1268,7 @@ void MSILWriter::printLocalVariables(const Function& F) {
       Ty = PointerType::getUnqual(AI->getAllocatedType());
       Name = getValueName(AI);
       Out << "\t.locals (" << getTypeName(Ty) << Name << ")\n";
-    } else if (I->getType()!=Type::VoidTy) {
+    } else if (I->getType()!=Type::getVoidTy(F.getContext())) {
       // Operation result.
       Ty = I->getType();
       Name = getValueName(&*I);
@@ -1372,8 +1395,8 @@ void MSILWriter::printConstantExpr(const ConstantExpr* CE) {
     printBinaryInstruction("shr",left,right);
     break;
   default:
-    cerr << "Expression = " << *CE << "\n";
-    assert(0 && "Invalid constant expression");
+    errs() << "Expression = " << *CE << "\n";
+    llvm_unreachable("Invalid constant expression");
   }
 }
 
@@ -1406,8 +1429,8 @@ void MSILWriter::printStaticInitializerList() {
         postfix = "stind."+postfix;
         printSimpleInstruction(postfix.c_str());
       } else {
-        cerr << "Constant = " << *I->constant << '\n';
-        assert(0 && "Invalid static initializer");
+        errs() << "Constant = " << *I->constant << '\n';
+        llvm_unreachable("Invalid static initializer");
       }
     }
   }
@@ -1470,8 +1493,8 @@ unsigned int MSILWriter::getBitWidth(const Type* Ty) {
   case 64:
     return N;
   default:
-    cerr << "Bits = " << N << '\n';
-    assert(0 && "Unsupported integer width");
+    errs() << "Bits = " << N << '\n';
+    llvm_unreachable("Unsupported integer width");
   }
   return 0; // Not reached
 }
@@ -1528,12 +1551,12 @@ void MSILWriter::printStaticConstant(const Constant* C, uint64_t& Offset) {
       // Null pointer initialization
       if (TySize==4) Out << "int32 (0)";
       else if (TySize==8) Out << "int64 (0)";
-      else assert(0 && "Invalid pointer size");
+      else llvm_unreachable("Invalid pointer size");
     }
     break;
   default:
-    cerr << "TypeID = " << Ty->getTypeID() << '\n';
-    assert(0 && "Invalid type in printStaticConstant()");
+    errs() << "TypeID = " << Ty->getTypeID() << '\n';
+    llvm_unreachable("Invalid type in printStaticConstant()");
   }
   // Increase offset.
   Offset += TySize;
@@ -1555,8 +1578,8 @@ void MSILWriter::printStaticInitializer(const Constant* C,
     Out << getTypeName(C->getType());
     break;
   default:
-    cerr << "Type = " << *C << "\n";
-    assert(0 && "Invalid constant type");
+    errs() << "Type = " << *C << "\n";
+    llvm_unreachable("Invalid constant type");
   }
   // Print initializer
   std::string label = Name;
@@ -1595,17 +1618,18 @@ void MSILWriter::printGlobalVariables() {
 
 
 const char* MSILWriter::getLibraryName(const Function* F) {
-  return getLibraryForSymbol(F->getName().c_str(), true, F->getCallingConv());
+  return getLibraryForSymbol(F->getName(), true, F->getCallingConv());
 }
 
 
 const char* MSILWriter::getLibraryName(const GlobalVariable* GV) {
-  return getLibraryForSymbol(Mang->getValueName(GV).c_str(), false, 0);
+  return getLibraryForSymbol(Mang->getMangledName(GV), false, CallingConv::C);
 }
 
 
-const char* MSILWriter::getLibraryForSymbol(const char* Name, bool isFunction,
-                                           unsigned CallingConv) {
+const char* MSILWriter::getLibraryForSymbol(const StringRef &Name, 
+                                            bool isFunction,
+                                            CallingConv::ID CallingConv) {
   // TODO: Read *.def file with function and libraries definitions.
   return "MSVCRT.DLL";  
 }
@@ -1654,11 +1678,10 @@ void MSILWriter::printExternals() {
        E = ModulePtr->global_end(); I!=E; ++I) {
     if (!I->isDeclaration() || !I->hasDLLImportLinkage()) continue;
     // Use "LoadLibrary"/"GetProcAddress" to recive variable address.
-    std::string Label = "not_null$_"+utostr(getUniqID());
     std::string Tmp = getTypeName(I->getType())+getValueName(&*I);
     printSimpleInstruction("ldsflda",Tmp.c_str());
     Out << "\tldstr\t\"" << getLibraryName(&*I) << "\"\n";
-    Out << "\tldstr\t\"" << Mang->getValueName(&*I) << "\"\n";
+    Out << "\tldstr\t\"" << Mang->getMangledName(&*I) << "\"\n";
     printSimpleInstruction("call","void* $MSIL_Import(string,string)");
     printIndirectSave(I->getType());
   }
@@ -1671,7 +1694,8 @@ void MSILWriter::printExternals() {
 //                      External Interface declaration
 //===----------------------------------------------------------------------===//
 
-bool MSILTarget::addPassesToEmitWholeFile(PassManager &PM, raw_ostream &o,
+bool MSILTarget::addPassesToEmitWholeFile(PassManager &PM,
+                                          formatted_raw_ostream &o,
                                           CodeGenFileType FileType,
                                           CodeGenOpt::Level OptLevel)
 {
diff --git a/lib/Target/MSIL/MSILWriter.h b/lib/Target/MSIL/MSILWriter.h
index 45f5579..2280a3b 100644
--- a/lib/Target/MSIL/MSILWriter.h
+++ b/lib/Target/MSIL/MSILWriter.h
@@ -13,24 +13,24 @@
 #ifndef MSILWRITER_H
 #define MSILWRITER_H
 
+#include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/Module.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/FindUsedTypes.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetMachineRegistry.h"
 #include "llvm/Support/Mangler.h"
-#include <ios>
-using namespace llvm;
 
-namespace {
+namespace llvm {
+  extern Target TheMSILTarget;
 
   class MSILModule : public ModulePass {
     Module *ModulePtr;
@@ -56,7 +56,7 @@ namespace {
 
   };
 
-  class MSILWriter  : public FunctionPass {
+  class MSILWriter : public FunctionPass {
     struct StaticInitializer {
       const Constant* constant;
       uint64_t offset;
@@ -75,7 +75,7 @@ namespace {
     }
 
   public:
-    raw_ostream &Out;
+    formatted_raw_ostream &Out;
     Module* ModulePtr;
     const TargetData* TD;
     Mangler* Mang;
@@ -85,7 +85,11 @@ namespace {
       StaticInitList;
     const std::set<const Type *>* UsedTypes;
     static char ID;
-    MSILWriter(raw_ostream &o) : FunctionPass(&ID), Out(o) {
+    DenseMap<const Value*, unsigned> AnonValueNumbers;
+    unsigned NextAnonValueNumber;
+
+    MSILWriter(formatted_raw_ostream &o) : FunctionPass(&ID), Out(o),
+         NextAnonValueNumber(0) {
       UniqID = 0;
     }
 
@@ -130,7 +134,7 @@ namespace {
 
     std::string getLabelName(const std::string& Name);
 
-    std::string getConvModopt(unsigned CallingConvID);
+    std::string getConvModopt(CallingConv::ID CallingConvID);
 
     std::string getArrayTypeName(Type::TypeID TyID, const Type* Ty);
 
@@ -183,7 +187,7 @@ namespace {
     void printIndirectSave(const Type* Ty);
 
     void printCastInstruction(unsigned int Op, const Value* V,
-                              const Type* Ty);
+                              const Type* Ty, const Type* SrcTy=0);
 
     void printGepInstruction(const Value* V, gep_type_iterator I,
                              gep_type_iterator E);
@@ -244,11 +248,12 @@ namespace {
 
     const char* getLibraryName(const GlobalVariable* GV); 
     
-    const char* getLibraryForSymbol(const char* Name, bool isFunction,
-                                    unsigned CallingConv);
+    const char* getLibraryForSymbol(const StringRef &Name, bool isFunction,
+                                    CallingConv::ID CallingConv);
 
     void printExternals();
   };
+
 }
 
 #endif
diff --git a/lib/Target/MSIL/Makefile b/lib/Target/MSIL/Makefile
index 94265ed..8057cc7 100644
--- a/lib/Target/MSIL/Makefile
+++ b/lib/Target/MSIL/Makefile
@@ -9,6 +9,9 @@
 
 LEVEL = ../../..
 LIBRARYNAME = LLVMMSIL
+
+DIRS = TargetInfo
+
 include $(LEVEL)/Makefile.common
 
 CompileCommonOpts := $(CompileCommonOpts) -Wno-format
diff --git a/lib/Target/MSIL/TargetInfo/CMakeLists.txt b/lib/Target/MSIL/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..9f0c3a0
--- /dev/null
+++ b/lib/Target/MSIL/TargetInfo/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMMSILInfo
+  MSILTargetInfo.cpp
+  )
+
diff --git a/lib/Target/MSIL/TargetInfo/MSILTargetInfo.cpp b/lib/Target/MSIL/TargetInfo/MSILTargetInfo.cpp
new file mode 100644
index 0000000..dfd4281
--- /dev/null
+++ b/lib/Target/MSIL/TargetInfo/MSILTargetInfo.cpp
@@ -0,0 +1,26 @@
+//===-- MSILTargetInfo.cpp - MSIL Target Implementation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSILWriter.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheMSILTarget;
+
+static unsigned MSIL_TripleMatchQuality(const std::string &TT) {
+  // This class always works, but shouldn't be the default in most cases.
+  return 1;
+}
+
+extern "C" void LLVMInitializeMSILTargetInfo() { 
+  TargetRegistry::RegisterTarget(TheMSILTarget, "msil",    
+                                  "MSIL backend",
+                                  &MSIL_TripleMatchQuality);
+}
diff --git a/lib/Target/MSIL/TargetInfo/Makefile b/lib/Target/MSIL/TargetInfo/Makefile
new file mode 100644
index 0000000..30b0950
--- /dev/null
+++ b/lib/Target/MSIL/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/MSIL/TargetInfo/Makefile -----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMMSILInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/MSP430/AsmPrinter/CMakeLists.txt b/lib/Target/MSP430/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..6e66887
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMMSP430AsmPrinter
+  MSP430AsmPrinter.cpp
+  )
+add_dependencies(LLVMMSP430AsmPrinter MSP430CodeGenTable_gen)
diff --git a/lib/Target/MSP430/AsmPrinter/MSP430AsmPrinter.cpp b/lib/Target/MSP430/AsmPrinter/MSP430AsmPrinter.cpp
new file mode 100644
index 0000000..852019f
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/MSP430AsmPrinter.cpp
@@ -0,0 +1,281 @@
+//===-- MSP430AsmPrinter.cpp - MSP430 LLVM assembly writer ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the MSP430 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "MSP430.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430MCAsmInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  class VISIBILITY_HIDDEN MSP430AsmPrinter : public AsmPrinter {
+  public:
+    MSP430AsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                     const MCAsmInfo *MAI, bool V)
+      : AsmPrinter(O, TM, MAI, V) {}
+
+    virtual const char *getPassName() const {
+      return "MSP430 Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int OpNum,
+                      const char* Modifier = 0);
+    void printSrcMemOperand(const MachineInstr *MI, int OpNum,
+                            const char* Modifier = 0);
+    void printCCOperand(const MachineInstr *MI, int OpNum);
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printMachineInstruction(const MachineInstr * MI);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant,
+                         const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI,
+                               unsigned OpNo, unsigned AsmVariant,
+                               const char *ExtraCode);
+
+    void emitFunctionHeader(const MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &F);
+
+    virtual void PrintGlobalVariable(const GlobalVariable *GV) {
+      // FIXME: No support for global variables?
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AsmPrinter::getAnalysisUsage(AU);
+      AU.setPreservesAll();
+    }
+  };
+} // end of anonymous namespace
+
+#include "MSP430GenAsmWriter.inc"
+
+
+void MSP430AsmPrinter::emitFunctionHeader(const MachineFunction &MF) {
+  const Function *F = MF.getFunction();
+
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
+
+  unsigned FnAlign = MF.getAlignment();
+  EmitAlignment(FnAlign, F);
+
+  switch (F->getLinkage()) {
+  default: llvm_unreachable("Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+  case Function::PrivateLinkage:
+  case Function::LinkerPrivateLinkage:
+    break;
+  case Function::ExternalLinkage:
+    O << "\t.globl\t" << CurrentFnName << '\n';
+    break;
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+    O << "\t.weak\t" << CurrentFnName << '\n';
+    break;
+  }
+
+  printVisibility(CurrentFnName, F->getVisibility());
+
+  O << "\t.type\t" << CurrentFnName << ",@function\n"
+    << CurrentFnName << ":\n";
+}
+
+bool MSP430AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print the 'header' of function
+  emitFunctionHeader(MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    EmitBasicBlockStart(I);
+
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II)
+      // Print the assembly for the instruction.
+      printMachineInstruction(II);
+  }
+
+  if (MAI->hasDotTypeDotSizeDirective())
+    O << "\t.size\t" << CurrentFnName << ", .-" << CurrentFnName << '\n';
+
+  // We didn't modify anything
+  return false;
+}
+
+void MSP430AsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  processDebugLoc(MI, true);
+
+  // Call the autogenerated instruction printer routines.
+  printInstruction(MI);
+
+  if (VerboseAsm && !MI->getDebugLoc().isUnknown())
+    EmitComments(*MI);
+  O << '\n';
+
+  processDebugLoc(MI, false);
+}
+
+void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                    const char* Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << getRegisterName(MO.getReg());
+    return;
+  case MachineOperand::MO_Immediate:
+    if (!Modifier || strcmp(Modifier, "nohash"))
+      O << '#';
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    std::string Name = Mang->getMangledName(MO.getGlobal());
+    uint64_t Offset = MO.getOffset();
+
+    O << (isMemOp ? '&' : '#');
+    if (Offset)
+      O << '(' << Offset << '+';
+
+    O << Name;
+    if (Offset)
+      O << ')';
+
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    std::string Name(MAI->getGlobalPrefix());
+    Name += MO.getSymbolName();
+
+    O << (isMemOp ? '&' : '#') << Name;
+
+    return;
+  }
+  default:
+    llvm_unreachable("Not implemented yet!");
+  }
+}
+
+void MSP430AsmPrinter::printSrcMemOperand(const MachineInstr *MI, int OpNum,
+                                          const char* Modifier) {
+  const MachineOperand &Base = MI->getOperand(OpNum);
+  const MachineOperand &Disp = MI->getOperand(OpNum+1);
+
+  if (Base.isGlobal())
+    printOperand(MI, OpNum, "mem");
+  else if (Disp.isImm() && !Base.getReg())
+    printOperand(MI, OpNum);
+  else if (Base.getReg()) {
+    if (Disp.getImm()) {
+      printOperand(MI, OpNum + 1, "nohash");
+      O << '(';
+      printOperand(MI, OpNum);
+      O << ')';
+    } else {
+      O << '@';
+      printOperand(MI, OpNum);
+    }
+  } else
+    llvm_unreachable("Unsupported memory operand");
+}
+
+void MSP430AsmPrinter::printCCOperand(const MachineInstr *MI, int OpNum) {
+  unsigned CC = MI->getOperand(OpNum).getImm();
+
+  switch (CC) {
+  default:
+   llvm_unreachable("Unsupported CC code");
+   break;
+  case MSP430::COND_E:
+   O << "eq";
+   break;
+  case MSP430::COND_NE:
+   O << "ne";
+   break;
+  case MSP430::COND_HS:
+   O << "hs";
+   break;
+  case MSP430::COND_LO:
+   O << "lo";
+   break;
+  case MSP430::COND_GE:
+   O << "ge";
+   break;
+  case MSP430::COND_L:
+   O << 'l';
+   break;
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool MSP430AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant,
+                                       const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                             unsigned OpNo, unsigned AsmVariant,
+                                             const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0]) {
+    return true; // Unknown modifier.
+  }
+  printSrcMemOperand(MI, OpNo);
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMSP430AsmPrinter() {
+  RegisterAsmPrinter<MSP430AsmPrinter> X(TheMSP430Target);
+}
diff --git a/lib/Target/MSP430/AsmPrinter/Makefile b/lib/Target/MSP430/AsmPrinter/Makefile
new file mode 100644
index 0000000..4f340c6
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/MSP430/AsmPrinter/Makefile ---------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMMSP430AsmPrinter
+
+# Hack: we need to include 'main' MSP430 target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
index 6701773..60e0bb1 100644
--- a/lib/Target/MSP430/CMakeLists.txt
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -10,14 +10,14 @@ tablegen(MSP430GenDAGISel.inc -gen-dag-isel)
 tablegen(MSP430GenCallingConv.inc -gen-callingconv)
 tablegen(MSP430GenSubtarget.inc -gen-subtarget)
 
-add_llvm_target(MSP430
-  MSP430AsmPrinter.cpp
-  MSP430FrameInfo.cpp
+add_llvm_target(MSP430CodeGen
   MSP430InstrInfo.cpp
   MSP430ISelDAGToDAG.cpp
   MSP430ISelLowering.cpp
+  MSP430MCAsmInfo.cpp
   MSP430RegisterInfo.cpp
   MSP430Subtarget.cpp
-  MSP430TargetAsmInfo.cpp
   MSP430TargetMachine.cpp
   )
+
+target_link_libraries (LLVMMSP430CodeGen LLVMSelectionDAG)
diff --git a/lib/Target/MSP430/MSP430.h b/lib/Target/MSP430/MSP430.h
index fc13c9e..d9f5f86 100644
--- a/lib/Target/MSP430/MSP430.h
+++ b/lib/Target/MSP430/MSP430.h
@@ -20,13 +20,13 @@
 namespace llvm {
   class MSP430TargetMachine;
   class FunctionPass;
-  class raw_ostream;
+  class formatted_raw_ostream;
 
   FunctionPass *createMSP430ISelDag(MSP430TargetMachine &TM,
                                     CodeGenOpt::Level OptLevel);
-  FunctionPass *createMSP430CodePrinterPass(raw_ostream &o,
-                                            MSP430TargetMachine &tm,
-                                            bool verbose);
+
+  extern Target TheMSP430Target;
+
 } // end namespace llvm;
 
 // Defines symbolic names for MSP430 registers.
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index bf49ec0..4195a88 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -28,8 +28,14 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+
 using namespace llvm;
 
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
 /// MSP430DAGToDAGISel - MSP430 specific code to select MSP430 machine
 /// instructions for SelectionDAG operations.
 ///
@@ -50,10 +56,15 @@ namespace {
       return "MSP430 DAG->DAG Pattern Instruction Selection";
     }
 
+    virtual bool
+    SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                                 std::vector<SDValue> &OutOps);
+
     // Include the pieces autogenerated from the target description.
   #include "MSP430GenDAGISel.inc"
 
   private:
+    void PreprocessForRMW();
     SDNode *Select(SDValue Op);
     bool SelectAddr(SDValue Op, SDValue Addr, SDValue &Base, SDValue &Disp);
 
@@ -120,21 +131,155 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue Op, SDValue Addr,
 }
 
 
+bool MSP430DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintCode) {
+  default: return true;
+  case 'm':   // memory
+    if (!SelectAddr(Op, Op, Op0, Op1))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  return false;
+}
+
+/// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
+/// and move load below the TokenFactor. Replace store's chain operand with
+/// load's chain result.
+/// Shamelessly stolen from X86.
+static void MoveBelowTokenFactor(SelectionDAG *CurDAG, SDValue Load,
+                                 SDValue Store, SDValue TF) {
+  SmallVector<SDValue, 4> Ops;
+  bool isRMW = false;
+  SDValue TF0, TF1, NewTF;
+  for (unsigned i = 0, e = TF.getNode()->getNumOperands(); i != e; ++i)
+    if (Load.getNode() == TF.getOperand(i).getNode()) {
+      TF0 = Load.getOperand(0);
+      Ops.push_back(TF0);
+    } else {
+      TF1 = TF.getOperand(i);
+      Ops.push_back(TF1);
+      if (LoadSDNode* LD = dyn_cast<LoadSDNode>(TF1))
+        isRMW = !LD->isVolatile();
+    }
+
+  if (isRMW && TF1.getOperand(0).getNode() == TF0.getNode())
+    NewTF = TF0;
+  else
+    NewTF = CurDAG->UpdateNodeOperands(TF, &Ops[0], Ops.size());
+
+  SDValue NewLoad = CurDAG->UpdateNodeOperands(Load, NewTF,
+                                               Load.getOperand(1),
+                                               Load.getOperand(2));
+  CurDAG->UpdateNodeOperands(Store, NewLoad.getValue(1), Store.getOperand(1),
+                             Store.getOperand(2), Store.getOperand(3));
+}
+
+/// isRMWLoad - Return true if N is a load that's part of RMW sub-DAG. The chain
+/// produced by the load must only be used by the store's chain operand,
+/// otherwise this may produce a cycle in the DAG.
+/// Shamelessly stolen from X86. FIXME: Should we make this function common?
+static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
+                      SDValue &Load) {
+  if (N.getOpcode() == ISD::BIT_CONVERT)
+    N = N.getOperand(0);
+
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
+  if (!LD || LD->isVolatile())
+    return false;
+  if (LD->getAddressingMode() != ISD::UNINDEXED)
+    return false;
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType != ISD::NON_EXTLOAD && ExtType != ISD::EXTLOAD)
+    return false;
+
+  if (N.hasOneUse() &&
+      LD->hasNUsesOfValue(1, 1) &&
+      N.getOperand(1) == Address &&
+      LD->isOperandOf(Chain.getNode())) {
+    Load = N;
+    return true;
+  }
+  return false;
+}
+
+/// PreprocessForRMW - Preprocess the DAG to make instruction selection better.
+/// Shamelessly stolen from X86.
+void MSP430DAGToDAGISel::PreprocessForRMW() {
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+         E = CurDAG->allnodes_end(); I != E; ++I) {
+    if (!ISD::isNON_TRUNCStore(I))
+      continue;
+
+    SDValue Chain = I->getOperand(0);
+    if (Chain.getNode()->getOpcode() != ISD::TokenFactor)
+      continue;
+
+    SDValue N1 = I->getOperand(1); // Value to store
+    SDValue N2 = I->getOperand(2); // Address of store
+
+    if (!N1.hasOneUse())
+      continue;
+
+    bool RModW = false;
+    SDValue Load;
+    unsigned Opcode = N1.getNode()->getOpcode();
+    switch (Opcode) {
+      case ISD::ADD:
+      case ISD::AND:
+      case ISD::OR:
+      case ISD::XOR:
+      case ISD::ADDC:
+      case ISD::ADDE: {
+        SDValue N10 = N1.getOperand(0);
+        SDValue N11 = N1.getOperand(1);
+        RModW = isRMWLoad(N10, Chain, N2, Load);
+
+        if (!RModW && isRMWLoad(N11, Chain, N2, Load)) {
+          // Swap the operands, making the RMW load the first operand seems
+          // to help selection and prevent token chain loops.
+          N1 = CurDAG->UpdateNodeOperands(N1, N11, N10);
+          RModW = true;
+        }
+       break;
+      }
+      case ISD::SUB:
+      case ISD::SUBC:
+      case ISD::SUBE: {
+        SDValue N10 = N1.getOperand(0);
+        RModW = isRMWLoad(N10, Chain, N2, Load);
+        break;
+      }
+    }
+
+    if (RModW) {
+      MoveBelowTokenFactor(CurDAG, Load, SDValue(I, 0), Chain);
+      ++NumLoadMoved;
+    }
+  }
+}
 
 /// InstructionSelect - This callback is invoked by
 /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
 void MSP430DAGToDAGISel::InstructionSelect() {
+  PreprocessForRMW();
+
+  DEBUG(errs() << "Selection DAG after RMW preprocessing:\n");
+  DEBUG(CurDAG->dump());
+
   DEBUG(BB->dump());
 
   // Codegen the basic block.
-#ifndef NDEBUG
-  DOUT << "===== Instruction selection begins:\n";
-  Indent = 0;
-#endif
+  DEBUG(errs() << "===== Instruction selection begins:\n");
+  DEBUG(Indent = 0);
   SelectRoot(*CurDAG);
-#ifndef NDEBUG
-  DOUT << "===== Instruction selection ends:\n";
-#endif
+  DEBUG(errs() << "===== Instruction selection ends:\n");
 
   CurDAG->RemoveDeadNodes();
 }
@@ -144,21 +289,17 @@ SDNode *MSP430DAGToDAGISel::Select(SDValue Op) {
   DebugLoc dl = Op.getDebugLoc();
 
   // Dump information about the Node being selected
-  #ifndef NDEBUG
-  DOUT << std::string(Indent, ' ') << "Selecting: ";
+  DEBUG(errs().indent(Indent) << "Selecting: ");
   DEBUG(Node->dump(CurDAG));
-  DOUT << "\n";
-  Indent += 2;
-  #endif
+  DEBUG(errs() << "\n");
+  DEBUG(Indent += 2);
 
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    #ifndef NDEBUG
-    DOUT << std::string(Indent-2, ' ') << "== ";
-    DEBUG(Node->dump(CurDAG));
-    DOUT << "\n";
-    Indent -= 2;
-    #endif
+    DEBUG(errs().indent(Indent-2) << "== ";
+          Node->dump(CurDAG);
+          errs() << "\n");
+    DEBUG(Indent -= 2);
     return NULL;
   }
 
@@ -172,23 +313,21 @@ SDNode *MSP430DAGToDAGISel::Select(SDValue Op) {
     if (Node->hasOneUse())
       return CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16,
                                   TFI, CurDAG->getTargetConstant(0, MVT::i16));
-    return CurDAG->getTargetNode(MSP430::ADD16ri, dl, MVT::i16,
-                                 TFI, CurDAG->getTargetConstant(0, MVT::i16));
+    return CurDAG->getMachineNode(MSP430::ADD16ri, dl, MVT::i16,
+                                  TFI, CurDAG->getTargetConstant(0, MVT::i16));
   }
   }
 
   // Select the default instruction
   SDNode *ResNode = SelectCode(Op);
 
-  #ifndef NDEBUG
-  DOUT << std::string(Indent-2, ' ') << "=> ";
+  DEBUG(errs() << std::string(Indent-2, ' ') << "=> ");
   if (ResNode == NULL || ResNode == Op.getNode())
     DEBUG(Op.getNode()->dump(CurDAG));
   else
     DEBUG(ResNode->dump(CurDAG));
-  DOUT << "\n";
-  Indent -= 2;
-  #endif
+  DEBUG(errs() << "\n");
+  DEBUG(Indent -= 2);
 
   return ResNode;
 }
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 91a8663..b56f069 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -31,12 +31,16 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/VectorExtras.h"
 using namespace llvm;
 
 MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
-  TargetLowering(tm), Subtarget(*tm.getSubtargetImpl()), TM(tm) {
+  TargetLowering(tm, new TargetLoweringObjectFileELF()),
+  Subtarget(*tm.getSubtargetImpl()), TM(tm) {
 
   // Set up the register classes.
   addRegisterClass(MVT::i8,  MSP430::GR8RegisterClass);
@@ -77,7 +81,6 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
   setOperationAction(ISD::ROTR,             MVT::i8,    Expand);
   setOperationAction(ISD::ROTL,             MVT::i16,   Expand);
   setOperationAction(ISD::ROTR,             MVT::i16,   Expand);
-  setOperationAction(ISD::RET,              MVT::Other, Custom);
   setOperationAction(ISD::GlobalAddress,    MVT::i16,   Custom);
   setOperationAction(ISD::ExternalSymbol,   MVT::i16,   Custom);
   setOperationAction(ISD::BR_JT,            MVT::Other, Expand);
@@ -92,6 +95,24 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
   setOperationAction(ISD::SELECT_CC,        MVT::i8,    Custom);
   setOperationAction(ISD::SELECT_CC,        MVT::i16,   Custom);
   setOperationAction(ISD::SIGN_EXTEND,      MVT::i16,   Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);
+
+  setOperationAction(ISD::CTTZ,             MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ,             MVT::i16,   Expand);
+  setOperationAction(ISD::CTLZ,             MVT::i8,    Expand);
+  setOperationAction(ISD::CTLZ,             MVT::i16,   Expand);
+  setOperationAction(ISD::CTPOP,            MVT::i8,    Expand);
+  setOperationAction(ISD::CTPOP,            MVT::i16,   Expand);
+
+  setOperationAction(ISD::SHL_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SHL_PARTS,        MVT::i16,   Expand);
+  setOperationAction(ISD::SRL_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SRL_PARTS,        MVT::i16,   Expand);
+  setOperationAction(ISD::SRA_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SRA_PARTS,        MVT::i16,   Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,   Expand);
 
   // FIXME: Implement efficiently multiplication by a constant
   setOperationAction(ISD::MUL,              MVT::i16,   Expand);
@@ -110,19 +131,16 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
 
 SDValue MSP430TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   switch (Op.getOpcode()) {
-  case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG);
   case ISD::SHL: // FALLTHROUGH
   case ISD::SRL:
   case ISD::SRA:              return LowerShifts(Op, DAG);
-  case ISD::RET:              return LowerRET(Op, DAG);
-  case ISD::CALL:             return LowerCALL(Op, DAG);
   case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
   case ISD::ExternalSymbol:   return LowerExternalSymbol(Op, DAG);
   case ISD::BR_CC:            return LowerBR_CC(Op, DAG);
   case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
   case ISD::SIGN_EXTEND:      return LowerSIGN_EXTEND(Op, DAG);
   default:
-    assert(0 && "unimplemented operand");
+    llvm_unreachable("unimplemented operand");
     return SDValue();
   }
 }
@@ -133,32 +151,84 @@ unsigned MSP430TargetLowering::getFunctionAlignment(const Function *F) const {
 }
 
 //===----------------------------------------------------------------------===//
+//                       MSP430 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+TargetLowering::ConstraintType
+MSP430TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return C_RegisterClass;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+MSP430TargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint,
+                             EVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'r':   // GENERAL_REGS
+      if (VT == MVT::i8)
+        return std::make_pair(0U, MSP430::GR8RegisterClass);
+
+      return std::make_pair(0U, MSP430::GR16RegisterClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+//===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
 #include "MSP430GenCallingConv.inc"
 
-SDValue MSP430TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op,
-                                                    SelectionDAG &DAG) {
-  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-  switch (CC) {
+SDValue
+MSP430TargetLowering::LowerFormalArguments(SDValue Chain,
+                                           CallingConv::ID CallConv,
+                                           bool isVarArg,
+                                           const SmallVectorImpl<ISD::InputArg>
+                                             &Ins,
+                                           DebugLoc dl,
+                                           SelectionDAG &DAG,
+                                           SmallVectorImpl<SDValue> &InVals) {
+
+  switch (CallConv) {
   default:
-    assert(0 && "Unsupported calling convention");
+    llvm_unreachable("Unsupported calling convention");
   case CallingConv::C:
   case CallingConv::Fast:
-    return LowerCCCArguments(Op, DAG);
+    return LowerCCCArguments(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals);
   }
 }
 
-SDValue MSP430TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
-  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
-  unsigned CallingConv = TheCall->getCallingConv();
-  switch (CallingConv) {
+SDValue
+MSP430TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                                CallingConv::ID CallConv, bool isVarArg,
+                                bool isTailCall,
+                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                const SmallVectorImpl<ISD::InputArg> &Ins,
+                                DebugLoc dl, SelectionDAG &DAG,
+                                SmallVectorImpl<SDValue> &InVals) {
+
+  switch (CallConv) {
   default:
-    assert(0 && "Unsupported calling convention");
+    llvm_unreachable("Unsupported calling convention");
   case CallingConv::Fast:
   case CallingConv::C:
-    return LowerCCCCallTo(Op, DAG, CallingConv);
+    return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+                          Outs, Ins, dl, DAG, InVals);
   }
 }
 
@@ -166,40 +236,46 @@ SDValue MSP430TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
 /// generate load operations for arguments places on the stack.
 // FIXME: struct return stuff
 // FIXME: varargs
-SDValue MSP430TargetLowering::LowerCCCArguments(SDValue Op,
-                                                SelectionDAG &DAG) {
+SDValue
+MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
+                                        CallingConv::ID CallConv,
+                                        bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        DebugLoc dl,
+                                        SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals) {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
-  SDValue Root = Op.getOperand(0);
-  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
-  unsigned CC = MF.getFunction()->getCallingConv();
-  DebugLoc dl = Op.getDebugLoc();
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
-  CCInfo.AnalyzeFormalArguments(Op.getNode(), CC_MSP430);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_MSP430);
 
   assert(!isVarArg && "Varargs not supported yet");
 
-  SmallVector<SDValue, 16> ArgValues;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     if (VA.isRegLoc()) {
       // Arguments passed in registers
-      MVT RegVT = VA.getLocVT();
-      switch (RegVT.getSimpleVT()) {
-      default:
-        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
-             << RegVT.getSimpleVT()
-             << "\n";
-        abort();
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
+      default: 
+        {
+#ifndef NDEBUG
+          errs() << "LowerFormalArguments Unhandled argument type: "
+               << RegVT.getSimpleVT().SimpleTy << "\n";
+#endif
+          llvm_unreachable(0);
+        }
       case MVT::i16:
         unsigned VReg =
           RegInfo.createVirtualRegister(MSP430::GR16RegisterClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        SDValue ArgValue = DAG.getCopyFromReg(Root, dl, VReg, RegVT);
+        SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
 
         // If this is an 8-bit value, it is really passed promoted to 16
         // bits. Insert an assert[sz]ext to capture this, then truncate to the
@@ -214,7 +290,7 @@ SDValue MSP430TargetLowering::LowerCCCArguments(SDValue Op,
         if (VA.getLocInfo() != CCValAssign::Full)
           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
 
-        ArgValues.push_back(ArgValue);
+        InVals.push_back(ArgValue);
       }
     } else {
       // Sanity check
@@ -222,8 +298,8 @@ SDValue MSP430TargetLowering::LowerCCCArguments(SDValue Op,
       // Load the argument to a virtual register
       unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
       if (ObjSize > 2) {
-        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
-             << VA.getLocVT().getSimpleVT()
+        errs() << "LowerFormalArguments Unhandled argument type: "
+             << VA.getLocVT().getSimpleVT().SimpleTy
              << "\n";
       }
       // Create the frame index object for this incoming parameter...
@@ -232,30 +308,29 @@ SDValue MSP430TargetLowering::LowerCCCArguments(SDValue Op,
       // Create the SelectionDAG nodes corresponding to a load
       //from this parameter
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
-      ArgValues.push_back(DAG.getLoad(VA.getLocVT(), dl, Root, FIN,
-                                      PseudoSourceValue::getFixedStack(FI), 0));
+      InVals.push_back(DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+                                   PseudoSourceValue::getFixedStack(FI), 0));
     }
   }
 
-  ArgValues.push_back(Root);
-
-  // Return the new list of results.
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
-                     &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
+  return Chain;
 }
 
-SDValue MSP430TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
+SDValue
+MSP430TargetLowering::LowerReturn(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  DebugLoc dl, SelectionDAG &DAG) {
+
   // CCValAssign - represent the assignment of the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
-  unsigned CC   = DAG.getMachineFunction().getFunction()->getCallingConv();
-  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
-  DebugLoc dl = Op.getDebugLoc();
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
 
-  // Analize return values of ISD::RET
-  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_MSP430);
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_MSP430);
 
   // If this is the first return lowered for this function, add the regs to the
   // liveout set for the function.
@@ -265,8 +340,6 @@ SDValue MSP430TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
         DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
   }
 
-  // The chain is always operand #0
-  SDValue Chain = Op.getOperand(0);
   SDValue Flag;
 
   // Copy the result values into the output registers.
@@ -274,10 +347,8 @@ SDValue MSP430TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
-    // ISD::RET => ret chain, (regnum1,val1), ...
-    // So i*2+1 index only the regnums
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             Op.getOperand(i*2+1), Flag);
+                             Outs[i].Val, Flag);
 
     // Guarantee that all emitted copies are stuck together,
     // avoiding something bad.
@@ -294,19 +365,21 @@ SDValue MSP430TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
 /// LowerCCCCallTo - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 /// TODO: sret.
-SDValue MSP430TargetLowering::LowerCCCCallTo(SDValue Op, SelectionDAG &DAG,
-                                             unsigned CC) {
-  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
-  SDValue Chain  = TheCall->getChain();
-  SDValue Callee = TheCall->getCallee();
-  bool isVarArg  = TheCall->isVarArg();
-  DebugLoc dl = Op.getDebugLoc();
-
+SDValue
+MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                                     CallingConv::ID CallConv, bool isVarArg,
+                                     bool isTailCall,
+                                     const SmallVectorImpl<ISD::OutputArg>
+                                       &Outs,
+                                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                                     DebugLoc dl, SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &InVals) {
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
 
-  CCInfo.AnalyzeCallOperands(TheCall, CC_MSP430);
+  CCInfo.AnalyzeCallOperands(Outs, CC_MSP430);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -322,12 +395,11 @@ SDValue MSP430TargetLowering::LowerCCCCallTo(SDValue Op, SelectionDAG &DAG,
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
 
-    // Arguments start after the 5 first operands of ISD::CALL
-    SDValue Arg = TheCall->getArg(i);
+    SDValue Arg = Outs[i].Val;
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
-      default: assert(0 && "Unknown loc info!");
+      default: llvm_unreachable("Unknown loc info!");
       case CCValAssign::Full: break;
       case CCValAssign::SExt:
         Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
@@ -412,50 +484,43 @@ SDValue MSP430TargetLowering::LowerCCCCallTo(SDValue Op, SelectionDAG &DAG,
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
-  return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
-                 Op.getResNo());
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl,
+                         DAG, InVals);
 }
 
-/// LowerCallResult - Lower the result values of an ISD::CALL into the
-/// appropriate copies out of appropriate physical registers.  This assumes that
-/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
-/// being lowered. Returns a SDNode with the same number of values as the
-/// ISD::CALL.
-SDNode*
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
 MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
-                                      CallSDNode *TheCall,
-                                      unsigned CallingConv,
-                                      SelectionDAG &DAG) {
-  bool isVarArg = TheCall->isVarArg();
-  DebugLoc dl = TheCall->getDebugLoc();
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) {
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
 
-  CCInfo.AnalyzeCallResult(TheCall, RetCC_MSP430);
-  SmallVector<SDValue, 8> ResultVals;
+  CCInfo.AnalyzeCallResult(Ins, RetCC_MSP430);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
                                RVLocs[i].getValVT(), InFlag).getValue(1);
     InFlag = Chain.getValue(2);
-    ResultVals.push_back(Chain.getValue(0));
+    InVals.push_back(Chain.getValue(0));
   }
 
-  ResultVals.push_back(Chain);
-
-  // Merge everything together with a MERGE_VALUES node.
-  return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
-                     &ResultVals[0], ResultVals.size()).getNode();
+  return Chain;
 }
 
 SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
                                           SelectionDAG &DAG) {
   unsigned Opc = Op.getOpcode();
   SDNode* N = Op.getNode();
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   DebugLoc dl = N->getDebugLoc();
 
   // We currently only lower shifts of constant argument.
@@ -511,7 +576,7 @@ static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, unsigned &TargetCC,
   // FIXME: Handle jump negative someday
   TargetCC = MSP430::COND_INVALID;
   switch (CC) {
-  default: assert(0 && "Invalid integer condition!");
+  default: llvm_unreachable("Invalid integer condition!");
   case ISD::SETEQ:
     TargetCC = MSP430::COND_E;  // aka COND_Z
     break;
@@ -585,7 +650,7 @@ SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
 SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
                                                SelectionDAG &DAG) {
   SDValue Val = Op.getOperand(0);
-  MVT VT      = Op.getValueType();
+  EVT VT      = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
 
   assert(VT == MVT::i16 && "Only support i16 for now!");
@@ -616,7 +681,8 @@ const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
 
 MachineBasicBlock*
 MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                  MachineBasicBlock *BB) const {
+                                                  MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
   const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   assert((MI->getOpcode() == MSP430::Select16 ||
@@ -646,6 +712,10 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     .addImm(MI->getOperand(3).getImm());
   F->insert(I, copy0MBB);
   F->insert(I, copy1MBB);
+  // Inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), 
+         SE = BB->succ_end(); SI != SE; ++SI)
+    EM->insert(std::make_pair(*SI, copy1MBB));
   // Update machine-CFG edges by transferring all successors of the current
   // block to the new block which will contain the Phi node for the select.
   copy1MBB->transferSuccessors(BB);
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 4a90a0e..fdbc384 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -33,7 +33,7 @@ namespace llvm {
       /// Y = RRC X, rotate right via carry
       RRC,
 
-      /// CALL/TAILCALL - These operations represent an abstract call
+      /// CALL - These operations represent an abstract call
       /// instruction, which includes a bunch of information.
       CALL,
 
@@ -77,10 +77,6 @@ namespace llvm {
     /// getFunctionAlignment - Return the Log2 alignment of this function.
     virtual unsigned getFunctionAlignment(const Function *F) const;
 
-    SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerCCCArguments(SDValue Op, SelectionDAG &DAG);
     SDValue LowerShifts(SDValue Op, SelectionDAG &DAG);
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG);
@@ -88,16 +84,58 @@ namespace llvm {
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
     SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG);
 
-    SDValue LowerCCCCallTo(SDValue Op, SelectionDAG &DAG,
-                           unsigned CC);
-    SDNode* LowerCallResult(SDValue Chain, SDValue InFlag,
-                            CallSDNode *TheCall,
-                            unsigned CallingConv, SelectionDAG &DAG);
+    TargetLowering::ConstraintType
+    getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
 
     MachineBasicBlock* EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                   MachineBasicBlock *BB) const;
+                                                   MachineBasicBlock *BB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
 
   private:
+    SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           bool isTailCall,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    SDValue LowerCCCArguments(SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              DebugLoc dl,
+                              SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals);
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
     const MSP430Subtarget &Subtarget;
     const MSP430TargetMachine &TM;
   };
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 91112c3..37fbb6d 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
@@ -44,7 +45,7 @@ void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       .addFrameIndex(FrameIdx).addImm(0)
       .addReg(SrcReg, getKillRegState(isKill));
   else
-    assert(0 && "Cannot store this register to stack slot!");
+    llvm_unreachable("Cannot store this register to stack slot!");
 }
 
 void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -61,7 +62,7 @@ void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     BuildMI(MBB, MI, DL, get(MSP430::MOV8rm))
       .addReg(DestReg).addFrameIndex(FrameIdx).addImm(0);
   else
-    assert(0 && "Cannot store this register to stack slot!");
+    llvm_unreachable("Cannot store this register to stack slot!");
 }
 
 bool MSP430InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
@@ -171,7 +172,7 @@ MSP430InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
 
   // Conditional branch.
   unsigned Count = 0;
-  assert(0 && "Implement conditional branches!");
+  llvm_unreachable("Implement conditional branches!");
 
   return Count;
 }
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 39c08e4..f7e0d2b 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -155,7 +155,7 @@ let isCall = 1 in
   let Defs = [R12W, R13W, R14W, R15W, SRW],
       Uses = [SPW] in {
     def CALLi     : Pseudo<(outs), (ins i16imm:$dst, variable_ops),
-                           "call\t${dst:call}", [(MSP430call imm:$dst)]>;
+                           "call\t$dst", [(MSP430call imm:$dst)]>;
     def CALLr     : Pseudo<(outs), (ins GR16:$dst, variable_ops),
                            "call\t$dst", [(MSP430call GR16:$dst)]>;
     def CALLm     : Pseudo<(outs), (ins memsrc:$dst, variable_ops),
@@ -243,6 +243,13 @@ def MOV16mr : Pseudo<(outs), (ins memdst:$dst, GR16:$src),
                 "mov.w\t{$src, $dst}",
                 [(store GR16:$src, addr:$dst)]>;
 
+def MOV8mm  : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "mov.b\t{$src, $dst}",
+                [(store (i8 (load addr:$src)), addr:$dst)]>;
+def MOV16mm : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
+                "mov.w\t{$src, $dst}",
+                [(store (i16 (load addr:$src)), addr:$dst)]>;
+
 //===----------------------------------------------------------------------===//
 // Arithmetic Instructions
 
@@ -671,30 +678,26 @@ def OR16rm : Pseudo<(outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
 let isTwoAddress = 0 in {
 def OR8mr  : Pseudo<(outs), (ins memdst:$dst, GR8:$src),
                 "bis.b\t{$src, $dst}",
-                [(store (or (load addr:$dst), GR8:$src), addr:$dst),
-                 (implicit SRW)]>;
+                [(store (or (load addr:$dst), GR8:$src), addr:$dst)]>;
 def OR16mr : Pseudo<(outs), (ins memdst:$dst, GR16:$src),
                 "bis.w\t{$src, $dst}",
-                [(store (or (load addr:$dst), GR16:$src), addr:$dst),
-                 (implicit SRW)]>;
+                [(store (or (load addr:$dst), GR16:$src), addr:$dst)]>;
 
 def OR8mi  : Pseudo<(outs), (ins memdst:$dst, i8imm:$src),
                 "bis.b\t{$src, $dst}",
-                [(store (or (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                 (implicit SRW)]>;
+                [(store (or (load addr:$dst), (i8 imm:$src)), addr:$dst)]>;
 def OR16mi : Pseudo<(outs), (ins memdst:$dst, i16imm:$src),
                 "bis.w\t{$src, $dst}",
-                [(store (or (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                 (implicit SRW)]>;
+                [(store (or (load addr:$dst), (i16 imm:$src)), addr:$dst)]>;
 
 def OR8mm  : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
                 "bis.b\t{$src, $dst}",
-                [(store (or (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
-                 (implicit SRW)]>;
+                [(store (or (i8 (load addr:$dst)),
+                            (i8 (load addr:$src))), addr:$dst)]>;
 def OR16mm : Pseudo<(outs), (ins memdst:$dst, memsrc:$src),
                 "bis.w\t{$src, $dst}",
-                [(store (or (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
-                 (implicit SRW)]>;
+                 [(store (or (i16 (load addr:$dst)),
+                             (i16 (load addr:$src))), addr:$dst)]>;
 }
 
 } // isTwoAddress = 1
@@ -722,59 +725,6 @@ def CMP16im : Pseudo<(outs), (ins i16imm:$src1, memsrc:$src2),
                      "cmp.w\t{$src1, $src2}",
                       [(MSP430cmp (i16 imm:$src1), (load addr:$src2)), (implicit SRW)]>;
 
-// FIXME: imm is allowed only on src operand, not on dst.
-
-//def CMP8ri  : Pseudo<(outs), (ins GR8:$src1, i8imm:$src2),
-//                   "cmp.b\t{$src1, $src2}",
-//                   [(MSP430cmp GR8:$src1, imm:$src2), (implicit SRW)]>;
-//def CMP16ri : Pseudo<(outs), (ins GR16:$src1, i16imm:$src2),
-//                     "cmp.w\t{$src1, $src2}",
-//                     [(MSP430cmp GR16:$src1, imm:$src2), (implicit SRW)]>;
-
-//def CMP8mi  : Pseudo<(outs), (ins memsrc:$src1, i8imm:$src2),
-//                "cmp.b\t{$src1, $src2}",
-//                [(MSP430cmp (load addr:$src1), (i8 imm:$src2)), (implicit SRW)]>;
-//def CMP16mi : Pseudo<(outs), (ins memsrc:$src1, i16imm:$src2),
-//                "cmp.w\t{$src1, $src2}",
-//                [(MSP430cmp (load addr:$src1), (i16 imm:$src2)), (implicit SRW)]>;
-
-
-// Imm 0, +1, +2, +4, +8 are encoded via constant generator registers.
-// That's why we can use them as dest operands. 
-// We don't define new class for them, since they would need special encoding
-// in the future.
-
-def CMP8ri0 : Pseudo<(outs), (ins GR8:$src1),
-                     "cmp.b\t{$src1, #0}",
-                     [(MSP430cmp GR8:$src1, 0), (implicit SRW)]>;
-def CMP16ri0: Pseudo<(outs), (ins GR16:$src1),
-                     "cmp.w\t{$src1, #0}",
-                     [(MSP430cmp GR16:$src1, 0), (implicit SRW)]>;
-def CMP8ri1 : Pseudo<(outs), (ins GR8:$src1),
-                     "cmp.b\t{$src1, #1}",
-                     [(MSP430cmp GR8:$src1, 1), (implicit SRW)]>;
-def CMP16ri1: Pseudo<(outs), (ins GR16:$src1),
-                     "cmp.w\t{$src1, #1}",
-                     [(MSP430cmp GR16:$src1, 1), (implicit SRW)]>;
-def CMP8ri2 : Pseudo<(outs), (ins GR8:$src1),
-                     "cmp.b\t{$src1, #2}",
-                     [(MSP430cmp GR8:$src1, 2), (implicit SRW)]>;
-def CMP16ri2: Pseudo<(outs), (ins GR16:$src1),
-                     "cmp.w\t{$src1, #2}",
-                     [(MSP430cmp GR16:$src1, 2), (implicit SRW)]>;
-def CMP8ri4 : Pseudo<(outs), (ins GR8:$src1),
-                     "cmp.b\t{$src1, #4}",
-                     [(MSP430cmp GR8:$src1, 4), (implicit SRW)]>;
-def CMP16ri4: Pseudo<(outs), (ins GR16:$src1),
-                     "cmp.w\t{$src1, #4}",
-                     [(MSP430cmp GR16:$src1, 4), (implicit SRW)]>;
-def CMP8ri8 : Pseudo<(outs), (ins GR8:$src1),
-                     "cmp.b\t{$src1, #8}",
-                     [(MSP430cmp GR8:$src1, 8), (implicit SRW)]>;
-def CMP16ri8: Pseudo<(outs), (ins GR16:$src1),
-                     "cmp.w\t{$src1, #8}",
-                     [(MSP430cmp GR16:$src1, 8), (implicit SRW)]>;
-
 def CMP8rm  : Pseudo<(outs), (ins GR8:$src1, memsrc:$src2),
                      "cmp.b\t{$src1, $src2}",
                      [(MSP430cmp GR8:$src1, (load addr:$src2)), (implicit SRW)]>;
diff --git a/lib/Target/MSP430/MSP430MCAsmInfo.cpp b/lib/Target/MSP430/MSP430MCAsmInfo.cpp
new file mode 100644
index 0000000..069313e
--- /dev/null
+++ b/lib/Target/MSP430/MSP430MCAsmInfo.cpp
@@ -0,0 +1,20 @@
+//===-- MSP430MCAsmInfo.cpp - MSP430 asm properties -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MSP430MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCAsmInfo.h"
+using namespace llvm;
+
+MSP430MCAsmInfo::MSP430MCAsmInfo(const Target &T, const StringRef &TT) {
+  AlignmentIsInBytes = false;
+  AllowNameToStartWithDigit = true;
+}
diff --git a/lib/Target/MSP430/MSP430MCAsmInfo.h b/lib/Target/MSP430/MSP430MCAsmInfo.h
new file mode 100644
index 0000000..8318029
--- /dev/null
+++ b/lib/Target/MSP430/MSP430MCAsmInfo.h
@@ -0,0 +1,28 @@
+//=====-- MSP430MCAsmInfo.h - MSP430 asm properties -----------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MSP430MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430TARGETASMINFO_H
+#define MSP430TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+  struct MSP430MCAsmInfo : public MCAsmInfo {
+    explicit MSP430MCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index d40bac7..1a5893e 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
@@ -45,7 +46,7 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return CalleeSavedRegs;
 }
 
-const TargetRegisterClass* const*
+const TargetRegisterClass *const *
 MSP430RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
   static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
     &MSP430::GR16RegClass, &MSP430::GR16RegClass,
@@ -58,8 +59,7 @@ MSP430RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
   return CalleeSavedRegClasses;
 }
 
-BitVector
-MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
 
   // Mark 4 special registers as reserved.
@@ -75,7 +75,8 @@ MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
-const TargetRegisterClass* MSP430RegisterInfo::getPointerRegClass() const {
+const TargetRegisterClass *
+MSP430RegisterInfo::getPointerRegClass(unsigned Kind) const {
   return &MSP430::GR16RegClass;
 }
 
@@ -146,9 +147,10 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-void
+unsigned
 MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                        int SPAdj, RegScavenger *RS) const {
+                                        int SPAdj, int *Value,
+                                        RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
   unsigned i = 0;
@@ -186,7 +188,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     MI.getOperand(i).ChangeToRegister(BasePtr, false);
 
     if (Offset == 0)
-      return;
+      return 0;
 
     // We need to materialize the offset via add instruction.
     unsigned DstReg = MI.getOperand(0).getReg();
@@ -197,11 +199,12 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       BuildMI(MBB, next(II), dl, TII.get(MSP430::ADD16ri), DstReg)
         .addReg(DstReg).addImm(Offset);
 
-    return;
+    return 0;
   }
 
   MI.getOperand(i).ChangeToRegister(BasePtr, false);
   MI.getOperand(i+1).ChangeToImmediate(Offset);
+  return 0;
 }
 
 void
@@ -291,7 +294,7 @@ void MSP430RegisterInfo::emitEpilogue(MachineFunction &MF,
   switch (RetOpcode) {
   case MSP430::RET: break;  // These are ok
   default:
-    assert(0 && "Can only insert epilog into returning blocks");
+    llvm_unreachable("Can only insert epilog into returning blocks");
   }
 
   // Get the number of bytes to allocate from the FrameInfo
@@ -310,7 +313,6 @@ void MSP430RegisterInfo::emitEpilogue(MachineFunction &MF,
     NumBytes = StackSize - CSSize;
 
   // Skip the callee-saved pop instructions.
-  MachineBasicBlock::iterator LastCSPop = MBBI;
   while (MBBI != MBB.begin()) {
     MachineBasicBlock::iterator PI = prior(MBBI);
     unsigned Opc = PI->getOpcode();
@@ -327,7 +329,16 @@ void MSP430RegisterInfo::emitEpilogue(MachineFunction &MF,
   //  mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
 
   if (MFI->hasVarSizedObjects()) {
-    assert(0 && "Not implemented yet!");
+    BuildMI(MBB, MBBI, DL,
+            TII.get(MSP430::MOV16rr), MSP430::SPW).addReg(MSP430::FPW);
+    if (CSSize) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL,
+                TII.get(MSP430::SUB16ri), MSP430::SPW)
+        .addReg(MSP430::SPW).addImm(CSSize);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
   } else {
     // adjust stack pointer back: SPW += numbytes
     if (NumBytes) {
@@ -349,7 +360,7 @@ unsigned MSP430RegisterInfo::getFrameRegister(MachineFunction &MF) const {
 }
 
 int MSP430RegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  assert(0 && "Not implemented yet!");
+  llvm_unreachable("Not implemented yet!");
   return 0;
 }
 
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index a210e36..5f3a216 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -40,7 +40,7 @@ public:
     getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
-  const TargetRegisterClass* getPointerRegClass() const;
+  const TargetRegisterClass* getPointerRegClass(unsigned Kind = 0) const;
 
   bool hasFP(const MachineFunction &MF) const;
   bool hasReservedCallFrame(MachineFunction &MF) const;
@@ -49,8 +49,9 @@ public:
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-  void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
 
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
index ef9e103..1346cb9 100644
--- a/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -14,12 +14,10 @@
 #include "MSP430Subtarget.h"
 #include "MSP430.h"
 #include "MSP430GenSubtarget.inc"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
-MSP430Subtarget::MSP430Subtarget(const TargetMachine &TM, const Module &M,
-                                 const std::string &FS) {
+MSP430Subtarget::MSP430Subtarget(const std::string &TT, const std::string &FS) {
   std::string CPU = "generic";
 
   // Parse features string.
diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h
index 96c8108..1070544 100644
--- a/lib/Target/MSP430/MSP430Subtarget.h
+++ b/lib/Target/MSP430/MSP430Subtarget.h
@@ -19,17 +19,14 @@
 #include <string>
 
 namespace llvm {
-class Module;
-class TargetMachine;
 
 class MSP430Subtarget : public TargetSubtarget {
   bool ExtendedInsts;
 public:
   /// This constructor initializes the data members to match that
-  /// of the specified module.
+  /// of the specified triple.
   ///
-  MSP430Subtarget(const TargetMachine &TM, const Module &M,
-                  const std::string &FS);
+  MSP430Subtarget(const std::string &TT, const std::string &FS);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index dd09d43..5e21f8e 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -12,43 +12,30 @@
 //===----------------------------------------------------------------------===//
 
 #include "MSP430.h"
-#include "MSP430TargetAsmInfo.h"
+#include "MSP430MCAsmInfo.h"
 #include "MSP430TargetMachine.h"
-#include "llvm/Module.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetAsmInfo.h"
-#include "llvm/Target/TargetMachineRegistry.h"
-
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
-/// MSP430TargetMachineModule - Note that this is used on hosts that
-/// cannot link in a library unless there are references into the
-/// library.  In particular, it seems that it is not possible to get
-/// things to work on Win32 without this.  Though it is unused, do not
-/// remove it.
-extern "C" int MSP430TargetMachineModule;
-int MSP430TargetMachineModule = 0;
-
-
-// Register the targets
-static RegisterTarget<MSP430TargetMachine>
-X("msp430", "MSP430 [experimental]");
-
-// Force static initialization.
-extern "C" void LLVMInitializeMSP430Target() { }
+extern "C" void LLVMInitializeMSP430Target() {
+  // Register the target.
+  RegisterTargetMachine<MSP430TargetMachine> X(TheMSP430Target);
+  RegisterAsmInfo<MSP430MCAsmInfo> Z(TheMSP430Target);
+}
 
-MSP430TargetMachine::MSP430TargetMachine(const Module &M,
+MSP430TargetMachine::MSP430TargetMachine(const Target &T,
+                                         const std::string &TT,
                                          const std::string &FS) :
-  Subtarget(*this, M, FS),
+  LLVMTargetMachine(T, TT),
+  Subtarget(TT, FS),
   // FIXME: Check TargetData string.
   DataLayout("e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8"),
   InstrInfo(*this), TLInfo(*this),
   FrameInfo(TargetFrameInfo::StackGrowsDown, 2, -2) { }
 
-const TargetAsmInfo *MSP430TargetMachine::createTargetAsmInfo() const {
-  return new MSP430TargetAsmInfo(*this);
-}
 
 bool MSP430TargetMachine::addInstSelector(PassManagerBase &PM,
                                           CodeGenOpt::Level OptLevel) {
@@ -57,23 +44,3 @@ bool MSP430TargetMachine::addInstSelector(PassManagerBase &PM,
   return false;
 }
 
-bool MSP430TargetMachine::addAssemblyEmitter(PassManagerBase &PM,
-                                             CodeGenOpt::Level OptLevel,
-                                             bool Verbose,
-                                             raw_ostream &Out) {
-  // Output assembly language.
-  PM.add(createMSP430CodePrinterPass(Out, *this, Verbose));
-  return false;
-}
-
-unsigned MSP430TargetMachine::getModuleMatchQuality(const Module &M) {
-  std::string TT = M.getTargetTriple();
-
-  // We strongly match msp430
-  if (TT.size() >= 6 && TT[0] == 'm' && TT[1] == 's' && TT[2] == 'p' &&
-      TT[3] == '4' &&  TT[4] == '3' && TT[5] == '0')
-    return 20;
-
-  return 0;
-}
-
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index d9ffa2b..d386140 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -37,11 +37,9 @@ class MSP430TargetMachine : public LLVMTargetMachine {
   // any MSP430 specific FrameInfo class.
   TargetFrameInfo       FrameInfo;
 
-protected:
-  virtual const TargetAsmInfo *createTargetAsmInfo() const;
-
 public:
-  MSP430TargetMachine(const Module &M, const std::string &FS);
+  MSP430TargetMachine(const Target &T, const std::string &TT,
+                      const std::string &FS);
 
   virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; }
   virtual const MSP430InstrInfo *getInstrInfo() const  { return &InstrInfo; }
@@ -57,10 +55,6 @@ public:
   }
 
   virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addAssemblyEmitter(PassManagerBase &PM,
-                                  CodeGenOpt::Level OptLevel, bool Verbose,
-                                  raw_ostream &Out);
-  static unsigned getModuleMatchQuality(const Module &M);
 }; // MSP430TargetMachine.
 
 } // end namespace llvm
diff --git a/lib/Target/MSP430/Makefile b/lib/Target/MSP430/Makefile
index 45cb3aa..4b18bc9 100644
--- a/lib/Target/MSP430/Makefile
+++ b/lib/Target/MSP430/Makefile
@@ -7,7 +7,7 @@
 # 
 ##===----------------------------------------------------------------------===##
 LEVEL = ../../..
-LIBRARYNAME = LLVMMSP430
+LIBRARYNAME = LLVMMSP430CodeGen
 TARGET = MSP430
 
 # Make sure that tblgen is run, first thing.
@@ -17,5 +17,7 @@ BUILT_SOURCES = MSP430GenRegisterInfo.h.inc MSP430GenRegisterNames.inc \
 		MSP430GenDAGISel.inc MSP430GenCallingConv.inc \
 		MSP430GenSubtarget.inc
 
+DIRS = AsmPrinter TargetInfo
+
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Target/MSP430/TargetInfo/CMakeLists.txt b/lib/Target/MSP430/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..1d408d0
--- /dev/null
+++ b/lib/Target/MSP430/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMMSP430Info
+  MSP430TargetInfo.cpp
+  )
+
+add_dependencies(LLVMMSP430Info MSP430Table_gen)
diff --git a/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
new file mode 100644
index 0000000..f9ca5c4
--- /dev/null
+++ b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
@@ -0,0 +1,20 @@
+//===-- MSP430TargetInfo.cpp - MSP430 Target Implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheMSP430Target;
+
+extern "C" void LLVMInitializeMSP430TargetInfo() { 
+  RegisterTarget<Triple::msp430> 
+    X(TheMSP430Target, "msp430", "MSP430 [experimental]");
+}
diff --git a/lib/Target/MSP430/TargetInfo/Makefile b/lib/Target/MSP430/TargetInfo/Makefile
new file mode 100644
index 0000000..abb08f2
--- /dev/null
+++ b/lib/Target/MSP430/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/MSP430/TargetInfo/Makefile ---------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMMSP430Info
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Mips/AsmPrinter/CMakeLists.txt b/lib/Target/Mips/AsmPrinter/CMakeLists.txt
index 197cc29..56c68a6 100644
--- a/lib/Target/Mips/AsmPrinter/CMakeLists.txt
+++ b/lib/Target/Mips/AsmPrinter/CMakeLists.txt
@@ -4,6 +4,6 @@ include_directories(
   )
 
 add_llvm_library(LLVMMipsAsmPrinter
-  MipsAsmPrinter.cpp
+  MipsAsmPrinter.cpp
   )
 add_dependencies(LLVMMipsAsmPrinter MipsCodeGenTable_gen)
 \ No newline at end of file
diff --git a/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp b/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp
index cb40479..ccf9ee5 100644
--- a/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp
@@ -22,24 +22,28 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
-#include "llvm/MDNode.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DwarfWriter.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h" 
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Mangler.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include <cctype>
 
 using namespace llvm;
@@ -50,8 +54,8 @@ namespace {
   class VISIBILITY_HIDDEN MipsAsmPrinter : public AsmPrinter {
     const MipsSubtarget *Subtarget;
   public:
-    explicit MipsAsmPrinter(raw_ostream &O, MipsTargetMachine &TM, 
-                            const TargetAsmInfo *T, bool V)
+    explicit MipsAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM, 
+                            const MCAsmInfo *T, bool V)
       : AsmPrinter(O, TM, T, V) {
       Subtarget = &TM.getSubtarget<MipsSubtarget>();
     }
@@ -68,34 +72,25 @@ namespace {
                          const char *Modifier = 0);
     void printFCCOperand(const MachineInstr *MI, int opNum, 
                          const char *Modifier = 0);
-    void printModuleLevelGV(const GlobalVariable* GVar);
+    void PrintGlobalVariable(const GlobalVariable *GVar);
     void printSavedRegsBitmask(MachineFunction &MF);
     void printHex32(unsigned int Value);
 
-    const char *emitCurrentABIString(void);
+    const char *emitCurrentABIString();
     void emitFunctionStart(MachineFunction &MF);
     void emitFunctionEnd(MachineFunction &MF);
     void emitFrameDirective(MachineFunction &MF);
 
-    bool printInstruction(const MachineInstr *MI);  // autogenerated.
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
     bool runOnMachineFunction(MachineFunction &F);
-    bool doInitialization(Module &M);
-    bool doFinalization(Module &M);
+    void EmitStartOfAsmFile(Module &M);
   };
 } // end of anonymous namespace
 
 #include "MipsGenAsmWriter.inc"
 
-/// createMipsCodePrinterPass - Returns a pass that prints the MIPS
-/// assembly code for a MachineFunction to the given output stream,
-/// using the given target machine description.  This should work
-/// regardless of whether the function is in SSA form.
-FunctionPass *llvm::createMipsCodePrinterPass(raw_ostream &o,
-                                              MipsTargetMachine &tm,
-                                              bool verbose) {
-  return new MipsAsmPrinter(o, tm, tm.getTargetAsmInfo(), verbose);
-}
-
 //===----------------------------------------------------------------------===//
 //
 //  Mips Asm Directives
@@ -186,9 +181,7 @@ printHex32(unsigned int Value)
 //===----------------------------------------------------------------------===//
 
 /// Frame Directive
-void MipsAsmPrinter::
-emitFrameDirective(MachineFunction &MF)
-{
+void MipsAsmPrinter::emitFrameDirective(MachineFunction &MF) {
   const TargetRegisterInfo &RI = *TM.getRegisterInfo();
 
   unsigned stackReg  = RI.getFrameRegister(MF);
@@ -196,16 +189,14 @@ emitFrameDirective(MachineFunction &MF)
   unsigned stackSize = MF.getFrameInfo()->getStackSize();
 
 
-  O << "\t.frame\t" << '$' << LowercaseString(RI.get(stackReg).AsmName)
+  O << "\t.frame\t" << '$' << LowercaseString(getRegisterName(stackReg))
                     << ',' << stackSize << ','
-                    << '$' << LowercaseString(RI.get(returnReg).AsmName)
+                    << '$' << LowercaseString(getRegisterName(returnReg))
                     << '\n';
 }
 
 /// Emit Set directives.
-const char * MipsAsmPrinter::
-emitCurrentABIString(void) 
-{  
+const char *MipsAsmPrinter::emitCurrentABIString() {  
   switch(Subtarget->getTargetABI()) {
     case MipsSubtarget::O32:  return "abi32";  
     case MipsSubtarget::O64:  return "abiO64";
@@ -215,17 +206,15 @@ emitCurrentABIString(void)
     default: break;
   }
 
-  assert(0 && "Unknown Mips ABI");
+  llvm_unreachable("Unknown Mips ABI");
   return NULL;
 }  
 
 /// Emit the directives used by GAS on the start of functions
-void MipsAsmPrinter::
-emitFunctionStart(MachineFunction &MF)
-{
+void MipsAsmPrinter::emitFunctionStart(MachineFunction &MF) {
   // Print out the label for the function.
   const Function *F = MF.getFunction();
-  SwitchToSection(TAI->SectionForGlobal(F));
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
 
   // 2 bits aligned
   EmitAlignment(MF.getAlignment(), F);
@@ -235,7 +224,7 @@ emitFunctionStart(MachineFunction &MF)
 
   printVisibility(CurrentFnName, F->getVisibility());
 
-  if ((TAI->hasDotTypeDotSizeDirective()) && Subtarget->isLinux())
+  if ((MAI->hasDotTypeDotSizeDirective()) && Subtarget->isLinux())
     O << "\t.type\t"   << CurrentFnName << ", @function\n";
 
   O << CurrentFnName << ":\n";
@@ -247,9 +236,7 @@ emitFunctionStart(MachineFunction &MF)
 }
 
 /// Emit the directives used by GAS on the end of functions
-void MipsAsmPrinter::
-emitFunctionEnd(MachineFunction &MF) 
-{
+void MipsAsmPrinter::emitFunctionEnd(MachineFunction &MF) {
   // There are instruction for this macros, but they must
   // always be at the function end, and we can't emit and
   // break with BB logic. 
@@ -257,15 +244,13 @@ emitFunctionEnd(MachineFunction &MF)
   O << "\t.set\treorder\n"; 
 
   O << "\t.end\t" << CurrentFnName << '\n';
-  if (TAI->hasDotTypeDotSizeDirective() && !Subtarget->isLinux())
+  if (MAI->hasDotTypeDotSizeDirective() && !Subtarget->isLinux())
     O << "\t.size\t" << CurrentFnName << ", .-" << CurrentFnName << '\n';
 }
 
 /// runOnMachineFunction - This uses the printMachineInstruction()
 /// method to print assembly for each instruction.
-bool MipsAsmPrinter::
-runOnMachineFunction(MachineFunction &MF) 
-{
+bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   this->MF = &MF;
 
   SetupMachineFunction(MF);
@@ -287,14 +272,21 @@ runOnMachineFunction(MachineFunction &MF)
 
     // Print a label for the basic block.
     if (I != MF.begin()) {
-      printBasicBlockLabel(I, true, true);
-      O << '\n';
+      EmitBasicBlockStart(I);
     }
 
     for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
          II != E; ++II) {
+      processDebugLoc(II, true);
+
       // Print the assembly for the instruction.
       printInstruction(II);
+      
+      if (VerboseAsm && !II->getDebugLoc().isUnknown())
+        EmitComments(*II);
+      O << '\n';
+
+      processDebugLoc(II, false);      
       ++EmittedInsts;
     }
 
@@ -310,10 +302,8 @@ runOnMachineFunction(MachineFunction &MF)
 }
 
 // Print out an operand for an inline asm expression.
-bool MipsAsmPrinter::
-PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 
-                unsigned AsmVariant, const char *ExtraCode) 
-{
+bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 
+                                     unsigned AsmVariant,const char *ExtraCode){
   // Does this asm operand have a single letter operand modifier?
   if (ExtraCode && ExtraCode[0]) 
     return true; // Unknown modifier.
@@ -322,57 +312,33 @@ PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
   return false;
 }
 
-void MipsAsmPrinter::
-printOperand(const MachineInstr *MI, int opNum) 
-{
+void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
   const MachineOperand &MO = MI->getOperand(opNum);
-  const TargetRegisterInfo  &RI = *TM.getRegisterInfo();
   bool closeP = false;
-  bool isPIC = (TM.getRelocationModel() == Reloc::PIC_);
-  bool isCodeLarge = (TM.getCodeModel() == CodeModel::Large);
-
-  // %hi and %lo used on mips gas to load global addresses on
-  // static code. %got is used to load global addresses when 
-  // using PIC_. %call16 is used to load direct call targets
-  // on PIC_ and small code size. %call_lo and %call_hi load 
-  // direct call targets on PIC_ and large code size.
-  if (MI->getOpcode() == Mips::LUi && !MO.isReg() && !MO.isImm()) {
-    if ((isPIC) && (isCodeLarge))
-      O << "%call_hi(";
-    else
-      O << "%hi(";
+
+  if (MO.getTargetFlags())
     closeP = true;
-  } else if ((MI->getOpcode() == Mips::ADDiu) && !MO.isReg() && !MO.isImm()) {
-    const MachineOperand &firstMO = MI->getOperand(opNum-1);
-    if (firstMO.getReg() == Mips::GP)
-      O << "%gp_rel(";
+
+  switch(MO.getTargetFlags()) {
+  case MipsII::MO_GPREL:    O << "%gp_rel("; break;
+  case MipsII::MO_GOT_CALL: O << "%call16("; break;
+  case MipsII::MO_GOT:
+    if (MI->getOpcode() == Mips::LW)
+      O << "%got(";
     else
       O << "%lo(";
-    closeP = true;
-  } else if ((isPIC) && (MI->getOpcode() == Mips::LW) &&
-             (!MO.isReg()) && (!MO.isImm())) {
-    const MachineOperand &firstMO = MI->getOperand(opNum-1);
-    const MachineOperand &lastMO  = MI->getOperand(opNum+1);
-    if ((firstMO.isReg()) && (lastMO.isReg())) {
-      if ((firstMO.getReg() == Mips::T9) && (lastMO.getReg() == Mips::GP) 
-          && (!isCodeLarge))
-        O << "%call16(";
-      else if ((firstMO.getReg() != Mips::T9) && (lastMO.getReg() == Mips::GP))
-        O << "%got(";
-      else if ((firstMO.getReg() == Mips::T9) && (lastMO.getReg() != Mips::GP) 
-               && (isCodeLarge))
-        O << "%call_lo(";
-      closeP = true;
-    }
+    break;
+  case MipsII::MO_ABS_HILO:
+    if (MI->getOpcode() == Mips::LUi)
+      O << "%hi(";
+    else
+      O << "%lo(";     
+    break;
   }
- 
-  switch (MO.getType()) 
-  {
+
+  switch (MO.getType()) {
     case MachineOperand::MO_Register:
-      if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
-        O << '$' << LowercaseString (RI.get(MO.getReg()).AsmName);
-      else
-        O << '$' << MO.getReg();
+      O << '$' << LowercaseString(getRegisterName(MO.getReg()));
       break;
 
     case MachineOperand::MO_Immediate:
@@ -380,14 +346,11 @@ printOperand(const MachineInstr *MI, int opNum)
       break;
 
     case MachineOperand::MO_MachineBasicBlock:
-      printBasicBlockLabel(MO.getMBB());
+      GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
       return;
 
     case MachineOperand::MO_GlobalAddress:
-      {
-        const GlobalValue *GV = MO.getGlobal();
-        O << Mang->getValueName(GV);
-      }
+      O << Mang->getMangledName(MO.getGlobal());
       break;
 
     case MachineOperand::MO_ExternalSymbol:
@@ -395,25 +358,23 @@ printOperand(const MachineInstr *MI, int opNum)
       break;
 
     case MachineOperand::MO_JumpTableIndex:
-      O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
       << '_' << MO.getIndex();
       break;
 
     case MachineOperand::MO_ConstantPoolIndex:
-      O << TAI->getPrivateGlobalPrefix() << "CPI"
+      O << MAI->getPrivateGlobalPrefix() << "CPI"
         << getFunctionNumber() << "_" << MO.getIndex();
       break;
   
     default:
-      O << "<unknown operand type>"; abort (); break;
+      llvm_unreachable("<unknown operand type>");
   }
 
   if (closeP) O << ")";
 }
 
-void MipsAsmPrinter::
-printUnsignedImm(const MachineInstr *MI, int opNum) 
-{
+void MipsAsmPrinter::printUnsignedImm(const MachineInstr *MI, int opNum) {
   const MachineOperand &MO = MI->getOperand(opNum);
   if (MO.getType() == MachineOperand::MO_Immediate)
     O << (unsigned short int)MO.getImm();
@@ -422,8 +383,7 @@ printUnsignedImm(const MachineInstr *MI, int opNum)
 }
 
 void MipsAsmPrinter::
-printMemOperand(const MachineInstr *MI, int opNum, const char *Modifier) 
-{
+printMemOperand(const MachineInstr *MI, int opNum, const char *Modifier) {
   // when using stack locations for not load/store instructions
   // print the same way as all normal 3 operand instructions.
   if (Modifier && !strcmp(Modifier, "stackloc")) {
@@ -443,17 +403,14 @@ printMemOperand(const MachineInstr *MI, int opNum, const char *Modifier)
 }
 
 void MipsAsmPrinter::
-printFCCOperand(const MachineInstr *MI, int opNum, const char *Modifier) 
-{
+printFCCOperand(const MachineInstr *MI, int opNum, const char *Modifier) {
   const MachineOperand& MO = MI->getOperand(opNum);
   O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm()); 
 }
 
-bool MipsAsmPrinter::
-doInitialization(Module &M) 
-{
-  Mang = new Mangler(M, "", TAI->getPrivateGlobalPrefix());
-
+void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  // FIXME: Use SwitchSection.
+  
   // Tell the assembler which ABI we are using
   O << "\t.section .mdebug." << emitCurrentABIString() << '\n';
 
@@ -464,12 +421,9 @@ doInitialization(Module &M)
 
   // return to previous section
   O << "\t.previous" << '\n'; 
-
-  return false; // success
 }
 
-void MipsAsmPrinter::
-printModuleLevelGV(const GlobalVariable* GVar) {
+void MipsAsmPrinter::PrintGlobalVariable(const GlobalVariable *GVar) {
   const TargetData *TD = TM.getTargetData();
 
   if (!GVar->hasInitializer())
@@ -480,10 +434,8 @@ printModuleLevelGV(const GlobalVariable* GVar) {
     return;
 
   O << "\n\n";
-  std::string name = Mang->getValueName(GVar);
+  std::string name = Mang->getMangledName(GVar);
   Constant *C = GVar->getInitializer();
-  if (isa<MDNode>(C) || isa<MDString>(C))
-    return;
   const Type *CTy = C->getType();
   unsigned Size = TD->getTypeAllocSize(CTy);
   const ConstantArray *CVA = dyn_cast<ConstantArray>(C);
@@ -503,7 +455,8 @@ printModuleLevelGV(const GlobalVariable* GVar) {
 
   printVisibility(name, GVar->getVisibility());
 
-  SwitchToSection(TAI->SectionForGlobal(GVar));
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GVar, Mang,
+                                                                  TM));
 
   if (C->isNullValue() && !GVar->hasSection()) {
     if (!GVar->isThreadLocal() &&
@@ -513,8 +466,8 @@ printModuleLevelGV(const GlobalVariable* GVar) {
       if (GVar->hasLocalLinkage())
         O << "\t.local\t" << name << '\n';
 
-      O << TAI->getCOMMDirective() << name << ',' << Size;
-      if (TAI->getCOMMDirectiveTakesAlignment())
+      O << MAI->getCOMMDirective() << name << ',' << Size;
+      if (MAI->getCOMMDirectiveTakesAlignment())
         O << ',' << (1 << Align);
 
       O << '\n';
@@ -536,29 +489,27 @@ printModuleLevelGV(const GlobalVariable* GVar) {
     // or something.  For now, just emit them as external.
    case GlobalValue::ExternalLinkage:
     // If external or appending, declare as a global symbol
-    O << TAI->getGlobalDirective() << name << '\n';
+    O << MAI->getGlobalDirective() << name << '\n';
     // Fall Through
    case GlobalValue::PrivateLinkage:
+   case GlobalValue::LinkerPrivateLinkage:
    case GlobalValue::InternalLinkage:
     if (CVA && CVA->isCString())
       printSizeAndType = false;
     break;
    case GlobalValue::GhostLinkage:
-    cerr << "Should not have any unmaterialized functions!\n";
-    abort();
+    llvm_unreachable("Should not have any unmaterialized functions!");
    case GlobalValue::DLLImportLinkage:
-    cerr << "DLLImport linkage is not supported by this target!\n";
-    abort();
+    llvm_unreachable("DLLImport linkage is not supported by this target!");
    case GlobalValue::DLLExportLinkage:
-    cerr << "DLLExport linkage is not supported by this target!\n";
-    abort();
+    llvm_unreachable("DLLExport linkage is not supported by this target!");
    default:
-    assert(0 && "Unknown linkage type!");
+    llvm_unreachable("Unknown linkage type!");
   }
 
   EmitAlignment(Align, GVar);
 
-  if (TAI->hasDotTypeDotSizeDirective() && printSizeAndType) {
+  if (MAI->hasDotTypeDotSizeDirective() && printSizeAndType) {
     O << "\t.type " << name << ",@object\n";
     O << "\t.size " << name << ',' << Size << '\n';
   }
@@ -567,26 +518,9 @@ printModuleLevelGV(const GlobalVariable* GVar) {
   EmitGlobalConstant(C);
 }
 
-bool MipsAsmPrinter::
-doFinalization(Module &M)
-{
-  // Print out module-level global variables here.
-  for (Module::const_global_iterator I = M.global_begin(),
-         E = M.global_end(); I != E; ++I)
-    printModuleLevelGV(I);
-
-  O << '\n';
-
-  return AsmPrinter::doFinalization(M);
-}
-
-namespace {
-  static struct Register {
-    Register() {
-      MipsTargetMachine::registerAsmPrinter(createMipsCodePrinterPass);
-    }
-  } Registrator;
-}
 
 // Force static initialization.
-extern "C" void LLVMInitializeMipsAsmPrinter() { }
+extern "C" void LLVMInitializeMipsAsmPrinter() { 
+  RegisterAsmPrinter<MipsAsmPrinter> X(TheMipsTarget);
+  RegisterAsmPrinter<MipsAsmPrinter> Y(TheMipselTarget);
+}
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index d27e6f1..0e3bf5a 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -15,10 +15,11 @@ add_llvm_target(MipsCodeGen
   MipsInstrInfo.cpp
   MipsISelDAGToDAG.cpp
   MipsISelLowering.cpp
+  MipsMCAsmInfo.cpp
   MipsRegisterInfo.cpp
   MipsSubtarget.cpp
-  MipsTargetAsmInfo.cpp
   MipsTargetMachine.cpp
+  MipsTargetObjectFile.cpp
   )
 
 target_link_libraries (LLVMMipsCodeGen LLVMSelectionDAG)
diff --git a/lib/Target/Mips/Makefile b/lib/Target/Mips/Makefile
index 48ab5f9..0780345 100644
--- a/lib/Target/Mips/Makefile
+++ b/lib/Target/Mips/Makefile
@@ -17,7 +17,7 @@ BUILT_SOURCES = MipsGenRegisterInfo.h.inc MipsGenRegisterNames.inc \
                 MipsGenDAGISel.inc MipsGenCallingConv.inc \
                 MipsGenSubtarget.inc
 
-DIRS = AsmPrinter
+DIRS = AsmPrinter TargetInfo
 
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index 9b22a91..a9ab050 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -21,13 +21,14 @@ namespace llvm {
   class MipsTargetMachine;
   class FunctionPass;
   class MachineCodeEmitter;
-  class raw_ostream;
+  class formatted_raw_ostream;
 
   FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
   FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
-  FunctionPass *createMipsCodePrinterPass(raw_ostream &OS, 
-                                          MipsTargetMachine &TM,
-                                          bool Verbose);
+
+  extern Target TheMipsTarget;
+  extern Target TheMipselTarget;
+
 } // end namespace llvm;
 
 // Defines symbolic names for Mips registers.  This defines a mapping from
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 53de1bb..cc20dd7 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -32,6 +32,8 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -106,22 +108,16 @@ private:
 
 /// InstructionSelect - This callback is invoked by
 /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-void MipsDAGToDAGISel::
-InstructionSelect() 
-{
+void MipsDAGToDAGISel::InstructionSelect() {
   DEBUG(BB->dump());
   // Codegen the basic block.
-  #ifndef NDEBUG
-  DOUT << "===== Instruction selection begins:\n";
-  Indent = 0;
-  #endif
+  DEBUG(errs() << "===== Instruction selection begins:\n");
+  DEBUG(Indent = 0);
 
   // Select target instructions for the DAG.
   SelectRoot(*CurDAG);
 
-  #ifndef NDEBUG
-  DOUT << "===== Instruction selection ends:\n";
-  #endif
+  DEBUG(errs() << "===== Instruction selection ends:\n");
 
   CurDAG->RemoveDeadNodes();
 }
@@ -129,7 +125,6 @@ InstructionSelect()
 /// getGlobalBaseReg - Output the instructions required to put the
 /// GOT address into a register.
 SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
-  MachineFunction *MF = BB->getParent();
   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
   return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
 }
@@ -186,29 +181,23 @@ SelectAddr(SDValue Op, SDValue Addr, SDValue &Offset, SDValue &Base)
 
 /// Select instructions not customized! Used for
 /// expanded, promoted and normal instructions
-SDNode* MipsDAGToDAGISel::
-Select(SDValue N) 
-{
+SDNode* MipsDAGToDAGISel::Select(SDValue N) {
   SDNode *Node = N.getNode();
   unsigned Opcode = Node->getOpcode();
   DebugLoc dl = Node->getDebugLoc();
 
   // Dump information about the Node being selected
-  #ifndef NDEBUG
-  DOUT << std::string(Indent, ' ') << "Selecting: ";
-  DEBUG(Node->dump(CurDAG));
-  DOUT << "\n";
-  Indent += 2;
-  #endif
+  DEBUG(errs().indent(Indent) << "Selecting: ";
+        Node->dump(CurDAG);
+        errs() << "\n");
+  DEBUG(Indent += 2);
 
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    #ifndef NDEBUG
-    DOUT << std::string(Indent-2, ' ') << "== ";
-    DEBUG(Node->dump(CurDAG));
-    DOUT << "\n";
-    Indent -= 2;
-    #endif
+    DEBUG(errs().indent(Indent-2) << "== ";
+          Node->dump(CurDAG);
+          errs() << "\n");
+    DEBUG(Indent -= 2);
     return NULL;
   }
 
@@ -242,10 +231,10 @@ Select(SDValue N)
       SDValue LHS = Node->getOperand(0);
       SDValue RHS = Node->getOperand(1);
 
-      MVT VT = LHS.getValueType();
-      SDNode *Carry = CurDAG->getTargetNode(Mips::SLTu, dl, VT, Ops, 2);
-      SDNode *AddCarry = CurDAG->getTargetNode(Mips::ADDu, dl, VT, 
-                                               SDValue(Carry,0), RHS);
+      EVT VT = LHS.getValueType();
+      SDNode *Carry = CurDAG->getMachineNode(Mips::SLTu, dl, VT, Ops, 2);
+      SDNode *AddCarry = CurDAG->getMachineNode(Mips::ADDu, dl, VT, 
+                                                SDValue(Carry,0), RHS);
 
       return CurDAG->SelectNodeTo(N.getNode(), MOp, VT, MVT::Flag,
                                   LHS, SDValue(AddCarry,0));
@@ -265,13 +254,13 @@ Select(SDValue N)
       else
         Op = (Opcode == ISD::UDIVREM ? Mips::DIVu : Mips::DIV);
 
-      SDNode *Node = CurDAG->getTargetNode(Op, dl, MVT::Flag, Op1, Op2);
+      SDNode *Node = CurDAG->getMachineNode(Op, dl, MVT::Flag, Op1, Op2);
 
       SDValue InFlag = SDValue(Node, 0);
-      SDNode *Lo = CurDAG->getTargetNode(Mips::MFLO, dl, MVT::i32, 
-                                         MVT::Flag, InFlag);
+      SDNode *Lo = CurDAG->getMachineNode(Mips::MFLO, dl, MVT::i32, 
+                                          MVT::Flag, InFlag);
       InFlag = SDValue(Lo,1);
-      SDNode *Hi = CurDAG->getTargetNode(Mips::MFHI, dl, MVT::i32, InFlag);
+      SDNode *Hi = CurDAG->getMachineNode(Mips::MFHI, dl, MVT::i32, InFlag);
 
       if (!N.getValue(0).use_empty()) 
         ReplaceUses(N.getValue(0), SDValue(Lo,0));
@@ -290,15 +279,15 @@ Select(SDValue N)
       SDValue MulOp2 = Node->getOperand(1);
 
       unsigned MulOp  = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT);
-      SDNode *MulNode = CurDAG->getTargetNode(MulOp, dl, 
-                                              MVT::Flag, MulOp1, MulOp2);
+      SDNode *MulNode = CurDAG->getMachineNode(MulOp, dl, 
+                                               MVT::Flag, MulOp1, MulOp2);
 
       SDValue InFlag = SDValue(MulNode, 0);
 
       if (MulOp == ISD::MUL)
-        return CurDAG->getTargetNode(Mips::MFLO, dl, MVT::i32, InFlag);
+        return CurDAG->getMachineNode(Mips::MFLO, dl, MVT::i32, InFlag);
       else
-        return CurDAG->getTargetNode(Mips::MFHI, dl, MVT::i32, InFlag);
+        return CurDAG->getMachineNode(Mips::MFHI, dl, MVT::i32, InFlag);
     }
 
     /// Div/Rem operations
@@ -317,10 +306,10 @@ Select(SDValue N)
         Op  = (Opcode == ISD::SREM ? Mips::DIV : Mips::DIVu);
         MOp = Mips::MFHI;
       }
-      SDNode *Node = CurDAG->getTargetNode(Op, dl, MVT::Flag, Op1, Op2);
+      SDNode *Node = CurDAG->getMachineNode(Op, dl, MVT::Flag, Op1, Op2);
 
       SDValue InFlag = SDValue(Node, 0);
-      return CurDAG->getTargetNode(MOp, dl, MVT::i32, InFlag);
+      return CurDAG->getMachineNode(MOp, dl, MVT::i32, InFlag);
     }
 
     // Get target GOT address.
@@ -333,7 +322,6 @@ Select(SDValue N)
     /// be loaded with 3 instructions. 
     case MipsISD::JmpLink: {
       if (TM.getRelocationModel() == Reloc::PIC_) {
-        //bool isCodeLarge = (TM.getCodeModel() == CodeModel::Large);
         SDValue Chain  = Node->getOperand(0);
         SDValue Callee = Node->getOperand(1);
         SDValue T9Reg = CurDAG->getRegister(Mips::T9, MVT::i32);
@@ -347,7 +335,7 @@ Select(SDValue N)
 
           // Use load to get GOT target
           SDValue Ops[] = { Callee, GPReg, Chain };
-          SDValue Load = SDValue(CurDAG->getTargetNode(Mips::LW, dl, MVT::i32, 
+          SDValue Load = SDValue(CurDAG->getMachineNode(Mips::LW, dl, MVT::i32, 
                                      MVT::Other, Ops, 3), 0);
           Chain = Load.getValue(1);
 
@@ -358,7 +346,7 @@ Select(SDValue N)
           Chain = CurDAG->getCopyToReg(Chain, dl, T9Reg, Callee, InFlag);
 
         // Emit Jump and Link Register
-        SDNode *ResNode = CurDAG->getTargetNode(Mips::JALR, dl, MVT::Other,
+        SDNode *ResNode = CurDAG->getMachineNode(Mips::JALR, dl, MVT::Other,
                                   MVT::Flag, T9Reg, Chain);
         Chain  = SDValue(ResNode, 0);
         InFlag = SDValue(ResNode, 1);
@@ -372,15 +360,13 @@ Select(SDValue N)
   // Select the default instruction
   SDNode *ResNode = SelectCode(N);
 
-  #ifndef NDEBUG
-  DOUT << std::string(Indent-2, ' ') << "=> ";
+  DEBUG(errs().indent(Indent-2) << "=> ");
   if (ResNode == NULL || ResNode == N.getNode())
     DEBUG(N.getNode()->dump(CurDAG));
   else
     DEBUG(ResNode->dump(CurDAG));
-  DOUT << "\n";
-  Indent -= 2;
-  #endif
+  DEBUG(errs() << "\n");
+  DEBUG(Indent -= 2);
 
   return ResNode;
 }
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 3d2e2b7..ab8790a 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -13,10 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mips-lower"
-
 #include "MipsISelLowering.h"
 #include "MipsMachineFunction.h"
 #include "MipsTargetMachine.h"
+#include "MipsTargetObjectFile.h"
 #include "MipsSubtarget.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
@@ -31,13 +31,11 @@
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
-const char *MipsTargetLowering::
-getTargetNodeName(unsigned Opcode) const 
-{
-  switch (Opcode) 
-  {
+const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
     case MipsISD::JmpLink    : return "MipsISD::JmpLink";
     case MipsISD::Hi         : return "MipsISD::Hi";
     case MipsISD::Lo         : return "MipsISD::Lo";
@@ -54,8 +52,8 @@ getTargetNodeName(unsigned Opcode) const
 }
 
 MipsTargetLowering::
-MipsTargetLowering(MipsTargetMachine &TM): TargetLowering(TM) 
-{
+MipsTargetLowering(MipsTargetMachine &TM)
+  : TargetLowering(TM, new MipsTargetObjectFile()) {
   Subtarget = &TM.getSubtarget<MipsSubtarget>();
 
   // Mips does not have i1 type, so use i32 for
@@ -82,6 +80,10 @@ MipsTargetLowering(MipsTargetMachine &TM): TargetLowering(TM)
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
 
+  // MIPS doesn't have extending float->double load/store
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
   // Used by legalize types to correctly generate the setcc result. 
   // Without this, every float setcc comes with a AND/OR with the result, 
   // we don't want this, since the fpcmp result goes to a flag register, 
@@ -91,7 +93,6 @@ MipsTargetLowering(MipsTargetMachine &TM): TargetLowering(TM)
   // Mips Custom Operations
   setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
   setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
-  setOperationAction(ISD::RET,                MVT::Other, Custom);
   setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
   setOperationAction(ISD::ConstantPool,       MVT::i32,   Custom);
   setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
@@ -119,11 +120,20 @@ MipsTargetLowering(MipsTargetMachine &TM): TargetLowering(TM)
   setOperationAction(ISD::CTPOP,             MVT::i32,   Expand);
   setOperationAction(ISD::CTTZ,              MVT::i32,   Expand);
   setOperationAction(ISD::ROTL,              MVT::i32,   Expand);
+  setOperationAction(ISD::ROTR,              MVT::i32,   Expand);
   setOperationAction(ISD::SHL_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::SRA_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::SRL_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::FCOPYSIGN,         MVT::f32,   Expand);
   setOperationAction(ISD::FCOPYSIGN,         MVT::f64,   Expand);
+  setOperationAction(ISD::FSIN,              MVT::f32,   Expand);
+  setOperationAction(ISD::FCOS,              MVT::f32,   Expand);
+  setOperationAction(ISD::FPOWI,             MVT::f32,   Expand);
+  setOperationAction(ISD::FPOW,              MVT::f32,   Expand);
+  setOperationAction(ISD::FLOG,              MVT::f32,   Expand);
+  setOperationAction(ISD::FLOG2,             MVT::f32,   Expand);
+  setOperationAction(ISD::FLOG10,            MVT::f32,   Expand);
+  setOperationAction(ISD::FEXP,              MVT::f32,   Expand);
 
   // We don't have line number support yet.
   setOperationAction(ISD::DBG_STOPPOINT,     MVT::Other, Expand);
@@ -154,7 +164,7 @@ MipsTargetLowering(MipsTargetMachine &TM): TargetLowering(TM)
   computeRegisterProperties();
 }
 
-MVT MipsTargetLowering::getSetCCResultType(MVT VT) const {
+MVT::SimpleValueType MipsTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
@@ -170,16 +180,13 @@ LowerOperation(SDValue Op, SelectionDAG &DAG)
   {
     case ISD::AND:                return LowerANDOR(Op, DAG);
     case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
-    case ISD::CALL:               return LowerCALL(Op, DAG);
     case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
     case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
-    case ISD::FORMAL_ARGUMENTS:   return LowerFORMAL_ARGUMENTS(Op, DAG);
     case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
     case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
     case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
     case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
     case ISD::OR:                 return LowerANDOR(Op, DAG);
-    case ISD::RET:                return LowerRET(Op, DAG);
     case ISD::SELECT:             return LowerSELECT(Op, DAG);
     case ISD::SETCC:              return LowerSETCC(Op, DAG);
   }
@@ -202,37 +209,6 @@ AddLiveIn(MachineFunction &MF, unsigned PReg, TargetRegisterClass *RC)
   return VReg;
 }
 
-// A address must be loaded from a small section if its size is less than the 
-// small section size threshold. Data in this section must be addressed using 
-// gp_rel operator.
-bool MipsTargetLowering::IsInSmallSection(unsigned Size) {
-  return (Size > 0 && (Size <= Subtarget->getSSectionThreshold()));
-}
-
-// Discover if this global address can be placed into small data/bss section. 
-bool MipsTargetLowering::IsGlobalInSmallSection(GlobalValue *GV)
-{
-  const TargetData *TD = getTargetData();
-  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
-
-  if (!GVA)
-    return false;
-  
-  const Type *Ty = GV->getType()->getElementType();
-  unsigned Size = TD->getTypeAllocSize(Ty);
-
-  // if this is a internal constant string, there is a special
-  // section for it, but not in small data/bss.
-  if (GVA->hasInitializer() && GV->hasLocalLinkage()) {
-    Constant *C = GVA->getInitializer();
-    const ConstantArray *CVA = dyn_cast<ConstantArray>(C);
-    if (CVA && CVA->isCString()) 
-      return false;
-  }
-
-  return IsInSmallSection(Size);
-}
-
 // Get fp branch code (not opcode) from condition code.
 static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) {
   if (CC >= Mips::FCOND_F && CC <= Mips::FCOND_NGT)
@@ -247,7 +223,7 @@ static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) {
 static unsigned FPBranchCodeToOpc(Mips::FPBranchCode BC) {
   switch(BC) {
     default:
-      assert(0 && "Unknown branch code");
+      llvm_unreachable("Unknown branch code");
     case Mips::BRANCH_T  : return Mips::BC1T;
     case Mips::BRANCH_F  : return Mips::BC1F;
     case Mips::BRANCH_TL : return Mips::BC1TL;
@@ -257,7 +233,7 @@ static unsigned FPBranchCodeToOpc(Mips::FPBranchCode BC) {
 
 static Mips::CondCode FPCondCCodeToFCC(ISD::CondCode CC) {
   switch (CC) {
-  default: assert(0 && "Unknown fp condition code!");
+  default: llvm_unreachable("Unknown fp condition code!");
   case ISD::SETEQ:  
   case ISD::SETOEQ: return Mips::FCOND_EQ;
   case ISD::SETUNE: return Mips::FCOND_OGL;
@@ -283,7 +259,8 @@ static Mips::CondCode FPCondCCodeToFCC(ISD::CondCode CC) {
 
 MachineBasicBlock *
 MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                MachineBasicBlock *BB) const {
+                                                MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   bool isFPCmp = false;
   DebugLoc dl = MI->getDebugLoc();
@@ -331,9 +308,12 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     F->insert(It, sinkMBB);
     // Update machine-CFG edges by first adding all successors of the current
     // block to the new block which will contain the Phi node for the select.
+    // Also inform sdisel of the edge changes.
     for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
-        e = BB->succ_end(); i != e; ++i)
+          e = BB->succ_end(); i != e; ++i) {
+      EM->insert(std::make_pair(*i, sinkMBB));
       sinkMBB->addSuccessor(*i);
+    }
     // Next, remove all successors of the current block, and add the true
     // and fallthrough blocks as its successors.
     while(!BB->succ_empty())
@@ -508,29 +488,34 @@ LowerSELECT(SDValue Op, SelectionDAG &DAG)
                      Cond, True, False, CCNode);
 }
 
-SDValue MipsTargetLowering::
-LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) 
-{
+SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
   // FIXME there isn't actually debug info here
   DebugLoc dl = Op.getDebugLoc();
   GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
 
-  if (!Subtarget->hasABICall()) {
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
     SDVTList VTs = DAG.getVTList(MVT::i32);
-    SDValue Ops[] = { GA };
+    
+    MipsTargetObjectFile &TLOF = (MipsTargetObjectFile&)getObjFileLowering();
+    
     // %gp_rel relocation
-    if (!isa<Function>(GV) && IsGlobalInSmallSection(GV)) { 
-      SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, dl, VTs, Ops, 1);
+    if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) { 
+      SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32, 0, 
+                                              MipsII::MO_GPREL);
+      SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, dl, VTs, &GA, 1);
       SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
       return DAG.getNode(ISD::ADD, dl, MVT::i32, GOT, GPRelNode); 
     }
     // %hi/%lo relocation
-    SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, Ops, 1);
+    SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32, 0,
+                                            MipsII::MO_ABS_HILO);
+    SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, &GA, 1);
     SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GA);
     return DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
 
-  } else { // Abicall relocations, TODO: make this cleaner.
+  } else {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32, 0,
+                                            MipsII::MO_GOT);
     SDValue ResNode = DAG.getLoad(MVT::i32, dl, 
                                   DAG.getEntryNode(), GA, NULL, 0);
     // On functions and global targets not internal linked only
@@ -541,14 +526,14 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG)
     return DAG.getNode(ISD::ADD, dl, MVT::i32, ResNode, Lo);
   }
 
-  assert(0 && "Dont know how to handle GlobalAddress");
+  llvm_unreachable("Dont know how to handle GlobalAddress");
   return SDValue(0,0);
 }
 
 SDValue MipsTargetLowering::
 LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG)
 {
-  assert(0 && "TLS not implemented for MIPS.");
+  llvm_unreachable("TLS not implemented for MIPS.");
   return SDValue(); // Not reached
 }
 
@@ -559,15 +544,17 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG)
   SDValue HiPart; 
   // FIXME there isn't actually debug info here
   DebugLoc dl = Op.getDebugLoc();
+  bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+  unsigned char OpFlag = IsPIC ? MipsII::MO_GOT : MipsII::MO_ABS_HILO;
 
-  MVT PtrVT = Op.getValueType();
+  EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT  = cast<JumpTableSDNode>(Op);
-  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
-    SDVTList VTs = DAG.getVTList(MVT::i32);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
+
+  if (IsPIC) {
     SDValue Ops[] = { JTI };
-    HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, Ops, 1);
+    HiPart = DAG.getNode(MipsISD::Hi, dl, DAG.getVTList(MVT::i32), Ops, 1);
   } else // Emit Load from Global Pointer
     HiPart = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(), JTI, NULL, 0);
 
@@ -583,7 +570,8 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG)
   SDValue ResNode;
   ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
   Constant *C = N->getConstVal();
-  SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment());
+  SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(), 
+                                         MipsII::MO_ABS_HILO);
   // FIXME there isn't actually debug info here
   DebugLoc dl = Op.getDebugLoc();
 
@@ -592,8 +580,7 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG)
   // but the asm printer currently doens't support this feature without
   // hacking it. This feature should come soon so we can uncomment the 
   // stuff below.
-  //if (!Subtarget->hasABICall() &&  
-  //    IsInSmallSection(getTargetData()->getTypeAllocSize(C->getType()))) {
+  //if (IsInSmallSection(C->getType())) {
   //  SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, MVT::i32, CP);
   //  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
   //  ResNode = DAG.getNode(ISD::ADD, MVT::i32, GOT, GPRelNode); 
@@ -608,13 +595,6 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG)
 
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
-//
-//  The lower operations present on calling convention works on this order:
-//      LowerCALL (virt regs --> phys regs, virt regs --> stack) 
-//      LowerFORMAL_ARGUMENTS (phys --> virt regs, stack --> virt regs)
-//      LowerRET (virt regs --> phys regs)
-//      LowerCALL (phys regs --> virt regs)
-//
 //===----------------------------------------------------------------------===//
 
 #include "MipsGenCallingConv.inc"
@@ -632,8 +612,8 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG)
 //       go to stack.
 //===----------------------------------------------------------------------===//
 
-static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
-                       MVT LocVT, CCValAssign::LocInfo LocInfo,
+static bool CC_MipsO32(unsigned ValNo, EVT ValVT,
+                       EVT LocVT, CCValAssign::LocInfo LocInfo,
                        ISD::ArgFlagsTy ArgFlags, CCState &State) {
 
   static const unsigned IntRegsSize=4, FloatRegsSize=2; 
@@ -699,38 +679,38 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
 }
 
 //===----------------------------------------------------------------------===//
-//                  CALL Calling Convention Implementation
+//                  Call Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
-/// LowerCALL - functions arguments are copied from virtual regs to 
+/// LowerCall - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 /// TODO: isVarArg, isTailCall.
-SDValue MipsTargetLowering::
-LowerCALL(SDValue Op, SelectionDAG &DAG)
-{
-  MachineFunction &MF = DAG.getMachineFunction();
-
-  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
-  SDValue Chain = TheCall->getChain();
-  SDValue Callee = TheCall->getCallee();
-  bool isVarArg = TheCall->isVarArg();
-  unsigned CC = TheCall->getCallingConv();
-  DebugLoc dl = TheCall->getDebugLoc();
+SDValue
+MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                              CallingConv::ID CallConv, bool isVarArg,
+                              bool isTailCall,
+                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              DebugLoc dl, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals) {
 
+  MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+                 *DAG.getContext());
 
   // To meet O32 ABI, Mips must always allocate 16 bytes on
   // the stack (even if less than 4 are used as arguments)
   if (Subtarget->isABI_O32()) {
-    int VTsize = MVT(MVT::i32).getSizeInBits()/8;
+    int VTsize = EVT(MVT::i32).getSizeInBits()/8;
     MFI->CreateFixedObject(VTsize, (VTsize*3));
-    CCInfo.AnalyzeCallOperands(TheCall, CC_MipsO32);
+    CCInfo.AnalyzeCallOperands(Outs, CC_MipsO32);
   } else
-    CCInfo.AnalyzeCallOperands(TheCall, CC_Mips);
+    CCInfo.AnalyzeCallOperands(Outs, CC_Mips);
   
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -747,12 +727,12 @@ LowerCALL(SDValue Op, SelectionDAG &DAG)
 
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    SDValue Arg = TheCall->getArg(i);
+    SDValue Arg = Outs[i].Val;
     CCValAssign &VA = ArgLocs[i];
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
-    default: assert(0 && "Unknown loc info!");
+    default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: 
       if (Subtarget->isABI_O32() && VA.isRegLoc()) {
         if (VA.getValVT() == MVT::f32 && VA.getLocVT() == MVT::i32)
@@ -825,10 +805,13 @@ LowerCALL(SDValue Op, SelectionDAG &DAG)
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 
   // node so that legalize doesn't hack it. 
+  unsigned char OpFlag = IsPIC ? MipsII::MO_GOT_CALL : MipsII::MO_NO_FLAG;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), 
+                                getPointerTy(), 0, OpFlag);
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
-    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), 
+                                getPointerTy(), OpFlag);
 
   // MipsJmpLink = #chain, #target_address, #opt_in_flags...
   //             = Chain, Callee, Reg#1, Reg#2, ...  
@@ -859,7 +842,7 @@ LowerCALL(SDValue Op, SelectionDAG &DAG)
   // Create a stack location to hold GP when PIC is used. This stack 
   // location is used on function prologue to save GP and also after all 
   // emited CALL's to restore GP. 
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+  if (IsPIC) {
       // Function can have an arbitrary number of calls, so 
       // hold the LastArgStackLoc with the biggest offset.
       int FI;
@@ -887,75 +870,69 @@ LowerCALL(SDValue Op, SelectionDAG &DAG)
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
-  return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG), Op.getResNo());
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
 }
 
-/// LowerCallResult - Lower the result values of an ISD::CALL into the
-/// appropriate copies out of appropriate physical registers.  This assumes that
-/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
-/// being lowered. Returns a SDNode with the same number of values as the 
-/// ISD::CALL.
-SDNode *MipsTargetLowering::
-LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, 
-        unsigned CallingConv, SelectionDAG &DAG) {
-  
-  bool isVarArg = TheCall->isVarArg();
-  DebugLoc dl = TheCall->getDebugLoc();
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue
+MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) {
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
 
-  CCInfo.AnalyzeCallResult(TheCall, RetCC_Mips);
-  SmallVector<SDValue, 8> ResultVals;
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Mips);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
-                                 RVLocs[i].getValVT(), InFlag).getValue(1);
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
     InFlag = Chain.getValue(2);
-    ResultVals.push_back(Chain.getValue(0));
+    InVals.push_back(Chain.getValue(0));
   }
-  
-  ResultVals.push_back(Chain);
 
-  // Merge everything together with a MERGE_VALUES node.
-  return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
-                     &ResultVals[0], ResultVals.size()).getNode();
+  return Chain;
 }
 
 //===----------------------------------------------------------------------===//
-//             FORMAL_ARGUMENTS Calling Convention Implementation
+//             Formal Arguments Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
-/// LowerFORMAL_ARGUMENTS - transform physical registers into
+/// LowerFormalArguments - transform physical registers into
 /// virtual registers and generate load operations for
 /// arguments places on the stack.
 /// TODO: isVarArg
-SDValue MipsTargetLowering::
-LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) 
-{
-  SDValue Root = Op.getOperand(0);
+SDValue
+MipsTargetLowering::LowerFormalArguments(SDValue Chain,
+                                         CallingConv::ID CallConv, bool isVarArg,
+                                         const SmallVectorImpl<ISD::InputArg>
+                                           &Ins,
+                                         DebugLoc dl, SelectionDAG &DAG,
+                                         SmallVectorImpl<SDValue> &InVals) {
+
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-  DebugLoc dl = Op.getDebugLoc();
-
-  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
-  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
 
   unsigned StackReg = MF.getTarget().getRegisterInfo()->getFrameRegister(MF);
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
 
   if (Subtarget->isABI_O32())
-    CCInfo.AnalyzeFormalArguments(Op.getNode(), CC_MipsO32);
+    CCInfo.AnalyzeFormalArguments(Ins, CC_MipsO32);
   else
-    CCInfo.AnalyzeFormalArguments(Op.getNode(), CC_Mips);
+    CCInfo.AnalyzeFormalArguments(Ins, CC_Mips);
 
-  SmallVector<SDValue, 16> ArgValues;
   SDValue StackPtr;
 
   unsigned FirstStackArgLoc = (Subtarget->isABI_EABI() ? 0 : 16);
@@ -965,7 +942,7 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG)
 
     // Arguments stored on registers
     if (VA.isRegLoc()) {
-      MVT RegVT = VA.getLocVT();
+      EVT RegVT = VA.getLocVT();
       TargetRegisterClass *RC = 0;
 
       if (RegVT == MVT::i32)
@@ -976,12 +953,12 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG)
         if (!Subtarget->isSingleFloat()) 
           RC = Mips::AFGR64RegisterClass;
       } else  
-        assert(0 && "RegVT not supported by FORMAL_ARGUMENTS Lowering");
+        llvm_unreachable("RegVT not supported by LowerFormalArguments Lowering");
 
       // Transform the arguments stored on 
       // physical registers into virtual ones
       unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
-      SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
       
       // If this is an 8 or 16-bit value, it has been passed promoted 
       // to 32 bits.  Insert an assert[sz]ext to capture this, then 
@@ -1005,14 +982,14 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG)
         if (RegVT == MVT::i32 && VA.getValVT() == MVT::f64) {
           unsigned Reg2 = AddLiveIn(DAG.getMachineFunction(), 
                                     VA.getLocReg()+1, RC);
-          SDValue ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg2, RegVT);
+          SDValue ArgValue2 = DAG.getCopyFromReg(Chain, dl, Reg2, RegVT);
           SDValue Hi = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, ArgValue);
           SDValue Lo = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, ArgValue2);
           ArgValue = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::f64, Lo, Hi);
         }
       }
 
-      ArgValues.push_back(ArgValue);
+      InVals.push_back(ArgValue);
 
       // To meet ABI, when VARARGS are passed on registers, the registers
       // must have their values written to the caller stack frame. 
@@ -1034,7 +1011,7 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG)
       
         // emit ISD::STORE whichs stores the 
         // parameter value to a stack Location
-        ArgValues.push_back(DAG.getStore(Root, dl, ArgValue, PtrOff, NULL, 0));
+        InVals.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff, NULL, 0));
       }
 
     } else { // VA.isRegLoc()
@@ -1057,7 +1034,7 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG)
 
       // Create load nodes to retrieve arguments from the stack
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      ArgValues.push_back(DAG.getLoad(VA.getValVT(), dl, Root, FIN, NULL, 0));
+      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, NULL, 0));
     }
   }
 
@@ -1070,36 +1047,33 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG)
       Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i32));
       MipsFI->setSRetReturnReg(Reg);
     }
-    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, ArgValues[0]);
-    Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Root);
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   }
 
-  ArgValues.push_back(Root);
-
-  // Return the new list of results.
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
-                     &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
+  return Chain;
 }
 
 //===----------------------------------------------------------------------===//
 //               Return Value Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
-SDValue MipsTargetLowering::
-LowerRET(SDValue Op, SelectionDAG &DAG)
-{
+SDValue
+MipsTargetLowering::LowerReturn(SDValue Chain,
+                                CallingConv::ID CallConv, bool isVarArg,
+                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                DebugLoc dl, SelectionDAG &DAG) {
+
   // CCValAssign - represent the assignment of
   // the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
-  unsigned CC   = DAG.getMachineFunction().getFunction()->getCallingConv();
-  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
-  DebugLoc dl = Op.getDebugLoc();
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
 
-  // Analize return values of ISD::RET
-  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_Mips);
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_Mips);
 
   // If this is the first return lowered for this function, add 
   // the regs to the liveout set for the function.
@@ -1109,8 +1083,6 @@ LowerRET(SDValue Op, SelectionDAG &DAG)
         DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
   }
 
-  // The chain is always operand #0
-  SDValue Chain = Op.getOperand(0);
   SDValue Flag;
 
   // Copy the result values into the output registers.
@@ -1118,10 +1090,8 @@ LowerRET(SDValue Op, SelectionDAG &DAG)
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
-    // ISD::RET => ret chain, (regnum1,val1), ...
-    // So i*2+1 index only the regnums
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 
-                             Op.getOperand(i*2+1), Flag);
+                             Outs[i].Val, Flag);
 
     // guarantee that all emitted copies are
     // stuck together, avoiding something bad
@@ -1138,7 +1108,7 @@ LowerRET(SDValue Op, SelectionDAG &DAG)
     unsigned Reg = MipsFI->getSRetReturnReg();
 
     if (!Reg) 
-      assert(0 && "sret virtual register not created in the entry block");
+      llvm_unreachable("sret virtual register not created in the entry block");
     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
 
     Chain = DAG.getCopyToReg(Chain, dl, Mips::V0, Val, Flag);
@@ -1188,7 +1158,7 @@ getConstraintType(const std::string &Constraint) const
 /// return a list of registers that can be used to satisfy the constraint.
 /// This should only be used for C_RegisterClass constraints.
 std::pair<unsigned, const TargetRegisterClass*> MipsTargetLowering::
-getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
+getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const
 {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
@@ -1210,7 +1180,7 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
 /// pointer.
 std::vector<unsigned> MipsTargetLowering::
 getRegClassForInlineAsmConstraint(const std::string &Constraint,
-                                  MVT VT) const
+                                  EVT VT) const
 {
   if (Constraint.size() != 1)
     return std::vector<unsigned>();
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 9ad4895..dddba42 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -66,8 +66,8 @@ namespace llvm {
   //===--------------------------------------------------------------------===//
   // TargetLowering Implementation
   //===--------------------------------------------------------------------===//
-  class MipsTargetLowering : public TargetLowering 
-  {
+  
+  class MipsTargetLowering : public TargetLowering  {
   public:
 
     explicit MipsTargetLowering(MipsTargetMachine &TM);
@@ -80,7 +80,7 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - get the ISD::SETCC result ValueType
-    MVT getSetCCResultType(MVT VT) const;
+    MVT::SimpleValueType getSetCCResultType(EVT VT) const;
 
     /// getFunctionAlignment - Return the Log2 alignment of this function.
     virtual unsigned getFunctionAlignment(const Function *F) const;
@@ -88,40 +88,62 @@ namespace llvm {
     // Subtarget Info
     const MipsSubtarget *Subtarget;
 
+
     // Lower Operand helpers
-    SDNode *LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
-                            unsigned CallingConv, SelectionDAG &DAG);
-    bool IsGlobalInSmallSection(GlobalValue *GV); 
-    bool IsInSmallSection(unsigned Size); 
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
 
     // Lower Operand specifics
     SDValue LowerANDOR(SDValue Op, SelectionDAG &DAG);
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
     SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG);
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG);
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG);
 
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
     virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                   MachineBasicBlock *MBB) const;
+                                                         MachineBasicBlock *MBB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
 
     // Inline asm support
     ConstraintType getConstraintType(const std::string &Constraint) const;
 
     std::pair<unsigned, const TargetRegisterClass*> 
               getRegForInlineAsmConstraint(const std::string &Constraint,
-              MVT VT) const;
+              EVT VT) const;
 
     std::vector<unsigned>
     getRegClassForInlineAsmConstraint(const std::string &Constraint,
-              MVT VT) const;
+              EVT VT) const;
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
   };
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index e16fd8e..9159904 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "MipsGenInstrInfo.inc"
 
 using namespace llvm;
@@ -208,29 +209,6 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
           .addImm(0).addFrameIndex(FI);
 }
 
-void MipsInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
-  bool isKill, SmallVectorImpl<MachineOperand> &Addr, 
-  const TargetRegisterClass *RC, SmallVectorImpl<MachineInstr*> &NewMIs) const 
-{
-  unsigned Opc;
-  if (RC == Mips::CPURegsRegisterClass) 
-    Opc = Mips::SW;
-  else if (RC == Mips::FGR32RegisterClass)
-    Opc = Mips::SWC1;
-  else {
-    assert(RC == Mips::AFGR64RegisterClass);
-    Opc = Mips::SDC1;
-  }
-  
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc))
-    .addReg(SrcReg, getKillRegState(isKill));
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  NewMIs.push_back(MIB);
-  return;
-}
-
 void MipsInstrInfo::
 loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
@@ -251,28 +229,6 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   BuildMI(MBB, I, DL, get(Opc), DestReg).addImm(0).addFrameIndex(FI);
 }
 
-void MipsInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                                    SmallVectorImpl<MachineOperand> &Addr,
-                                    const TargetRegisterClass *RC,
-                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Opc;
-  if (RC == Mips::CPURegsRegisterClass) 
-    Opc = Mips::LW;
-  else if (RC == Mips::FGR32RegisterClass)
-    Opc = Mips::LWC1;
-  else {
-    assert(RC == Mips::AFGR64RegisterClass);
-    Opc = Mips::LDC1;
-  }
-
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  NewMIs.push_back(MIB);
-  return;
-}
-
 MachineInstr *MipsInstrInfo::
 foldMemoryOperandImpl(MachineFunction &MF,
                       MachineInstr* MI,
@@ -372,7 +328,7 @@ static Mips::CondCode GetCondFromBranchOpc(unsigned BrOpc)
 unsigned Mips::GetCondBranchFromCond(Mips::CondCode CC) 
 {
   switch (CC) {
-  default: assert(0 && "Illegal condition code!");
+  default: llvm_unreachable("Illegal condition code!");
   case Mips::COND_E   : return Mips::BEQ;
   case Mips::COND_NE  : return Mips::BNE;
   case Mips::COND_GZ  : return Mips::BGTZ;
@@ -421,7 +377,7 @@ unsigned Mips::GetCondBranchFromCond(Mips::CondCode CC)
 Mips::CondCode Mips::GetOppositeBranchCondition(Mips::CondCode CC) 
 {
   switch (CC) {
-  default: assert(0 && "Illegal condition code!");
+  default: llvm_unreachable("Illegal condition code!");
   case Mips::COND_E   : return Mips::COND_NE;
   case Mips::COND_NE  : return Mips::COND_E;
   case Mips::COND_GZ  : return Mips::COND_LEZ;
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 6655c67..249d3de 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -15,6 +15,7 @@
 #define MIPSINSTRUCTIONINFO_H
 
 #include "Mips.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "MipsRegisterInfo.h"
 
@@ -92,7 +93,7 @@ namespace Mips {
   inline static const char *MipsFCCToString(Mips::CondCode CC) 
   {
     switch (CC) {
-      default: assert(0 && "Unknown condition code");
+      default: llvm_unreachable("Unknown condition code");
       case FCOND_F:
       case FCOND_T:   return "f";
       case FCOND_UN:
@@ -129,6 +130,38 @@ namespace Mips {
   }
 }
 
+/// MipsII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MipsII {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // Mips Specific MachineOperand flags.
+ 
+    MO_NO_FLAG,
+
+    /// MO_GOT - Represents the offset into the global offset table at which
+    /// the address the relocation entry symbol resides during execution.
+    MO_GOT,
+
+    /// MO_GOT_CALL - Represents the offset into the global offset table at 
+    /// which the address of a call site relocation entry symbol resides 
+    /// during execution. This is different from the above since this flag
+    /// can only be present in call instructions.
+    MO_GOT_CALL,
+
+    /// MO_GPREL - Represents the offset from the current gp value to be used 
+    /// for the relocatable object file being produced.
+    MO_GPREL,
+
+    /// MO_ABS_HILO - Represents the hi or low part of an absolute symbol
+    /// address. 
+    MO_ABS_HILO
+
+  };
+}
+
 class MipsInstrInfo : public TargetInstrInfoImpl {
   MipsTargetMachine &TM;
   const MipsRegisterInfo RI;
@@ -182,21 +215,11 @@ public:
                                    unsigned SrcReg, bool isKill, int FrameIndex,
                                    const TargetRegisterClass *RC) const;
 
-  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                              SmallVectorImpl<MachineOperand> &Addr,
-                              const TargetRegisterClass *RC,
-                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
   virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MBBI,
                                     unsigned DestReg, int FrameIndex,
                                     const TargetRegisterClass *RC) const;
 
-  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                               SmallVectorImpl<MachineOperand> &Addr,
-                               const TargetRegisterClass *RC,
-                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
-  
   virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
                                               MachineInstr* MI,
                                            const SmallVectorImpl<unsigned> &Ops,
diff --git a/lib/Target/Mips/MipsMCAsmInfo.cpp b/lib/Target/Mips/MipsMCAsmInfo.cpp
new file mode 100644
index 0000000..60ef1c9
--- /dev/null
+++ b/lib/Target/Mips/MipsMCAsmInfo.cpp
@@ -0,0 +1,27 @@
+//===-- MipsMCAsmInfo.cpp - Mips asm properties ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MipsMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMCAsmInfo.h"
+using namespace llvm;
+
+MipsMCAsmInfo::MipsMCAsmInfo(const Target &T, const StringRef &TT) {
+  AlignmentIsInBytes          = false;
+  COMMDirectiveTakesAlignment = true;
+  Data16bitsDirective         = "\t.half\t";
+  Data32bitsDirective         = "\t.word\t";
+  Data64bitsDirective         = 0;
+  PrivateGlobalPrefix         = "$";
+  CommentString               = "#";
+  ZeroDirective               = "\t.space\t";
+  PICJumpTableDirective       = "\t.gpword\t";
+}
diff --git a/lib/Target/Mips/MipsMCAsmInfo.h b/lib/Target/Mips/MipsMCAsmInfo.h
new file mode 100644
index 0000000..33a4b5e
--- /dev/null
+++ b/lib/Target/Mips/MipsMCAsmInfo.h
@@ -0,0 +1,30 @@
+//=====-- MipsMCAsmInfo.h - Mips asm properties ---------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MipsMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETASMINFO_H
+#define MIPSTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+  
+  class MipsMCAsmInfo : public MCAsmInfo {
+  public:
+    explicit MipsMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index ac3cdfd..949c78a 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -57,7 +57,7 @@ private:
   /// to be used on emitPrologue and processFunctionBeforeFrameFinalized.
   MipsFIHolder GPHolder;
 
-  /// On LowerFORMAL_ARGUMENTS the stack size is unknown, so the Stack 
+  /// On LowerFormalArguments the stack size is unknown, so the Stack
   /// Pointer Offset calculation of "not in register arguments" must be 
   /// postponed to emitPrologue. 
   SmallVector<MipsFIHolder, 16> FnLoadArgs;
@@ -65,7 +65,7 @@ private:
 
   // When VarArgs, we must write registers back to caller stack, preserving 
   // on register arguments. Since the stack size is unknown on 
-  // LowerFORMAL_ARGUMENTS, the Stack Pointer Offset calculation must be
+  // LowerFormalArguments, the Stack Pointer Offset calculation must be
   // postponed to emitPrologue. 
   SmallVector<MipsFIHolder, 4> FnStoreVarArgs;
   bool HasStoreVarArgs;
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 579d4db..d2289e9 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -31,6 +31,8 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 
@@ -79,12 +81,12 @@ getRegisterNumbering(unsigned RegEnum)
     case Mips::SP   : case Mips::F29: return 29;
     case Mips::FP   : case Mips::F30: case Mips::D15: return 30;
     case Mips::RA   : case Mips::F31: return 31;
-    default: assert(0 && "Unknown register number!");
+    default: llvm_unreachable("Unknown register number!");
   }    
   return 0; // Not reached
 }
 
-unsigned MipsRegisterInfo::getPICCallReg(void) { return Mips::T9; }
+unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
 
 //===----------------------------------------------------------------------===//
 // Callee Saved Registers methods 
@@ -210,7 +212,7 @@ getReservedRegs(const MachineFunction &MF) const
 //   The emitted instruction will be something like:
 //     lw REGX, 16+StackSize(SP)
 //
-// Since the total stack size is unknown on LowerFORMAL_ARGUMENTS, all
+// Since the total stack size is unknown on LowerFormalArguments, all
 // stack references (ObjectOffset) created to reference the function 
 // arguments, are negative numbers. This way, on eliminateFrameIndex it's
 // possible to detect those references and the offsets are adjusted to
@@ -232,7 +234,7 @@ void MipsRegisterInfo::adjustMipsStackFrame(MachineFunction &MF) const
   int TopCPUSavedRegOff = -1, TopFPUSavedRegOff = -1;
 
   // Replace the dummy '0' SPOffset by the negative offsets, as explained on 
-  // LowerFORMAL_ARGUMENTS. Leaving '0' for while is necessary to avoid 
+  // LowerFormalArguments. Leaving '0' for while is necessary to avoid
   // the approach done by calculateFrameObjectOffsets to the stack frame.
   MipsFI->adjustLoadArgsFI(MFI);
   MipsFI->adjustStoreVarArgsFI(MFI); 
@@ -346,9 +348,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 // FrameIndex represent objects inside a abstract stack.
 // We must replace FrameIndex with an stack/frame pointer
 // direct reference.
-void MipsRegisterInfo::
-eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, 
-                    RegScavenger *RS) const 
+unsigned MipsRegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                    int *Value, RegScavenger *RS) const
 {
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
@@ -360,34 +362,27 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
            "Instr doesn't have FrameIndex operand!");
   }
 
-  #ifndef NDEBUG
-  DOUT << "\nFunction : " << MF.getFunction()->getName() << "\n";
-  DOUT << "<--------->\n";
-  MI.print(DOUT);
-  #endif
+  DEBUG(errs() << "\nFunction : " << MF.getFunction()->getName() << "\n";
+        errs() << "<--------->\n" << MI);
 
   int FrameIndex = MI.getOperand(i).getIndex();
   int stackSize  = MF.getFrameInfo()->getStackSize();
   int spOffset   = MF.getFrameInfo()->getObjectOffset(FrameIndex);
 
-  #ifndef NDEBUG
-  DOUT << "FrameIndex : " << FrameIndex << "\n";
-  DOUT << "spOffset   : " << spOffset << "\n";
-  DOUT << "stackSize  : " << stackSize << "\n";
-  #endif
+  DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"
+               << "spOffset   : " << spOffset << "\n"
+               << "stackSize  : " << stackSize << "\n");
 
-  // as explained on LowerFORMAL_ARGUMENTS, detect negative offsets 
+  // as explained on LowerFormalArguments, detect negative offsets
   // and adjust SPOffsets considering the final stack size.
   int Offset = ((spOffset < 0) ? (stackSize + (-(spOffset+4))) : (spOffset));
   Offset    += MI.getOperand(i-1).getImm();
 
-  #ifndef NDEBUG
-  DOUT << "Offset     : " << Offset << "\n";
-  DOUT << "<--------->\n";
-  #endif
+  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
 
   MI.getOperand(i-1).ChangeToImmediate(Offset);
   MI.getOperand(i).ChangeToRegister(getFrameRegister(MF), false);
+  return 0;
 }
 
 void MipsRegisterInfo::
@@ -515,19 +510,19 @@ getFrameRegister(MachineFunction &MF) const {
 
 unsigned MipsRegisterInfo::
 getEHExceptionRegister() const {
-  assert(0 && "What is the exception register");
+  llvm_unreachable("What is the exception register");
   return 0;
 }
 
 unsigned MipsRegisterInfo::
 getEHHandlerRegister() const {
-  assert(0 && "What is the exception handler register");
+  llvm_unreachable("What is the exception handler register");
   return 0;
 }
 
 int MipsRegisterInfo::
 getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  assert(0 && "What is the dwarf register number");
+  llvm_unreachable("What is the dwarf register number");
   return -1;
 }
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 808e995..122f786 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -34,7 +34,7 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo {
   static unsigned getRegisterNumbering(unsigned RegEnum);
 
   /// Get PIC indirect call register
-  static unsigned getPICCallReg(void); 
+  static unsigned getPICCallReg();
 
   /// Adjust the Mips stack frame.
   void adjustMipsStackFrame(MachineFunction &MF) const;
@@ -54,8 +54,9 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo {
                                      MachineBasicBlock::iterator I) const;
 
   /// Stack Frame Processing Methods
-  void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 4245f27..db114da 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -14,37 +14,20 @@
 #include "MipsSubtarget.h"
 #include "Mips.h"
 #include "MipsGenSubtarget.inc"
-#include "llvm/Module.h"
-#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
-static cl::opt<bool>
-NotABICall("disable-mips-abicall", cl::Hidden,
-           cl::desc("Disable code for SVR4-style dynamic objects"));
-static cl::opt<bool>
-AbsoluteCall("enable-mips-absolute-call", cl::Hidden,
-             cl::desc("Enable absolute call within abicall"));
-static cl::opt<unsigned>
-SSThreshold("mips-ssection-threshold", cl::Hidden,
-            cl::desc("Small data and bss section threshold size (default=8)"),
-            cl::init(8));
-
-MipsSubtarget::MipsSubtarget(const TargetMachine &TM, const Module &M, 
-                             const std::string &FS, bool little) : 
+MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &FS,
+                             bool little) : 
   MipsArchVersion(Mips1), MipsABI(O32), IsLittle(little), IsSingleFloat(false),
-  IsFP64bit(false), IsGP64bit(false), HasVFPU(false), HasABICall(true), 
-  HasAbsoluteCall(false), IsLinux(true), HasSEInReg(false), HasCondMov(false),
-  HasMulDivAdd(false), HasMinMax(false), HasSwap(false), HasBitCount(false)
+  IsFP64bit(false), IsGP64bit(false), HasVFPU(false), IsLinux(true),
+  HasSEInReg(false), HasCondMov(false), HasMulDivAdd(false), HasMinMax(false),
+  HasSwap(false), HasBitCount(false)
 {
   std::string CPU = "mips1";
   MipsArchVersion = Mips1;
 
   // Parse features string.
   ParseSubtargetFeatures(FS, CPU);
-  const std::string& TT = M.getTargetTriple();
-
-  // Small section size threshold
-  SSectionThreshold = SSThreshold;
 
   // Is the target system Linux ?
   if (TT.find("linux") == std::string::npos)
@@ -65,13 +48,4 @@ MipsSubtarget::MipsSubtarget(const TargetMachine &TM, const Module &M,
     HasSwap = true;
     HasCondMov = true;
   }
-
-  // Abicall is the default for O32 ABI, but is disabled within EABI and in
-  // static code.
-  if (NotABICall || isABI_EABI() || (TM.getRelocationModel() == Reloc::Static))
-    HasABICall = false;
-
-  // TODO: disable when handling 64 bit symbols in the future.
-  if (HasABICall && AbsoluteCall)
-    HasAbsoluteCall = true;
 }
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 61c37c1..1d6f87d 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -20,7 +20,6 @@
 #include <string>
 
 namespace llvm {
-class Module;
 
 class MipsSubtarget : public TargetSubtarget {
 
@@ -58,20 +57,9 @@ protected:
   // HasVFPU - Processor has a vector floating point unit.
   bool HasVFPU;
 
-  // IsABICall - Enable SRV4 code for SVR4-style dynamic objects 
-  bool HasABICall;
-
-  // HasAbsoluteCall - Enable code that is not fully position-independent.
-  // Only works with HasABICall enabled.
-  bool HasAbsoluteCall;
-
   // isLinux - Target system is Linux. Is false we consider ELFOS for now.
   bool IsLinux;
 
-  // Put global and static items less than or equal to SSectionThreshold 
-  // bytes into the small data or bss section. The default is 8.
-  unsigned SSectionThreshold;
-
   /// Features related to the presence of specific instructions.
   
   // HasSEInReg - SEB and SEH (signext in register) instructions.
@@ -103,9 +91,8 @@ public:
   unsigned getTargetABI() const { return MipsABI; }
 
   /// This constructor initializes the data members to match that
-  /// of the specified module.
-  MipsSubtarget(const TargetMachine &TM, const Module &M, 
-                const std::string &FS, bool little);
+  /// of the specified triple.
+  MipsSubtarget(const std::string &TT, const std::string &FS, bool little);
   
   /// ParseSubtargetFeatures - Parses features string setting specified 
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -121,10 +108,7 @@ public:
   bool isSingleFloat() const { return IsSingleFloat; };
   bool isNotSingleFloat() const { return !IsSingleFloat; };
   bool hasVFPU() const { return HasVFPU; };
-  bool hasABICall() const { return HasABICall; };
-  bool hasAbsoluteCall() const { return HasAbsoluteCall; };
   bool isLinux() const { return IsLinux; };
-  unsigned getSSectionThreshold() const { return SSectionThreshold; }
 
   /// Features related to the presence of specific instructions.
   bool hasSEInReg()   const { return HasSEInReg; };
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 4675536..4fa5450 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -12,35 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "Mips.h"
-#include "MipsTargetAsmInfo.h"
+#include "MipsMCAsmInfo.h"
 #include "MipsTargetMachine.h"
-#include "llvm/Module.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
-/// MipsTargetMachineModule - Note that this is used on hosts that
-/// cannot link in a library unless there are references into the
-/// library.  In particular, it seems that it is not possible to get
-/// things to work on Win32 without this.  Though it is unused, do not
-/// remove it.
-extern "C" int MipsTargetMachineModule;
-int MipsTargetMachineModule = 0;
-
-// Register the target.
-static RegisterTarget<MipsTargetMachine>    X("mips", "Mips");
-static RegisterTarget<MipselTargetMachine>  Y("mipsel", "Mipsel");
-
-MipsTargetMachine::AsmPrinterCtorFn MipsTargetMachine::AsmPrinterCtor = 0;
-
-
-// Force static initialization.
-extern "C" void LLVMInitializeMipsTarget() { }
-
-const TargetAsmInfo *MipsTargetMachine::
-createTargetAsmInfo() const 
-{
-  return new MipsTargetAsmInfo(*this);
+extern "C" void LLVMInitializeMipsTarget() {
+  // Register the target.
+  RegisterTargetMachine<MipsTargetMachine> X(TheMipsTarget);
+  RegisterTargetMachine<MipselTargetMachine> Y(TheMipselTarget);
+  RegisterAsmInfo<MipsMCAsmInfo> A(TheMipsTarget);
+  RegisterAsmInfo<MipsMCAsmInfo> B(TheMipselTarget);
 }
 
 // DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
@@ -51,17 +34,22 @@ createTargetAsmInfo() const
 // an easier handling.
 // Using CodeModel::Large enables different CALL behavior.
 MipsTargetMachine::
-MipsTargetMachine(const Module &M, const std::string &FS, bool isLittle=false):
-  Subtarget(*this, M, FS, isLittle), 
+MipsTargetMachine(const Target &T, const std::string &TT, const std::string &FS,
+                  bool isLittle=false):
+  LLVMTargetMachine(T, TT),
+  Subtarget(TT, FS, isLittle), 
   DataLayout(isLittle ? std::string("e-p:32:32:32-i8:8:32-i16:16:32") :
                         std::string("E-p:32:32:32-i8:8:32-i16:16:32")), 
   InstrInfo(*this), 
   FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0),
-  TLInfo(*this) 
-{
+  TLInfo(*this) {
   // Abicall enables PIC by default
-  if (Subtarget.hasABICall())
-    setRelocationModel(Reloc::PIC_);  
+  if (getRelocationModel() == Reloc::Default) {
+    if (Subtarget.isABI_O32())
+      setRelocationModel(Reloc::PIC_);
+    else
+      setRelocationModel(Reloc::Static);
+  }
 
   // TODO: create an option to enable long calls, like -mlong-calls, 
   // that would be our CodeModel::Large. It must not work with Abicall.
@@ -70,43 +58,9 @@ MipsTargetMachine(const Module &M, const std::string &FS, bool isLittle=false):
 }
 
 MipselTargetMachine::
-MipselTargetMachine(const Module &M, const std::string &FS) :
-  MipsTargetMachine(M, FS, true) {}
-
-// return 0 and must specify -march to gen MIPS code.
-unsigned MipsTargetMachine::
-getModuleMatchQuality(const Module &M) 
-{
-  // We strongly match "mips*-*".
-  std::string TT = M.getTargetTriple();
-  if (TT.size() >= 5 && std::string(TT.begin(), TT.begin()+5) == "mips-")
-    return 20;
-  
-  if (TT.size() >= 13 && std::string(TT.begin(), 
-      TT.begin()+13) == "mipsallegrex-")
-    return 20;
-
-  return 0;
-}
-
-// return 0 and must specify -march to gen MIPSEL code.
-unsigned MipselTargetMachine::
-getModuleMatchQuality(const Module &M) 
-{
-  // We strongly match "mips*el-*".
-  std::string TT = M.getTargetTriple();
-  if (TT.size() >= 7 && std::string(TT.begin(), TT.begin()+7) == "mipsel-")
-    return 20;
-
-  if (TT.size() >= 15 && std::string(TT.begin(), 
-      TT.begin()+15) == "mipsallegrexel-")
-    return 20;
-
-  if (TT.size() == 3 && std::string(TT.begin(), TT.begin()+3) == "psp")
-    return 20;
-  
-  return 0;
-}
+MipselTargetMachine(const Target &T, const std::string &TT,
+                    const std::string &FS) :
+  MipsTargetMachine(T, TT, FS, true) {}
 
 // Install an instruction selector pass using 
 // the ISelDag to gen Mips code.
@@ -126,14 +80,3 @@ addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel)
   PM.add(createMipsDelaySlotFillerPass(*this));
   return true;
 }
-
-// Implements the AssemblyEmitter for the target. Must return
-// true if AssemblyEmitter is supported
-bool MipsTargetMachine::
-addAssemblyEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, 
-                   bool Verbose, raw_ostream &Out)  {
-  // Output assembly language.
-  assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-  PM.add(AsmPrinterCtor(Out, *this, Verbose));
-  return false;
-}
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 95e5be4..c3428be 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -22,7 +22,7 @@
 #include "llvm/Target/TargetFrameInfo.h"
 
 namespace llvm {
-  class raw_ostream;
+  class formatted_raw_ostream;
   
   class MipsTargetMachine : public LLVMTargetMachine {
     MipsSubtarget       Subtarget;
@@ -30,24 +30,9 @@ namespace llvm {
     MipsInstrInfo       InstrInfo;
     TargetFrameInfo     FrameInfo;
     MipsTargetLowering  TLInfo;
-  
-  protected:
-    virtual const TargetAsmInfo *createTargetAsmInfo() const;
-  protected:
-    // To avoid having target depend on the asmprinter stuff libraries,
-    // asmprinter set this functions to ctor pointer at startup time if they are
-    // linked in.
-    typedef FunctionPass *(*AsmPrinterCtorFn)(raw_ostream &o,
-                                              MipsTargetMachine &tm,
-                                              bool verbose);
-    static AsmPrinterCtorFn AsmPrinterCtor;
-    
   public:
-    MipsTargetMachine(const Module &M, const std::string &FS, bool isLittle);
-
-    static void registerAsmPrinter(AsmPrinterCtorFn F) {
-      AsmPrinterCtor = F;
-    }
+    MipsTargetMachine(const Target &T, const std::string &TT,
+                      const std::string &FS, bool isLittle);
     
     virtual const MipsInstrInfo   *getInstrInfo()     const 
     { return &InstrInfo; }
@@ -66,25 +51,19 @@ namespace llvm {
       return const_cast<MipsTargetLowering*>(&TLInfo); 
     }
 
-    static unsigned getModuleMatchQuality(const Module &M);
-
     // Pass Pipeline Configuration
     virtual bool addInstSelector(PassManagerBase &PM,
                                  CodeGenOpt::Level OptLevel);
     virtual bool addPreEmitPass(PassManagerBase &PM,
                                 CodeGenOpt::Level OptLevel);
-    virtual bool addAssemblyEmitter(PassManagerBase &PM,
-                                    CodeGenOpt::Level OptLevel,
-                                    bool Verbose, raw_ostream &Out);
   };
 
 /// MipselTargetMachine - Mipsel target machine.
 ///
 class MipselTargetMachine : public MipsTargetMachine {
 public:
-  MipselTargetMachine(const Module &M, const std::string &FS);
-
-  static unsigned getModuleMatchQuality(const Module &M);
+  MipselTargetMachine(const Target &T, const std::string &TT,
+                      const std::string &FS);
 };
 
 } // End llvm namespace
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
new file mode 100644
index 0000000..85e9d65
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -0,0 +1,93 @@
+//===-- MipsTargetObjectFile.cpp - Mips object files ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetObjectFile.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+static cl::opt<unsigned>
+SSThreshold("mips-ssection-threshold", cl::Hidden,
+            cl::desc("Small data and bss section threshold size (default=8)"),
+            cl::init(8));
+
+void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ 
+  SmallDataSection =
+    getELFSection(".sdata", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                  SectionKind::getDataRel());
+  
+  SmallBSSSection =
+    getELFSection(".sbss", MCSectionELF::SHT_NOBITS,
+                  MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                  SectionKind::getBSS());
+  
+}
+
+// A address must be loaded from a small section if its size is less than the 
+// small section size threshold. Data in this section must be addressed using 
+// gp_rel operator.
+static bool IsInSmallSection(uint64_t Size) {
+  return Size > 0 && Size <= SSThreshold;
+}
+
+bool MipsTargetObjectFile::IsGlobalInSmallSection(const GlobalValue *GV,
+                                                const TargetMachine &TM) const {
+  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+    return false;
+  
+  return IsGlobalInSmallSection(GV, TM, getKindForGlobal(GV, TM));
+}
+
+/// IsGlobalInSmallSection - Return true if this global address should be
+/// placed into small data/bss section.
+bool MipsTargetObjectFile::
+IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
+                       SectionKind Kind) const {
+  // Only global variables, not functions.
+  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
+  if (!GVA)
+    return false;
+  
+  // We can only do this for datarel or BSS objects for now.
+  if (!Kind.isBSS() && !Kind.isDataRel())
+    return false;
+  
+  // If this is a internal constant string, there is a special
+  // section for it, but not in small data/bss.
+  if (Kind.isMergeable1ByteCString())
+    return false;
+
+  const Type *Ty = GV->getType()->getElementType();
+  return IsInSmallSection(TM.getTargetData()->getTypeAllocSize(Ty));
+}
+
+
+
+const MCSection *MipsTargetObjectFile::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+  // TODO: Could also support "weak" symbols as well with ".gnu.linkonce.s.*"
+  // sections?
+  
+  // Handle Small Section classification here.
+  if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallBSSSection;
+  if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallDataSection;
+  
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang,TM);
+}
diff --git a/lib/Target/Mips/MipsTargetObjectFile.h b/lib/Target/Mips/MipsTargetObjectFile.h
new file mode 100644
index 0000000..32e0436
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetObjectFile.h
@@ -0,0 +1,41 @@
+//===-- llvm/Target/MipsTargetObjectFile.h - Mips Object Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MIPS_TARGETOBJECTFILE_H
+#define LLVM_TARGET_MIPS_TARGETOBJECTFILE_H
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+  class MipsTargetObjectFile : public TargetLoweringObjectFileELF {
+    const MCSection *SmallDataSection;
+    const MCSection *SmallBSSSection;
+  public:
+    
+    void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    
+    /// IsGlobalInSmallSection - Return true if this global address should be
+    /// placed into small data/bss section.
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM, SectionKind Kind)const;
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM) const;  
+    
+    const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
+                                            SectionKind Kind,
+                                            Mangler *Mang,
+                                            const TargetMachine &TM) const;
+      
+    // TODO: Classify globals as mips wishes.
+  };
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Mips/TargetInfo/CMakeLists.txt b/lib/Target/Mips/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..6e5d56b
--- /dev/null
+++ b/lib/Target/Mips/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMMipsInfo
+  MipsTargetInfo.cpp
+  )
+
+add_dependencies(LLVMMipsInfo MipsCodeGenTable_gen)
diff --git a/lib/Target/Mips/TargetInfo/Makefile b/lib/Target/Mips/TargetInfo/Makefile
new file mode 100644
index 0000000..32f4e16
--- /dev/null
+++ b/lib/Target/Mips/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/Mips/TargetInfo/Makefile -----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMMipsInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
new file mode 100644
index 0000000..cc3d61e
--- /dev/null
+++ b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -0,0 +1,21 @@
+//===-- MipsTargetInfo.cpp - Mips Target Implementation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheMipsTarget, llvm::TheMipselTarget;
+
+extern "C" void LLVMInitializeMipsTargetInfo() { 
+  RegisterTarget<Triple::mips> X(TheMipsTarget, "mips", "Mips");
+
+  RegisterTarget<Triple::mipsel> Y(TheMipselTarget, "mipsel", "Mipsel");
+}
diff --git a/lib/Target/PIC16/AsmPrinter/CMakeLists.txt b/lib/Target/PIC16/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..2e1b809
--- /dev/null
+++ b/lib/Target/PIC16/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,9 @@
+include_directories(
+  ${CMAKE_CURRENT_BINARY_DIR}/..
+  ${CMAKE_CURRENT_SOURCE_DIR}/..
+  )
+
+add_llvm_library(LLVMPIC16AsmPrinter
+  PIC16AsmPrinter.cpp
+  )
+add_dependencies(LLVMPIC16AsmPrinter PIC16CodeGenTable_gen)
diff --git a/lib/Target/PIC16/AsmPrinter/Makefile b/lib/Target/PIC16/AsmPrinter/Makefile
new file mode 100644
index 0000000..f4db57e
--- /dev/null
+++ b/lib/Target/PIC16/AsmPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/PIC16/AsmPrinter/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMPIC16AsmPrinter
+
+# Hack: we need to include 'main' pic16 target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp
new file mode 100644
index 0000000..3f415af
--- /dev/null
+++ b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp
@@ -0,0 +1,484 @@
+//===-- PIC16AsmPrinter.cpp - PIC16 LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PIC16 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16AsmPrinter.h"
+#include "MCSectionPIC16.h"
+#include "PIC16MCAsmInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Mangler.h"
+#include <cstring>
+using namespace llvm;
+
+#include "PIC16GenAsmWriter.inc"
+
+PIC16AsmPrinter::PIC16AsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                                 const MCAsmInfo *T, bool V)
+: AsmPrinter(O, TM, T, V), DbgInfo(O, T) {
+  PTLI = static_cast<PIC16TargetLowering*>(TM.getTargetLowering());
+  PMAI = static_cast<const PIC16MCAsmInfo*>(T);
+  PTOF = (PIC16TargetObjectFile*)&PTLI->getObjFileLowering();
+}
+
+bool PIC16AsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  processDebugLoc(MI, true);
+  printInstruction(MI);
+  if (VerboseAsm && !MI->getDebugLoc().isUnknown())
+    EmitComments(*MI);
+  O << '\n';
+  processDebugLoc(MI, false);
+  return true;
+}
+
+/// runOnMachineFunction - This emits the frame section, autos section and 
+/// assembly for each instruction. Also takes care of function begin debug
+/// directive and file begin debug directive (if required) for the function.
+///
+bool PIC16AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  this->MF = &MF;
+
+  // This calls the base class function required to be called at beginning
+  // of runOnMachineFunction.
+  SetupMachineFunction(MF);
+
+  // Get the mangled name.
+  const Function *F = MF.getFunction();
+  CurrentFnName = Mang->getMangledName(F);
+
+  // Emit the function frame (args and temps).
+  EmitFunctionFrame(MF);
+
+  DbgInfo.BeginFunction(MF);
+
+  // Emit the autos section of function.
+  EmitAutos(CurrentFnName);
+
+  // Now emit the instructions of function in its code section.
+  const MCSection *fCodeSection = 
+    getObjFileLowering().getSectionForFunction(CurrentFnName);
+  // Start the Code Section.
+  O <<  "\n";
+  OutStreamer.SwitchSection(fCodeSection);
+
+  // Emit the frame address of the function at the beginning of code.
+  O << "\tretlw  low(" << PAN::getFrameLabel(CurrentFnName) << ")\n";
+  O << "\tretlw  high(" << PAN::getFrameLabel(CurrentFnName) << ")\n";
+
+  // Emit function start label.
+  O << CurrentFnName << ":\n";
+
+  DebugLoc CurDL;
+  O << "\n"; 
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      EmitBasicBlockStart(I);
+    }
+    
+    // Print a basic block.
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+
+      // Emit the line directive if source line changed.
+      const DebugLoc DL = II->getDebugLoc();
+      if (!DL.isUnknown() && DL != CurDL) {
+        DbgInfo.ChangeDebugLoc(MF, DL);
+        CurDL = DL;
+      }
+        
+      // Print the assembly for the instruction.
+      printMachineInstruction(II);
+    }
+  }
+  
+  // Emit function end debug directives.
+  DbgInfo.EndFunction(MF);
+
+  return false;  // we didn't modify anything.
+}
+
+
+// printOperand - print operand of insn.
+void PIC16AsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+
+  switch (MO.getType()) {
+    case MachineOperand::MO_Register:
+      O << getRegisterName(MO.getReg());
+      return;
+
+    case MachineOperand::MO_Immediate:
+      O << (int)MO.getImm();
+      return;
+
+    case MachineOperand::MO_GlobalAddress: {
+      std::string Sname = Mang->getMangledName(MO.getGlobal());
+      // FIXME: currently we do not have a memcpy def coming in the module
+      // by any chance, as we do not link in those as .bc lib. So these calls
+      // are always external and it is safe to emit an extern.
+      if (PAN::isMemIntrinsic(Sname)) {
+        LibcallDecls.push_back(createESName(Sname));
+      }
+
+      O << Sname;
+      break;
+    }
+    case MachineOperand::MO_ExternalSymbol: {
+       const char *Sname = MO.getSymbolName();
+
+      // If its a libcall name, record it to decls section.
+      if (PAN::getSymbolTag(Sname) == PAN::LIBCALL) {
+        LibcallDecls.push_back(Sname);
+      }
+
+      // Record a call to intrinsic to print the extern declaration for it.
+      std::string Sym = Sname;  
+      if (PAN::isMemIntrinsic(Sym)) {
+        Sym = PAN::addPrefix(Sym);
+        LibcallDecls.push_back(createESName(Sym));
+      }
+
+      O  << Sym;
+      break;
+    }
+    case MachineOperand::MO_MachineBasicBlock:
+      GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
+      return;
+
+    default:
+      llvm_unreachable(" Operand type not supported.");
+  }
+}
+
+/// printCCOperand - Print the cond code operand.
+///
+void PIC16AsmPrinter::printCCOperand(const MachineInstr *MI, int opNum) {
+  int CC = (int)MI->getOperand(opNum).getImm();
+  O << PIC16CondCodeToString((PIC16CC::CondCodes)CC);
+}
+
+// This function is used to sort the decls list.
+// should return true if s1 should come before s2.
+static bool is_before(const char *s1, const char *s2) {
+  return strcmp(s1, s2) <= 0;
+}
+
+// This is used by list::unique below. 
+// unique will filter out duplicates if it knows them.
+static bool is_duplicate(const char *s1, const char *s2) {
+  return !strcmp(s1, s2);
+}
+
+/// printLibcallDecls - print the extern declarations for compiler 
+/// intrinsics.
+///
+void PIC16AsmPrinter::printLibcallDecls() {
+  // If no libcalls used, return.
+  if (LibcallDecls.empty()) return;
+
+  O << MAI->getCommentString() << "External decls for libcalls - BEGIN." <<"\n";
+  // Remove duplicate entries.
+  LibcallDecls.sort(is_before);
+  LibcallDecls.unique(is_duplicate);
+
+  for (std::list<const char*>::const_iterator I = LibcallDecls.begin(); 
+       I != LibcallDecls.end(); I++) {
+    O << MAI->getExternDirective() << *I << "\n";
+    O << MAI->getExternDirective() << PAN::getArgsLabel(*I) << "\n";
+    O << MAI->getExternDirective() << PAN::getRetvalLabel(*I) << "\n";
+  }
+  O << MAI->getCommentString() << "External decls for libcalls - END." <<"\n";
+}
+
+/// doInitialization - Perform Module level initializations here.
+/// One task that we do here is to sectionize all global variables.
+/// The MemSelOptimizer pass depends on the sectionizing.
+///
+bool PIC16AsmPrinter::doInitialization(Module &M) {
+  bool Result = AsmPrinter::doInitialization(M);
+
+  // FIXME:: This is temporary solution to generate the include file.
+  // The processor should be passed to llc as in input and the header file
+  // should be generated accordingly.
+  O << "\n\t#include P16F1937.INC\n";
+
+  // Set the section names for all globals.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) {
+      const MCSection *S = getObjFileLowering().SectionForGlobal(I, Mang, TM);
+      
+      I->setSection(((const MCSectionPIC16*)S)->getName());
+    }
+
+  DbgInfo.BeginModule(M);
+  EmitFunctionDecls(M);
+  EmitUndefinedVars(M);
+  EmitDefinedVars(M);
+  EmitIData(M);
+  EmitUData(M);
+  EmitRomData(M);
+  return Result;
+}
+
+/// Emit extern decls for functions imported from other modules, and emit
+/// global declarations for function defined in this module and which are
+/// available to other modules.
+///
+void PIC16AsmPrinter::EmitFunctionDecls(Module &M) {
+ // Emit declarations for external functions.
+  O <<"\n"<<MAI->getCommentString() << "Function Declarations - BEGIN." <<"\n";
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; I++) {
+    if (I->isIntrinsic())
+      continue;
+
+    std::string Name = Mang->getMangledName(I);
+    if (Name.compare("@abort") == 0)
+      continue;
+    
+    if (!I->isDeclaration() && !I->hasExternalLinkage())
+      continue;
+
+    // Do not emit memcpy, memset, and memmove here.
+    // Calls to these routines can be generated in two ways,
+    // 1. User calling the standard lib function
+    // 2. Codegen generating these calls for llvm intrinsics.
+    // In the first case a prototype is alread availale, while in
+    // second case the call is via and externalsym and the prototype is missing.
+    // So declarations for these are currently always getting printing by
+    // tracking both kind of references in printInstrunction.
+    if (I->isDeclaration() && PAN::isMemIntrinsic(Name)) continue;
+
+    const char *directive = I->isDeclaration() ? MAI->getExternDirective() :
+                                                 MAI->getGlobalDirective();
+      
+    O << directive << Name << "\n";
+    O << directive << PAN::getRetvalLabel(Name) << "\n";
+    O << directive << PAN::getArgsLabel(Name) << "\n";
+  }
+
+  O << MAI->getCommentString() << "Function Declarations - END." <<"\n";
+}
+
+// Emit variables imported from other Modules.
+void PIC16AsmPrinter::EmitUndefinedVars(Module &M) {
+  std::vector<const GlobalVariable*> Items = PTOF->ExternalVarDecls->Items;
+  if (!Items.size()) return;
+
+  O << "\n" << MAI->getCommentString() << "Imported Variables - BEGIN" << "\n";
+  for (unsigned j = 0; j < Items.size(); j++) {
+    O << MAI->getExternDirective() << Mang->getMangledName(Items[j]) << "\n";
+  }
+  O << MAI->getCommentString() << "Imported Variables - END" << "\n";
+}
+
+// Emit variables defined in this module and are available to other modules.
+void PIC16AsmPrinter::EmitDefinedVars(Module &M) {
+  std::vector<const GlobalVariable*> Items = PTOF->ExternalVarDefs->Items;
+  if (!Items.size()) return;
+
+  O << "\n" << MAI->getCommentString() << "Exported Variables - BEGIN" << "\n";
+  for (unsigned j = 0; j < Items.size(); j++) {
+    O << MAI->getGlobalDirective() << Mang->getMangledName(Items[j]) << "\n";
+  }
+  O <<  MAI->getCommentString() << "Exported Variables - END" << "\n";
+}
+
+// Emit initialized data placed in ROM.
+void PIC16AsmPrinter::EmitRomData(Module &M) {
+  // Print ROM Data section.
+  const std::vector<PIC16Section*> &ROSections = PTOF->ROSections;
+  for (unsigned i = 0; i < ROSections.size(); i++) {
+    const std::vector<const GlobalVariable*> &Items = ROSections[i]->Items;
+    if (!Items.size()) continue;
+    O << "\n";
+    OutStreamer.SwitchSection(PTOF->ROSections[i]->S_);
+    for (unsigned j = 0; j < Items.size(); j++) {
+      O << Mang->getMangledName(Items[j]);
+      Constant *C = Items[j]->getInitializer();
+      int AddrSpace = Items[j]->getType()->getAddressSpace();
+      EmitGlobalConstant(C, AddrSpace);
+    }
+  }
+}
+
+bool PIC16AsmPrinter::doFinalization(Module &M) {
+  printLibcallDecls();
+  EmitRemainingAutos();
+  DbgInfo.EndModule(M);
+  O << "\n\t" << "END\n";
+  return AsmPrinter::doFinalization(M);
+}
+
+void PIC16AsmPrinter::EmitFunctionFrame(MachineFunction &MF) {
+  const Function *F = MF.getFunction();
+  std::string FuncName = Mang->getMangledName(F);
+  const TargetData *TD = TM.getTargetData();
+  // Emit the data section name.
+  O << "\n"; 
+  
+  const MCSection *fPDataSection =
+    getObjFileLowering().getSectionForFunctionFrame(CurrentFnName);
+  OutStreamer.SwitchSection(fPDataSection);
+  
+  // Emit function frame label
+  O << PAN::getFrameLabel(CurrentFnName) << ":\n";
+
+  const Type *RetType = F->getReturnType();
+  unsigned RetSize = 0; 
+  if (RetType->getTypeID() != Type::VoidTyID) 
+    RetSize = TD->getTypeAllocSize(RetType);
+  
+  //Emit function return value space
+  // FIXME: Do not emit RetvalLable when retsize is zero. To do this
+  // we will need to avoid printing a global directive for Retval label
+  // in emitExternandGloblas.
+  if(RetSize > 0)
+     O << PAN::getRetvalLabel(CurrentFnName) << " RES " << RetSize << "\n";
+  else
+     O << PAN::getRetvalLabel(CurrentFnName) << ": \n";
+   
+  // Emit variable to hold the space for function arguments 
+  unsigned ArgSize = 0;
+  for (Function::const_arg_iterator argi = F->arg_begin(),
+           arge = F->arg_end(); argi != arge ; ++argi) {
+    const Type *Ty = argi->getType();
+    ArgSize += TD->getTypeAllocSize(Ty);
+   }
+
+  O << PAN::getArgsLabel(CurrentFnName) << " RES " << ArgSize << "\n";
+
+  // Emit temporary space
+  int TempSize = PTLI->GetTmpSize();
+  if (TempSize > 0)
+    O << PAN::getTempdataLabel(CurrentFnName) << " RES  " << TempSize << '\n';
+}
+
+void PIC16AsmPrinter::EmitIData(Module &M) {
+
+  // Print all IDATA sections.
+  const std::vector<PIC16Section*> &IDATASections = PTOF->IDATASections;
+  for (unsigned i = 0; i < IDATASections.size(); i++) {
+    O << "\n";
+    if (IDATASections[i]->S_->getName().find("llvm.") != std::string::npos)
+      continue;
+    OutStreamer.SwitchSection(IDATASections[i]->S_);
+    std::vector<const GlobalVariable*> Items = IDATASections[i]->Items;
+    for (unsigned j = 0; j < Items.size(); j++) {
+      std::string Name = Mang->getMangledName(Items[j]);
+      Constant *C = Items[j]->getInitializer();
+      int AddrSpace = Items[j]->getType()->getAddressSpace();
+      O << Name;
+      EmitGlobalConstant(C, AddrSpace);
+    }
+  }
+}
+
+void PIC16AsmPrinter::EmitUData(Module &M) {
+  const TargetData *TD = TM.getTargetData();
+
+  // Print all BSS sections.
+  const std::vector<PIC16Section*> &BSSSections = PTOF->BSSSections;
+  for (unsigned i = 0; i < BSSSections.size(); i++) {
+    O << "\n";
+    OutStreamer.SwitchSection(BSSSections[i]->S_);
+    std::vector<const GlobalVariable*> Items = BSSSections[i]->Items;
+    for (unsigned j = 0; j < Items.size(); j++) {
+      std::string Name = Mang->getMangledName(Items[j]);
+      Constant *C = Items[j]->getInitializer();
+      const Type *Ty = C->getType();
+      unsigned Size = TD->getTypeAllocSize(Ty);
+
+      O << Name << " RES " << Size << "\n";
+    }
+  }
+}
+
+void PIC16AsmPrinter::EmitAutos(std::string FunctName) {
+  // Section names for all globals are already set.
+  const TargetData *TD = TM.getTargetData();
+
+  // Now print Autos section for this function.
+  std::string SectionName = PAN::getAutosSectionName(FunctName);
+  const std::vector<PIC16Section*> &AutosSections = PTOF->AutosSections;
+  for (unsigned i = 0; i < AutosSections.size(); i++) {
+    O << "\n";
+    if (AutosSections[i]->S_->getName() == SectionName) { 
+      // Set the printing status to true
+      AutosSections[i]->setPrintedStatus(true);
+      OutStreamer.SwitchSection(AutosSections[i]->S_);
+      const std::vector<const GlobalVariable*> &Items = AutosSections[i]->Items;
+      for (unsigned j = 0; j < Items.size(); j++) {
+        std::string VarName = Mang->getMangledName(Items[j]);
+        Constant *C = Items[j]->getInitializer();
+        const Type *Ty = C->getType();
+        unsigned Size = TD->getTypeAllocSize(Ty);
+        // Emit memory reserve directive.
+        O << VarName << "  RES  " << Size << "\n";
+      }
+      break;
+    }
+  }
+}
+
+// Print autos that were not printed during the code printing of functions.
+// As the functions might themselves would have got deleted by the optimizer.
+void PIC16AsmPrinter::EmitRemainingAutos() {
+  const TargetData *TD = TM.getTargetData();
+
+  // Now print Autos section for this function.
+  std::vector <PIC16Section *>AutosSections = PTOF->AutosSections;
+  for (unsigned i = 0; i < AutosSections.size(); i++) {
+    
+    // if the section is already printed then don't print again
+    if (AutosSections[i]->isPrinted()) 
+      continue;
+
+    // Set status as printed
+    AutosSections[i]->setPrintedStatus(true);
+
+    O << "\n";
+    OutStreamer.SwitchSection(AutosSections[i]->S_);
+    const std::vector<const GlobalVariable*> &Items = AutosSections[i]->Items;
+    for (unsigned j = 0; j < Items.size(); j++) {
+      std::string VarName = Mang->getMangledName(Items[j]);
+      Constant *C = Items[j]->getInitializer();
+      const Type *Ty = C->getType();
+      unsigned Size = TD->getTypeAllocSize(Ty);
+      // Emit memory reserve directive.
+      O << VarName << "  RES  " << Size << "\n";
+    }
+  }
+}
+
+
+extern "C" void LLVMInitializePIC16AsmPrinter() { 
+  RegisterAsmPrinter<PIC16AsmPrinter> X(ThePIC16Target);
+}
+
+
diff --git a/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.h b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.h
new file mode 100644
index 0000000..2dd4600
--- /dev/null
+++ b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.h
@@ -0,0 +1,80 @@
+//===-- PIC16AsmPrinter.h - PIC16 LLVM assembly writer ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PIC16 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16ASMPRINTER_H
+#define PIC16ASMPRINTER_H
+
+#include "PIC16.h"
+#include "PIC16TargetMachine.h"
+#include "PIC16DebugInfo.h"
+#include "PIC16MCAsmInfo.h"
+#include "PIC16TargetObjectFile.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include <list>
+#include <string>
+
+namespace llvm {
+  class VISIBILITY_HIDDEN PIC16AsmPrinter : public AsmPrinter {
+  public:
+    explicit PIC16AsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                             const MCAsmInfo *T, bool V);
+  private:
+    virtual const char *getPassName() const {
+      return "PIC16 Assembly Printer";
+    }
+    
+    PIC16TargetObjectFile &getObjFileLowering() const {
+      return (PIC16TargetObjectFile &)AsmPrinter::getObjFileLowering();
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printCCOperand(const MachineInstr *MI, int opNum);
+    void printInstruction(const MachineInstr *MI); // definition autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
+    bool printMachineInstruction(const MachineInstr *MI);
+    void EmitFunctionDecls (Module &M);
+    void EmitUndefinedVars (Module &M);
+    void EmitDefinedVars (Module &M);
+    void EmitIData (Module &M);
+    void EmitUData (Module &M);
+    void EmitAutos (std::string FunctName);
+    void EmitRemainingAutos ();
+    void EmitRomData (Module &M);
+    void EmitFunctionFrame(MachineFunction &MF);
+    void printLibcallDecls();
+  protected:
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+
+    /// PrintGlobalVariable - Emit the specified global variable and its
+    /// initializer to the output stream.
+    virtual void PrintGlobalVariable(const GlobalVariable *GV) {
+      // PIC16 doesn't use normal hooks for this.
+    }
+    
+  private:
+    PIC16TargetObjectFile *PTOF;
+    PIC16TargetLowering *PTLI;
+    PIC16DbgInfo DbgInfo;
+    const PIC16MCAsmInfo *PMAI;
+    std::list<const char *> LibcallDecls; // List of extern decls.
+  };
+} // end of namespace
+
+#endif
diff --git a/lib/Target/PIC16/CMakeLists.txt b/lib/Target/PIC16/CMakeLists.txt
index 00d737a..0ee88f9 100644
--- a/lib/Target/PIC16/CMakeLists.txt
+++ b/lib/Target/PIC16/CMakeLists.txt
@@ -11,14 +11,14 @@ tablegen(PIC16GenCallingConv.inc -gen-callingconv)
 tablegen(PIC16GenSubtarget.inc -gen-subtarget)
 
 add_llvm_target(PIC16
-  PIC16AsmPrinter.cpp
   PIC16DebugInfo.cpp
   PIC16InstrInfo.cpp
   PIC16ISelDAGToDAG.cpp
   PIC16ISelLowering.cpp
   PIC16MemSelOpt.cpp
+  PIC16MCAsmInfo.cpp
   PIC16RegisterInfo.cpp
   PIC16Subtarget.cpp
-  PIC16TargetAsmInfo.cpp
   PIC16TargetMachine.cpp
+  PIC16TargetObjectFile.cpp
   )
diff --git a/lib/Target/PIC16/MCSectionPIC16.h b/lib/Target/PIC16/MCSectionPIC16.h
new file mode 100644
index 0000000..352be99
--- /dev/null
+++ b/lib/Target/PIC16/MCSectionPIC16.h
@@ -0,0 +1,88 @@
+//===- MCSectionPIC16.h - PIC16-specific section representation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MCSectionPIC16 class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PIC16SECTION_H
+#define LLVM_PIC16SECTION_H
+
+#include "llvm/MC/MCSection.h"
+
+namespace llvm {
+
+  /// MCSectionPIC16 - Represents a physical section in PIC16 COFF.
+  /// Contains data objects.
+  ///
+  class MCSectionPIC16 : public MCSection {
+    /// Name of the section to uniquely identify it.
+    std::string Name;
+
+    /// User can specify an address at which a section should be placed. 
+    /// Negative value here means user hasn't specified any. 
+    int Address; 
+
+    /// Overlay information - Sections with same color can be overlaid on
+    /// one another.
+    int Color; 
+
+    /// Conatined data objects.
+    std::vector<const GlobalVariable *>Items;
+
+    /// Total size of all data objects contained here.
+    unsigned Size;
+    
+    MCSectionPIC16(const StringRef &name, SectionKind K, int addr, int color)
+      : MCSection(K), Name(name), Address(addr), Color(color) {
+    }
+    
+  public:
+    /// Return the name of the section.
+    const std::string &getName() const { return Name; }
+
+    /// Return the Address of the section.
+    int getAddress() const { return Address; }
+
+    /// Return the Color of the section.
+    int getColor() const { return Color; }
+
+    /// PIC16 Terminology for section kinds is as below.
+    /// UDATA - BSS
+    /// IDATA - initialized data (equiv to Metadata) 
+    /// ROMDATA - ReadOnly.
+    /// UDATA_OVR - Sections that can be overlaid. Section of such type is
+    ///             used to contain function autos an frame. We can think of
+    ///             it as equiv to llvm ThreadBSS)
+    /// So, let's have some convenience functions to Map PIC16 Section types 
+    /// to SectionKind just for the sake of better readability.
+    static SectionKind UDATA_Kind() { return SectionKind::getBSS(); } 
+    static SectionKind IDATA_Kind() { return SectionKind::getMetadata(); }
+    static SectionKind ROMDATA_Kind() { return SectionKind::getReadOnly(); }
+    static SectionKind UDATA_OVR_Kind() { return SectionKind::getThreadBSS(); }
+
+    // If we could just do getKind() == UDATA_Kind() ?
+    bool isUDATA_Kind() { return getKind().isBSS(); }
+    bool isIDATA_Kind() { return getKind().isMetadata(); }
+    bool isROMDATA_Kind() { return getKind().isMetadata(); }
+    bool isUDATA_OVR_Kind() { return getKind().isThreadBSS(); }
+
+    /// This would be the only way to create a section. 
+    static MCSectionPIC16 *Create(const StringRef &Name, SectionKind K, 
+                                  int Address, int Color, MCContext &Ctx);
+    
+    /// Override this as PIC16 has its own way of printing switching
+    /// to a section.
+    virtual void PrintSwitchToSection(const MCAsmInfo &MAI,
+                                      raw_ostream &OS) const;
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/PIC16/Makefile b/lib/Target/PIC16/Makefile
index c429324..f913675 100644
--- a/lib/Target/PIC16/Makefile
+++ b/lib/Target/PIC16/Makefile
@@ -7,7 +7,7 @@
 # 
 ##===----------------------------------------------------------------------===##
 LEVEL = ../../..
-LIBRARYNAME = LLVMPIC16
+LIBRARYNAME = LLVMPIC16CodeGen
 TARGET = PIC16
 
 # Make sure that tblgen is run, first thing.
@@ -17,5 +17,7 @@ BUILT_SOURCES = PIC16GenRegisterInfo.h.inc PIC16GenRegisterNames.inc \
 		PIC16GenDAGISel.inc PIC16GenCallingConv.inc \
 		PIC16GenSubtarget.inc
 
+DIRS = AsmPrinter TargetInfo
+
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Target/PIC16/PIC16.h b/lib/Target/PIC16/PIC16.h
index 7940648..8a3704d 100644
--- a/lib/Target/PIC16/PIC16.h
+++ b/lib/Target/PIC16/PIC16.h
@@ -15,8 +15,8 @@
 #ifndef LLVM_TARGET_PIC16_H
 #define LLVM_TARGET_PIC16_H
 
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
-#include <iosfwd>
 #include <cassert>
 #include <sstream>
 #include <cstring>
@@ -26,7 +26,7 @@ namespace llvm {
   class PIC16TargetMachine;
   class FunctionPass;
   class MachineCodeEmitter;
-  class raw_ostream;
+  class formatted_raw_ostream;
 
 namespace PIC16CC {
   enum CondCodes {
@@ -83,7 +83,7 @@ namespace PIC16CC {
     // initialized globals - @idata.<num>.#
     // Function frame - @<func>.frame_section.
     // Function autos - @<func>.autos_section.
-    // Declarations - @section.0
+    // Declarations - Enclosed in comments. No section for them.
     //----------------------------------------------------------
     
     // Tags used to mangle different names. 
@@ -221,17 +221,29 @@ namespace PIC16CC {
       return Func1 + tag + "# CODE";
     }
 
-    // udata and idata section names are generated by a given number.
+    // udata, romdata and idata section names are generated by a given number.
     // @udata.<num>.# 
-    static std::string getUdataSectionName(unsigned num) {
+    static std::string getUdataSectionName(unsigned num, 
+                                           std::string prefix = "") {
        std::ostringstream o;
-       o << getTagName(PREFIX_SYMBOL) << "udata." << num << ".# UDATA"; 
+       o << getTagName(PREFIX_SYMBOL) << prefix << "udata." << num 
+         << ".# UDATA"; 
        return o.str(); 
     }
 
-    static std::string getIdataSectionName(unsigned num) {
+    static std::string getRomdataSectionName(unsigned num,
+                                             std::string prefix = "") {
        std::ostringstream o;
-       o << getTagName(PREFIX_SYMBOL) << "idata." << num << ".# IDATA"; 
+       o << getTagName(PREFIX_SYMBOL) << prefix << "romdata." << num 
+         << ".# ROMDATA";
+       return o.str();
+    }
+
+    static std::string getIdataSectionName(unsigned num,
+                                           std::string prefix = "") {
+       std::ostringstream o;
+       o << getTagName(PREFIX_SYMBOL) << prefix << "idata." << num 
+         << ".# IDATA"; 
        return o.str(); 
     }
 
@@ -242,6 +254,15 @@ namespace PIC16CC {
       return false;
     }
 
+    inline static bool isMemIntrinsic (const std::string &Name) {
+      if (Name.compare("@memcpy") == 0 || Name.compare("@memset") == 0 ||
+          Name.compare("@memmove") == 0) {
+        return true;
+      }
+      
+      return false;
+    }
+
     inline static bool isLocalToFunc (std::string &Func, std::string &Var) {
       if (! isLocalName(Var)) return false;
 
@@ -295,7 +316,7 @@ namespace PIC16CC {
 
   inline static const char *PIC16CondCodeToString(PIC16CC::CondCodes CC) {
     switch (CC) {
-    default: assert(0 && "Unknown condition code");
+    default: llvm_unreachable("Unknown condition code");
     case PIC16CC::NE:  return "ne";
     case PIC16CC::EQ:   return "eq";
     case PIC16CC::LT:   return "lt";
@@ -311,7 +332,7 @@ namespace PIC16CC {
 
   inline static bool isSignedComparison(PIC16CC::CondCodes CC) {
     switch (CC) {
-    default: assert(0 && "Unknown condition code");
+    default: llvm_unreachable("Unknown condition code");
     case PIC16CC::NE:  
     case PIC16CC::EQ: 
     case PIC16CC::LT:
@@ -330,11 +351,12 @@ namespace PIC16CC {
 
 
   FunctionPass *createPIC16ISelDag(PIC16TargetMachine &TM);
-  FunctionPass *createPIC16CodePrinterPass(raw_ostream &OS, 
-                                           PIC16TargetMachine &TM,
-                                           bool Verbose);
-  // Banksel optimzer pass.
+  // Banksel optimizer pass.
   FunctionPass *createPIC16MemSelOptimizerPass();
+
+  extern Target ThePIC16Target;
+  extern Target TheCooperTarget;
+  
 } // end namespace llvm;
 
 // Defines symbolic names for PIC16 registers.  This defines a mapping from
diff --git a/lib/Target/PIC16/PIC16DebugInfo.cpp b/lib/Target/PIC16/PIC16DebugInfo.cpp
index 4300588..961caed 100644
--- a/lib/Target/PIC16/PIC16DebugInfo.cpp
+++ b/lib/Target/PIC16/PIC16DebugInfo.cpp
@@ -1,3 +1,4 @@
+
 //===-- PIC16DebugInfo.cpp - Implementation for PIC16 Debug Information ======//
 //
 //                     The LLVM Compiler Infrastructure
@@ -15,8 +16,10 @@
 #include "PIC16DebugInfo.h" 
 #include "llvm/GlobalVariable.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/DebugLoc.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/ADT/SmallString.h"
 
 using namespace llvm;
 
@@ -25,11 +28,11 @@ using namespace llvm;
 void PIC16DbgInfo::PopulateDebugInfo (DIType Ty, unsigned short &TypeNo,
                                       bool &HasAux, int Aux[], 
                                       std::string &TagName) {
-  if (Ty.isBasicType(Ty.getTag())) 
+  if (Ty.isBasicType())
     PopulateBasicTypeInfo (Ty, TypeNo);
-  else if (Ty.isDerivedType(Ty.getTag())) 
+  else if (Ty.isDerivedType())
     PopulateDerivedTypeInfo (Ty, TypeNo, HasAux, Aux, TagName);
-  else if (Ty.isCompositeType(Ty.getTag())) 
+  else if (Ty.isCompositeType())
     PopulateCompositeTypeInfo (Ty, TypeNo, HasAux, Aux, TagName);
   else {
     TypeNo = PIC16Dbg::T_NULL;
@@ -41,8 +44,7 @@ void PIC16DbgInfo::PopulateDebugInfo (DIType Ty, unsigned short &TypeNo,
 /// PopulateBasicTypeInfo- Populate TypeNo for basic type from Ty.
 ///
 void PIC16DbgInfo::PopulateBasicTypeInfo (DIType Ty, unsigned short &TypeNo) {
-  std::string Name = "";
-  Ty.getName(Name);
+  std::string Name = Ty.getName();
   unsigned short BaseTy = GetTypeDebugNumber(Name);
   TypeNo = TypeNo << PIC16Dbg::S_BASIC;
   TypeNo = TypeNo | (0xffff & BaseTy);
@@ -67,7 +69,7 @@ void PIC16DbgInfo::PopulateDerivedTypeInfo (DIType Ty, unsigned short &TypeNo,
   
   // We also need to encode the the information about the base type of
   // pointer in TypeNo.
-  DIType BaseType = DIDerivedType(Ty.getGV()).getTypeDerivedFrom();
+  DIType BaseType = DIDerivedType(Ty.getNode()).getTypeDerivedFrom();
   PopulateDebugInfo(BaseType, TypeNo, HasAux, Aux, TagName);
 }
 
@@ -76,7 +78,7 @@ void PIC16DbgInfo::PopulateArrayTypeInfo (DIType Ty, unsigned short &TypeNo,
                                           bool &HasAux, int Aux[],
                                           std::string &TagName) {
 
-  DICompositeType CTy = DICompositeType(Ty.getGV());
+  DICompositeType CTy = DICompositeType(Ty.getNode());
   DIArray Elements = CTy.getTypeArray();
   unsigned short size = 1;
   unsigned short Dimension[4]={0,0,0,0};
@@ -85,7 +87,7 @@ void PIC16DbgInfo::PopulateArrayTypeInfo (DIType Ty, unsigned short &TypeNo,
     if (Element.getTag() == dwarf::DW_TAG_subrange_type) {
       TypeNo = TypeNo << PIC16Dbg::S_DERIVED;
       TypeNo = TypeNo | PIC16Dbg::DT_ARY;
-      DISubrange SubRange = DISubrange(Element.getGV());
+      DISubrange SubRange = DISubrange(Element.getNode());
       Dimension[i] = SubRange.getHi() - SubRange.getLo() + 1;
       // Each dimension is represented by 2 bytes starting at byte 9.
       Aux[8+i*2+0] = Dimension[i];
@@ -108,16 +110,20 @@ void PIC16DbgInfo::PopulateStructOrUnionTypeInfo (DIType Ty,
                                                   unsigned short &TypeNo,
                                                   bool &HasAux, int Aux[],
                                                   std::string &TagName) {
-  DICompositeType CTy = DICompositeType(Ty.getGV());
+  DICompositeType CTy = DICompositeType(Ty.getNode());
   TypeNo = TypeNo << PIC16Dbg::S_BASIC;
   if (Ty.getTag() == dwarf::DW_TAG_structure_type)
     TypeNo = TypeNo | PIC16Dbg::T_STRUCT;
   else
     TypeNo = TypeNo | PIC16Dbg::T_UNION;
-  CTy.getName(TagName);
+  TagName = CTy.getName();
   // UniqueSuffix is .number where number is obtained from
   // llvm.dbg.composite<number>.
-  std::string UniqueSuffix = "." + Ty.getGV()->getName().substr(18);
+  // FIXME: This will break when composite type is not represented by
+  // llvm.dbg.composite* global variable. Since we need to revisit 
+  // PIC16DebugInfo implementation anyways after the MDNodes based 
+  // framework is done, let us continue with the way it is.
+  std::string UniqueSuffix = "." + Ty.getNode()->getNameStr().substr(18);
   TagName += UniqueSuffix;
   unsigned short size = CTy.getSizeInBits()/8;
   // 7th and 8th byte represent size.
@@ -200,12 +206,14 @@ short PIC16DbgInfo::getStorageClass(DIGlobalVariable DIGV) {
 /// required initializations.
 void PIC16DbgInfo::BeginModule(Module &M) {
   // Emit file directive for module.
-  GlobalVariable *CU = M.getNamedGlobal("llvm.dbg.compile_unit");
-  if (CU) {
+  DebugInfoFinder DbgFinder;
+  DbgFinder.processModule(M);
+  if (DbgFinder.compile_unit_count() != 0) {
+    // FIXME : What if more then one CUs are present in a module ?
+    MDNode *CU = *DbgFinder.compile_unit_begin();
     EmitDebugDirectives = true;
     SwitchToCU(CU);
   }
-
   // Emit debug info for decls of composite types.
   EmitCompositeTypeDecls(M);
 }
@@ -233,10 +241,11 @@ void PIC16DbgInfo::BeginFunction(const MachineFunction &MF) {
   
   // Retreive the first valid debug Loc and process it.
   const DebugLoc &DL = GetDebugLocForFunction(MF);
-  ChangeDebugLoc(MF, DL, true);
-
-  EmitFunctBeginDI(MF.getFunction());
-  
+  // Emit debug info only if valid debug info is available.
+  if (!DL.isUnknown()) {
+    ChangeDebugLoc(MF, DL, true);
+    EmitFunctBeginDI(MF.getFunction());
+  } 
   // Set current line to 0 so that.line directive is genearted after .bf.
   CurLine = 0;
 }
@@ -249,7 +258,7 @@ void PIC16DbgInfo::ChangeDebugLoc(const MachineFunction &MF,
   if (! EmitDebugDirectives) return;
   assert (! DL.isUnknown()  && "can't change to invalid debug loc");
 
-  GlobalVariable *CU = MF.getDebugLocTuple(DL).CompileUnit;
+  MDNode *CU = MF.getDebugLocTuple(DL).Scope;
   unsigned line = MF.getDebugLocTuple(DL).Line;
 
   SwitchToCU(CU);
@@ -268,7 +277,10 @@ void PIC16DbgInfo::SwitchToLine(unsigned Line, bool IsInBeginFunction) {
 ///
 void PIC16DbgInfo::EndFunction(const MachineFunction &MF) {
   if (! EmitDebugDirectives) return;
-  EmitFunctEndDI(MF.getFunction(), CurLine);
+  const DebugLoc &DL = GetDebugLocForFunction(MF);
+  // Emit debug info only if valid debug info is available.
+  if (!DL.isUnknown())
+    EmitFunctEndDI(MF.getFunction(), CurLine);
 }
 
 /// EndModule - Emit .eof for end of module.
@@ -283,7 +295,7 @@ void PIC16DbgInfo::EndModule(Module &M) {
 /// composite type.
 /// 
 void PIC16DbgInfo::EmitCompositeTypeElements (DICompositeType CTy,
-                                              std::string UniqueSuffix) { 
+                                              std::string SuffixNo) {
   unsigned long Value = 0;
   DIArray Elements = CTy.getTypeArray();
   for (unsigned i = 0, N = Elements.getNumElements(); i < N; i++) {
@@ -292,24 +304,22 @@ void PIC16DbgInfo::EmitCompositeTypeElements (DICompositeType CTy,
     bool HasAux = false;
     int ElementAux[PIC16Dbg::AuxSize] = { 0 };
     std::string TagName = "";
-    std::string ElementName;
-    GlobalVariable *GV = Element.getGV();
-    DIDerivedType DITy(GV);
-    DITy.getName(ElementName);
+    DIDerivedType DITy(Element.getNode());
+    const char *ElementName = DITy.getName();
     unsigned short ElementSize = DITy.getSizeInBits()/8;
     // Get mangleddd name for this structure/union  element.
-    std::string MangMemName = ElementName + UniqueSuffix;
+    std::string MangMemName = ElementName + SuffixNo;
     PopulateDebugInfo(DITy, TypeNo, HasAux, ElementAux, TagName);
     short Class = 0;
     if( CTy.getTag() == dwarf::DW_TAG_union_type)
       Class = PIC16Dbg::C_MOU;
     else if  (CTy.getTag() == dwarf::DW_TAG_structure_type)
       Class = PIC16Dbg::C_MOS;
-    EmitSymbol(MangMemName, Class, TypeNo, Value);
+    EmitSymbol(MangMemName.c_str(), Class, TypeNo, Value);
     if (CTy.getTag() == dwarf::DW_TAG_structure_type)
       Value += ElementSize;
     if (HasAux)
-      EmitAuxEntry(MangMemName, ElementAux, PIC16Dbg::AuxSize, TagName);
+      EmitAuxEntry(MangMemName.c_str(), ElementAux, PIC16Dbg::AuxSize, TagName);
   }
 }
 
@@ -317,48 +327,48 @@ void PIC16DbgInfo::EmitCompositeTypeElements (DICompositeType CTy,
 /// and union declarations.
 ///
 void PIC16DbgInfo::EmitCompositeTypeDecls(Module &M) {
-  for(iplist<GlobalVariable>::iterator I = M.getGlobalList().begin(),
-      E = M.getGlobalList().end(); I != E; I++) {
-    // Structures and union declaration's debug info has llvm.dbg.composite
-    // in its name.
-    if(I->getName().find("llvm.dbg.composite") != std::string::npos) {
-      GlobalVariable *GV = cast<GlobalVariable >(I);
-      DICompositeType CTy(GV);
-      if (CTy.getTag() == dwarf::DW_TAG_union_type ||
-          CTy.getTag() == dwarf::DW_TAG_structure_type ) {
-        std::string name;
-        CTy.getName(name);
-        std::string DIVar = I->getName();
-        // Get the number after llvm.dbg.composite and make UniqueSuffix from 
-        // it.
-        std::string UniqueSuffix = "." + DIVar.substr(18);
-        std::string MangledCTyName = name + UniqueSuffix;
-        unsigned short size = CTy.getSizeInBits()/8;
-        int Aux[PIC16Dbg::AuxSize] = {0};
-        // 7th and 8th byte represent size of structure/union.
-        Aux[6] = size & 0xff;
-        Aux[7] = size >> 8;
-        // Emit .def for structure/union tag.
-        if( CTy.getTag() == dwarf::DW_TAG_union_type)
-          EmitSymbol(MangledCTyName, PIC16Dbg::C_UNTAG);
-        else if  (CTy.getTag() == dwarf::DW_TAG_structure_type) 
-          EmitSymbol(MangledCTyName, PIC16Dbg::C_STRTAG);
-
-        // Emit auxiliary debug information for structure/union tag. 
-        EmitAuxEntry(MangledCTyName, Aux, PIC16Dbg::AuxSize);
-
-        // Emit members.
-        EmitCompositeTypeElements (CTy, UniqueSuffix);
-
-        // Emit mangled Symbol for end of structure/union.
-        std::string EOSSymbol = ".eos" + UniqueSuffix;
-        EmitSymbol(EOSSymbol, PIC16Dbg::C_EOS);
-        EmitAuxEntry(EOSSymbol, Aux, PIC16Dbg::AuxSize, MangledCTyName);
-      }
+  DebugInfoFinder DbgFinder;
+  DbgFinder.processModule(M);
+  for (DebugInfoFinder::iterator I = DbgFinder.type_begin(),
+         E = DbgFinder.type_end(); I != E; ++I) {
+    DICompositeType CTy(*I);
+    if (CTy.isNull())
+      continue;
+    if (CTy.getTag() == dwarf::DW_TAG_union_type ||
+        CTy.getTag() == dwarf::DW_TAG_structure_type ) {
+      const char *Name = CTy.getName();
+      // Get the number after llvm.dbg.composite and make UniqueSuffix from 
+      // it.
+      std::string DIVar = CTy.getNode()->getNameStr();
+      std::string UniqueSuffix = "." + DIVar.substr(18);
+      std::string MangledCTyName = Name + UniqueSuffix;
+      unsigned short size = CTy.getSizeInBits()/8;
+      int Aux[PIC16Dbg::AuxSize] = {0};
+      // 7th and 8th byte represent size of structure/union.
+      Aux[6] = size & 0xff;
+      Aux[7] = size >> 8;
+      // Emit .def for structure/union tag.
+      if( CTy.getTag() == dwarf::DW_TAG_union_type)
+        EmitSymbol(MangledCTyName.c_str(), PIC16Dbg::C_UNTAG);
+      else if  (CTy.getTag() == dwarf::DW_TAG_structure_type) 
+        EmitSymbol(MangledCTyName.c_str(), PIC16Dbg::C_STRTAG);
+      
+      // Emit auxiliary debug information for structure/union tag. 
+      EmitAuxEntry(MangledCTyName.c_str(), Aux, PIC16Dbg::AuxSize);
+      
+      // Emit members.
+      EmitCompositeTypeElements (CTy, UniqueSuffix);
+      
+      // Emit mangled Symbol for end of structure/union.
+      std::string EOSSymbol = ".eos" + UniqueSuffix;
+      EmitSymbol(EOSSymbol.c_str(), PIC16Dbg::C_EOS);
+      EmitAuxEntry(EOSSymbol.c_str(), Aux, PIC16Dbg::AuxSize, 
+                   MangledCTyName.c_str());
     }
   }
 }
 
+
 /// EmitFunctBeginDI - Emit .bf for function.
 ///
 void PIC16DbgInfo::EmitFunctBeginDI(const Function *F) {
@@ -425,31 +435,26 @@ void PIC16DbgInfo::EmitSymbol(std::string Name, short Class, unsigned short
 /// EmitVarDebugInfo - Emit debug information for all variables.
 ///
 void PIC16DbgInfo::EmitVarDebugInfo(Module &M) {
-  GlobalVariable *Root = M.getGlobalVariable("llvm.dbg.global_variables");
-  if (!Root)
-    return;
-
-  Constant *RootC = cast<Constant>(*Root->use_begin());
-  for (Value::use_iterator UI = RootC->use_begin(), UE = Root->use_end();
-       UI != UE; ++UI) {
-    for (Value::use_iterator UUI = UI->use_begin(), UUE = UI->use_end();
-         UUI != UUE; ++UUI) {
-      DIGlobalVariable DIGV(cast<GlobalVariable>(*UUI));
-      DIType Ty = DIGV.getType();
-      unsigned short TypeNo = 0;
-      bool HasAux = false;
-      int Aux[PIC16Dbg::AuxSize] = { 0 };
-      std::string TagName = "";
-      std::string VarName = TAI->getGlobalPrefix()+DIGV.getGlobal()->getName();
-      PopulateDebugInfo(Ty, TypeNo, HasAux, Aux, TagName);
-      // Emit debug info only if type information is availaible.
-      if (TypeNo != PIC16Dbg::T_NULL) {
-        O << "\n\t.type " << VarName << ", " << TypeNo;
-        short ClassNo = getStorageClass(DIGV);
-        O << "\n\t.class " << VarName << ", " << ClassNo;
-        if (HasAux) 
-          EmitAuxEntry(VarName, Aux, PIC16Dbg::AuxSize, TagName);
-      }
+  DebugInfoFinder DbgFinder;
+  DbgFinder.processModule(M);
+  
+  for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(),
+         E = DbgFinder.global_variable_end(); I != E; ++I) {
+    DIGlobalVariable DIGV(*I);
+    DIType Ty = DIGV.getType();
+    unsigned short TypeNo = 0;
+    bool HasAux = false;
+    int Aux[PIC16Dbg::AuxSize] = { 0 };
+    std::string TagName = "";
+    std::string VarName = MAI->getGlobalPrefix()+DIGV.getGlobal()->getNameStr();
+    PopulateDebugInfo(Ty, TypeNo, HasAux, Aux, TagName);
+    // Emit debug info only if type information is availaible.
+    if (TypeNo != PIC16Dbg::T_NULL) {
+      O << "\n\t.type " << VarName << ", " << TypeNo;
+      short ClassNo = getStorageClass(DIGV);
+      O << "\n\t.class " << VarName << ", " << ClassNo;
+      if (HasAux) 
+        EmitAuxEntry(VarName, Aux, PIC16Dbg::AuxSize, TagName);
     }
   }
   O << "\n";
@@ -457,12 +462,12 @@ void PIC16DbgInfo::EmitVarDebugInfo(Module &M) {
 
 /// SwitchToCU - Switch to a new compilation unit.
 ///
-void PIC16DbgInfo::SwitchToCU(GlobalVariable *CU) {
+void PIC16DbgInfo::SwitchToCU(MDNode *CU) {
   // Get the file path from CU.
   DICompileUnit cu(CU);
-  std::string DirName, FileName;
-  std::string FilePath = cu.getDirectory(DirName) + "/" + 
-                         cu.getFilename(FileName);
+  std::string DirName = cu.getDirectory();
+  std::string FileName = cu.getFilename();
+  std::string FilePath = DirName + "/" + FileName;
 
   // Nothing to do if source file is still same.
   if ( FilePath == CurFile ) return;
diff --git a/lib/Target/PIC16/PIC16DebugInfo.h b/lib/Target/PIC16/PIC16DebugInfo.h
index d126d85..54e27c7 100644
--- a/lib/Target/PIC16/PIC16DebugInfo.h
+++ b/lib/Target/PIC16/PIC16DebugInfo.h
@@ -16,8 +16,6 @@
 
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetAsmInfo.h" 
-#include <map>
 
 namespace llvm {
   class MachineFunction;
@@ -90,11 +88,11 @@ namespace llvm {
     };
   }
 
-  class raw_ostream;
+  class formatted_raw_ostream;
 
   class PIC16DbgInfo {
-    raw_ostream &O;
-    const TargetAsmInfo *TAI;
+    formatted_raw_ostream &O;
+    const MCAsmInfo *MAI;
     std::string CurFile;
     unsigned CurLine;
 
@@ -103,7 +101,8 @@ namespace llvm {
     bool EmitDebugDirectives;
 
   public:
-    PIC16DbgInfo(raw_ostream &o, const TargetAsmInfo *T) : O(o), TAI(T) {
+    PIC16DbgInfo(formatted_raw_ostream &o, const MCAsmInfo *T)
+      : O(o), MAI(T) {
       CurFile = "";
       CurLine = 0;
       EmitDebugDirectives = false; 
@@ -118,7 +117,7 @@ namespace llvm {
 
 
     private:
-    void SwitchToCU (GlobalVariable *CU);
+    void SwitchToCU (MDNode *CU);
     void SwitchToLine (unsigned Line, bool IsInBeginFunction = false);
 
     void PopulateDebugInfo (DIType Ty, unsigned short &TypeNo, bool &HasAux,
@@ -144,8 +143,7 @@ namespace llvm {
     short getStorageClass(DIGlobalVariable DIGV);
     void EmitFunctBeginDI(const Function *F);
     void EmitCompositeTypeDecls(Module &M);
-    void EmitCompositeTypeElements (DICompositeType CTy,
-                                    std::string UniqueSuffix);
+    void EmitCompositeTypeElements (DICompositeType CTy, std::string Suffix);
     void EmitFunctEndDI(const Function *F, unsigned Line);
     void EmitAuxEntry(const std::string VarName, int Aux[], 
                       int num = PIC16Dbg::AuxSize, std::string TagName = "");
diff --git a/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp b/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp
index 6c2b8ec..cc57d12 100644
--- a/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp
+++ b/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp
@@ -13,6 +13,8 @@
 
 #define DEBUG_TYPE "pic16-isel"
 
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "PIC16ISelDAGToDAG.h"
 #include "llvm/Support/Debug.h"
 
diff --git a/lib/Target/PIC16/PIC16ISelDAGToDAG.h b/lib/Target/PIC16/PIC16ISelDAGToDAG.h
index 83abed3..3a2f6b4 100644
--- a/lib/Target/PIC16/PIC16ISelDAGToDAG.h
+++ b/lib/Target/PIC16/PIC16ISelDAGToDAG.h
@@ -31,7 +31,7 @@ class VISIBILITY_HIDDEN PIC16DAGToDAGISel : public SelectionDAGISel {
 
   /// PIC16Lowering - This object fully describes how to lower LLVM code to an
   /// PIC16-specific SelectionDAG.
-  PIC16TargetLowering PIC16Lowering;
+  PIC16TargetLowering &PIC16Lowering;
 
 public:
   explicit PIC16DAGToDAGISel(PIC16TargetMachine &tm) : 
diff --git a/lib/Target/PIC16/PIC16ISelLowering.cpp b/lib/Target/PIC16/PIC16ISelLowering.cpp
index 0d24f61..bf986b1 100644
--- a/lib/Target/PIC16/PIC16ISelLowering.cpp
+++ b/lib/Target/PIC16/PIC16ISelLowering.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "pic16-lower"
-
 #include "PIC16ISelLowering.h"
+#include "PIC16TargetObjectFile.h"
 #include "PIC16TargetMachine.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalValue.h"
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
 
 
 using namespace llvm;
@@ -30,7 +31,7 @@ using namespace llvm;
 static const char *getIntrinsicName(unsigned opcode) {
   std::string Basename;
   switch(opcode) {
-  default: assert (0 && "do not know intrinsic name");
+  default: llvm_unreachable("do not know intrinsic name");
   // Arithmetic Right shift for integer types.
   case PIC16ISD::SRA_I8: Basename = "sra.i8"; break;
   case RTLIB::SRA_I16: Basename = "sra.i16"; break;
@@ -114,22 +115,48 @@ static const char *getIntrinsicName(unsigned opcode) {
   std::string Fullname = prefix + tagname + Basename; 
 
   // The name has to live through program life.
-  char *tmp = new char[Fullname.size() + 1];
-  strcpy (tmp, Fullname.c_str());
-  
-  return tmp;
+  return createESName(Fullname);
+}
+
+// getStdLibCallName - Get the name for the standard library function.
+static const char *getStdLibCallName(unsigned opcode) {
+  std::string BaseName;
+  switch(opcode) {
+    case RTLIB::COS_F32: BaseName = "cos";
+      break;
+    case RTLIB::SIN_F32: BaseName = "sin";
+      break;
+    case RTLIB::MEMCPY: BaseName = "memcpy";
+      break;
+    case RTLIB::MEMSET: BaseName = "memset";
+      break;
+    case RTLIB::MEMMOVE: BaseName = "memmove";
+      break;
+    default: llvm_unreachable("do not know std lib call name");
+  }
+  std::string prefix = PAN::getTagName(PAN::PREFIX_SYMBOL);
+  std::string LibCallName = prefix + BaseName;
+
+  // The name has to live through program life.
+  return createESName(LibCallName);
 }
 
 // PIC16TargetLowering Constructor.
 PIC16TargetLowering::PIC16TargetLowering(PIC16TargetMachine &TM)
-  : TargetLowering(TM), TmpSize(0) {
+  : TargetLowering(TM, new PIC16TargetObjectFile()), TmpSize(0) {
  
   Subtarget = &TM.getSubtarget<PIC16Subtarget>();
 
   addRegisterClass(MVT::i8, PIC16::GPRRegisterClass);
 
   setShiftAmountType(MVT::i8);
-  setShiftAmountFlavor(Extend);
+  
+  // Std lib call names
+  setLibcallName(RTLIB::COS_F32, getStdLibCallName(RTLIB::COS_F32));
+  setLibcallName(RTLIB::SIN_F32, getStdLibCallName(RTLIB::SIN_F32));
+  setLibcallName(RTLIB::MEMCPY, getStdLibCallName(RTLIB::MEMCPY));
+  setLibcallName(RTLIB::MEMSET, getStdLibCallName(RTLIB::MEMSET));
+  setLibcallName(RTLIB::MEMMOVE, getStdLibCallName(RTLIB::MEMMOVE));
 
   // SRA library call names
   setPIC16LibcallName(PIC16ISD::SRA_I8, getIntrinsicName(PIC16ISD::SRA_I8));
@@ -226,6 +253,7 @@ PIC16TargetLowering::PIC16TargetLowering(PIC16TargetMachine &TM)
   setOperationAction(ISD::STORE,  MVT::i8,  Legal);
   setOperationAction(ISD::STORE,  MVT::i16, Custom);
   setOperationAction(ISD::STORE,  MVT::i32, Custom);
+  setOperationAction(ISD::STORE,  MVT::i64, Custom);
 
   setOperationAction(ISD::ADDE,    MVT::i8,  Custom);
   setOperationAction(ISD::ADDC,    MVT::i8,  Custom);
@@ -240,46 +268,27 @@ PIC16TargetLowering::PIC16TargetLowering(PIC16TargetMachine &TM)
   setOperationAction(ISD::XOR,    MVT::i8,  Custom);
 
   setOperationAction(ISD::FrameIndex, MVT::i16, Custom);
-  setOperationAction(ISD::CALL,   MVT::i16, Custom);
-  setOperationAction(ISD::RET,    MVT::Other, Custom);
 
-  setOperationAction(ISD::MUL,    MVT::i8,  Custom); 
-  setOperationAction(ISD::MUL,    MVT::i16, Expand);
-  setOperationAction(ISD::MUL,    MVT::i32, Expand);
+  setOperationAction(ISD::MUL,    MVT::i8,  Custom);
 
   setOperationAction(ISD::SMUL_LOHI,    MVT::i8,  Expand);
-  setOperationAction(ISD::SMUL_LOHI,    MVT::i16, Expand);
-  setOperationAction(ISD::SMUL_LOHI,    MVT::i32, Expand);
   setOperationAction(ISD::UMUL_LOHI,    MVT::i8,  Expand);
-  setOperationAction(ISD::UMUL_LOHI,    MVT::i16, Expand);
-  setOperationAction(ISD::UMUL_LOHI,    MVT::i32, Expand);
   setOperationAction(ISD::MULHU,        MVT::i8, Expand);
-  setOperationAction(ISD::MULHU,        MVT::i16, Expand);
-  setOperationAction(ISD::MULHU,        MVT::i32, Expand);
   setOperationAction(ISD::MULHS,        MVT::i8, Expand);
-  setOperationAction(ISD::MULHS,        MVT::i16, Expand);
-  setOperationAction(ISD::MULHS,        MVT::i32, Expand);
 
   setOperationAction(ISD::SRA,    MVT::i8,  Custom);
-  setOperationAction(ISD::SRA,    MVT::i16, Expand);
-  setOperationAction(ISD::SRA,    MVT::i32, Expand);
   setOperationAction(ISD::SHL,    MVT::i8,  Custom);
-  setOperationAction(ISD::SHL,    MVT::i16, Expand);
-  setOperationAction(ISD::SHL,    MVT::i32, Expand);
   setOperationAction(ISD::SRL,    MVT::i8,  Custom);
-  setOperationAction(ISD::SRL,    MVT::i16, Expand);
-  setOperationAction(ISD::SRL,    MVT::i32, Expand);
+
+  setOperationAction(ISD::ROTL,    MVT::i8,  Expand);
+  setOperationAction(ISD::ROTR,    MVT::i8,  Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
   // PIC16 does not support shift parts
-  setOperationAction(ISD::SRA_PARTS,    MVT::i8,  Expand);
-  setOperationAction(ISD::SRA_PARTS,    MVT::i16, Expand);
-  setOperationAction(ISD::SRA_PARTS,    MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS,    MVT::i8, Expand);
   setOperationAction(ISD::SHL_PARTS,    MVT::i8, Expand);
-  setOperationAction(ISD::SHL_PARTS,    MVT::i16, Expand);
-  setOperationAction(ISD::SHL_PARTS,    MVT::i32, Expand);
   setOperationAction(ISD::SRL_PARTS,    MVT::i8, Expand);
-  setOperationAction(ISD::SRL_PARTS,    MVT::i16, Expand);
-  setOperationAction(ISD::SRL_PARTS,    MVT::i32, Expand);
 
 
   // PIC16 does not have a SETCC, expand it to SELECT_CC.
@@ -356,7 +365,8 @@ static void PopulateResults(SDValue N, SmallVectorImpl<SDValue>&Results) {
     Results.push_back(N);
 }
 
-MVT PIC16TargetLowering::getSetCCResultType(MVT ValType) const {
+MVT::SimpleValueType
+PIC16TargetLowering::getSetCCResultType(EVT ValType) const {
   return MVT::i8;
 }
 
@@ -379,7 +389,7 @@ PIC16TargetLowering::getPIC16LibcallName(PIC16ISD::PIC16Libcall Call) {
 
 SDValue
 PIC16TargetLowering::MakePIC16Libcall(PIC16ISD::PIC16Libcall Call,
-                                      MVT RetVT, const SDValue *Ops,
+                                      EVT RetVT, const SDValue *Ops,
                                       unsigned NumOps, bool isSigned,
                                       SelectionDAG &DAG, DebugLoc dl) {
 
@@ -389,17 +399,20 @@ PIC16TargetLowering::MakePIC16Libcall(PIC16ISD::PIC16Libcall Call,
   TargetLowering::ArgListEntry Entry;
   for (unsigned i = 0; i != NumOps; ++i) {
     Entry.Node = Ops[i];
-    Entry.Ty = Entry.Node.getValueType().getTypeForMVT();
+    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
     Entry.isSExt = isSigned;
     Entry.isZExt = !isSigned;
     Args.push_back(Entry);
   }
-  SDValue Callee = DAG.getExternalSymbol(getPIC16LibcallName(Call), MVT::i8);
 
-   const Type *RetTy = RetVT.getTypeForMVT();
+  SDValue Callee = DAG.getExternalSymbol(getPIC16LibcallName(Call), MVT::i16);
+
+   const Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
    std::pair<SDValue,SDValue> CallInfo = 
      LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
-                 false, 0, CallingConv::C, false, Callee, Args, DAG, dl);
+                 false, 0, CallingConv::C, false,
+                 /*isReturnValueUsed=*/true,
+                 Callee, Args, DAG, dl);
 
   return CallInfo.first;
 }
@@ -429,6 +442,7 @@ const char *PIC16TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PIC16ISD::SUBCC:            return "PIC16ISD::SUBCC";
   case PIC16ISD::SELECT_ICC:       return "PIC16ISD::SELECT_ICC";
   case PIC16ISD::BRCOND:           return "PIC16ISD::BRCOND";
+  case PIC16ISD::RET:              return "PIC16ISD::RET";
   case PIC16ISD::Dummy:            return "PIC16ISD::Dummy";
   }
 }
@@ -502,7 +516,7 @@ SDValue PIC16TargetLowering::ExpandStore(SDNode *N, SelectionDAG &DAG) {
   SDValue Chain = St->getChain();
   SDValue Src = St->getValue();
   SDValue Ptr = St->getBasePtr();
-  MVT ValueType = Src.getValueType();
+  EVT ValueType = Src.getValueType();
   unsigned StoreOffset = 0;
   DebugLoc dl = N->getDebugLoc();
 
@@ -519,6 +533,10 @@ SDValue PIC16TargetLowering::ExpandStore(SDNode *N, SelectionDAG &DAG) {
     SDValue SrcLo, SrcHi;
     GetExpandedParts(Src, DAG, SrcLo, SrcHi);
     SDValue ChainLo = Chain, ChainHi = Chain;
+    // FIXME: This makes unsafe assumptions. The Chain may be a TokenFactor
+    // created for an unrelated purpose, in which case it may not have
+    // exactly two operands. Also, even if it does have two operands, they
+    // may not be the low and high parts of an aligned load that was split.
     if (Chain.getOpcode() == ISD::TokenFactor) {
       ChainLo = Chain.getOperand(0);
       ChainHi = Chain.getOperand(1);
@@ -546,16 +564,19 @@ SDValue PIC16TargetLowering::ExpandStore(SDNode *N, SelectionDAG &DAG) {
     GetExpandedParts(SrcHi, DAG, SrcHi1, SrcHi2);
 
     SDValue ChainLo = Chain, ChainHi = Chain;
+    // FIXME: This makes unsafe assumptions; see the FIXME above.
     if (Chain.getOpcode() == ISD::TokenFactor) {  
       ChainLo = Chain.getOperand(0);
       ChainHi = Chain.getOperand(1);
     }
     SDValue ChainLo1 = ChainLo, ChainLo2 = ChainLo, ChainHi1 = ChainHi,
             ChainHi2 = ChainHi;
+    // FIXME: This makes unsafe assumptions; see the FIXME above.
     if (ChainLo.getOpcode() == ISD::TokenFactor) {
       ChainLo1 = ChainLo.getOperand(0);
       ChainLo2 = ChainLo.getOperand(1);
     }
+    // FIXME: This makes unsafe assumptions; see the FIXME above.
     if (ChainHi.getOpcode() == ISD::TokenFactor) {
       ChainHi1 = ChainHi.getOperand(0);
       ChainHi2 = ChainHi.getOperand(1);
@@ -583,8 +604,26 @@ SDValue PIC16TargetLowering::ExpandStore(SDNode *N, SelectionDAG &DAG) {
                                  getChain(Store3), getChain(Store4));
     return  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, RetLo, RetHi);
 
-  }
-  else {
+  } else if (ValueType == MVT::i64) {
+    SDValue SrcLo, SrcHi;
+    GetExpandedParts(Src, DAG, SrcLo, SrcHi);
+    SDValue ChainLo = Chain, ChainHi = Chain;
+    // FIXME: This makes unsafe assumptions; see the FIXME above.
+    if (Chain.getOpcode() == ISD::TokenFactor) {
+      ChainLo = Chain.getOperand(0);
+      ChainHi = Chain.getOperand(1);
+    }
+    SDValue Store1 = DAG.getStore(ChainLo, dl, SrcLo, Ptr, NULL,
+                                  0 + StoreOffset);
+
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(4, Ptr.getValueType()));
+    SDValue Store2 = DAG.getStore(ChainHi, dl, SrcHi, Ptr, NULL,
+                                  1 + StoreOffset);
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1,
+                       Store2);
+  } else {
     assert (0 && "value type not supported");
     return SDValue();
   }
@@ -660,7 +699,7 @@ void PIC16TargetLowering::GetExpandedParts(SDValue Op, SelectionDAG &DAG,
                                            SDValue &Lo, SDValue &Hi) {  
   SDNode *N = Op.getNode();
   DebugLoc dl = N->getDebugLoc();
-  MVT NewVT = getTypeToTransformTo(N->getValueType(0));
+  EVT NewVT = getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
 
   // Extract the lo component.
   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NewVT, Op,
@@ -808,7 +847,7 @@ SDValue PIC16TargetLowering::ExpandLoad(SDNode *N, SelectionDAG &DAG) {
 
   SDValue Load, Offset;
   SDVTList Tys; 
-  MVT VT, NewVT;
+  EVT VT, NewVT;
   SDValue PtrLo, PtrHi;
   unsigned LoadOffset;
 
@@ -821,7 +860,7 @@ SDValue PIC16TargetLowering::ExpandLoad(SDNode *N, SelectionDAG &DAG) {
   unsigned NumLoads = VT.getSizeInBits() / 8; 
   std::vector<SDValue> PICLoads;
   unsigned iter;
-  MVT MemVT = LD->getMemoryVT();
+  EVT MemVT = LD->getMemoryVT();
   if(ISD::isNON_EXTLoad(N)) {
     for (iter=0; iter<NumLoads ; ++iter) {
       // Add the pointer offset if any
@@ -839,7 +878,7 @@ SDValue PIC16TargetLowering::ExpandLoad(SDNode *N, SelectionDAG &DAG) {
     
     // For extended loads this is the memory value type
     // i.e. without any extension
-    MVT MemVT = LD->getMemoryVT();
+    EVT MemVT = LD->getMemoryVT();
     unsigned MemBytes = MemVT.getSizeInBits() / 8;
     // if MVT::i1 is extended to MVT::i8 then MemBytes will be zero
     // So set it to one
@@ -945,6 +984,19 @@ SDValue PIC16TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
   return Call;
 }
 
+SDValue PIC16TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  // We should have handled larger operands in type legalizer itself.
+  assert (Op.getValueType() == MVT::i8 && "illegal multiply to lower");
+
+  SDNode *N = Op.getNode();
+  SmallVector<SDValue, 2> Ops(2);
+  Ops[0] = N->getOperand(0);
+  Ops[1] = N->getOperand(1);
+  SDValue Call = MakePIC16Libcall(PIC16ISD::MUL_I8, N->getValueType(0), 
+                                  &Ops[0], 2, true, DAG, N->getDebugLoc());
+  return Call;
+}
+
 void
 PIC16TargetLowering::LowerOperationWrapper(SDNode *N,
                                            SmallVectorImpl<SDValue>&Results,
@@ -953,12 +1005,8 @@ PIC16TargetLowering::LowerOperationWrapper(SDNode *N,
   SDValue Res;
   unsigned i;
   switch (Op.getOpcode()) {
-    case ISD::FORMAL_ARGUMENTS:
-      Res = LowerFORMAL_ARGUMENTS(Op, DAG); break;
     case ISD::LOAD:
       Res = ExpandLoad(Op.getNode(), DAG); break;
-    case ISD::CALL:
-      Res = LowerCALL(Op, DAG); break;
     default: {
       // All other operations are handled in LowerOperation.
       Res = LowerOperation(Op, DAG);
@@ -978,8 +1026,6 @@ PIC16TargetLowering::LowerOperationWrapper(SDNode *N,
 
 SDValue PIC16TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   switch (Op.getOpcode()) {
-    case ISD::FORMAL_ARGUMENTS:
-      return LowerFORMAL_ARGUMENTS(Op, DAG);
     case ISD::ADD:
     case ISD::ADDC:
     case ISD::ADDE:
@@ -992,6 +1038,8 @@ SDValue PIC16TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
       return ExpandLoad(Op.getNode(), DAG);
     case ISD::STORE:
       return ExpandStore(Op.getNode(), DAG);
+    case ISD::MUL:
+      return LowerMUL(Op, DAG);
     case ISD::SHL:
     case ISD::SRA:
     case ISD::SRL:
@@ -1000,10 +1048,6 @@ SDValue PIC16TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
     case ISD::AND:
     case ISD::XOR:
       return LowerBinOp(Op, DAG);
-    case ISD::CALL:
-      return LowerCALL(Op, DAG);
-    case ISD::RET:
-      return LowerRET(Op, DAG);
     case ISD::BR_CC:
       return LowerBR_CC(Op, DAG);
     case ISD::SELECT_CC:
@@ -1048,12 +1092,12 @@ SDValue PIC16TargetLowering::ConvertToMemOperand(SDValue Op,
 }
 
 SDValue PIC16TargetLowering::
-LowerIndirectCallArguments(SDValue Op, SDValue Chain, SDValue InFlag,
+LowerIndirectCallArguments(SDValue Chain, SDValue InFlag,
                            SDValue DataAddr_Lo, SDValue DataAddr_Hi,
-                           SelectionDAG &DAG) {
-  CallSDNode *TheCall = dyn_cast<CallSDNode>(Op);
-  unsigned NumOps = TheCall->getNumArgs();
-  DebugLoc dl = TheCall->getDebugLoc();
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG) {
+  unsigned NumOps = Outs.size();
 
   // If call has no arguments then do nothing and return.
   if (NumOps == 0)
@@ -1064,10 +1108,10 @@ LowerIndirectCallArguments(SDValue Op, SDValue Chain, SDValue InFlag,
   SDValue Arg, StoreRet;
 
   // For PIC16 ABI the arguments come after the return value. 
-  unsigned RetVals = TheCall->getNumRetVals();
+  unsigned RetVals = Ins.size();
   for (unsigned i = 0, ArgOffset = RetVals; i < NumOps; i++) {
     // Get the arguments
-    Arg = TheCall->getArg(i);
+    Arg = Outs[i].Val;
     
     Ops.clear();
     Ops.push_back(Chain);
@@ -1087,16 +1131,14 @@ LowerIndirectCallArguments(SDValue Op, SDValue Chain, SDValue InFlag,
 }
 
 SDValue PIC16TargetLowering::
-LowerDirectCallArguments(SDValue Op, SDValue Chain, SDValue ArgLabel, 
-                         SDValue InFlag, SelectionDAG &DAG) {
-  CallSDNode *TheCall = dyn_cast<CallSDNode>(Op);
-  unsigned NumOps = TheCall->getNumArgs();
-  DebugLoc dl = TheCall->getDebugLoc();
+LowerDirectCallArguments(SDValue ArgLabel, SDValue Chain, SDValue InFlag,
+                         const SmallVectorImpl<ISD::OutputArg> &Outs,
+                         DebugLoc dl, SelectionDAG &DAG) {
+  unsigned NumOps = Outs.size();
   std::string Name;
   SDValue Arg, StoreAt;
-  MVT ArgVT;
+  EVT ArgVT;
   unsigned Size=0;
-  unsigned ArgCount=0;
 
   // If call has no arguments then do nothing and return.
   if (NumOps == 0)
@@ -1114,9 +1156,9 @@ LowerDirectCallArguments(SDValue Op, SDValue Chain, SDValue ArgLabel,
 
   std::vector<SDValue> Ops;
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
-  for (unsigned i=ArgCount, Offset = 0; i<NumOps; i++) {
+  for (unsigned i=0, Offset = 0; i<NumOps; i++) {
     // Get the argument
-    Arg = TheCall->getArg(i);
+    Arg = Outs[i].Val;
     StoreOffset = (Offset + AddressOffset);
    
     // Store the argument on frame
@@ -1144,12 +1186,12 @@ LowerDirectCallArguments(SDValue Op, SDValue Chain, SDValue ArgLabel,
 }
 
 SDValue PIC16TargetLowering::
-LowerIndirectCallReturn (SDValue Op, SDValue Chain, SDValue InFlag,
-                         SDValue DataAddr_Lo, SDValue DataAddr_Hi,
-                         SelectionDAG &DAG) {
-  CallSDNode *TheCall = dyn_cast<CallSDNode>(Op);
-  DebugLoc dl = TheCall->getDebugLoc();
-  unsigned RetVals = TheCall->getNumRetVals();
+LowerIndirectCallReturn(SDValue Chain, SDValue InFlag,
+                        SDValue DataAddr_Lo, SDValue DataAddr_Hi,
+                        const SmallVectorImpl<ISD::InputArg> &Ins,
+                        DebugLoc dl, SelectionDAG &DAG,
+                        SmallVectorImpl<SDValue> &InVals) {
+  unsigned RetVals = Ins.size();
 
   // If call does not have anything to return
   // then do nothing and go back.
@@ -1157,7 +1199,6 @@ LowerIndirectCallReturn (SDValue Op, SDValue Chain, SDValue InFlag,
     return Chain;
 
   // Call has something to return
-  std::vector<SDValue> ResultVals;
   SDValue LoadRet;
 
   SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Other, MVT::Flag);
@@ -1167,23 +1208,20 @@ LowerIndirectCallReturn (SDValue Op, SDValue Chain, SDValue InFlag,
                           InFlag);
     InFlag = getOutFlag(LoadRet);
     Chain = getChain(LoadRet);
-    ResultVals.push_back(LoadRet);
+    InVals.push_back(LoadRet);
   }
-  ResultVals.push_back(Chain);
-  SDValue Res = DAG.getMergeValues(&ResultVals[0], ResultVals.size(), dl);
-  return Res;
+  return Chain;
 }
 
 SDValue PIC16TargetLowering::
-LowerDirectCallReturn(SDValue Op, SDValue Chain, SDValue RetLabel,
-                      SDValue InFlag, SelectionDAG &DAG) {
-  CallSDNode *TheCall = dyn_cast<CallSDNode>(Op);
-  DebugLoc dl = TheCall->getDebugLoc();
+LowerDirectCallReturn(SDValue RetLabel, SDValue Chain, SDValue InFlag,
+                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                      DebugLoc dl, SelectionDAG &DAG,
+                      SmallVectorImpl<SDValue> &InVals) {
+
   // Currently handling primitive types only. They will come in
   // i8 parts
-  unsigned RetVals = TheCall->getNumRetVals();
-  
-  std::vector<SDValue> ResultVals;
+  unsigned RetVals = Ins.size();
 
   // Return immediately if the return type is void
   if (RetVals == 0)
@@ -1209,30 +1247,20 @@ LowerDirectCallReturn(SDValue Op, SDValue Chain, SDValue RetLabel,
 
     Chain = getChain(LoadRet);
     Offset++;
-    ResultVals.push_back(LoadRet);
+    InVals.push_back(LoadRet);
   }
 
-  // To return use MERGE_VALUES
-  ResultVals.push_back(Chain);
-  SDValue Res = DAG.getMergeValues(&ResultVals[0], ResultVals.size(), dl);
-  return Res;
+  return Chain;
 }
 
-SDValue PIC16TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
-  SDValue Chain = Op.getOperand(0);
-  DebugLoc dl = Op.getDebugLoc();
-
-  if (Op.getNumOperands() == 1)   // return void
-    return Op;
+SDValue
+PIC16TargetLowering::LowerReturn(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 DebugLoc dl, SelectionDAG &DAG) {
 
-  // return should have odd number of operands
-  if ((Op.getNumOperands() % 2) == 0 ) {
-    assert(0 && "Do not know how to return this many arguments!");
-    abort();
-  }
-  
   // Number of values to return 
-  unsigned NumRet = (Op.getNumOperands() / 2);
+  unsigned NumRet = Outs.size();
 
   // Function returns value always on stack with the offset starting
   // from 0 
@@ -1246,68 +1274,13 @@ SDValue PIC16TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
   SDValue BS = DAG.getConstant(1, MVT::i8);
   SDValue RetVal;
   for(unsigned i=0;i<NumRet; ++i) {
-    RetVal = Op.getNode()->getOperand(2*i + 1);
+    RetVal = Outs[i].Val;
     Chain =  DAG.getNode (PIC16ISD::PIC16Store, dl, MVT::Other, Chain, RetVal,
                         ES, BS,
                         DAG.getConstant (i, MVT::i8));
       
   }
-  return DAG.getNode(ISD::RET, dl, MVT::Other, Chain);
-}
-
-// CALL node may have some operands non-legal to PIC16. Generate new CALL
-// node with all the operands legal.
-// Currently only Callee operand of the CALL node is non-legal. This function
-// legalizes the Callee operand and uses all other operands as are to generate
-// new CALL node.
-
-SDValue PIC16TargetLowering::LegalizeCALL(SDValue Op, SelectionDAG &DAG) {
-    CallSDNode *TheCall = dyn_cast<CallSDNode>(Op);
-    SDValue Chain = TheCall->getChain();
-    SDValue Callee = TheCall->getCallee();
-    DebugLoc dl = TheCall->getDebugLoc();
-    unsigned i =0;
-
-    assert(Callee.getValueType() == MVT::i16 &&
-           "Don't know how to legalize this call node!!!");
-    assert(Callee.getOpcode() == ISD::BUILD_PAIR &&
-           "Don't know how to legalize this call node!!!");
-
-    if (isDirectAddress(Callee)) {
-       // Come here for direct calls
-       Callee = Callee.getOperand(0).getOperand(0);
-    } else {
-      // Come here for indirect calls
-      SDValue Lo, Hi;
-      // Indirect addresses. Get the hi and lo parts of ptr.
-      GetExpandedParts(Callee, DAG, Lo, Hi);
-      // Connect Lo and Hi parts of the callee with the PIC16Connect
-      Callee = DAG.getNode(PIC16ISD::PIC16Connect, dl, MVT::i8, Lo, Hi);
-    }
-    std::vector<SDValue> Ops;
-    Ops.push_back(Chain);
-    Ops.push_back(Callee);
-
-    // Add the call arguments and their flags
-    unsigned NumArgs = TheCall->getNumArgs();
-    for(i=0;i<NumArgs;i++) {
-       Ops.push_back(TheCall->getArg(i));
-       Ops.push_back(TheCall->getArgFlagsVal(i));
-    }
-    std::vector<MVT> NodeTys;
-    unsigned NumRets = TheCall->getNumRetVals();
-    for(i=0;i<NumRets;i++)
-       NodeTys.push_back(TheCall->getRetValType(i));
-
-   // Return a Chain as well
-   NodeTys.push_back(MVT::Other);
-   
-   SDVTList VTs = DAG.getVTList(&NodeTys[0], NodeTys.size());
-   // Generate new call with all the operands legal
-   return DAG.getCall(TheCall->getCallingConv(), dl,
-                      TheCall->isVarArg(), TheCall->isTailCall(),
-                      TheCall->isInreg(), VTs, &Ops[0], Ops.size(),
-                      TheCall->getNumFixedArgs());
+  return DAG.getNode(PIC16ISD::RET, dl, MVT::Other, Chain);
 }
 
 void PIC16TargetLowering::
@@ -1372,36 +1345,40 @@ GetDataAddress(DebugLoc dl, SDValue Callee, SDValue &Chain,
    DataAddr_Hi = DAG.getNode(PIC16ISD::MTHI, dl, MVT::i8, Call, OperFlag);
 }
 
+SDValue
+PIC16TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               bool isTailCall,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) {
 
-SDValue PIC16TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
-    CallSDNode *TheCall = dyn_cast<CallSDNode>(Op);
-    SDValue Chain = TheCall->getChain();
-    SDValue Callee = TheCall->getCallee();
-    DebugLoc dl = TheCall->getDebugLoc();
-    if (Callee.getValueType() == MVT::i16 &&
-      Callee.getOpcode() == ISD::BUILD_PAIR) {
-          // Control should come here only from TypeLegalizer for lowering
-          
-          // Legalize the non-legal arguments of call and return the
-          // new call with legal arguments.
-          return LegalizeCALL(Op, DAG);
-    }
-    // Control should come here from Legalize DAG.
-    // Here all the operands of CALL node should be legal.
-    
-    // If this is an indirect call then to pass the arguments
-    // and read the return value back, we need the data address
-    // of the function being called. 
-    // To get the data address two more calls need to be made.
+    assert(Callee.getValueType() == MVT::i16 &&
+           "Don't know how to legalize this call node!!!");
 
     // The flag to track if this is a direct or indirect call.
     bool IsDirectCall = true;    
-    unsigned RetVals = TheCall->getNumRetVals();
-    unsigned NumArgs = TheCall->getNumArgs();
+    unsigned RetVals = Ins.size();
+    unsigned NumArgs = Outs.size();
 
     SDValue DataAddr_Lo, DataAddr_Hi; 
-    if (Callee.getOpcode() == PIC16ISD::PIC16Connect) { 
+    if (!isa<GlobalAddressSDNode>(Callee) &&
+        !isa<ExternalSymbolSDNode>(Callee)) {
        IsDirectCall = false;    // This is indirect call
+
+       // If this is an indirect call then to pass the arguments
+       // and read the return value back, we need the data address
+       // of the function being called.
+       // To get the data address two more calls need to be made.
+
+       // Come here for indirect calls
+       SDValue Lo, Hi;
+       // Indirect addresses. Get the hi and lo parts of ptr.
+       GetExpandedParts(Callee, DAG, Lo, Hi);
+       // Connect Lo and Hi parts of the callee with the PIC16Connect
+       Callee = DAG.getNode(PIC16ISD::PIC16Connect, dl, MVT::i8, Lo, Hi);
+
        // Read DataAddress only if we have to pass arguments or 
        // read return value. 
        if ((RetVals > 0) || (NumArgs > 0)) 
@@ -1457,12 +1434,13 @@ SDValue PIC16TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
     // Pass the argument to function before making the call.
     SDValue CallArgs;
     if (IsDirectCall) {
-      CallArgs = LowerDirectCallArguments(Op, Chain, ArgLabel, OperFlag, DAG);
+      CallArgs = LowerDirectCallArguments(ArgLabel, Chain, OperFlag,
+                                          Outs, dl, DAG);
       Chain = getChain(CallArgs);
       OperFlag = getOutFlag(CallArgs);
     } else {
-      CallArgs = LowerIndirectCallArguments(Op, Chain, OperFlag, DataAddr_Lo, 
-                                            DataAddr_Hi, DAG);
+      CallArgs = LowerIndirectCallArguments(Chain, OperFlag, DataAddr_Lo,
+                                            DataAddr_Hi, Outs, Ins, dl, DAG);
       Chain = getChain(CallArgs);
       OperFlag = getOutFlag(CallArgs);
     }
@@ -1483,10 +1461,11 @@ SDValue PIC16TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
 
     // Lower the return value reading after the call.
     if (IsDirectCall)
-      return LowerDirectCallReturn(Op, Chain, RetLabel, OperFlag, DAG);
+      return LowerDirectCallReturn(RetLabel, Chain, OperFlag,
+                                   Ins, dl, DAG, InVals);
     else
-      return LowerIndirectCallReturn(Op, Chain, OperFlag, DataAddr_Lo,
-                                     DataAddr_Hi, DAG);
+      return LowerIndirectCallReturn(Chain, OperFlag, DataAddr_Lo,
+                                     DataAddr_Hi, Ins, dl, DAG, InVals);
 }
 
 bool PIC16TargetLowering::isDirectLoad(const SDValue Op) {
@@ -1591,11 +1570,20 @@ SDValue PIC16TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) {
   SDValue NewVal = ConvertToMemOperand (Op.getOperand(0), DAG, dl);
 
   SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Flag);
-  if (Op.getOpcode() == ISD::SUBE)
-    return DAG.getNode(Op.getOpcode(), dl, Tys, NewVal, Op.getOperand(1),
-                       Op.getOperand(2));
-  else
-    return DAG.getNode(Op.getOpcode(), dl, Tys, NewVal, Op.getOperand(1));
+  switch (Op.getOpcode()) {
+    default:
+      assert (0 && "Opcode unknown."); 
+    case ISD::SUBE:
+      return DAG.getNode(Op.getOpcode(), dl, Tys, NewVal, Op.getOperand(1),
+                         Op.getOperand(2));
+      break;
+    case ISD::SUBC:
+      return DAG.getNode(Op.getOpcode(), dl, Tys, NewVal, Op.getOperand(1));
+      break;
+    case ISD::SUB:
+      return DAG.getNode(Op.getOpcode(), dl, MVT::i8, NewVal, Op.getOperand(1));
+      break;
+  }
 }
 
 void PIC16TargetLowering::InitReservedFrameCount(const Function *F) {
@@ -1609,17 +1597,19 @@ void PIC16TargetLowering::InitReservedFrameCount(const Function *F) {
     ReservedFrameCount = NumArgs + 1;
 }
 
-// LowerFORMAL_ARGUMENTS - Argument values are loaded from the
+// LowerFormalArguments - Argument values are loaded from the
 // <fname>.args + offset. All arguments are already broken to leaglized
 // types, so the offset just runs from 0 to NumArgVals - 1.
 
-SDValue PIC16TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, 
-                                                   SelectionDAG &DAG) {
-  SmallVector<SDValue, 8> ArgValues;
-  unsigned NumArgVals = Op.getNode()->getNumValues() - 1;
-  DebugLoc dl = Op.getDebugLoc();
-  SDValue Chain = Op.getOperand(0);    // Formal arguments' chain
-
+SDValue
+PIC16TargetLowering::LowerFormalArguments(SDValue Chain,
+                                          CallingConv::ID CallConv,
+                                          bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                          DebugLoc dl,
+                                          SelectionDAG &DAG,
+                                          SmallVectorImpl<SDValue> &InVals) {
+  unsigned NumArgVals = Ins.size();
 
   // Get the callee's name to create the <fname>.args label to pass args.
   MachineFunction &MF = DAG.getMachineFunction();
@@ -1643,13 +1633,10 @@ SDValue PIC16TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op,
     SDValue PICLoad = DAG.getNode(PIC16ISD::PIC16LdArg, dl, VTs, Chain, ES, BS,
                                   Offset);
     Chain = getChain(PICLoad);
-    ArgValues.push_back(PICLoad);
+    InVals.push_back(PICLoad);
   }
 
-  // Return a MERGE_VALUE node.
-  ArgValues.push_back(Op.getOperand(0));
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(), 
-                     &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
+  return Chain;
 }
 
 // Perform DAGCombine of PIC16Load.
@@ -1697,7 +1684,7 @@ SDValue PIC16TargetLowering::PerformDAGCombine(SDNode *N,
 
 static PIC16CC::CondCodes IntCCToPIC16CC(ISD::CondCode CC) {
   switch (CC) {
-  default: assert(0 && "Unknown condition code!");
+  default: llvm_unreachable("Unknown condition code!");
   case ISD::SETNE:  return PIC16CC::NE;
   case ISD::SETEQ:  return PIC16CC::EQ;
   case ISD::SETGT:  return PIC16CC::GT;
@@ -1826,7 +1813,8 @@ SDValue PIC16TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
 
 MachineBasicBlock *
 PIC16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *BB) const {
+                                                 MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
   const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
   unsigned CC = (PIC16CC::CondCodes)MI->getOperand(3).getImm();
   DebugLoc dl = MI->getDebugLoc();
@@ -1852,9 +1840,18 @@ PIC16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   F->insert(It, copy0MBB);
   F->insert(It, sinkMBB);
 
-  // Update machine-CFG edges by transferring all successors of the current
+  // Update machine-CFG edges by first adding all successors of the current
   // block to the new block which will contain the Phi node for the select.
-  sinkMBB->transferSuccessors(BB);
+  // Also inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
+         E = BB->succ_end(); I != E; ++I) {
+    EM->insert(std::make_pair(*I, sinkMBB));
+    sinkMBB->addSuccessor(*I);
+  }
+  // Next, remove all successors of the current block, and add the true
+  // and fallthrough blocks as its successors.
+  while (!BB->succ_empty())
+    BB->removeSuccessor(BB->succ_begin());
   // Next, add the true and fallthrough blocks as its successors.
   BB->addSuccessor(copy0MBB);
   BB->addSuccessor(sinkMBB);
diff --git a/lib/Target/PIC16/PIC16ISelLowering.h b/lib/Target/PIC16/PIC16ISelLowering.h
index b40ea12..286ed24 100644
--- a/lib/Target/PIC16/PIC16ISelLowering.h
+++ b/lib/Target/PIC16/PIC16ISelLowering.h
@@ -52,6 +52,7 @@ namespace llvm {
       SUBCC,         // Compare for equality or inequality.
       SELECT_ICC,    // Psuedo to be caught in schedular and expanded to brcond.
       BRCOND,        // Conditional branch.
+      RET,           // Return.
       Dummy
     };
 
@@ -81,39 +82,45 @@ namespace llvm {
     /// DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual MVT getSetCCResultType(MVT ValType) const;
-    SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
+    virtual MVT::SimpleValueType getSetCCResultType(EVT ValType) const;
     SDValue LowerShift(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerMUL(SDValue Op, SelectionDAG &DAG);
     SDValue LowerADD(SDValue Op, SelectionDAG &DAG);
     SDValue LowerSUB(SDValue Op, SelectionDAG &DAG);
     SDValue LowerBinOp(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
     // Call returns
     SDValue 
-    LowerDirectCallReturn(SDValue Op, SDValue Chain, SDValue FrameAddress, 
-                          SDValue InFlag, SelectionDAG &DAG);
+    LowerDirectCallReturn(SDValue RetLabel, SDValue Chain, SDValue InFlag,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          DebugLoc dl, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals);
     SDValue 
-    LowerIndirectCallReturn(SDValue Op, SDValue Chain, SDValue InFlag,
-                            SDValue DataAddr_Lo, SDValue DataAddr_Hi,
-                            SelectionDAG &DAG);
+    LowerIndirectCallReturn(SDValue Chain, SDValue InFlag,
+                             SDValue DataAddr_Lo, SDValue DataAddr_Hi,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
 
     // Call arguments
     SDValue 
-    LowerDirectCallArguments(SDValue Op, SDValue Chain, SDValue FrameAddress, 
-                             SDValue InFlag, SelectionDAG &DAG);
+    LowerDirectCallArguments(SDValue ArgLabel, SDValue Chain, SDValue InFlag,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             DebugLoc dl, SelectionDAG &DAG);
 
     SDValue 
-    LowerIndirectCallArguments(SDValue Op, SDValue Chain, SDValue InFlag, 
+    LowerIndirectCallArguments(SDValue Chain, SDValue InFlag,
                                SDValue DataAddr_Lo, SDValue DataAddr_Hi, 
-                               SelectionDAG &DAG);
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc dl, SelectionDAG &DAG);
 
     SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG);
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
     SDValue getPIC16Cmp(SDValue LHS, SDValue RHS, unsigned OrigCC, SDValue &CC,
                         SelectionDAG &DAG, DebugLoc dl);
     virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                  MachineBasicBlock *MBB) const;
+                                                         MachineBasicBlock *MBB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
 
 
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
@@ -124,6 +131,28 @@ namespace llvm {
                                        SmallVectorImpl<SDValue> &Results,
                                        SelectionDAG &DAG);
 
+    virtual SDValue
+    LowerFormalArguments(SDValue Chain,
+                         CallingConv::ID CallConv,
+                         bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         DebugLoc dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
     SDValue ExpandStore(SDNode *N, SelectionDAG &DAG);
     SDValue ExpandLoad(SDNode *N, SelectionDAG &DAG);
     SDValue ExpandGlobalAddress(SDNode *N, SelectionDAG &DAG);
@@ -174,12 +203,6 @@ namespace llvm {
     void LegalizeFrameIndex(SDValue Op, SelectionDAG &DAG, SDValue &ES, 
                             int &Offset);
 
-
-    // CALL node should have all legal operands only. Legalize all non-legal
-    // operands of CALL node and then return the new call will all operands
-    // legal.
-    SDValue LegalizeCALL(SDValue Op, SelectionDAG &DAG);
-
     // For indirect calls data address of the callee frame need to be
     // extracted. This function fills the arguments DataAddr_Lo and 
     // DataAddr_Hi with the address of the callee frame.
@@ -209,7 +232,7 @@ namespace llvm {
     const char *getPIC16LibcallName(PIC16ISD::PIC16Libcall Call);
 
     // Make PIC16 Libcall.
-    SDValue MakePIC16Libcall(PIC16ISD::PIC16Libcall Call, MVT RetVT, 
+    SDValue MakePIC16Libcall(PIC16ISD::PIC16Libcall Call, EVT RetVT, 
                              const SDValue *Ops, unsigned NumOps, bool isSigned,
                              SelectionDAG &DAG, DebugLoc dl);
 
diff --git a/lib/Target/PIC16/PIC16InstrInfo.cpp b/lib/Target/PIC16/PIC16InstrInfo.cpp
index 8418423..cb0c41b 100644
--- a/lib/Target/PIC16/PIC16InstrInfo.cpp
+++ b/lib/Target/PIC16/PIC16InstrInfo.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <cstdio>
 
 
@@ -104,7 +105,7 @@ void PIC16InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       .addImm(1); // Emit banksel for it.
   }
   else
-    assert(0 && "Can't store this register to stack slot");
+    llvm_unreachable("Can't store this register to stack slot");
 }
 
 void PIC16InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 
@@ -144,7 +145,7 @@ void PIC16InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       .addImm(1); // Emit banksel for it.
   }
   else
-    assert(0 && "Can't load this register from stack slot");
+    llvm_unreachable("Can't load this register from stack slot");
 }
 
 bool PIC16InstrInfo::copyRegToReg (MachineBasicBlock &MBB,
diff --git a/lib/Target/PIC16/PIC16InstrInfo.td b/lib/Target/PIC16/PIC16InstrInfo.td
index a054bdc..250ca0a 100644
--- a/lib/Target/PIC16/PIC16InstrInfo.td
+++ b/lib/Target/PIC16/PIC16InstrInfo.td
@@ -115,6 +115,8 @@ def PIC16Brcond : SDNode<"PIC16ISD::BRCOND", SDT_PIC16Brcond,
 def PIC16Selecticc : SDNode<"PIC16ISD::SELECT_ICC", SDT_PIC16Selecticc, 
                          [SDNPInFlag]>;
 
+def PIC16ret       : SDNode<"PIC16ISD::RET", SDTNone, [SDNPHasChain]>;
+
 //===----------------------------------------------------------------------===//
 // PIC16 Operand Definitions.
 //===----------------------------------------------------------------------===//
@@ -375,8 +377,9 @@ def subfw_2: SUBFW<0, "subwf", subc>;
 let Uses = [STATUS] in
 def subfwb: SUBFW<0, "subwfb", sube>;  // With Borrow.
 
-def subfw_cc: SUBFW<0, "subwf", PIC16Subcc>;
 }
+let Defs = [STATUS], isTerminator = 1 in
+def subfw_cc: SUBFW<0, "subwf", PIC16Subcc>;
 
 // [F] -= W ; 
 let mayStore = 1 in
@@ -425,8 +428,9 @@ class SUBLW<bits<6> opcode, SDNode OpNode> :
 let Defs = [STATUS] in {
 def sublw_1 : SUBLW<0, sub>;
 def sublw_2 : SUBLW<0, subc>;
-def sublw_cc : SUBLW<0, PIC16Subcc>;
 }
+let Defs = [STATUS], isTerminator = 1 in 
+def sublw_cc : SUBLW<0, PIC16Subcc>;
 
 // Call instruction.
 let isCall = 1,
@@ -489,8 +493,9 @@ def pagesel :
 
 
 // Return insn.
+let isTerminator = 1, isBarrier = 1, isReturn = 1 in
 def Return : 
-  ControlFormat<0, (outs), (ins), "return", [(ret)]>;
+  ControlFormat<0, (outs), (ins), "return", [(PIC16ret)]>;
 
 //===----------------------------------------------------------------------===//
 // PIC16 Replacment Patterns.
diff --git a/lib/Target/PIC16/PIC16MCAsmInfo.cpp b/lib/Target/PIC16/PIC16MCAsmInfo.cpp
new file mode 100644
index 0000000..a17d1a8
--- /dev/null
+++ b/lib/Target/PIC16/PIC16MCAsmInfo.cpp
@@ -0,0 +1,58 @@
+//===-- PIC16MCAsmInfo.cpp - PIC16 asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the PIC16MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16MCAsmInfo.h"
+
+// FIXME: Layering violation to get enums and static function, should be moved
+// to separate headers.
+#include "PIC16.h"
+#include "PIC16ISelLowering.h"
+using namespace llvm;
+
+PIC16MCAsmInfo::PIC16MCAsmInfo(const Target &T, const StringRef &TT) {
+  CommentString = ";";
+  GlobalPrefix = PAN::getTagName(PAN::PREFIX_SYMBOL);
+  GlobalDirective = "\tglobal\t";
+  ExternDirective = "\textern\t";
+
+  Data8bitsDirective = " db ";
+  Data16bitsDirective = " dw ";
+  Data32bitsDirective = " dl ";
+  Data64bitsDirective = NULL;
+  ZeroDirective = NULL;
+  AsciiDirective = " dt ";
+  AscizDirective = NULL;
+    
+  RomData8bitsDirective = " dw ";
+  RomData16bitsDirective = " rom_di ";
+  RomData32bitsDirective = " rom_dl ";
+    
+    
+  // Set it to false because we weed to generate c file name and not bc file
+  // name.
+  HasSingleParameterDotFile = false;
+}
+
+const char *PIC16MCAsmInfo::getDataASDirective(unsigned Size,
+                                               unsigned AS) const {
+  if (AS != PIC16ISD::ROM_SPACE)
+    return 0;
+  
+  switch (Size) {
+  case  8: return RomData8bitsDirective;
+  case 16: return RomData16bitsDirective;
+  case 32: return RomData32bitsDirective;
+  default: return NULL;
+  }
+}
+
diff --git a/lib/Target/PIC16/PIC16MCAsmInfo.h b/lib/Target/PIC16/PIC16MCAsmInfo.h
new file mode 100644
index 0000000..e84db85
--- /dev/null
+++ b/lib/Target/PIC16/PIC16MCAsmInfo.h
@@ -0,0 +1,35 @@
+//=====-- PIC16MCAsmInfo.h - PIC16 asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the PIC16MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16TARGETASMINFO_H
+#define PIC16TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+
+  class PIC16MCAsmInfo : public MCAsmInfo {
+    const char *RomData8bitsDirective;
+    const char *RomData16bitsDirective;
+    const char *RomData32bitsDirective;
+  public:    
+    PIC16MCAsmInfo(const Target &T, const StringRef &TT);
+    
+    virtual const char *getDataASDirective(unsigned size, unsigned AS) const;
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/PIC16/PIC16MemSelOpt.cpp b/lib/Target/PIC16/PIC16MemSelOpt.cpp
index 43d47ae..c9ebb57 100644
--- a/lib/Target/PIC16/PIC16MemSelOpt.cpp
+++ b/lib/Target/PIC16/PIC16MemSelOpt.cpp
@@ -22,7 +22,7 @@
 #define DEBUG_TYPE "pic16-codegen"
 #include "PIC16.h"
 #include "PIC16InstrInfo.h"
-#include "PIC16TargetAsmInfo.h"
+#include "PIC16MCAsmInfo.h"
 #include "PIC16TargetMachine.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
diff --git a/lib/Target/PIC16/PIC16RegisterInfo.cpp b/lib/Target/PIC16/PIC16RegisterInfo.cpp
index eb758d8..47087ab 100644
--- a/lib/Target/PIC16/PIC16RegisterInfo.cpp
+++ b/lib/Target/PIC16/PIC16RegisterInfo.cpp
@@ -16,7 +16,7 @@
 #include "PIC16.h"
 #include "PIC16RegisterInfo.h"
 #include "llvm/ADT/BitVector.h"
-
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
@@ -51,10 +51,13 @@ bool PIC16RegisterInfo::hasFP(const MachineFunction &MF) const {
   return false;
 }
 
-void PIC16RegisterInfo::
+unsigned PIC16RegisterInfo::
 eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                    RegScavenger *RS) const
-{    /* NOT YET IMPLEMENTED */  }
+                    int *Value, RegScavenger *RS) const
+{
+  /* NOT YET IMPLEMENTED */
+  return 0;
+}
 
 void PIC16RegisterInfo::emitPrologue(MachineFunction &MF) const
 {    /* NOT YET IMPLEMENTED */  }
@@ -65,17 +68,17 @@ emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
 
 int PIC16RegisterInfo::
 getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  assert(0 && "Not keeping track of debug information yet!!");
+  llvm_unreachable("Not keeping track of debug information yet!!");
   return -1;
 }
 
 unsigned PIC16RegisterInfo::getFrameRegister(MachineFunction &MF) const {
-  assert(0 && "PIC16 Does not have any frame register");
+  llvm_unreachable("PIC16 Does not have any frame register");
   return 0;
 }
 
 unsigned PIC16RegisterInfo::getRARegister() const {
-  assert(0 && "PIC16 Does not have any return address register");
+  llvm_unreachable("PIC16 Does not have any return address register");
   return 0;
 }
 
diff --git a/lib/Target/PIC16/PIC16RegisterInfo.h b/lib/Target/PIC16/PIC16RegisterInfo.h
index 83689d0..8aa5a10 100644
--- a/lib/Target/PIC16/PIC16RegisterInfo.h
+++ b/lib/Target/PIC16/PIC16RegisterInfo.h
@@ -48,8 +48,9 @@ class PIC16RegisterInfo : public PIC16GenRegisterInfo {
   virtual BitVector getReservedRegs(const MachineFunction &MF) const;
   virtual bool hasFP(const MachineFunction &MF) const;
 
-  virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI,
-                        int SPAdj, RegScavenger *RS=NULL) const;
+  virtual unsigned eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                       int SPAdj, int *Value = NULL,
+                                       RegScavenger *RS=NULL) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
diff --git a/lib/Target/PIC16/PIC16Subtarget.cpp b/lib/Target/PIC16/PIC16Subtarget.cpp
index db8a5d8..33fc3fb 100644
--- a/lib/Target/PIC16/PIC16Subtarget.cpp
+++ b/lib/Target/PIC16/PIC16Subtarget.cpp
@@ -16,7 +16,7 @@
 
 using namespace llvm;
 
-PIC16Subtarget::PIC16Subtarget(const Module &M, const std::string &FS, 
+PIC16Subtarget::PIC16Subtarget(const std::string &TT, const std::string &FS, 
                                bool Cooper)
   :IsCooper(Cooper)
 {
diff --git a/lib/Target/PIC16/PIC16Subtarget.h b/lib/Target/PIC16/PIC16Subtarget.h
index e5147a0..81e3783 100644
--- a/lib/Target/PIC16/PIC16Subtarget.h
+++ b/lib/Target/PIC16/PIC16Subtarget.h
@@ -19,7 +19,6 @@
 #include <string>
 
 namespace llvm {
-class Module;
 
 class PIC16Subtarget : public TargetSubtarget {
 
@@ -28,9 +27,9 @@ class PIC16Subtarget : public TargetSubtarget {
 
 public:
   /// This constructor initializes the data members to match that
-  /// of the specified module.
+  /// of the specified triple.
   ///
-  PIC16Subtarget(const Module &M, const std::string &FS, bool Cooper);
+  PIC16Subtarget(const std::string &TT, const std::string &FS, bool Cooper);
   
   /// isCooper - Returns true if the target ISA is Cooper.
   bool isCooper() const { return IsCooper; }
diff --git a/lib/Target/PIC16/PIC16TargetMachine.cpp b/lib/Target/PIC16/PIC16TargetMachine.cpp
index 77ad188..08307e7 100644
--- a/lib/Target/PIC16/PIC16TargetMachine.cpp
+++ b/lib/Target/PIC16/PIC16TargetMachine.cpp
@@ -12,51 +12,32 @@
 //===----------------------------------------------------------------------===//
 
 #include "PIC16.h"
-#include "PIC16TargetAsmInfo.h"
+#include "PIC16MCAsmInfo.h"
 #include "PIC16TargetMachine.h"
-#include "llvm/Module.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetAsmInfo.h"
-#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Target/TargetRegistry.h"
 
 using namespace llvm;
 
-/// PIC16TargetMachineModule - Note that this is used on hosts that
-/// cannot link in a library unless there are references into the
-/// library.  In particular, it seems that it is not possible to get
-/// things to work on Win32 without this.  Though it is unused, do not
-/// remove it.
-extern "C" int PIC16TargetMachineModule;
-int PIC16TargetMachineModule = 0;
-
-
-// Register the targets
-static RegisterTarget<PIC16TargetMachine> 
-X("pic16", "PIC16 14-bit [experimental].");
-static RegisterTarget<CooperTargetMachine> 
-Y("cooper", "PIC16 Cooper [experimental].");
+extern "C" void LLVMInitializePIC16Target() {
+  // Register the target. Curretnly the codegen works for
+  // enhanced pic16 mid-range.
+  RegisterTargetMachine<PIC16TargetMachine> X(ThePIC16Target);
+  RegisterAsmInfo<PIC16MCAsmInfo> A(ThePIC16Target);
+}
 
-// Force static initialization.
-extern "C" void LLVMInitializePIC16Target() { }
 
-// PIC16TargetMachine - Traditional PIC16 Machine.
-PIC16TargetMachine::PIC16TargetMachine(const Module &M, const std::string &FS,
-                                       bool Cooper)
-: Subtarget(M, FS, Cooper),
+// PIC16TargetMachine - Enhanced PIC16 mid-range Machine. May also represent
+// a Traditional Machine if 'Trad' is true.
+PIC16TargetMachine::PIC16TargetMachine(const Target &T, const std::string &TT,
+                                       const std::string &FS, bool Trad)
+: LLVMTargetMachine(T, TT),
+  Subtarget(TT, FS, Trad),
   DataLayout("e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8"), 
   InstrInfo(*this), TLInfo(*this),
   FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0) { }
 
-// CooperTargetMachine - Uses the same PIC16TargetMachine, but makes IsCooper
-// as true.
-CooperTargetMachine::CooperTargetMachine(const Module &M, const std::string &FS)
-  : PIC16TargetMachine(M, FS, true) {}
-
-
-const TargetAsmInfo *PIC16TargetMachine::createTargetAsmInfo() const {
-  return new PIC16TargetAsmInfo(*this);
-}
 
 bool PIC16TargetMachine::addInstSelector(PassManagerBase &PM,
                                          CodeGenOpt::Level OptLevel) {
@@ -65,15 +46,7 @@ bool PIC16TargetMachine::addInstSelector(PassManagerBase &PM,
   return false;
 }
 
-bool PIC16TargetMachine::addAssemblyEmitter(PassManagerBase &PM, 
-                                            CodeGenOpt::Level OptLevel,
-                                            bool Verbose, raw_ostream &Out) {
-  // Output assembly language.
-  PM.add(createPIC16CodePrinterPass(Out, *this, Verbose));
-  return false;
-}
-
-bool PIC16TargetMachine::addPostRegAlloc(PassManagerBase &PM, 
+bool PIC16TargetMachine::addPreEmitPass(PassManagerBase &PM, 
                                          CodeGenOpt::Level OptLevel) {
   PM.add(createPIC16MemSelOptimizerPass());
   return true;  // -print-machineinstr should print after this.
diff --git a/lib/Target/PIC16/PIC16TargetMachine.h b/lib/Target/PIC16/PIC16TargetMachine.h
index 7f62d5c..b11fdd5 100644
--- a/lib/Target/PIC16/PIC16TargetMachine.h
+++ b/lib/Target/PIC16/PIC16TargetMachine.h
@@ -37,12 +37,9 @@ class PIC16TargetMachine : public LLVMTargetMachine {
   // any PIC16 specific FrameInfo class.
   TargetFrameInfo       FrameInfo;
 
-protected:
-  virtual const TargetAsmInfo *createTargetAsmInfo() const;
-
 public:
-  PIC16TargetMachine(const Module &M, const std::string &FS, 
-                     bool Cooper = false);
+  PIC16TargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS, bool Cooper = false);
 
   virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; }
   virtual const PIC16InstrInfo *getInstrInfo() const  { return &InstrInfo; }
@@ -59,18 +56,9 @@ public:
 
   virtual bool addInstSelector(PassManagerBase &PM,
                                CodeGenOpt::Level OptLevel);
-  virtual bool addAssemblyEmitter(PassManagerBase &PM,
-                                  CodeGenOpt::Level OptLevel,
-                                  bool Verbose, raw_ostream &Out);
-  virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
 }; // PIC16TargetMachine.
 
-/// CooperTargetMachine
-class CooperTargetMachine : public PIC16TargetMachine {
-public:
-  CooperTargetMachine(const Module &M, const std::string &FS);
-}; // CooperTargetMachine.
-
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/PIC16/PIC16TargetObjectFile.cpp b/lib/Target/PIC16/PIC16TargetObjectFile.cpp
new file mode 100644
index 0000000..a2a4c09
--- /dev/null
+++ b/lib/Target/PIC16/PIC16TargetObjectFile.cpp
@@ -0,0 +1,440 @@
+//===-- PIC16TargetObjectFile.cpp - PIC16 object files --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16TargetObjectFile.h"
+#include "MCSectionPIC16.h"
+#include "PIC16ISelLowering.h"
+#include "PIC16TargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+
+MCSectionPIC16 *MCSectionPIC16::Create(const StringRef &Name, SectionKind K, 
+                                       int Address, int Color, MCContext &Ctx) {
+  return new (Ctx) MCSectionPIC16(Name, K, Address, Color);
+}
+
+
+void MCSectionPIC16::PrintSwitchToSection(const MCAsmInfo &MAI,
+                                          raw_ostream &OS) const {
+  OS << getName() << '\n';
+}
+
+
+
+
+PIC16TargetObjectFile::PIC16TargetObjectFile()
+  : ExternalVarDecls(0), ExternalVarDefs(0) {
+}
+
+const MCSectionPIC16 *PIC16TargetObjectFile::
+getPIC16Section(const char *Name, SectionKind Kind, 
+                int Address, int Color) const {
+  MCSectionPIC16 *&Entry = SectionsByName[Name];
+  if (Entry)
+    return Entry;
+
+  return Entry = MCSectionPIC16::Create(Name, Kind, Address, Color, 
+                                        getContext());
+}
+
+
+void PIC16TargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &tm){
+  TargetLoweringObjectFile::Initialize(Ctx, tm);
+  TM = &tm;
+  
+  BSSSection = getPIC16Section("udata.# UDATA", MCSectionPIC16::UDATA_Kind());
+  ReadOnlySection = getPIC16Section("romdata.# ROMDATA", 
+                                    MCSectionPIC16::ROMDATA_Kind());
+  DataSection = getPIC16Section("idata.# IDATA", MCSectionPIC16::IDATA_Kind());
+  
+  // Need because otherwise a .text symbol is emitted by DwarfWriter
+  // in BeginModule, and gpasm cribbs for that .text symbol.
+  TextSection = getPIC16Section("", SectionKind::getText());
+
+  ROSections.push_back(new PIC16Section((MCSectionPIC16*)ReadOnlySection));
+  
+  // FIXME: I don't know what the classification of these sections really is.
+  // These aren't really objects belonging to any section. Just emit them
+  // in AsmPrinter and remove this code from here. 
+  ExternalVarDecls = new PIC16Section(getPIC16Section("ExternalVarDecls",
+                                      SectionKind::getMetadata()));
+  ExternalVarDefs = new PIC16Section(getPIC16Section("ExternalVarDefs",
+                                      SectionKind::getMetadata()));
+}
+
+const MCSection *PIC16TargetObjectFile::
+getSectionForFunction(const std::string &FnName) const {
+  std::string T = PAN::getCodeSectionName(FnName);
+  return getPIC16Section(T.c_str(), SectionKind::getText());
+}
+
+
+const MCSection *PIC16TargetObjectFile::
+getSectionForFunctionFrame(const std::string &FnName) const {
+  std::string T = PAN::getFrameSectionName(FnName);
+  return getPIC16Section(T.c_str(), SectionKind::getDataRel());
+}
+
+const MCSection *
+PIC16TargetObjectFile::getBSSSectionForGlobal(const GlobalVariable *GV) const {
+  assert(GV->hasInitializer() && "This global doesn't need space");
+  Constant *C = GV->getInitializer();
+  assert(C->isNullValue() && "Unitialized globals has non-zero initializer");
+
+  // Find how much space this global needs.
+  const TargetData *TD = TM->getTargetData();
+  const Type *Ty = C->getType(); 
+  unsigned ValSize = TD->getTypeAllocSize(Ty);
+ 
+  // Go through all BSS Sections and assign this variable
+  // to the first available section having enough space.
+  PIC16Section *FoundBSS = NULL;
+  for (unsigned i = 0; i < BSSSections.size(); i++) {
+    if (DataBankSize - BSSSections[i]->Size >= ValSize) {
+      FoundBSS = BSSSections[i];
+      break;
+    }
+  }
+
+  // No BSS section spacious enough was found. Crate a new one.
+  if (!FoundBSS) {
+    std::string name = PAN::getUdataSectionName(BSSSections.size());
+    const MCSectionPIC16 *NewSection
+      = getPIC16Section(name.c_str(), MCSectionPIC16::UDATA_Kind());
+
+    FoundBSS = new PIC16Section(NewSection);
+
+    // Add this newly created BSS section to the list of BSSSections.
+    BSSSections.push_back(FoundBSS);
+  }
+  
+  // Insert the GV into this BSS.
+  FoundBSS->Items.push_back(GV);
+  FoundBSS->Size += ValSize;
+  return FoundBSS->S_;
+} 
+
+const MCSection *
+PIC16TargetObjectFile::getIDATASectionForGlobal(const GlobalVariable *GV) const{
+  assert(GV->hasInitializer() && "This global doesn't need space");
+  Constant *C = GV->getInitializer();
+  assert(!C->isNullValue() && "initialized globals has zero initializer");
+  assert(GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE &&
+         "can split initialized RAM data only");
+
+  // Find how much space this global needs.
+  const TargetData *TD = TM->getTargetData();
+  const Type *Ty = C->getType(); 
+  unsigned ValSize = TD->getTypeAllocSize(Ty);
+ 
+  // Go through all IDATA Sections and assign this variable
+  // to the first available section having enough space.
+  PIC16Section *FoundIDATA = NULL;
+  for (unsigned i = 0; i < IDATASections.size(); i++) {
+    if (DataBankSize - IDATASections[i]->Size >= ValSize) {
+      FoundIDATA = IDATASections[i]; 
+      break;
+    }
+  }
+
+  // No IDATA section spacious enough was found. Crate a new one.
+  if (!FoundIDATA) {
+    std::string name = PAN::getIdataSectionName(IDATASections.size());
+    const MCSectionPIC16 *NewSection =
+      getPIC16Section(name.c_str(), MCSectionPIC16::IDATA_Kind());
+
+    FoundIDATA = new PIC16Section(NewSection);
+
+    // Add this newly created IDATA section to the list of IDATASections.
+    IDATASections.push_back(FoundIDATA);
+  }
+  
+  // Insert the GV into this IDATA.
+  FoundIDATA->Items.push_back(GV);
+  FoundIDATA->Size += ValSize;
+  return FoundIDATA->S_;
+} 
+
+// Get the section for an automatic variable of a function.
+// For PIC16 they are globals only with mangled names.
+const MCSection *
+PIC16TargetObjectFile::getSectionForAuto(const GlobalVariable *GV) const {
+
+  const std::string name = PAN::getSectionNameForSym(GV->getName());
+
+  // Go through all Auto Sections and assign this variable
+  // to the appropriate section.
+  PIC16Section *FoundAutoSec = NULL;
+  for (unsigned i = 0; i < AutosSections.size(); i++) {
+    if (AutosSections[i]->S_->getName() == name) {
+      FoundAutoSec = AutosSections[i];
+      break;
+    }
+  }
+
+  // No Auto section was found. Crate a new one.
+  if (!FoundAutoSec) {
+    const MCSectionPIC16 *NewSection =
+      getPIC16Section(name.c_str(), MCSectionPIC16::UDATA_OVR_Kind());
+
+    FoundAutoSec = new PIC16Section(NewSection);
+
+    // Add this newly created autos section to the list of AutosSections.
+    AutosSections.push_back(FoundAutoSec);
+  }
+
+  // Insert the auto into this section.
+  FoundAutoSec->Items.push_back(GV);
+
+  return FoundAutoSec->S_;
+}
+
+
+// Override default implementation to put the true globals into
+// multiple data sections if required.
+const MCSection *
+PIC16TargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV1,
+                                              SectionKind Kind,
+                                              Mangler *Mang,
+                                              const TargetMachine &TM) const {
+  // We select the section based on the initializer here, so it really
+  // has to be a GlobalVariable.
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(GV1); 
+  if (!GV)
+    return TargetLoweringObjectFile::SelectSectionForGlobal(GV1, Kind, Mang,TM);
+
+  // Record External Var Decls.
+  if (GV->isDeclaration()) {
+    ExternalVarDecls->Items.push_back(GV);
+    return ExternalVarDecls->S_;
+  }
+    
+  assert(GV->hasInitializer() && "A def without initializer?");
+
+  // First, if this is an automatic variable for a function, get the section
+  // name for it and return.
+  std::string name = GV->getName();
+  if (PAN::isLocalName(name))
+    return getSectionForAuto(GV);
+
+  // Record Exteranl Var Defs.
+  if (GV->hasExternalLinkage() || GV->hasCommonLinkage())
+    ExternalVarDefs->Items.push_back(GV);
+
+  // See if this is an uninitialized global.
+  const Constant *C = GV->getInitializer();
+  if (C->isNullValue()) 
+    return getBSSSectionForGlobal(GV); 
+
+  // If this is initialized data in RAM. Put it in the correct IDATA section.
+  if (GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE) 
+    return getIDATASectionForGlobal(GV);
+
+  // This is initialized data in rom, put it in the readonly section.
+  if (GV->getType()->getAddressSpace() == PIC16ISD::ROM_SPACE) 
+    return getROSectionForGlobal(GV);
+
+  // Else let the default implementation take care of it.
+  return TargetLoweringObjectFile::SelectSectionForGlobal(GV, Kind, Mang,TM);
+}
+
+PIC16TargetObjectFile::~PIC16TargetObjectFile() {
+  for (unsigned i = 0; i < BSSSections.size(); i++)
+    delete BSSSections[i]; 
+  for (unsigned i = 0; i < IDATASections.size(); i++)
+    delete IDATASections[i]; 
+  for (unsigned i = 0; i < AutosSections.size(); i++)
+    delete AutosSections[i]; 
+  for (unsigned i = 0; i < ROSections.size(); i++)
+    delete ROSections[i];
+  delete ExternalVarDecls;
+  delete ExternalVarDefs;
+}
+
+
+/// getSpecialCasedSectionGlobals - Allow the target to completely override
+/// section assignment of a global.
+const MCSection *PIC16TargetObjectFile::
+getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind, 
+                         Mangler *Mang, const TargetMachine &TM) const {
+  assert(GV->hasSection());
+  
+  if (const GlobalVariable *GVar = cast<GlobalVariable>(GV)) {
+    std::string SectName = GVar->getSection();
+    // If address for a variable is specified, get the address and create
+    // section.
+    std::string AddrStr = "Address=";
+    if (SectName.compare(0, AddrStr.length(), AddrStr) == 0) {
+      std::string SectAddr = SectName.substr(AddrStr.length());
+      return CreateSectionForGlobal(GVar, Mang, SectAddr);
+    }
+     
+    // Create the section specified with section attribute. 
+    return CreateSectionForGlobal(GVar, Mang);
+  }
+
+  return getPIC16Section(GV->getSection().c_str(), Kind);
+}
+
+// Create a new section for global variable. If Addr is given then create
+// section at that address else create by name.
+const MCSection *
+PIC16TargetObjectFile::CreateSectionForGlobal(const GlobalVariable *GV,
+                                              Mangler *Mang,
+                                              const std::string &Addr) const {
+  // See if this is an uninitialized global.
+  const Constant *C = GV->getInitializer();
+  if (C->isNullValue())
+    return CreateBSSSectionForGlobal(GV, Addr);
+
+  // If this is initialized data in RAM. Put it in the correct IDATA section.
+  if (GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE)
+    return CreateIDATASectionForGlobal(GV, Addr);
+
+  // This is initialized data in rom, put it in the readonly section.
+  if (GV->getType()->getAddressSpace() == PIC16ISD::ROM_SPACE) 
+    return CreateROSectionForGlobal(GV, Addr);
+
+  // Else let the default implementation take care of it.
+  return TargetLoweringObjectFile::SectionForGlobal(GV, Mang, *TM);
+}
+
+// Create uninitialized section for a variable.
+const MCSection *
+PIC16TargetObjectFile::CreateBSSSectionForGlobal(const GlobalVariable *GV,
+                                                 std::string Addr) const {
+  assert(GV->hasInitializer() && "This global doesn't need space");
+  assert(GV->getInitializer()->isNullValue() &&
+         "Unitialized global has non-zero initializer");
+  std::string Name;
+  // If address is given then create a section at that address else create a
+  // section by section name specified in GV.
+  PIC16Section *FoundBSS = NULL;
+  if (Addr.empty()) { 
+    Name = GV->getSection() + " UDATA";
+    for (unsigned i = 0; i < BSSSections.size(); i++) {
+      if (BSSSections[i]->S_->getName() == Name) {
+        FoundBSS = BSSSections[i];
+        break;
+      }
+    }
+  } else {
+    std::string Prefix = GV->getNameStr() + "." + Addr + ".";
+    Name = PAN::getUdataSectionName(BSSSections.size(), Prefix) + " " + Addr;
+  }
+  
+  PIC16Section *NewBSS = FoundBSS;
+  if (NewBSS == NULL) {
+    const MCSectionPIC16 *NewSection =
+      getPIC16Section(Name.c_str(), MCSectionPIC16::UDATA_Kind());
+    NewBSS = new PIC16Section(NewSection);
+    BSSSections.push_back(NewBSS);
+  }
+
+  // Insert the GV into this BSS.
+  NewBSS->Items.push_back(GV);
+
+  // We do not want to put any  GV without explicit section into this section
+  // so set its size to DatabankSize.
+  NewBSS->Size = DataBankSize;
+  return NewBSS->S_;
+}
+
+// Get rom section for a variable. Currently there can be only one rom section
+// unless a variable explicitly requests a section.
+const MCSection *
+PIC16TargetObjectFile::getROSectionForGlobal(const GlobalVariable *GV) const {
+  ROSections[0]->Items.push_back(GV);
+  return ROSections[0]->S_;
+}
+
+// Create initialized data section for a variable.
+const MCSection *
+PIC16TargetObjectFile::CreateIDATASectionForGlobal(const GlobalVariable *GV,
+                                                   std::string Addr) const {
+  assert(GV->hasInitializer() && "This global doesn't need space");
+  assert(!GV->getInitializer()->isNullValue() &&
+         "initialized global has zero initializer");
+  assert(GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE &&
+         "can be used for initialized RAM data only");
+
+  std::string Name;
+  // If address is given then create a section at that address else create a
+  // section by section name specified in GV.
+  PIC16Section *FoundIDATASec = NULL;
+  if (Addr.empty()) {
+    Name = GV->getSection() + " IDATA";
+    for (unsigned i = 0; i < IDATASections.size(); i++) {
+      if (IDATASections[i]->S_->getName() == Name) {
+        FoundIDATASec = IDATASections[i];
+        break;
+      }
+    }
+  } else {
+    std::string Prefix = GV->getNameStr() + "." + Addr + ".";
+    Name = PAN::getIdataSectionName(IDATASections.size(), Prefix) + " " + Addr;
+  }
+
+  PIC16Section *NewIDATASec = FoundIDATASec;
+  if (NewIDATASec == NULL) {
+    const MCSectionPIC16 *NewSection =
+      getPIC16Section(Name.c_str(), MCSectionPIC16::IDATA_Kind());
+    NewIDATASec = new PIC16Section(NewSection);
+    IDATASections.push_back(NewIDATASec);
+  }
+  // Insert the GV into this IDATA Section.
+  NewIDATASec->Items.push_back(GV);
+  // We do not want to put any  GV without explicit section into this section 
+  // so set its size to DatabankSize.
+  NewIDATASec->Size = DataBankSize;
+  return NewIDATASec->S_;
+}
+
+// Create a section in rom for a variable.
+const MCSection *
+PIC16TargetObjectFile::CreateROSectionForGlobal(const GlobalVariable *GV,
+                                                std::string Addr) const {
+  assert(GV->getType()->getAddressSpace() == PIC16ISD::ROM_SPACE &&
+         "can be used for ROM data only");
+
+  std::string Name;
+  // If address is given then create a section at that address else create a
+  // section by section name specified in GV.
+  PIC16Section *FoundROSec = NULL;
+  if (Addr.empty()) {
+    Name = GV->getSection() + " ROMDATA";
+    for (unsigned i = 1; i < ROSections.size(); i++) {
+      if (ROSections[i]->S_->getName() == Name) {
+        FoundROSec = ROSections[i];
+        break;
+      }
+    }
+  } else {
+    std::string Prefix = GV->getNameStr() + "." + Addr + ".";
+    Name = PAN::getRomdataSectionName(ROSections.size(), Prefix) + " " + Addr;
+  }
+
+  PIC16Section *NewRomSec = FoundROSec;
+  if (NewRomSec == NULL) {
+    const MCSectionPIC16 *NewSection =
+      getPIC16Section(Name.c_str(), MCSectionPIC16::ROMDATA_Kind());
+    NewRomSec = new PIC16Section(NewSection);
+    ROSections.push_back(NewRomSec);
+  }
+
+  // Insert the GV into this ROM Section.
+  NewRomSec->Items.push_back(GV);
+  return NewRomSec->S_;
+}
+
diff --git a/lib/Target/PIC16/PIC16TargetObjectFile.h b/lib/Target/PIC16/PIC16TargetObjectFile.h
new file mode 100644
index 0000000..75f6cce
--- /dev/null
+++ b/lib/Target/PIC16/PIC16TargetObjectFile.h
@@ -0,0 +1,120 @@
+//===-- PIC16TargetObjectFile.h - PIC16 Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_PIC16_TARGETOBJECTFILE_H
+#define LLVM_TARGET_PIC16_TARGETOBJECTFILE_H
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/ADT/StringMap.h"
+#include <vector>
+#include <string>
+
+namespace llvm {
+  class GlobalVariable;
+  class Module;
+  class PIC16TargetMachine;
+  class MCSectionPIC16;
+  
+  enum { DataBankSize = 80 };
+
+  /// PIC16 Splits the global data into mulitple udata and idata sections.
+  /// Each udata and idata section needs to contain a list of globals that
+  /// they contain, in order to avoid scanning over all the global values 
+  /// again and printing only those that match the current section. 
+  /// Keeping values inside the sections make printing a section much easier.
+  ///
+  /// FIXME: MOVE ALL THIS STUFF TO MCSectionPIC16.
+  ///
+  struct PIC16Section {
+    const MCSectionPIC16 *S_; // Connection to actual Section.
+    unsigned Size;  // Total size of the objects contained.
+    bool SectionPrinted;
+    std::vector<const GlobalVariable*> Items;
+    
+    PIC16Section(const MCSectionPIC16 *s) {
+      S_ = s;
+      Size = 0;
+      SectionPrinted = false;
+    }
+    bool isPrinted() const { return SectionPrinted; }
+    void setPrintedStatus(bool status) { SectionPrinted = status; } 
+  };
+  
+  class PIC16TargetObjectFile : public TargetLoweringObjectFile {
+    /// SectionsByName - Bindings of names to allocated sections.
+    mutable StringMap<MCSectionPIC16*> SectionsByName;
+
+    const TargetMachine *TM;
+    
+    const MCSectionPIC16 *getPIC16Section(const char *Name,
+                                          SectionKind K, 
+                                          int Address = -1, 
+                                          int Color = -1) const;
+  public:
+    mutable std::vector<PIC16Section*> BSSSections;
+    mutable std::vector<PIC16Section*> IDATASections;
+    mutable std::vector<PIC16Section*> AutosSections;
+    mutable std::vector<PIC16Section*> ROSections;
+    mutable PIC16Section *ExternalVarDecls;
+    mutable PIC16Section *ExternalVarDefs;
+
+    PIC16TargetObjectFile();
+    ~PIC16TargetObjectFile();
+    
+    void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    
+    virtual const MCSection *
+    getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind, 
+                             Mangler *Mang, const TargetMachine &TM) const;
+    
+    virtual const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
+                                                    SectionKind Kind,
+                                                    Mangler *Mang,
+                                                    const TargetMachine&) const;
+
+    const MCSection *getSectionForFunction(const std::string &FnName) const;
+    const MCSection *getSectionForFunctionFrame(const std::string &FnName)const;
+    
+    
+  private:
+    std::string getSectionNameForSym(const std::string &Sym) const;
+
+    const MCSection *getBSSSectionForGlobal(const GlobalVariable *GV) const;
+    const MCSection *getIDATASectionForGlobal(const GlobalVariable *GV) const;
+    const MCSection *getSectionForAuto(const GlobalVariable *GV) const;
+    const MCSection *CreateBSSSectionForGlobal(const GlobalVariable *GV,
+                                               std::string Addr = "") const;
+    const MCSection *CreateIDATASectionForGlobal(const GlobalVariable *GV,
+                                                 std::string Addr = "") const;
+    const MCSection *getROSectionForGlobal(const GlobalVariable *GV) const;
+    const MCSection *CreateROSectionForGlobal(const GlobalVariable *GV,
+                                              std::string Addr = "") const;
+    const MCSection *CreateSectionForGlobal(const GlobalVariable *GV,
+                                            Mangler *Mang,
+                                            const std::string &Addr = "") const;
+  public:
+    void SetSectionForGVs(Module &M);
+    const std::vector<PIC16Section*> &getBSSSections() const {
+      return BSSSections;
+    }
+    const std::vector<PIC16Section*> &getIDATASections() const {
+      return IDATASections;
+    }
+    const std::vector<PIC16Section*> &getAutosSections() const {
+      return AutosSections;
+    }
+    const std::vector<PIC16Section*> &getROSections() const {
+      return ROSections;
+    }
+    
+  };
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/PIC16/TargetInfo/CMakeLists.txt b/lib/Target/PIC16/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..bfc6ff4
--- /dev/null
+++ b/lib/Target/PIC16/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMPIC16Info
+  PIC16TargetInfo.cpp
+  )
+
+add_dependencies(LLVMPIC16Info PIC16Table_gen)
diff --git a/lib/Target/PIC16/TargetInfo/Makefile b/lib/Target/PIC16/TargetInfo/Makefile
new file mode 100644
index 0000000..76609f6
--- /dev/null
+++ b/lib/Target/PIC16/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/PIC16/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMPIC16Info
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PIC16/TargetInfo/PIC16TargetInfo.cpp b/lib/Target/PIC16/TargetInfo/PIC16TargetInfo.cpp
new file mode 100644
index 0000000..46cc819
--- /dev/null
+++ b/lib/Target/PIC16/TargetInfo/PIC16TargetInfo.cpp
@@ -0,0 +1,21 @@
+//===-- PIC16TargetInfo.cpp - PIC16 Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::ThePIC16Target, llvm::TheCooperTarget;
+
+extern "C" void LLVMInitializePIC16TargetInfo() { 
+  RegisterTarget<> X(ThePIC16Target, "pic16", "PIC16 14-bit [experimental]");
+
+  RegisterTarget<> Y(TheCooperTarget, "cooper", "PIC16 Cooper [experimental]");
+}
diff --git a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
index 7f1673c..a0fba86 100644
--- a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
@@ -24,7 +24,6 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
-#include "llvm/MDNode.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DwarfWriter.h"
@@ -32,16 +31,22 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/Mangler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetAsmInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
@@ -52,13 +57,40 @@ STATISTIC(EmittedInsts, "Number of machine instrs printed");
 namespace {
   class VISIBILITY_HIDDEN PPCAsmPrinter : public AsmPrinter {
   protected:
-    StringSet<> FnStubs, GVStubs, HiddenGVStubs;
+    struct FnStubInfo {
+      std::string Stub, LazyPtr, AnonSymbol;
+      
+      FnStubInfo() {}
+      
+      void Init(const GlobalValue *GV, Mangler *Mang) {
+        // Already initialized.
+        if (!Stub.empty()) return;
+        Stub = Mang->getMangledName(GV, "$stub", true);
+        LazyPtr = Mang->getMangledName(GV, "$lazy_ptr", true);
+        AnonSymbol = Mang->getMangledName(GV, "$stub$tmp", true);
+      }
+
+      void Init(const std::string &GV, Mangler *Mang) {
+        // Already initialized.
+        if (!Stub.empty()) return;
+        Stub = Mang->makeNameProper(GV + "$stub",
+                                    Mangler::Private);
+        LazyPtr = Mang->makeNameProper(GV + "$lazy_ptr",
+                                       Mangler::Private);
+        AnonSymbol = Mang->makeNameProper(GV + "$stub$tmp",
+                                          Mangler::Private);
+      }
+    };
+    
+    StringMap<FnStubInfo> FnStubs;
+    StringMap<std::string> GVStubs, HiddenGVStubs, TOC;
     const PPCSubtarget &Subtarget;
+    uint64_t LabelID;
   public:
-    explicit PPCAsmPrinter(raw_ostream &O, TargetMachine &TM,
-                           const TargetAsmInfo *T, bool V)
+    explicit PPCAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                           const MCAsmInfo *T, bool V)
       : AsmPrinter(O, TM, T, V),
-        Subtarget(TM.getSubtarget<PPCSubtarget>()) {}
+        Subtarget(TM.getSubtarget<PPCSubtarget>()), LabelID(0) {}
 
     virtual const char *getPassName() const {
       return "PowerPC Assembly Printer";
@@ -70,7 +102,7 @@ namespace {
 
     unsigned enumRegToMachineReg(unsigned enumReg) {
       switch (enumReg) {
-      default: assert(0 && "Unhandled register!"); break;
+      default: llvm_unreachable("Unhandled register!");
       case PPC::CR0:  return  0;
       case PPC::CR1:  return  1;
       case PPC::CR2:  return  2;
@@ -80,14 +112,16 @@ namespace {
       case PPC::CR6:  return  6;
       case PPC::CR7:  return  7;
       }
-      abort();
+      llvm_unreachable(0);
     }
 
     /// printInstruction - This method is automatically generated by tablegen
     /// from the instruction set description.  This method returns true if the
     /// machine instruction was sufficiently described to print it, otherwise it
     /// returns false.
-    bool printInstruction(const MachineInstr *MI);
+    void printInstruction(const MachineInstr *MI);
+    static const char *getRegisterName(unsigned RegNo);
+
 
     void printMachineInstruction(const MachineInstr *MI);
     void printOp(const MachineOperand &MO);
@@ -117,7 +151,7 @@ namespace {
         return;
       }
 
-      const char *RegName = TM.getRegisterInfo()->get(RegNo).AsmName;
+      const char *RegName = getRegisterName(RegNo);
       // Linux assembler (Others?) does not take register mnemonics.
       // FIXME - What about special registers used in mfspr/mtspr?
       if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
@@ -190,16 +224,16 @@ namespace {
           GlobalValue *GV = MO.getGlobal();
           if (GV->isDeclaration() || GV->isWeakForLinker()) {
             // Dynamically-resolved functions need a stub for the function.
-            std::string Name = Mang->getValueName(GV);
-            FnStubs.insert(Name);
-            printSuffixedName(Name, "$stub");
+            FnStubInfo &FnInfo = FnStubs[Mang->getMangledName(GV)];
+            FnInfo.Init(GV, Mang);
+            O << FnInfo.Stub;
             return;
           }
         }
         if (MO.getType() == MachineOperand::MO_ExternalSymbol) {
-          std::string Name(TAI->getGlobalPrefix()); Name += MO.getSymbolName();
-          FnStubs.insert(Name);
-          printSuffixedName(Name, "$stub");
+          FnStubInfo &FnInfo =FnStubs[Mang->makeNameProper(MO.getSymbolName())];
+          FnInfo.Init(MO.getSymbolName(), Mang);
+          O << FnInfo.Stub;
           return;
         }
       }
@@ -281,20 +315,39 @@ namespace {
       printOperand(MI, OpNo+1);
     }
 
+    void printTOCEntryLabel(const MachineInstr *MI, unsigned OpNo) {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+
+      assert(MO.getType() == MachineOperand::MO_GlobalAddress);
+
+      GlobalValue *GV = MO.getGlobal();
+
+      std::string Name = Mang->getMangledName(GV);
+
+      // Map symbol -> label of TOC entry.
+      if (TOC.count(Name) == 0) {
+        std::string Label;
+        Label += MAI->getPrivateGlobalPrefix();
+        Label += "C";
+        Label += utostr(LabelID++);
+
+        TOC[Name] = Label;
+      }
+
+      O << TOC[Name] << "@toc";
+    }
+
     void printPredicateOperand(const MachineInstr *MI, unsigned OpNo,
                                const char *Modifier);
 
     virtual bool runOnMachineFunction(MachineFunction &F) = 0;
-    virtual bool doFinalization(Module &M) = 0;
-
-    virtual void EmitExternalGlobal(const GlobalVariable *GV);
   };
 
   /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
   class VISIBILITY_HIDDEN PPCLinuxAsmPrinter : public PPCAsmPrinter {
   public:
-    explicit PPCLinuxAsmPrinter(raw_ostream &O, PPCTargetMachine &TM,
-                                const TargetAsmInfo *T, bool V)
+    explicit PPCLinuxAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                                const MCAsmInfo *T, bool V)
       : PPCAsmPrinter(O, TM, T, V){}
 
     virtual const char *getPassName() const {
@@ -311,16 +364,16 @@ namespace {
       PPCAsmPrinter::getAnalysisUsage(AU);
     }
 
-    void printModuleLevelGV(const GlobalVariable* GVar);
+    void PrintGlobalVariable(const GlobalVariable *GVar);
   };
 
   /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
   /// OS X
   class VISIBILITY_HIDDEN PPCDarwinAsmPrinter : public PPCAsmPrinter {
-    raw_ostream &OS;
+    formatted_raw_ostream &OS;
   public:
-    explicit PPCDarwinAsmPrinter(raw_ostream &O, PPCTargetMachine &TM,
-                                 const TargetAsmInfo *T, bool V)
+    explicit PPCDarwinAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                                 const MCAsmInfo *T, bool V)
       : PPCAsmPrinter(O, TM, T, V), OS(O) {}
 
     virtual const char *getPassName() const {
@@ -328,8 +381,8 @@ namespace {
     }
 
     bool runOnMachineFunction(MachineFunction &F);
-    bool doInitialization(Module &M);
     bool doFinalization(Module &M);
+    void EmitStartOfAsmFile(Module &M);
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
@@ -338,7 +391,7 @@ namespace {
       PPCAsmPrinter::getAnalysisUsage(AU);
     }
 
-    void printModuleLevelGV(const GlobalVariable* GVar);
+    void PrintGlobalVariable(const GlobalVariable *GVar);
   };
 } // end of anonymous namespace
 
@@ -348,54 +401,52 @@ namespace {
 void PPCAsmPrinter::printOp(const MachineOperand &MO) {
   switch (MO.getType()) {
   case MachineOperand::MO_Immediate:
-    cerr << "printOp() does not handle immediate values\n";
-    abort();
-    return;
+    llvm_unreachable("printOp() does not handle immediate values");
 
   case MachineOperand::MO_MachineBasicBlock:
-    printBasicBlockLabel(MO.getMBB());
+    GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
     return;
   case MachineOperand::MO_JumpTableIndex:
-    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
       << '_' << MO.getIndex();
     // FIXME: PIC relocation model
     return;
   case MachineOperand::MO_ConstantPoolIndex:
-    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
       << '_' << MO.getIndex();
     return;
-  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_ExternalSymbol: {
     // Computing the address of an external symbol, not calling it.
+    std::string Name(MAI->getGlobalPrefix());
+    Name += MO.getSymbolName();
+    
     if (TM.getRelocationModel() != Reloc::Static) {
-      std::string Name(TAI->getGlobalPrefix()); Name += MO.getSymbolName();
-      GVStubs.insert(Name);
-      printSuffixedName(Name, "$non_lazy_ptr");
-      return;
+      GVStubs[Name] = Name+"$non_lazy_ptr";
+      Name += "$non_lazy_ptr";
     }
-    O << TAI->getGlobalPrefix() << MO.getSymbolName();
+    O << Name;
     return;
+  }
   case MachineOperand::MO_GlobalAddress: {
     // Computing the address of a global symbol, not calling it.
     GlobalValue *GV = MO.getGlobal();
-    std::string Name = Mang->getValueName(GV);
+    std::string Name;
 
     // External or weakly linked global variables need non-lazily-resolved stubs
-    if (TM.getRelocationModel() != Reloc::Static) {
-      if (GV->isDeclaration() || GV->isWeakForLinker()) {
-        if (GV->hasHiddenVisibility()) {
-          if (GV->isDeclaration() || GV->hasCommonLinkage() ||
-              GV->hasAvailableExternallyLinkage()) {
-            HiddenGVStubs.insert(Name);
-            printSuffixedName(Name, "$non_lazy_ptr");
-          } else {
-            O << Name;
-          }
-        } else {
-          GVStubs.insert(Name);
-          printSuffixedName(Name, "$non_lazy_ptr");
-        }
-        return;
+    if (TM.getRelocationModel() != Reloc::Static &&
+        (GV->isDeclaration() || GV->isWeakForLinker())) {
+      if (!GV->hasHiddenVisibility()) {
+        Name = Mang->getMangledName(GV, "$non_lazy_ptr", true);
+        GVStubs[Mang->getMangledName(GV)] = Name;
+      } else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
+                 GV->hasAvailableExternallyLinkage()) {
+        Name = Mang->getMangledName(GV, "$non_lazy_ptr", true);
+        HiddenGVStubs[Mang->getMangledName(GV)] = Name;
+      } else {
+        Name = Mang->getMangledName(GV);
       }
+    } else {
+      Name = Mang->getMangledName(GV);
     }
     O << Name;
 
@@ -409,22 +460,6 @@ void PPCAsmPrinter::printOp(const MachineOperand &MO) {
   }
 }
 
-/// EmitExternalGlobal - In this case we need to use the indirect symbol.
-///
-void PPCAsmPrinter::EmitExternalGlobal(const GlobalVariable *GV) {
-  std::string Name;
-  getGlobalLinkName(GV, Name);
-  if (TM.getRelocationModel() != Reloc::Static) {
-    if (GV->hasHiddenVisibility())
-      HiddenGVStubs.insert(Name);
-    else
-      GVStubs.insert(Name);
-    printSuffixedName(Name, "$non_lazy_ptr");
-    return;
-  }
-  O << Name;
-}
-
 /// PrintAsmOperand - Print out an operand for an inline asm expression.
 ///
 bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -461,15 +496,19 @@ bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
   return false;
 }
 
+// At the moment, all inline asm memory operands are a single register.
+// In any case, the output of this routine should always be just one
+// assembler operand.
+
 bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                           unsigned AsmVariant,
                                           const char *ExtraCode) {
   if (ExtraCode && ExtraCode[0])
     return true; // Unknown modifier.
-  if (MI->getOperand(OpNo).isReg())
-    printMemRegReg(MI, OpNo);
-  else
-    printMemRegImm(MI, OpNo);
+  assert (MI->getOperand(OpNo).isReg());
+  O << "0(";
+  printOperand(MI, OpNo);
+  O << ")";
   return false;
 }
 
@@ -505,6 +544,8 @@ void PPCAsmPrinter::printPredicateOperand(const MachineInstr *MI, unsigned OpNo,
 ///
 void PPCAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
   ++EmittedInsts;
+  
+  processDebugLoc(MI, true);
 
   // Check for slwi/srwi mnemonics.
   if (MI->getOpcode() == PPC::RLWINM) {
@@ -549,12 +590,13 @@ void PPCAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
     }
   }
 
-  if (printInstruction(MI))
-    return; // Printer was automatically generated
+  printInstruction(MI);
+  
+  if (VerboseAsm && !MI->getDebugLoc().isUnknown())
+    EmitComments(*MI);
+  O << '\n';
 
-  assert(0 && "Unhandled instruction in asm writer!");
-  abort();
-  return;
+  processDebugLoc(MI, false);
 }
 
 /// runOnMachineFunction - This uses the printMachineInstruction()
@@ -571,10 +613,10 @@ bool PPCLinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   // Print out labels for the function.
   const Function *F = MF.getFunction();
-  SwitchToSection(TAI->SectionForGlobal(F));
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
 
   switch (F->getLinkage()) {
-  default: assert(0 && "Unknown linkage type!");
+  default: llvm_unreachable("Unknown linkage type!");
   case Function::PrivateLinkage:
   case Function::InternalLinkage:  // Symbols default to internal.
     break;
@@ -582,6 +624,7 @@ bool PPCLinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     O << "\t.global\t" << CurrentFnName << '\n'
       << "\t.type\t" << CurrentFnName << ", @function\n";
     break;
+  case Function::LinkerPrivateLinkage:
   case Function::WeakAnyLinkage:
   case Function::WeakODRLinkage:
   case Function::LinkOnceAnyLinkage:
@@ -594,7 +637,19 @@ bool PPCLinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   printVisibility(CurrentFnName, F->getVisibility());
 
   EmitAlignment(MF.getAlignment(), F);
-  O << CurrentFnName << ":\n";
+
+  if (Subtarget.isPPC64()) {
+    // Emit an official procedure descriptor.
+    // FIXME 64-bit SVR4: Use MCSection here?
+    O << "\t.section\t\".opd\",\"aw\"\n";
+    O << "\t.align 3\n";
+    O << CurrentFnName << ":\n";
+    O << "\t.quad .L." << CurrentFnName << ",.TOC.@tocbase\n";
+    O << "\t.previous\n";
+    O << ".L." << CurrentFnName << ":\n";
+  } else {
+    O << CurrentFnName << ":\n";
+  }
 
   // Emit pre-function debug information.
   DW->BeginFunction(&MF);
@@ -604,8 +659,7 @@ bool PPCLinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
        I != E; ++I) {
     // Print a label for the basic block.
     if (I != MF.begin()) {
-      printBasicBlockLabel(I, true, true);
-      O << '\n';
+      EmitBasicBlockStart(I);
     }
     for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
          II != E; ++II) {
@@ -619,27 +673,16 @@ bool PPCLinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // Print out jump tables referenced by the function.
   EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
 
-  SwitchToSection(TAI->SectionForGlobal(F));
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
 
   // Emit post-function debug information.
   DW->EndFunction(&MF);
 
-  O.flush();
-
   // We didn't modify anything.
   return false;
 }
 
-/// PrintUnmangledNameSafely - Print out the printable characters in the name.
-/// Don't print things like \\n or \\0.
-static void PrintUnmangledNameSafely(const Value *V, raw_ostream &OS) {
-  for (const char *Name = V->getNameStart(), *E = Name+V->getNameLen();
-       Name != E; ++Name)
-    if (isprint(*Name))
-      OS << *Name;
-}
-
-void PPCLinuxAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+void PPCLinuxAsmPrinter::PrintGlobalVariable(const GlobalVariable *GVar) {
   const TargetData *TD = TM.getTargetData();
 
   if (!GVar->hasInitializer())
@@ -649,18 +692,17 @@ void PPCLinuxAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
   if (EmitSpecialLLVMGlobal(GVar))
     return;
 
-  std::string name = Mang->getValueName(GVar);
+  std::string name = Mang->getMangledName(GVar);
 
   printVisibility(name, GVar->getVisibility());
 
   Constant *C = GVar->getInitializer();
-  if (isa<MDNode>(C) || isa<MDString>(C))
-    return;
   const Type *Type = C->getType();
   unsigned Size = TD->getTypeAllocSize(Type);
   unsigned Align = TD->getPreferredAlignmentLog(GVar);
 
-  SwitchToSection(TAI->SectionForGlobal(GVar));
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GVar, Mang,
+                                                                  TM));
 
   if (C->isNullValue() && /* FIXME: Verify correct */
       !GVar->hasSection() &&
@@ -674,13 +716,13 @@ void PPCLinuxAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
         O << name << ":\n";
         O << "\t.zero " << Size << '\n';
       } else if (GVar->hasLocalLinkage()) {
-        O << TAI->getLCOMMDirective() << name << ',' << Size;
+        O << MAI->getLCOMMDirective() << name << ',' << Size;
       } else {
         O << ".comm " << name << ',' << Size;
       }
       if (VerboseAsm) {
-        O << "\t\t" << TAI->getCommentString() << " '";
-        PrintUnmangledNameSafely(GVar, O);
+        O << "\t\t" << MAI->getCommentString() << " '";
+        WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
         O << "'";
       }
       O << '\n';
@@ -693,6 +735,7 @@ void PPCLinuxAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
    case GlobalValue::WeakAnyLinkage:
    case GlobalValue::WeakODRLinkage:
    case GlobalValue::CommonLinkage:
+   case GlobalValue::LinkerPrivateLinkage:
     O << "\t.global " << name << '\n'
       << "\t.type " << name << ", @object\n"
       << "\t.weak " << name << '\n';
@@ -709,15 +752,14 @@ void PPCLinuxAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
    case GlobalValue::PrivateLinkage:
     break;
    default:
-    cerr << "Unknown linkage type!";
-    abort();
+    llvm_unreachable("Unknown linkage type!");
   }
 
   EmitAlignment(Align, GVar);
   O << name << ":";
   if (VerboseAsm) {
-    O << "\t\t\t\t" << TAI->getCommentString() << " '";
-    PrintUnmangledNameSafely(GVar, O);
+    O << "\t\t\t\t" << MAI->getCommentString() << " '";
+    WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
     O << "'";
   }
   O << '\n';
@@ -727,10 +769,20 @@ void PPCLinuxAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
 }
 
 bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
-  // Print out module-level global variables here.
-  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    printModuleLevelGV(I);
+  const TargetData *TD = TM.getTargetData();
+
+  bool isPPC64 = TD->getPointerSizeInBits() == 64;
+
+  if (isPPC64 && !TOC.empty()) {
+    // FIXME 64-bit SVR4: Use MCSection here?
+    O << "\t.section\t\".toc\",\"aw\"\n";
+
+    for (StringMap<std::string>::iterator I = TOC.begin(), E = TOC.end();
+         I != E; ++I) {
+      O << I->second << ":\n";
+      O << "\t.tc " << I->getKeyData() << "[TC]," << I->getKeyData() << '\n';
+    }
+  }
 
   return AsmPrinter::doFinalization(M);
 }
@@ -749,10 +801,10 @@ bool PPCDarwinAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   // Print out labels for the function.
   const Function *F = MF.getFunction();
-  SwitchToSection(TAI->SectionForGlobal(F));
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
 
   switch (F->getLinkage()) {
-  default: assert(0 && "Unknown linkage type!");
+  default: llvm_unreachable("Unknown linkage type!");
   case Function::PrivateLinkage:
   case Function::InternalLinkage:  // Symbols default to internal.
     break;
@@ -763,6 +815,7 @@ bool PPCDarwinAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   case Function::WeakODRLinkage:
   case Function::LinkOnceAnyLinkage:
   case Function::LinkOnceODRLinkage:
+  case Function::LinkerPrivateLinkage:
     O << "\t.globl\t" << CurrentFnName << '\n';
     O << "\t.weak_definition\t" << CurrentFnName << '\n';
     break;
@@ -789,8 +842,7 @@ bool PPCDarwinAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
        I != E; ++I) {
     // Print a label for the basic block.
     if (I != MF.begin()) {
-      printBasicBlockLabel(I, true, true, VerboseAsm);
-      O << '\n';
+      EmitBasicBlockStart(I);
     }
     for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
          II != IE; ++II) {
@@ -810,7 +862,7 @@ bool PPCDarwinAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 }
 
 
-bool PPCDarwinAsmPrinter::doInitialization(Module &M) {
+void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
   static const char *const CPUDirectives[] = {
     "",
     "ppc",
@@ -833,26 +885,28 @@ bool PPCDarwinAsmPrinter::doInitialization(Module &M) {
   assert(Directive <= PPC::DIR_64 && "Directive out of range.");
   O << "\t.machine " << CPUDirectives[Directive] << '\n';
 
-  bool Result = AsmPrinter::doInitialization(M);
-  assert(MMI);
-
   // Prime text sections so they are adjacent.  This reduces the likelihood a
   // large data or debug section causes a branch to exceed 16M limit.
-  SwitchToTextSection("\t.section __TEXT,__textcoal_nt,coalesced,"
-                      "pure_instructions");
+  TargetLoweringObjectFileMachO &TLOFMacho = 
+    static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
+  OutStreamer.SwitchSection(TLOFMacho.getTextCoalSection());
   if (TM.getRelocationModel() == Reloc::PIC_) {
-    SwitchToTextSection("\t.section __TEXT,__picsymbolstub1,symbol_stubs,"
-                          "pure_instructions,32");
+    OutStreamer.SwitchSection(
+            TLOFMacho.getMachOSection("__TEXT", "__picsymbolstub1",
+                                      MCSectionMachO::S_SYMBOL_STUBS |
+                                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      32, SectionKind::getText()));
   } else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) {
-    SwitchToTextSection("\t.section __TEXT,__symbol_stub1,symbol_stubs,"
-                        "pure_instructions,16");
+    OutStreamer.SwitchSection(
+            TLOFMacho.getMachOSection("__TEXT","__symbol_stub1",
+                                      MCSectionMachO::S_SYMBOL_STUBS |
+                                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      16, SectionKind::getText()));
   }
-  SwitchToSection(TAI->getTextSection());
-
-  return Result;
+  OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
 }
 
-void PPCDarwinAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+void PPCDarwinAsmPrinter::PrintGlobalVariable(const GlobalVariable *GVar) {
   const TargetData *TD = TM.getTargetData();
 
   if (!GVar->hasInitializer())
@@ -869,8 +923,7 @@ void PPCDarwinAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
     return;
   }
 
-  std::string name = Mang->getValueName(GVar);
-
+  std::string name = Mang->getMangledName(GVar);
   printVisibility(name, GVar->getVisibility());
 
   Constant *C = GVar->getInitializer();
@@ -878,13 +931,17 @@ void PPCDarwinAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
   unsigned Size = TD->getTypeAllocSize(Type);
   unsigned Align = TD->getPreferredAlignmentLog(GVar);
 
-  SwitchToSection(TAI->SectionForGlobal(GVar));
+  const MCSection *TheSection =
+    getObjFileLowering().SectionForGlobal(GVar, Mang, TM);
+  OutStreamer.SwitchSection(TheSection);
 
+  /// FIXME: Drive this off the section!
   if (C->isNullValue() && /* FIXME: Verify correct */
       !GVar->hasSection() &&
       (GVar->hasLocalLinkage() || GVar->hasExternalLinkage() ||
        GVar->isWeakForLinker()) &&
-      TAI->SectionKindForGlobal(GVar) != SectionKind::RODataMergeStr) {
+      // Don't put things that should go in the cstring section into "comm".
+      !TheSection->getKind().isMergeableCString()) {
     if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
 
     if (GVar->hasExternalLinkage()) {
@@ -892,15 +949,15 @@ void PPCDarwinAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
       O << "\t.zerofill __DATA, __common, " << name << ", "
         << Size << ", " << Align;
     } else if (GVar->hasLocalLinkage()) {
-      O << TAI->getLCOMMDirective() << name << ',' << Size << ',' << Align;
+      O << MAI->getLCOMMDirective() << name << ',' << Size << ',' << Align;
     } else if (!GVar->hasCommonLinkage()) {
       O << "\t.globl " << name << '\n'
-        << TAI->getWeakDefDirective() << name << '\n';
+        << MAI->getWeakDefDirective() << name << '\n';
       EmitAlignment(Align, GVar);
       O << name << ":";
       if (VerboseAsm) {
-        O << "\t\t\t\t" << TAI->getCommentString() << " ";
-        PrintUnmangledNameSafely(GVar, O);
+        O << "\t\t\t\t" << MAI->getCommentString() << " ";
+        WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
       }
       O << '\n';
       EmitGlobalConstant(C);
@@ -912,8 +969,8 @@ void PPCDarwinAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
         O << ',' << Align;
     }
     if (VerboseAsm) {
-      O << "\t\t" << TAI->getCommentString() << " '";
-      PrintUnmangledNameSafely(GVar, O);
+      O << "\t\t" << MAI->getCommentString() << " '";
+      WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
       O << "'";
     }
     O << '\n';
@@ -926,6 +983,7 @@ void PPCDarwinAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
    case GlobalValue::WeakAnyLinkage:
    case GlobalValue::WeakODRLinkage:
    case GlobalValue::CommonLinkage:
+   case GlobalValue::LinkerPrivateLinkage:
     O << "\t.globl " << name << '\n'
       << "\t.weak_definition " << name << '\n';
     break;
@@ -940,15 +998,14 @@ void PPCDarwinAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
    case GlobalValue::PrivateLinkage:
     break;
    default:
-    cerr << "Unknown linkage type!";
-    abort();
+    llvm_unreachable("Unknown linkage type!");
   }
 
   EmitAlignment(Align, GVar);
   O << name << ":";
   if (VerboseAsm) {
-    O << "\t\t\t\t" << TAI->getCommentString() << " '";
-    PrintUnmangledNameSafely(GVar, O);
+    O << "\t\t\t\t" << MAI->getCommentString() << " '";
+    WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
     O << "'";
   }
   O << '\n';
@@ -960,141 +1017,110 @@ void PPCDarwinAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
 bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
   const TargetData *TD = TM.getTargetData();
 
-  // Print out module-level global variables here.
-  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    printModuleLevelGV(I);
-
   bool isPPC64 = TD->getPointerSizeInBits() == 64;
 
+  // Darwin/PPC always uses mach-o.
+  TargetLoweringObjectFileMachO &TLOFMacho = 
+    static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
+
+  
+  const MCSection *LSPSection = 0;
+  if (!FnStubs.empty()) // .lazy_symbol_pointer
+    LSPSection = TLOFMacho.getLazySymbolPointerSection();
+    
+  
   // Output stubs for dynamically-linked functions
-  if (TM.getRelocationModel() == Reloc::PIC_) {
-    for (StringSet<>::iterator i = FnStubs.begin(), e = FnStubs.end();
-         i != e; ++i) {
-      SwitchToTextSection("\t.section __TEXT,__picsymbolstub1,symbol_stubs,"
-                          "pure_instructions,32");
+  if (TM.getRelocationModel() == Reloc::PIC_ && !FnStubs.empty()) {
+    const MCSection *StubSection = 
+      TLOFMacho.getMachOSection("__TEXT", "__picsymbolstub1",
+                                MCSectionMachO::S_SYMBOL_STUBS |
+                                MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                32, SectionKind::getText());
+     for (StringMap<FnStubInfo>::iterator I = FnStubs.begin(), E = FnStubs.end();
+         I != E; ++I) {
+      OutStreamer.SwitchSection(StubSection);
       EmitAlignment(4);
-      const char *p = i->getKeyData();
-      bool hasQuote = p[0]=='\"';
-      printSuffixedName(p, "$stub");
-      O << ":\n";
-      O << "\t.indirect_symbol " << p << '\n';
+      const FnStubInfo &Info = I->second;
+      O << Info.Stub << ":\n";
+      O << "\t.indirect_symbol " << I->getKeyData() << '\n';
       O << "\tmflr r0\n";
-      O << "\tbcl 20,31,";
-      if (hasQuote)
-        O << "\"L0$" << &p[1];
-      else
-        O << "L0$" << p;
-      O << '\n';
-      if (hasQuote)
-        O << "\"L0$" << &p[1];
-      else
-        O << "L0$" << p;
-      O << ":\n";
+      O << "\tbcl 20,31," << Info.AnonSymbol << '\n';
+      O << Info.AnonSymbol << ":\n";
       O << "\tmflr r11\n";
-      O << "\taddis r11,r11,ha16(";
-      printSuffixedName(p, "$lazy_ptr");
-      O << "-";
-      if (hasQuote)
-        O << "\"L0$" << &p[1];
-      else
-        O << "L0$" << p;
+      O << "\taddis r11,r11,ha16(" << Info.LazyPtr << "-" << Info.AnonSymbol;
       O << ")\n";
       O << "\tmtlr r0\n";
-      if (isPPC64)
-        O << "\tldu r12,lo16(";
-      else
-        O << "\tlwzu r12,lo16(";
-      printSuffixedName(p, "$lazy_ptr");
-      O << "-";
-      if (hasQuote)
-        O << "\"L0$" << &p[1];
-      else
-        O << "L0$" << p;
-      O << ")(r11)\n";
+      O << (isPPC64 ? "\tldu" : "\tlwzu") << " r12,lo16(";
+      O << Info.LazyPtr << "-" << Info.AnonSymbol << ")(r11)\n";
       O << "\tmtctr r12\n";
       O << "\tbctr\n";
-      SwitchToDataSection(".lazy_symbol_pointer");
-      printSuffixedName(p, "$lazy_ptr");
-      O << ":\n";
-      O << "\t.indirect_symbol " << p << '\n';
-      if (isPPC64)
-        O << "\t.quad dyld_stub_binding_helper\n";
-      else
-        O << "\t.long dyld_stub_binding_helper\n";
+      
+      OutStreamer.SwitchSection(LSPSection);
+      O << Info.LazyPtr << ":\n";
+      O << "\t.indirect_symbol " << I->getKeyData() << '\n';
+      O << (isPPC64 ? "\t.quad" : "\t.long") << " dyld_stub_binding_helper\n";
     }
-  } else {
-    for (StringSet<>::iterator i = FnStubs.begin(), e = FnStubs.end();
-         i != e; ++i) {
-      SwitchToTextSection("\t.section __TEXT,__symbol_stub1,symbol_stubs,"
-                          "pure_instructions,16");
+  } else if (!FnStubs.empty()) {
+    const MCSection *StubSection =
+      TLOFMacho.getMachOSection("__TEXT","__symbol_stub1",
+                                MCSectionMachO::S_SYMBOL_STUBS |
+                                MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                16, SectionKind::getText());
+    
+    for (StringMap<FnStubInfo>::iterator I = FnStubs.begin(), E = FnStubs.end();
+         I != E; ++I) {
+      OutStreamer.SwitchSection(StubSection);
       EmitAlignment(4);
-      const char *p = i->getKeyData();
-      printSuffixedName(p, "$stub");
-      O << ":\n";
-      O << "\t.indirect_symbol " << p << '\n';
-      O << "\tlis r11,ha16(";
-      printSuffixedName(p, "$lazy_ptr");
-      O << ")\n";
-      if (isPPC64)
-        O << "\tldu r12,lo16(";
-      else
-        O << "\tlwzu r12,lo16(";
-      printSuffixedName(p, "$lazy_ptr");
-      O << ")(r11)\n";
+      const FnStubInfo &Info = I->second;
+      O << Info.Stub << ":\n";
+      O << "\t.indirect_symbol " << I->getKeyData() << '\n';
+      O << "\tlis r11,ha16(" << Info.LazyPtr << ")\n";
+      O << (isPPC64 ? "\tldu" :  "\tlwzu") << " r12,lo16(";
+      O << Info.LazyPtr << ")(r11)\n";
       O << "\tmtctr r12\n";
       O << "\tbctr\n";
-      SwitchToDataSection(".lazy_symbol_pointer");
-      printSuffixedName(p, "$lazy_ptr");
-      O << ":\n";
-      O << "\t.indirect_symbol " << p << '\n';
-      if (isPPC64)
-        O << "\t.quad dyld_stub_binding_helper\n";
-      else
-        O << "\t.long dyld_stub_binding_helper\n";
+      OutStreamer.SwitchSection(LSPSection);
+      O << Info.LazyPtr << ":\n";
+      O << "\t.indirect_symbol " << I->getKeyData() << '\n';
+      O << (isPPC64 ? "\t.quad" : "\t.long") << " dyld_stub_binding_helper\n";
     }
   }
 
   O << '\n';
 
-  if (TAI->doesSupportExceptionHandling() && MMI) {
+  if (MAI->doesSupportExceptionHandling() && MMI) {
     // Add the (possibly multiple) personalities to the set of global values.
     // Only referenced functions get into the Personalities list.
     const std::vector<Function *> &Personalities = MMI->getPersonalities();
     for (std::vector<Function *>::const_iterator I = Personalities.begin(),
-           E = Personalities.end(); I != E; ++I)
-      if (*I) GVStubs.insert("_" + (*I)->getName());
+         E = Personalities.end(); I != E; ++I) {
+      if (*I)
+        GVStubs[Mang->getMangledName(*I)] =
+          Mang->getMangledName(*I, "$non_lazy_ptr", true);
+    }
   }
 
-  // Output stubs for external and common global variables.
+  // Output macho stubs for external and common global variables.
   if (!GVStubs.empty()) {
-    SwitchToDataSection(".non_lazy_symbol_pointer");
-    for (StringSet<>::iterator i = GVStubs.begin(), e = GVStubs.end();
-         i != e; ++i) {
-      std::string p = i->getKeyData();
-      printSuffixedName(p, "$non_lazy_ptr");
-      O << ":\n";
-      O << "\t.indirect_symbol " << p << '\n';
-      if (isPPC64)
-        O << "\t.quad\t0\n";
-      else
-        O << "\t.long\t0\n";
+    // Switch with ".non_lazy_symbol_pointer" directive.
+    OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+    EmitAlignment(isPPC64 ? 3 : 2);
+    
+    for (StringMap<std::string>::iterator I = GVStubs.begin(),
+         E = GVStubs.end(); I != E; ++I) {
+      O << I->second << ":\n";
+      O << "\t.indirect_symbol " << I->getKeyData() << '\n';
+      O << (isPPC64 ? "\t.quad\t0\n" : "\t.long\t0\n");
     }
   }
 
   if (!HiddenGVStubs.empty()) {
-    SwitchToSection(TAI->getDataSection());
-    for (StringSet<>::iterator i = HiddenGVStubs.begin(), e = HiddenGVStubs.end();
-         i != e; ++i) {
-      std::string p = i->getKeyData();
-      EmitAlignment(isPPC64 ? 3 : 2);
-      printSuffixedName(p, "$non_lazy_ptr");
-      O << ":\n";
-      if (isPPC64)
-        O << "\t.quad\t";
-      else
-        O << "\t.long\t";
-      O << p << '\n';
+    OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+    EmitAlignment(isPPC64 ? 3 : 2);
+    for (StringMap<std::string>::iterator I = HiddenGVStubs.begin(),
+         E = HiddenGVStubs.end(); I != E; ++I) {
+      O << I->second << ":\n";
+      O << (isPPC64 ? "\t.quad\t" : "\t.long\t") << I->getKeyData() << '\n';
     }
   }
 
@@ -1114,28 +1140,19 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
 /// for a MachineFunction to the given output stream, in a format that the
 /// Darwin assembler can deal with.
 ///
-FunctionPass *llvm::createPPCAsmPrinterPass(raw_ostream &o,
-                                            PPCTargetMachine &tm,
-                                            bool verbose) {
+static AsmPrinter *createPPCAsmPrinterPass(formatted_raw_ostream &o,
+                                           TargetMachine &tm,
+                                           const MCAsmInfo *tai,
+                                           bool verbose) {
   const PPCSubtarget *Subtarget = &tm.getSubtarget<PPCSubtarget>();
 
-  if (Subtarget->isDarwin()) {
-    return new PPCDarwinAsmPrinter(o, tm, tm.getTargetAsmInfo(), verbose);
-  } else {
-    return new PPCLinuxAsmPrinter(o, tm, tm.getTargetAsmInfo(), verbose);
-  }
-}
-
-namespace {
-  static struct Register {
-    Register() {
-      PPCTargetMachine::registerAsmPrinter(createPPCAsmPrinterPass);
-    }
-  } Registrator;
+  if (Subtarget->isDarwin())
+    return new PPCDarwinAsmPrinter(o, tm, tai, verbose);
+  return new PPCLinuxAsmPrinter(o, tm, tai, verbose);
 }
 
-extern "C" int PowerPCAsmPrinterForceLink;
-int PowerPCAsmPrinterForceLink = 0;
-
 // Force static initialization.
-extern "C" void LLVMInitializePowerPCAsmPrinter() { }
+extern "C" void LLVMInitializePowerPCAsmPrinter() { 
+  TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass);
+  TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass);
+}
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index a6479d8..bdd6d36 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -20,10 +20,10 @@ add_llvm_target(PowerPCCodeGen
   PPCISelLowering.cpp
   PPCJITInfo.cpp
   PPCMachOWriterInfo.cpp
+  PPCMCAsmInfo.cpp
   PPCPredicates.cpp
   PPCRegisterInfo.cpp
   PPCSubtarget.cpp
-  PPCTargetAsmInfo.cpp
   PPCTargetMachine.cpp
   )
 
diff --git a/lib/Target/PowerPC/Makefile b/lib/Target/PowerPC/Makefile
index db68897..4015d4a 100644
--- a/lib/Target/PowerPC/Makefile
+++ b/lib/Target/PowerPC/Makefile
@@ -17,6 +17,6 @@ BUILT_SOURCES = PPCGenInstrNames.inc PPCGenRegisterNames.inc \
                 PPCGenInstrInfo.inc PPCGenDAGISel.inc \
                 PPCGenSubtarget.inc PPCGenCallingConv.inc
 
-DIRS = AsmPrinter
+DIRS = AsmPrinter TargetInfo
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index f6c3469..7b98268 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -24,16 +24,21 @@ namespace llvm {
   class PPCTargetMachine;
   class FunctionPass;
   class MachineCodeEmitter;
-  class raw_ostream;
+  class ObjectCodeEmitter;
+  class formatted_raw_ostream;
   
 FunctionPass *createPPCBranchSelectionPass();
 FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
-FunctionPass *createPPCAsmPrinterPass(raw_ostream &OS, PPCTargetMachine &TM,
-                                      bool Verbose);
 FunctionPass *createPPCCodeEmitterPass(PPCTargetMachine &TM,
                                        MachineCodeEmitter &MCE);
 FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
-                                       JITCodeEmitter &MCE);
+                                          JITCodeEmitter &MCE);
+FunctionPass *createPPCObjectCodeEmitterPass(PPCTargetMachine &TM,
+                                             ObjectCodeEmitter &OCE);
+
+extern Target ThePPC32Target;
+extern Target ThePPC64Target;
+
 } // end namespace llvm;
 
 // Defines symbolic names for PowerPC registers.  This defines a mapping from
diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
index cd6018d..0675293 100644
--- a/lib/Target/PowerPC/PPCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -19,12 +19,15 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/MachineCodeEmitter.h"
 #include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/ObjectCodeEmitter.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
@@ -55,8 +58,7 @@ namespace {
 
   template <class CodeEmitter>
   class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass,
-      public PPCCodeEmitter
-  {
+      public PPCCodeEmitter {
     TargetMachine &TM;
     CodeEmitter &MCE;
 
@@ -88,9 +90,10 @@ namespace {
   template <class CodeEmitter>
     char Emitter<CodeEmitter>::ID = 0;
 }
-	
+
 /// createPPCCodeEmitterPass - Return a pass that emits the collected PPC code
 /// to the specified MCE object.
+
 FunctionPass *llvm::createPPCCodeEmitterPass(PPCTargetMachine &TM,
                                              MachineCodeEmitter &MCE) {
   return new Emitter<MachineCodeEmitter>(TM, MCE);
@@ -101,6 +104,11 @@ FunctionPass *llvm::createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
   return new Emitter<JITCodeEmitter>(TM, JCE);
 }
 
+FunctionPass *llvm::createPPCObjectCodeEmitterPass(PPCTargetMachine &TM,
+                                                   ObjectCodeEmitter &OCE) {
+  return new Emitter<ObjectCodeEmitter>(TM, OCE);
+}
+
 template <class CodeEmitter>
 bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
   assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
@@ -121,9 +129,10 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
 template <class CodeEmitter>
 void Emitter<CodeEmitter>::emitBasicBlock(MachineBasicBlock &MBB) {
   MCE.StartMachineBasicBlock(&MBB);
-  
+
   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I){
     const MachineInstr &MI = *I;
+    MCE.processDebugLoc(MI.getDebugLoc(), true);
     switch (MI.getOpcode()) {
     default:
       MCE.emitWordBE(getBinaryCodeForInstr(MI));
@@ -133,6 +142,7 @@ void Emitter<CodeEmitter>::emitBasicBlock(MachineBasicBlock &MBB) {
       MCE.emitLabel(MI.getOperand(0).getImm());
       break;
     case TargetInstrInfo::IMPLICIT_DEF:
+    case TargetInstrInfo::KILL:
       break; // pseudo opcode, no side effects
     case PPC::MovePCtoLR:
     case PPC::MovePCtoLR8:
@@ -141,6 +151,7 @@ void Emitter<CodeEmitter>::emitBasicBlock(MachineBasicBlock &MBB) {
       MCE.emitWordBE(0x48000005);   // bl 1
       break;
     }
+    MCE.processDebugLoc(MI.getDebugLoc(), false);
   }
 }
 
@@ -172,7 +183,7 @@ unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
         assert(MovePCtoLROffset && "MovePCtoLR not seen yet?");
       }
       switch (MI.getOpcode()) {
-      default: MI.dump(); assert(0 && "Unknown instruction for relocation!");
+      default: MI.dump(); llvm_unreachable("Unknown instruction for relocation!");
       case PPC::LIS:
       case PPC::LIS8:
       case PPC::ADDIS:
@@ -193,7 +204,7 @@ unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
       case PPC::LWZ8:
       case PPC::LFS:
       case PPC::LFD:
-      
+
       // Stores.
       case PPC::STB:
       case PPC::STB8:
@@ -214,7 +225,7 @@ unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
         break;
       }
     }
-    
+
     MachineRelocation R;
     if (MO.isGlobal()) {
       R = MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
@@ -231,7 +242,7 @@ unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
       R = MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
                                           Reloc, MO.getIndex(), 0);
     }
-    
+
     // If in PIC mode, we need to encode the negated address of the
     // 'movepctolr' into the unrelocated field.  After relocation, we'll have
     // &gv-&movepctolr-4 in the imm field.  Once &movepctolr is added to the imm
@@ -242,7 +253,7 @@ unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
       R.setConstantVal(-(intptr_t)MovePCtoLROffset - 4);
     }
     MCE.addRelocation(R);
-    
+
   } else if (MO.isMBB()) {
     unsigned Reloc = 0;
     unsigned Opcode = MI.getOpcode();
@@ -252,15 +263,17 @@ unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
       Reloc = PPC::reloc_pcrel_bx;
     else // BCC instruction
       Reloc = PPC::reloc_pcrel_bcx;
+
     MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
                                                Reloc, MO.getMBB()));
   } else {
-    cerr << "ERROR: Unknown type of MachineOperand: " << MO << "\n";
-    abort();
+#ifndef NDEBUG
+    errs() << "ERROR: Unknown type of MachineOperand: " << MO << "\n";
+#endif
+    llvm_unreachable(0);
   }
 
   return rv;
 }
 
 #include "PPCGenCodeEmitter.inc"
-
diff --git a/lib/Target/PowerPC/PPCFrameInfo.h b/lib/Target/PowerPC/PPCFrameInfo.h
index 770a560..65f113e 100644
--- a/lib/Target/PowerPC/PPCFrameInfo.h
+++ b/lib/Target/PowerPC/PPCFrameInfo.h
@@ -31,33 +31,32 @@ public:
 
   /// getReturnSaveOffset - Return the previous frame offset to save the
   /// return address.
-  static unsigned getReturnSaveOffset(bool LP64, bool isDarwinABI) {
+  static unsigned getReturnSaveOffset(bool isPPC64, bool isDarwinABI) {
     if (isDarwinABI)
-      return LP64 ? 16 : 8;
+      return isPPC64 ? 16 : 8;
     // SVR4 ABI:
-    return 4;
+    return isPPC64 ? 16 : 4;
   }
 
   /// getFramePointerSaveOffset - Return the previous frame offset to save the
   /// frame pointer.
-  static unsigned getFramePointerSaveOffset(bool LP64, bool isDarwinABI) {
+  static unsigned getFramePointerSaveOffset(bool isPPC64, bool isDarwinABI) {
     // For the Darwin ABI:
     // Use the TOC save slot in the PowerPC linkage area for saving the frame
     // pointer (if needed.)  LLVM does not generate code that uses the TOC (R2
     // is treated as a caller saved register.)
     if (isDarwinABI)
-      return LP64 ? 40 : 20;
+      return isPPC64 ? 40 : 20;
     
-    // SVR4 ABI:
-    // Save it right before the link register
+    // SVR4 ABI: First slot in the general register save area.
     return -4U;
   }
   
   /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
   ///
-  static unsigned getLinkageSize(bool LP64, bool isDarwinABI) {
-    if (isDarwinABI)
-      return 6 * (LP64 ? 8 : 4);
+  static unsigned getLinkageSize(bool isPPC64, bool isDarwinABI) {
+    if (isDarwinABI || isPPC64)
+      return 6 * (isPPC64 ? 8 : 4);
     
     // SVR4 ABI:
     return 8;
@@ -65,118 +64,222 @@ public:
 
   /// getMinCallArgumentsSize - Return the size of the minium PowerPC ABI
   /// argument area.
-  static unsigned getMinCallArgumentsSize(bool LP64, bool isDarwinABI) {
-    // For the Darwin ABI:
+  static unsigned getMinCallArgumentsSize(bool isPPC64, bool isDarwinABI) {
+    // For the Darwin ABI / 64-bit SVR4 ABI:
     // The prolog code of the callee may store up to 8 GPR argument registers to
     // the stack, allowing va_start to index over them in memory if its varargs.
     // Because we cannot tell if this is needed on the caller side, we have to
     // conservatively assume that it is needed.  As such, make sure we have at
     // least enough stack space for the caller to store the 8 GPRs.
-    if (isDarwinABI)
-      return 8 * (LP64 ? 8 : 4);
+    if (isDarwinABI || isPPC64)
+      return 8 * (isPPC64 ? 8 : 4);
     
-    // SVR4 ABI:
+    // 32-bit SVR4 ABI:
     // There is no default stack allocated for the 8 first GPR arguments.
     return 0;
   }
 
   /// getMinCallFrameSize - Return the minimum size a call frame can be using
   /// the PowerPC ABI.
-  static unsigned getMinCallFrameSize(bool LP64, bool isDarwinABI) {
+  static unsigned getMinCallFrameSize(bool isPPC64, bool isDarwinABI) {
     // The call frame needs to be at least big enough for linkage and 8 args.
-    return getLinkageSize(LP64, isDarwinABI) +
-           getMinCallArgumentsSize(LP64, isDarwinABI);
+    return getLinkageSize(isPPC64, isDarwinABI) +
+           getMinCallArgumentsSize(isPPC64, isDarwinABI);
   }
 
   // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
-  const std::pair<unsigned, int> *
+  const SpillSlot *
   getCalleeSavedSpillSlots(unsigned &NumEntries) const {
     // Early exit if not using the SVR4 ABI.
     if (!TM.getSubtarget<PPCSubtarget>().isSVR4ABI()) {
       NumEntries = 0;
       return 0;
     }
-    
-    static const std::pair<unsigned, int> Offsets[] = {
+
+    static const SpillSlot Offsets[] = {
       // Floating-point register save area offsets.
-      std::pair<unsigned, int>(PPC::F31, -8),
-      std::pair<unsigned, int>(PPC::F30, -16),
-      std::pair<unsigned, int>(PPC::F29, -24),
-      std::pair<unsigned, int>(PPC::F28, -32),
-      std::pair<unsigned, int>(PPC::F27, -40),
-      std::pair<unsigned, int>(PPC::F26, -48),
-      std::pair<unsigned, int>(PPC::F25, -56),
-      std::pair<unsigned, int>(PPC::F24, -64),
-      std::pair<unsigned, int>(PPC::F23, -72),
-      std::pair<unsigned, int>(PPC::F22, -80),
-      std::pair<unsigned, int>(PPC::F21, -88),
-      std::pair<unsigned, int>(PPC::F20, -96),
-      std::pair<unsigned, int>(PPC::F19, -104),
-      std::pair<unsigned, int>(PPC::F18, -112),
-      std::pair<unsigned, int>(PPC::F17, -120),
-      std::pair<unsigned, int>(PPC::F16, -128),
-      std::pair<unsigned, int>(PPC::F15, -136),
-      std::pair<unsigned, int>(PPC::F14, -144),
-        
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
       // General register save area offsets.
-      std::pair<unsigned, int>(PPC::R31, -4),
-      std::pair<unsigned, int>(PPC::R30, -8),
-      std::pair<unsigned, int>(PPC::R29, -12),
-      std::pair<unsigned, int>(PPC::R28, -16),
-      std::pair<unsigned, int>(PPC::R27, -20),
-      std::pair<unsigned, int>(PPC::R26, -24),
-      std::pair<unsigned, int>(PPC::R25, -28),
-      std::pair<unsigned, int>(PPC::R24, -32),
-      std::pair<unsigned, int>(PPC::R23, -36),
-      std::pair<unsigned, int>(PPC::R22, -40),
-      std::pair<unsigned, int>(PPC::R21, -44),
-      std::pair<unsigned, int>(PPC::R20, -48),
-      std::pair<unsigned, int>(PPC::R19, -52),
-      std::pair<unsigned, int>(PPC::R18, -56),
-      std::pair<unsigned, int>(PPC::R17, -60),
-      std::pair<unsigned, int>(PPC::R16, -64),
-      std::pair<unsigned, int>(PPC::R15, -68),
-      std::pair<unsigned, int>(PPC::R14, -72),
+      {PPC::R31, -4},
+      {PPC::R30, -8},
+      {PPC::R29, -12},
+      {PPC::R28, -16},
+      {PPC::R27, -20},
+      {PPC::R26, -24},
+      {PPC::R25, -28},
+      {PPC::R24, -32},
+      {PPC::R23, -36},
+      {PPC::R22, -40},
+      {PPC::R21, -44},
+      {PPC::R20, -48},
+      {PPC::R19, -52},
+      {PPC::R18, -56},
+      {PPC::R17, -60},
+      {PPC::R16, -64},
+      {PPC::R15, -68},
+      {PPC::R14, -72},
 
       // CR save area offset.
       // FIXME SVR4: Disable CR save area for now.
-//      std::pair<unsigned, int>(PPC::CR2, -4),
-//      std::pair<unsigned, int>(PPC::CR3, -4),
-//      std::pair<unsigned, int>(PPC::CR4, -4),
-//      std::pair<unsigned, int>(PPC::CR2LT, -4),
-//      std::pair<unsigned, int>(PPC::CR2GT, -4),
-//      std::pair<unsigned, int>(PPC::CR2EQ, -4),
-//      std::pair<unsigned, int>(PPC::CR2UN, -4),
-//      std::pair<unsigned, int>(PPC::CR3LT, -4),
-//      std::pair<unsigned, int>(PPC::CR3GT, -4),
-//      std::pair<unsigned, int>(PPC::CR3EQ, -4),
-//      std::pair<unsigned, int>(PPC::CR3UN, -4),
-//      std::pair<unsigned, int>(PPC::CR4LT, -4),
-//      std::pair<unsigned, int>(PPC::CR4GT, -4),
-//      std::pair<unsigned, int>(PPC::CR4EQ, -4),
-//      std::pair<unsigned, int>(PPC::CR4UN, -4),
+//      {PPC::CR2, -4},
+//      {PPC::CR3, -4},
+//      {PPC::CR4, -4},
+//      {PPC::CR2LT, -4},
+//      {PPC::CR2GT, -4},
+//      {PPC::CR2EQ, -4},
+//      {PPC::CR2UN, -4},
+//      {PPC::CR3LT, -4},
+//      {PPC::CR3GT, -4},
+//      {PPC::CR3EQ, -4},
+//      {PPC::CR3UN, -4},
+//      {PPC::CR4LT, -4},
+//      {PPC::CR4GT, -4},
+//      {PPC::CR4EQ, -4},
+//      {PPC::CR4UN, -4},
 
       // VRSAVE save area offset.
-      std::pair<unsigned, int>(PPC::VRSAVE, -4),
-      
+      {PPC::VRSAVE, -4},
+
       // Vector register save area
-      std::pair<unsigned, int>(PPC::V31, -16),
-      std::pair<unsigned, int>(PPC::V30, -32),
-      std::pair<unsigned, int>(PPC::V29, -48),
-      std::pair<unsigned, int>(PPC::V28, -64),
-      std::pair<unsigned, int>(PPC::V27, -80),
-      std::pair<unsigned, int>(PPC::V26, -96),
-      std::pair<unsigned, int>(PPC::V25, -112),
-      std::pair<unsigned, int>(PPC::V24, -128),
-      std::pair<unsigned, int>(PPC::V23, -144),
-      std::pair<unsigned, int>(PPC::V22, -160),
-      std::pair<unsigned, int>(PPC::V21, -176),
-      std::pair<unsigned, int>(PPC::V20, -192)
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}
     };
-    
-    NumEntries = array_lengthof(Offsets);
-    
-    return Offsets;
+
+    static const SpillSlot Offsets64[] = {
+      // Floating-point register save area offsets.
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
+      // General register save area offsets.
+      // FIXME 64-bit SVR4: Are 32-bit registers actually allocated in 64-bit
+      //                    mode?
+      {PPC::R31, -4},
+      {PPC::R30, -12},
+      {PPC::R29, -20},
+      {PPC::R28, -28},
+      {PPC::R27, -36},
+      {PPC::R26, -44},
+      {PPC::R25, -52},
+      {PPC::R24, -60},
+      {PPC::R23, -68},
+      {PPC::R22, -76},
+      {PPC::R21, -84},
+      {PPC::R20, -92},
+      {PPC::R19, -100},
+      {PPC::R18, -108},
+      {PPC::R17, -116},
+      {PPC::R16, -124},
+      {PPC::R15, -132},
+      {PPC::R14, -140},
+
+      {PPC::X31, -8},
+      {PPC::X30, -16},
+      {PPC::X29, -24},
+      {PPC::X28, -32},
+      {PPC::X27, -40},
+      {PPC::X26, -48},
+      {PPC::X25, -56},
+      {PPC::X24, -64},
+      {PPC::X23, -72},
+      {PPC::X22, -80},
+      {PPC::X21, -88},
+      {PPC::X20, -96},
+      {PPC::X19, -104},
+      {PPC::X18, -112},
+      {PPC::X17, -120},
+      {PPC::X16, -128},
+      {PPC::X15, -136},
+      {PPC::X14, -144},
+
+      // CR save area offset.
+      // FIXME SVR4: Disable CR save area for now.
+//      {PPC::CR2, -4},
+//      {PPC::CR3, -4},
+//      {PPC::CR4, -4},
+//      {PPC::CR2LT, -4},
+//      {PPC::CR2GT, -4},
+//      {PPC::CR2EQ, -4},
+//      {PPC::CR2UN, -4},
+//      {PPC::CR3LT, -4},
+//      {PPC::CR3GT, -4},
+//      {PPC::CR3EQ, -4},
+//      {PPC::CR3UN, -4},
+//      {PPC::CR4LT, -4},
+//      {PPC::CR4GT, -4},
+//      {PPC::CR4EQ, -4},
+//      {PPC::CR4UN, -4},
+
+      // VRSAVE save area offset.
+      {PPC::VRSAVE, -4},
+
+      // Vector register save area
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}
+    };
+
+    if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
+      NumEntries = array_lengthof(Offsets64);
+
+      return Offsets64;
+    } else {
+      NumEntries = array_lengthof(Offsets);
+
+      return Offsets;
+    }
   }
 };
 
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index ec3e757..6af7e0f 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -17,6 +17,8 @@
 #include "PPCInstrInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -51,7 +53,7 @@ PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetInstrInfo &tii)
 }
 
 void PPCHazardRecognizer970::EndDispatchGroup() {
-  DOUT << "=== Start of dispatch group\n";
+  DEBUG(errs() << "=== Start of dispatch group\n");
   NumIssued = 0;
   
   // Structural hazard info.
@@ -141,7 +143,7 @@ getHazardType(SUnit *SU) {
     return Hazard;
       
   switch (InstrType) {
-  default: assert(0 && "Unknown instruction type!");
+  default: llvm_unreachable("Unknown instruction type!");
   case PPCII::PPC970_FXU:
   case PPCII::PPC970_LSU:
   case PPCII::PPC970_FPU:
@@ -167,7 +169,7 @@ getHazardType(SUnit *SU) {
   if (isLoad && NumStores) {
     unsigned LoadSize;
     switch (Opcode) {
-    default: assert(0 && "Unknown load!");
+    default: llvm_unreachable("Unknown load!");
     case PPC::LBZ:   case PPC::LBZU:
     case PPC::LBZX:
     case PPC::LBZ8:  case PPC::LBZU8:
@@ -235,7 +237,7 @@ void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
   if (isStore) {
     unsigned ThisStoreSize;
     switch (Opcode) {
-    default: assert(0 && "Unknown store instruction!");
+    default: llvm_unreachable("Unknown store instruction!");
     case PPC::STB:    case PPC::STB8:
     case PPC::STBU:   case PPC::STBU8:
     case PPC::STBX:   case PPC::STBX8:
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 823e316..8fa6a66 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -20,6 +20,7 @@
 #include "PPCHazardRecognizers.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
@@ -31,6 +32,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 namespace {
@@ -49,17 +52,12 @@ namespace {
         PPCLowering(*TM.getTargetLowering()),
         PPCSubTarget(*TM.getSubtargetImpl()) {}
     
-    virtual bool runOnFunction(Function &Fn) {
-      // Do not codegen any 'available_externally' functions at all, they have
-      // definitions outside the translation unit.
-      if (Fn.hasAvailableExternallyLinkage())
-        return false;
-
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
       // Make sure we re-emit a set of the global base reg if necessary
       GlobalBaseReg = 0;
-      SelectionDAGISel::runOnFunction(Fn);
+      SelectionDAGISel::runOnMachineFunction(MF);
       
-      InsertVRSaveCode(Fn);
+      InsertVRSaveCode(MF);
       return true;
     }
    
@@ -145,30 +143,14 @@ namespace {
     }
       
     /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
-    /// inline asm expressions.
-    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+    /// inline asm expressions.  It is always correct to compute the value into
+    /// a register.  The case of adding a (possibly relocatable) constant to a
+    /// register can be improved, but it is wrong to substitute Reg+Reg for
+    /// Reg in an asm, because the load or store opcode would have to change.
+   virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                               char ConstraintCode,
                                               std::vector<SDValue> &OutOps) {
-      SDValue Op0, Op1;
-      switch (ConstraintCode) {
-      default: return true;
-      case 'm':   // memory
-        if (!SelectAddrIdx(Op, Op, Op0, Op1))
-          SelectAddrImm(Op, Op, Op0, Op1);
-        break;
-      case 'o':   // offsetable
-        if (!SelectAddrImm(Op, Op, Op0, Op1)) {
-          Op0 = Op;
-          Op1 = getSmallIPtrImm(0);
-        }
-        break;
-      case 'v':   // not offsetable
-        SelectAddrIdxOnly(Op, Op, Op0, Op1);
-        break;
-      }
-      
-      OutOps.push_back(Op0);
-      OutOps.push_back(Op1);
+      OutOps.push_back(Op);
       return false;
     }
     
@@ -179,7 +161,7 @@ namespace {
     /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
     virtual void InstructionSelect();
     
-    void InsertVRSaveCode(Function &Fn);
+    void InsertVRSaveCode(MachineFunction &MF);
 
     virtual const char *getPassName() const {
       return "PowerPC DAG->DAG Pattern Instruction Selection";
@@ -216,13 +198,12 @@ void PPCDAGToDAGISel::InstructionSelect() {
 /// InsertVRSaveCode - Once the entire function has been instruction selected,
 /// all virtual registers are created and all machine instructions are built,
 /// check to see if we need to save/restore VRSAVE.  If so, do it.
-void PPCDAGToDAGISel::InsertVRSaveCode(Function &F) {
+void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
   // Check to see if this function uses vector registers, which means we have to
   // save and restore the VRSAVE register and update it with the regs we use.  
   //
   // In this case, there will be virtual registers of vector type type created
   // by the scheduler.  Detect them now.
-  MachineFunction &Fn = MachineFunction::get(&F);
   bool HasVectorVReg = false;
   for (unsigned i = TargetRegisterInfo::FirstVirtualRegister, 
        e = RegInfo->getLastVirtReg()+1; i != e; ++i)
@@ -285,7 +266,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
   if (!GlobalBaseReg) {
     const TargetInstrInfo &TII = *TM.getInstrInfo();
     // Insert the set of GlobalBaseReg into the first MBB of the function
-    MachineBasicBlock &FirstMBB = BB->getParent()->front();
+    MachineBasicBlock &FirstMBB = MF->front();
     MachineBasicBlock::iterator MBBI = FirstMBB.begin();
     DebugLoc dl = DebugLoc::getUnknownLoc();
 
@@ -488,7 +469,7 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
       SH &= 31;
       SDValue Ops[] = { Tmp3, Op1, getI32Imm(SH), getI32Imm(MB),
                           getI32Imm(ME) };
-      return CurDAG->getTargetNode(PPC::RLWIMI, dl, MVT::i32, Ops, 5);
+      return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops, 5);
     }
   }
   return 0;
@@ -507,12 +488,12 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
       if (isInt32Immediate(RHS, Imm)) {
         // SETEQ/SETNE comparison with 16-bit immediate, fold it.
         if (isUInt16(Imm))
-          return SDValue(CurDAG->getTargetNode(PPC::CMPLWI, dl, MVT::i32, LHS,
-                                                 getI32Imm(Imm & 0xFFFF)), 0);
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
         // If this is a 16-bit signed immediate, fold it.
         if (isInt16((int)Imm))
-          return SDValue(CurDAG->getTargetNode(PPC::CMPWI, dl, MVT::i32, LHS,
-                                                 getI32Imm(Imm & 0xFFFF)), 0);
+          return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
         
         // For non-equality comparisons, the default code would materialize the
         // constant, then compare against it, like this:
@@ -523,22 +504,22 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
         //   xoris r0,r3,0x1234
         //   cmplwi cr0,r0,0x5678
         //   beq cr0,L6
-        SDValue Xor(CurDAG->getTargetNode(PPC::XORIS, dl, MVT::i32, LHS,
-                                            getI32Imm(Imm >> 16)), 0);
-        return SDValue(CurDAG->getTargetNode(PPC::CMPLWI, dl, MVT::i32, Xor,
-                                               getI32Imm(Imm & 0xFFFF)), 0);
+        SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS,
+                                           getI32Imm(Imm >> 16)), 0);
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor,
+                                              getI32Imm(Imm & 0xFFFF)), 0);
       }
       Opc = PPC::CMPLW;
     } else if (ISD::isUnsignedIntSetCC(CC)) {
       if (isInt32Immediate(RHS, Imm) && isUInt16(Imm))
-        return SDValue(CurDAG->getTargetNode(PPC::CMPLWI, dl, MVT::i32, LHS,
-                                               getI32Imm(Imm & 0xFFFF)), 0);
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                              getI32Imm(Imm & 0xFFFF)), 0);
       Opc = PPC::CMPLW;
     } else {
       short SImm;
       if (isIntS16Immediate(RHS, SImm))
-        return SDValue(CurDAG->getTargetNode(PPC::CMPWI, dl, MVT::i32, LHS,
-                                               getI32Imm((int)SImm & 0xFFFF)),
+        return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                              getI32Imm((int)SImm & 0xFFFF)),
                          0);
       Opc = PPC::CMPW;
     }
@@ -548,12 +529,12 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
       if (isInt64Immediate(RHS.getNode(), Imm)) {
         // SETEQ/SETNE comparison with 16-bit immediate, fold it.
         if (isUInt16(Imm))
-          return SDValue(CurDAG->getTargetNode(PPC::CMPLDI, dl, MVT::i64, LHS,
-                                                 getI32Imm(Imm & 0xFFFF)), 0);
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
         // If this is a 16-bit signed immediate, fold it.
         if (isInt16(Imm))
-          return SDValue(CurDAG->getTargetNode(PPC::CMPDI, dl, MVT::i64, LHS,
-                                                 getI32Imm(Imm & 0xFFFF)), 0);
+          return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
         
         // For non-equality comparisons, the default code would materialize the
         // constant, then compare against it, like this:
@@ -565,23 +546,23 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
         //   cmpldi cr0,r0,0x5678
         //   beq cr0,L6
         if (isUInt32(Imm)) {
-          SDValue Xor(CurDAG->getTargetNode(PPC::XORIS8, dl, MVT::i64, LHS,
-                                              getI64Imm(Imm >> 16)), 0);
-          return SDValue(CurDAG->getTargetNode(PPC::CMPLDI, dl, MVT::i64, Xor,
-                                                 getI64Imm(Imm & 0xFFFF)), 0);
+          SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS,
+                                             getI64Imm(Imm >> 16)), 0);
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor,
+                                                getI64Imm(Imm & 0xFFFF)), 0);
         }
       }
       Opc = PPC::CMPLD;
     } else if (ISD::isUnsignedIntSetCC(CC)) {
       if (isInt64Immediate(RHS.getNode(), Imm) && isUInt16(Imm))
-        return SDValue(CurDAG->getTargetNode(PPC::CMPLDI, dl, MVT::i64, LHS,
-                                               getI64Imm(Imm & 0xFFFF)), 0);
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                              getI64Imm(Imm & 0xFFFF)), 0);
       Opc = PPC::CMPLD;
     } else {
       short SImm;
       if (isIntS16Immediate(RHS, SImm))
-        return SDValue(CurDAG->getTargetNode(PPC::CMPDI, dl, MVT::i64, LHS,
-                                               getI64Imm(SImm & 0xFFFF)),
+        return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                              getI64Imm(SImm & 0xFFFF)),
                          0);
       Opc = PPC::CMPD;
     }
@@ -591,7 +572,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
     assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
     Opc = PPC::FCMPUD;
   }
-  return SDValue(CurDAG->getTargetNode(Opc, dl, MVT::i32, LHS, RHS), 0);
+  return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
 }
 
 static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
@@ -600,8 +581,8 @@ static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
   case ISD::SETONE:
   case ISD::SETOLE:
   case ISD::SETOGE:
-    assert(0 && "Should be lowered by legalize!");
-  default: assert(0 && "Unknown condition!"); abort();
+    llvm_unreachable("Should be lowered by legalize!");
+  default: llvm_unreachable("Unknown condition!");
   case ISD::SETOEQ:
   case ISD::SETEQ:  return PPC::PRED_EQ;
   case ISD::SETUNE:
@@ -632,7 +613,7 @@ static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert, int &Other) {
   Invert = false;
   Other = -1;
   switch (CC) {
-  default: assert(0 && "Unknown condition!"); abort();
+  default: llvm_unreachable("Unknown condition!");
   case ISD::SETOLT:
   case ISD::SETLT:  return 0;                  // Bit #0 = SETOLT
   case ISD::SETOGT:
@@ -651,7 +632,7 @@ static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert, int &Other) {
   case ISD::SETOGE: 
   case ISD::SETOLE: 
   case ISD::SETONE:
-    assert(0 && "Invalid branch code: should be expanded by legalize");
+    llvm_unreachable("Invalid branch code: should be expanded by legalize");
   // These are invalid for floating point.  Assume integer.
   case ISD::SETULT: return 0;
   case ISD::SETUGT: return 1;
@@ -673,14 +654,14 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDValue Op) {
       switch (CC) {
       default: break;
       case ISD::SETEQ: {
-        Op = SDValue(CurDAG->getTargetNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
+        Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
         SDValue Ops[] = { Op, getI32Imm(27), getI32Imm(5), getI32Imm(31) };
         return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
       }
       case ISD::SETNE: {
         SDValue AD =
-          SDValue(CurDAG->getTargetNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
-                                          Op, getI32Imm(~0U)), 0);
+          SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
+                                         Op, getI32Imm(~0U)), 0);
         return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, 
                                     AD.getValue(1));
       }
@@ -690,8 +671,8 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDValue Op) {
       }
       case ISD::SETGT: {
         SDValue T =
-          SDValue(CurDAG->getTargetNode(PPC::NEG, dl, MVT::i32, Op), 0);
-        T = SDValue(CurDAG->getTargetNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
+          SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0);
+        T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
         SDValue Ops[] = { T, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
         return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
       }
@@ -701,31 +682,31 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDValue Op) {
       switch (CC) {
       default: break;
       case ISD::SETEQ:
-        Op = SDValue(CurDAG->getTargetNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
-                                             Op, getI32Imm(1)), 0);
+        Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
+                                            Op, getI32Imm(1)), 0);
         return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, 
-                              SDValue(CurDAG->getTargetNode(PPC::LI, dl, 
-                                                            MVT::i32,
-                                                            getI32Imm(0)), 0),
+                              SDValue(CurDAG->getMachineNode(PPC::LI, dl, 
+                                                             MVT::i32,
+                                                             getI32Imm(0)), 0),
                                       Op.getValue(1));
       case ISD::SETNE: {
-        Op = SDValue(CurDAG->getTargetNode(PPC::NOR, dl, MVT::i32, Op, Op), 0);
-        SDNode *AD = CurDAG->getTargetNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
-                                           Op, getI32Imm(~0U));
+        Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0);
+        SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
+                                            Op, getI32Imm(~0U));
         return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0),
                                     Op, SDValue(AD, 1));
       }
       case ISD::SETLT: {
-        SDValue AD = SDValue(CurDAG->getTargetNode(PPC::ADDI, dl, MVT::i32, Op,
-                                                       getI32Imm(1)), 0);
-        SDValue AN = SDValue(CurDAG->getTargetNode(PPC::AND, dl, MVT::i32, AD,
-                                                       Op), 0);
+        SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op,
+                                                    getI32Imm(1)), 0);
+        SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD,
+                                                    Op), 0);
         SDValue Ops[] = { AN, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
         return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
       }
       case ISD::SETGT: {
         SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
-        Op = SDValue(CurDAG->getTargetNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 
+        Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 
                      0);
         return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, 
                                     getI32Imm(1));
@@ -748,10 +729,10 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDValue Op) {
                                InFlag).getValue(1);
   
   if (PPCSubTarget.isGigaProcessor() && OtherCondIdx == -1)
-    IntCR = SDValue(CurDAG->getTargetNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
-                                            CCReg), 0);
+    IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
+                                           CCReg), 0);
   else
-    IntCR = SDValue(CurDAG->getTargetNode(PPC::MFCR, dl, MVT::i32, CCReg), 0);
+    IntCR = SDValue(CurDAG->getMachineNode(PPC::MFCR, dl, MVT::i32, CCReg), 0);
   
   SDValue Ops[] = { IntCR, getI32Imm((32-(3-Idx)) & 31),
                       getI32Imm(31), getI32Imm(31) };
@@ -760,7 +741,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDValue Op) {
 
   // Get the specified bit.
   SDValue Tmp =
-    SDValue(CurDAG->getTargetNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 0);
+    SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 0);
   if (Inv) {
     assert(OtherCondIdx == -1 && "Can't have split plus negation");
     return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1));
@@ -772,7 +753,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDValue Op) {
   // Get the other bit of the comparison.
   Ops[1] = getI32Imm((32-(3-OtherCondIdx)) & 31);
   SDValue OtherCond = 
-    SDValue(CurDAG->getTargetNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 0);
+    SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 0);
 
   return CurDAG->SelectNodeTo(N, PPC::OR, MVT::i32, Tmp, OtherCond);
 }
@@ -825,17 +806,17 @@ SDNode *PPCDAGToDAGISel::Select(SDValue Op) {
       // Simple value.
       if (isInt16(Imm)) {
        // Just the Lo bits.
-        Result = CurDAG->getTargetNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
+        Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
       } else if (Lo) {
         // Handle the Hi bits.
         unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
-        Result = CurDAG->getTargetNode(OpC, dl, MVT::i64, getI32Imm(Hi));
+        Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
         // And Lo bits.
-        Result = CurDAG->getTargetNode(PPC::ORI8, dl, MVT::i64,
-                                       SDValue(Result, 0), getI32Imm(Lo));
+        Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                        SDValue(Result, 0), getI32Imm(Lo));
       } else {
        // Just the Hi bits.
-        Result = CurDAG->getTargetNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
+        Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
       }
       
       // If no shift, we're done.
@@ -843,19 +824,20 @@ SDNode *PPCDAGToDAGISel::Select(SDValue Op) {
 
       // Shift for next step if the upper 32-bits were not zero.
       if (Imm) {
-        Result = CurDAG->getTargetNode(PPC::RLDICR, dl, MVT::i64,
-                                       SDValue(Result, 0),
-                                       getI32Imm(Shift), getI32Imm(63 - Shift));
+        Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
+                                        SDValue(Result, 0),
+                                        getI32Imm(Shift),
+                                        getI32Imm(63 - Shift));
       }
 
       // Add in the last bits as required.
       if ((Hi = (Remainder >> 16) & 0xFFFF)) {
-        Result = CurDAG->getTargetNode(PPC::ORIS8, dl, MVT::i64,
-                                       SDValue(Result, 0), getI32Imm(Hi));
+        Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
+                                        SDValue(Result, 0), getI32Imm(Hi));
       } 
       if ((Lo = Remainder & 0xFFFF)) {
-        Result = CurDAG->getTargetNode(PPC::ORI8, dl, MVT::i64,
-                                       SDValue(Result, 0), getI32Imm(Lo));
+        Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                        SDValue(Result, 0), getI32Imm(Lo));
       }
       
       return Result;
@@ -875,18 +857,18 @@ SDNode *PPCDAGToDAGISel::Select(SDValue Op) {
     if (N->hasOneUse())
       return CurDAG->SelectNodeTo(N, Opc, Op.getValueType(), TFI,
                                   getSmallIPtrImm(0));
-    return CurDAG->getTargetNode(Opc, dl, Op.getValueType(), TFI,
-                                 getSmallIPtrImm(0));
+    return CurDAG->getMachineNode(Opc, dl, Op.getValueType(), TFI,
+                                  getSmallIPtrImm(0));
   }
 
   case PPCISD::MFCR: {
     SDValue InFlag = N->getOperand(1);
     // Use MFOCRF if supported.
     if (PPCSubTarget.isGigaProcessor())
-      return CurDAG->getTargetNode(PPC::MFOCRF, dl, MVT::i32,
-                                   N->getOperand(0), InFlag);
+      return CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
+                                    N->getOperand(0), InFlag);
     else
-      return CurDAG->getTargetNode(PPC::MFCR, dl, MVT::i32, InFlag);
+      return CurDAG->getMachineNode(PPC::MFCR, dl, MVT::i32, InFlag);
   }
     
   case ISD::SDIV: {
@@ -900,17 +882,17 @@ SDNode *PPCDAGToDAGISel::Select(SDValue Op) {
       SDValue N0 = N->getOperand(0);
       if ((signed)Imm > 0 && isPowerOf2_32(Imm)) {
         SDNode *Op =
-          CurDAG->getTargetNode(PPC::SRAWI, dl, MVT::i32, MVT::Flag,
-                                N0, getI32Imm(Log2_32(Imm)));
+          CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Flag,
+                                 N0, getI32Imm(Log2_32(Imm)));
         return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, 
                                     SDValue(Op, 0), SDValue(Op, 1));
       } else if ((signed)Imm < 0 && isPowerOf2_32(-Imm)) {
         SDNode *Op =
-          CurDAG->getTargetNode(PPC::SRAWI, dl, MVT::i32, MVT::Flag,
-                                N0, getI32Imm(Log2_32(-Imm)));
+          CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Flag,
+                                 N0, getI32Imm(Log2_32(-Imm)));
         SDValue PT =
-          SDValue(CurDAG->getTargetNode(PPC::ADDZE, dl, MVT::i32,
-                                          SDValue(Op, 0), SDValue(Op, 1)),
+          SDValue(CurDAG->getMachineNode(PPC::ADDZE, dl, MVT::i32,
+                                         SDValue(Op, 0), SDValue(Op, 1)),
                     0);
         return CurDAG->SelectNodeTo(N, PPC::NEG, MVT::i32, PT);
       }
@@ -923,7 +905,7 @@ SDNode *PPCDAGToDAGISel::Select(SDValue Op) {
   case ISD::LOAD: {
     // Handle preincrement loads.
     LoadSDNode *LD = cast<LoadSDNode>(Op);
-    MVT LoadedVT = LD->getMemoryVT();
+    EVT LoadedVT = LD->getMemoryVT();
     
     // Normal loads are handled by code generated from the .td file.
     if (LD->getAddressingMode() != ISD::PRE_INC)
@@ -938,8 +920,8 @@ SDNode *PPCDAGToDAGISel::Select(SDValue Op) {
       if (LD->getValueType(0) != MVT::i64) {
         // Handle PPC32 integer and normal FP loads.
         assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
-        switch (LoadedVT.getSimpleVT()) {
-          default: assert(0 && "Invalid PPC load type!");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
           case MVT::f64: Opcode = PPC::LFDU; break;
           case MVT::f32: Opcode = PPC::LFSU; break;
           case MVT::i32: Opcode = PPC::LWZU; break;
@@ -950,8 +932,8 @@ SDNode *PPCDAGToDAGISel::Select(SDValue Op) {
       } else {
         assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
         assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
-        switch (LoadedVT.getSimpleVT()) {
-          default: assert(0 && "Invalid PPC load type!");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
           case MVT::i64: Opcode = PPC::LDU; break;
           case MVT::i32: Opcode = PPC::LWZU8; break;
           case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break;
@@ -964,11 +946,11 @@ SDNode *PPCDAGToDAGISel::Select(SDValue Op) {
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Offset, Base, Chain };
       // FIXME: PPC64
-      return CurDAG->getTargetNode(Opcode, dl, LD->getValueType(0),
-                                   PPCLowering.getPointerTy(),
-                                   MVT::Other, Ops, 3);
+      return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
+                                    PPCLowering.getPointerTy(),
+                                    MVT::Other, Ops, 3);
     } else {
-      assert(0 && "R+R preindex loads not supported yet!");
+      llvm_unreachable("R+R preindex loads not supported yet!");
     }
   }
     
@@ -1008,7 +990,7 @@ SDNode *PPCDAGToDAGISel::Select(SDValue Op) {
         SDValue Ops[] = { N->getOperand(0).getOperand(0),
                             N->getOperand(0).getOperand(1),
                             getI32Imm(0), getI32Imm(MB),getI32Imm(ME) };
-        return CurDAG->getTargetNode(PPC::RLWIMI, dl, MVT::i32, Ops, 5);
+        return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops, 5);
       }
     }
     
@@ -1058,8 +1040,8 @@ SDNode *PPCDAGToDAGISel::Select(SDValue Op) {
               // FIXME: Implement this optzn for PPC64.
               N->getValueType(0) == MVT::i32) {
             SDNode *Tmp =
-              CurDAG->getTargetNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
-                                    N->getOperand(0), getI32Imm(~0U));
+              CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
+                                     N->getOperand(0), getI32Imm(~0U));
             return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32,
                                         SDValue(Tmp, 0), N->getOperand(0),
                                         SDValue(Tmp, 1));
@@ -1109,51 +1091,10 @@ SDNode *PPCDAGToDAGISel::Select(SDValue Op) {
     SDValue Chain = N->getOperand(0);
     SDValue Target = N->getOperand(1);
     unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8;
-    Chain = SDValue(CurDAG->getTargetNode(Opc, dl, MVT::Other, Target,
-                                            Chain), 0);
+    Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Target,
+                                           Chain), 0);
     return CurDAG->SelectNodeTo(N, PPC::BCTR, MVT::Other, Chain);
   }
-  case ISD::DECLARE: {
-    SDValue Chain = N->getOperand(0);
-    SDValue N1 = N->getOperand(1);
-    SDValue N2 = N->getOperand(2);
-    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N1);
-    
-    // FIXME: We need to handle this for VLAs.
-    if (!FINode) {
-      ReplaceUses(Op.getValue(0), Chain);
-      return NULL;
-    }
-    
-    if (N2.getOpcode() == ISD::ADD) {
-      if (N2.getOperand(0).getOpcode() == ISD::ADD &&
-          N2.getOperand(0).getOperand(0).getOpcode() == PPCISD::GlobalBaseReg &&
-          N2.getOperand(0).getOperand(1).getOpcode() == PPCISD::Hi &&
-          N2.getOperand(1).getOpcode() == PPCISD::Lo)
-        N2 = N2.getOperand(0).getOperand(1).getOperand(0);
-      else if (N2.getOperand(0).getOpcode() == ISD::ADD &&
-          N2.getOperand(0).getOperand(0).getOpcode() == PPCISD::GlobalBaseReg &&
-          N2.getOperand(0).getOperand(1).getOpcode() == PPCISD::Lo &&
-               N2.getOperand(1).getOpcode() == PPCISD::Hi)
-        N2 = N2.getOperand(0).getOperand(1).getOperand(0);
-      else if (N2.getOperand(0).getOpcode() == PPCISD::Hi &&
-               N2.getOperand(1).getOpcode() == PPCISD::Lo)
-        N2 = N2.getOperand(0).getOperand(0);
-    }
-    
-    // If we don't have a global address here, the debug info is mangled, just
-    // drop it.
-    if (!isa<GlobalAddressSDNode>(N2)) {
-      ReplaceUses(Op.getValue(0), Chain);
-      return NULL;
-    }
-    int FI = cast<FrameIndexSDNode>(N1)->getIndex();
-    GlobalValue *GV = cast<GlobalAddressSDNode>(N2)->getGlobal();
-    SDValue Tmp1 = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
-    SDValue Tmp2 = CurDAG->getTargetGlobalAddress(GV, TLI.getPointerTy());
-    return CurDAG->SelectNodeTo(N, TargetInstrInfo::DECLARE,
-                                MVT::Other, Tmp1, Tmp2, Chain);
-  }
   }
   
   return SelectCode(Op);
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 1c6b287..3920b38 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -31,21 +31,24 @@
 #include "llvm/Intrinsics.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/DerivedTypes.h"
 using namespace llvm;
 
-static bool CC_PPC_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool CC_PPC_SVR4_Custom_Dummy(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                                      CCValAssign::LocInfo &LocInfo,
                                      ISD::ArgFlagsTy &ArgFlags,
                                      CCState &State);
-static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
-                                            MVT &LocVT,
+static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, EVT &ValVT,
+                                            EVT &LocVT,
                                             CCValAssign::LocInfo &LocInfo,
                                             ISD::ArgFlagsTy &ArgFlags,
                                             CCState &State);
-static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
-                                              MVT &LocVT,
+static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, EVT &ValVT,
+                                              EVT &LocVT,
                                               CCValAssign::LocInfo &LocInfo,
                                               ISD::ArgFlagsTy &ArgFlags,
                                               CCState &State);
@@ -54,8 +57,15 @@ static cl::opt<bool> EnablePPCPreinc("enable-ppc-preinc",
 cl::desc("enable preincrement load/store generation on PPC (experimental)"),
                                      cl::Hidden);
 
+static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
+  if (TM.getSubtargetImpl()->isDarwin())
+    return new TargetLoweringObjectFileMachO();
+  return new TargetLoweringObjectFileELF();
+}
+
+
 PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
-  : TargetLowering(TM), PPCSubTarget(*TM.getSubtargetImpl()) {
+  : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) {
 
   setPow2DivIsCheap();
 
@@ -193,9 +203,6 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::ConstantPool,  MVT::i64, Custom);
   setOperationAction(ISD::JumpTable,     MVT::i64, Custom);
 
-  // RET must be custom lowered, to meet ABI requirements.
-  setOperationAction(ISD::RET               , MVT::Other, Custom);
-
   // TRAP is legal.
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
@@ -205,8 +212,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 
-  // VAARG is custom lowered with the SVR4 ABI
-  if (TM.getSubtarget<PPCSubtarget>().isSVR4ABI())
+  // VAARG is custom lowered with the 32-bit SVR4 ABI.
+  if (    TM.getSubtarget<PPCSubtarget>().isSVR4ABI()
+      && !TM.getSubtarget<PPCSubtarget>().isPPC64())
     setOperationAction(ISD::VAARG, MVT::Other, Custom);
   else
     setOperationAction(ISD::VAARG, MVT::Other, Expand);
@@ -276,7 +284,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     // will selectively turn on ones that can be effectively codegen'd.
     for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
          i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
-      MVT VT = (MVT::SimpleValueType)i;
+      MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
 
       // add/sub are legal for all supported vector VT's.
       setOperationAction(ISD::ADD , VT, Legal);
@@ -412,6 +420,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::VPERM:           return "PPCISD::VPERM";
   case PPCISD::Hi:              return "PPCISD::Hi";
   case PPCISD::Lo:              return "PPCISD::Lo";
+  case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
   case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
   case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
   case PPCISD::SRL:             return "PPCISD::SRL";
@@ -421,6 +430,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::STD_32:          return "PPCISD::STD_32";
   case PPCISD::CALL_SVR4:       return "PPCISD::CALL_SVR4";
   case PPCISD::CALL_Darwin:     return "PPCISD::CALL_Darwin";
+  case PPCISD::NOP:             return "PPCISD::NOP";
   case PPCISD::MTCTR:           return "PPCISD::MTCTR";
   case PPCISD::BCTRL_Darwin:    return "PPCISD::BCTRL_Darwin";
   case PPCISD::BCTRL_SVR4:      return "PPCISD::BCTRL_SVR4";
@@ -438,12 +448,11 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::MTFSB1:          return "PPCISD::MTFSB1";
   case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
   case PPCISD::MTFSF:           return "PPCISD::MTFSF";
-  case PPCISD::TAILCALL:        return "PPCISD::TAILCALL";
   case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
   }
 }
 
-MVT PPCTargetLowering::getSetCCResultType(MVT VT) const {
+MVT::SimpleValueType PPCTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
@@ -900,7 +909,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
 
       Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, MVT::i32);
       unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
-      Base = SDValue(DAG.getTargetNode(Opc, dl, CN->getValueType(0), Base), 0);
+      Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
       return true;
     }
   }
@@ -1012,7 +1021,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
         Disp = DAG.getTargetConstant((short)Addr >> 2, MVT::i32);
         Base = DAG.getTargetConstant((Addr-(signed short)Addr) >> 16, MVT::i32);
         unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
-        Base = SDValue(DAG.getTargetNode(Opc, dl, CN->getValueType(0), Base),0);
+        Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base),0);
         return true;
       }
     }
@@ -1038,7 +1047,7 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   if (!EnablePPCPreinc) return false;
 
   SDValue Ptr;
-  MVT VT;
+  EVT VT;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     Ptr = LD->getBasePtr();
     VT = LD->getMemoryVT();
@@ -1086,7 +1095,7 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 
 SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
                                              SelectionDAG &DAG) {
-  MVT PtrVT = Op.getValueType();
+  EVT PtrVT = Op.getValueType();
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   Constant *C = CP->getConstVal();
   SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
@@ -1120,7 +1129,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
 }
 
 SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
-  MVT PtrVT = Op.getValueType();
+  EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
   SDValue Zero = DAG.getConstant(0, PtrVT);
@@ -1154,13 +1163,13 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
 
 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                    SelectionDAG &DAG) {
-  assert(0 && "TLS not implemented for PPC.");
+  llvm_unreachable("TLS not implemented for PPC.");
   return SDValue(); // Not reached
 }
 
 SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
                                               SelectionDAG &DAG) {
-  MVT PtrVT = Op.getValueType();
+  EVT PtrVT = Op.getValueType();
   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
   GlobalValue *GV = GSDN->getGlobal();
   SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
@@ -1170,6 +1179,13 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
 
   const TargetMachine &TM = DAG.getTarget();
 
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    return DAG.getNode(PPCISD::TOC_ENTRY, dl, MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
   SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, GA, Zero);
   SDValue Lo = DAG.getNode(PPCISD::Lo, dl, PtrVT, GA, Zero);
 
@@ -1191,7 +1207,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
 
   Lo = DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
 
-  if (!TM.getSubtarget<PPCSubtarget>().hasLazyResolverStub(GV))
+  if (!TM.getSubtarget<PPCSubtarget>().hasLazyResolverStub(GV, TM))
     return Lo;
 
   // If the global is weak or external, we have to go through the lazy
@@ -1208,7 +1224,7 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
   // fold the new nodes.
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
     if (C->isNullValue() && CC == ISD::SETEQ) {
-      MVT VT = Op.getOperand(0).getValueType();
+      EVT VT = Op.getOperand(0).getValueType();
       SDValue Zext = Op.getOperand(0);
       if (VT.bitsLT(MVT::i32)) {
         VT = MVT::i32;
@@ -1232,9 +1248,9 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
   // condition register, reading it back out, and masking the correct bit.  The
   // normal approach here uses sub to do this instead of xor.  Using xor exposes
   // the result to other bit-twiddling opportunities.
-  MVT LHSVT = Op.getOperand(0).getValueType();
+  EVT LHSVT = Op.getOperand(0).getValueType();
   if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-    MVT VT = Op.getValueType();
+    EVT VT = Op.getValueType();
     SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
                                 Op.getOperand(1));
     return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, LHSVT), CC);
@@ -1249,7 +1265,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
                               unsigned VarArgsNumFPR,
                               const PPCSubtarget &Subtarget) {
 
-  assert(0 && "VAARG not yet implemented for the SVR4 ABI!");
+  llvm_unreachable("VAARG not yet implemented for the SVR4 ABI!");
   return SDValue(); // Not reached
 }
 
@@ -1260,10 +1276,11 @@ SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
   DebugLoc dl = Op.getDebugLoc();
 
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = (PtrVT == MVT::i64);
   const Type *IntPtrTy =
-    DAG.getTargetLoweringInfo().getTargetData()->getIntPtrType();
+    DAG.getTargetLoweringInfo().getTargetData()->getIntPtrType(
+                                                             *DAG.getContext());
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
@@ -1281,8 +1298,9 @@ SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
 
   // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
   std::pair<SDValue, SDValue> CallResult =
-    LowerCallTo(Chain, Op.getValueType().getTypeForMVT(), false, false,
-                false, false, 0, CallingConv::C, false,
+    LowerCallTo(Chain, Op.getValueType().getTypeForEVT(*DAG.getContext()),
+                false, false, false, false, 0, CallingConv::C, false,
+                /*isReturnValueUsed=*/true,
                 DAG.getExternalSymbol("__trampoline_setup", PtrVT),
                 Args, DAG, dl);
 
@@ -1300,16 +1318,16 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
                                         const PPCSubtarget &Subtarget) {
   DebugLoc dl = Op.getDebugLoc();
 
-  if (Subtarget.isDarwinABI()) {
+  if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
     // memory location argument.
-    MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
     SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
     const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
     return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
   }
 
-  // For the SVR4 ABI we follow the layout of the va_list struct.
+  // For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
   // We suppose the given va_list is already allocated.
   //
   // typedef struct {
@@ -1338,7 +1356,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
   SDValue ArgFPR = DAG.getConstant(VarArgsNumFPR, MVT::i32);
 
 
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   SDValue StackOffsetFI = DAG.getFrameIndex(VarArgsStackOffset, PtrVT);
   SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
@@ -1380,15 +1398,15 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
 
 #include "PPCGenCallingConv.inc"
 
-static bool CC_PPC_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+static bool CC_PPC_SVR4_Custom_Dummy(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                                      CCValAssign::LocInfo &LocInfo,
                                      ISD::ArgFlagsTy &ArgFlags,
                                      CCState &State) {
   return true;
 }
 
-static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
-                                            MVT &LocVT,
+static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, EVT &ValVT,
+                                            EVT &LocVT,
                                             CCValAssign::LocInfo &LocInfo,
                                             ISD::ArgFlagsTy &ArgFlags,
                                             CCState &State) {
@@ -1414,8 +1432,8 @@ static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
   return false;
 }
 
-static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
-                                              MVT &LocVT,
+static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, EVT &ValVT,
+                                              EVT &LocVT,
                                               CCValAssign::LocInfo &LocInfo,
                                               ISD::ArgFlagsTy &ArgFlags,
                                               CCState &State) {
@@ -1442,29 +1460,20 @@ static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
 }
 
 /// GetFPR - Get the set of FP registers that should be allocated for arguments,
-/// depending on which subtarget is selected.
-static const unsigned *GetFPR(const PPCSubtarget &Subtarget) {
-  if (Subtarget.isDarwinABI()) {
-    static const unsigned FPR[] = {
-      PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
-      PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13
-    };
-    return FPR;
-  }
-
-
+/// on Darwin.
+static const unsigned *GetFPR() {
   static const unsigned FPR[] = {
     PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
-    PPC::F8
+    PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13
   };
+
   return FPR;
 }
 
 /// CalculateStackSlotSize - Calculates the size reserved for this argument on
 /// the stack.
-static unsigned CalculateStackSlotSize(SDValue Arg, ISD::ArgFlagsTy Flags,
+static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
                                        unsigned PtrByteSize) {
-  MVT ArgVT = Arg.getValueType();
   unsigned ArgSize = ArgVT.getSizeInBits()/8;
   if (Flags.isByVal())
     ArgSize = Flags.getByValSize();
@@ -1474,14 +1483,31 @@ static unsigned CalculateStackSlotSize(SDValue Arg, ISD::ArgFlagsTy Flags,
 }
 
 SDValue
-PPCTargetLowering::LowerFORMAL_ARGUMENTS_SVR4(SDValue Op,
-                                              SelectionDAG &DAG,
-                                              int &VarArgsFrameIndex,
-                                              int &VarArgsStackOffset,
-                                              unsigned &VarArgsNumGPR,
-                                              unsigned &VarArgsNumFPR,
-                                              const PPCSubtarget &Subtarget) {
-  // SVR4 ABI Stack Frame Layout:
+PPCTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals) {
+  if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) {
+    return LowerFormalArguments_SVR4(Chain, CallConv, isVarArg, Ins,
+                                     dl, DAG, InVals);
+  } else {
+    return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
+                                       dl, DAG, InVals);
+  }
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments_SVR4(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) {
+
+  // 32-bit SVR4 ABI Stack Frame Layout:
   //              +-----------------------------------+
   //        +-->  |            Back chain             |
   //        |     +-----------------------------------+
@@ -1512,25 +1538,21 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_SVR4(SDValue Op,
   
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  SmallVector<SDValue, 8> ArgValues;
-  SDValue Root = Op.getOperand(0);
-  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
-  DebugLoc dl = Op.getDebugLoc();
 
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   // Potential tail calls could cause overwriting of argument stack slots.
-  unsigned CC = MF.getFunction()->getCallingConv();
-  bool isImmutable = !(PerformTailCallOpt && (CC==CallingConv::Fast));
+  bool isImmutable = !(PerformTailCallOpt && (CallConv==CallingConv::Fast));
   unsigned PtrByteSize = 4;
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+                 *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(PPCFrameInfo::getLinkageSize(false, false), PtrByteSize);
 
-  CCInfo.AnalyzeFormalArguments(Op.getNode(), CC_PPC_SVR4);
+  CCInfo.AnalyzeFormalArguments(Ins, CC_PPC_SVR4);
   
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -1538,11 +1560,11 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_SVR4(SDValue Op,
     // Arguments stored in registers.
     if (VA.isRegLoc()) {
       TargetRegisterClass *RC;
-      MVT ValVT = VA.getValVT();
+      EVT ValVT = VA.getValVT();
       
-      switch (ValVT.getSimpleVT()) {
+      switch (ValVT.getSimpleVT().SimpleTy) {
         default:
-          assert(0 && "ValVT not supported by FORMAL_ARGUMENTS Lowering");
+          llvm_unreachable("ValVT not supported by formal arguments Lowering");
         case MVT::i32:
           RC = PPC::GPRCRegisterClass;
           break;
@@ -1562,9 +1584,9 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_SVR4(SDValue Op,
       
       // Transform the arguments stored in physical registers into virtual ones.
       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
-      SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, ValVT);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, ValVT);
 
-      ArgValues.push_back(ArgValue);
+      InVals.push_back(ArgValue);
     } else {
       // Argument stored in memory.
       assert(VA.isMemLoc());
@@ -1575,7 +1597,7 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_SVR4(SDValue Op,
 
       // Create load nodes to retrieve arguments from the stack.
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-      ArgValues.push_back(DAG.getLoad(VA.getValVT(), dl, Root, FIN, NULL, 0));
+      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, NULL, 0));
     }
   }
 
@@ -1583,12 +1605,13 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_SVR4(SDValue Op,
   // Aggregates passed by value are stored in the local variable space of the
   // caller's stack frame, right above the parameter list area.
   SmallVector<CCValAssign, 16> ByValArgLocs;
-  CCState CCByValInfo(CC, isVarArg, getTargetMachine(), ByValArgLocs);
+  CCState CCByValInfo(CallConv, isVarArg, getTargetMachine(),
+                      ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
 
-  CCByValInfo.AnalyzeFormalArguments(Op.getNode(), CC_PPC_SVR4_ByVal);
+  CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC_SVR4_ByVal);
 
   // Area that is at least reserved in the caller of this function.
   unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
@@ -1632,7 +1655,7 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_SVR4(SDValue Op,
 
     // Make room for NumGPArgRegs and NumFPArgRegs.
     int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
-                NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
+                NumFPArgRegs * EVT(MVT::f64).getSizeInBits()/8;
 
     VarArgsStackOffset = MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
                                                 CCInfo.getNextStackOffset());
@@ -1645,7 +1668,7 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_SVR4(SDValue Op,
     unsigned GPRIndex = 0;
     for (; GPRIndex != VarArgsNumGPR; ++GPRIndex) {
       SDValue Val = DAG.getRegister(GPArgRegs[GPRIndex], PtrVT);
-      SDValue Store = DAG.getStore(Root, dl, Val, FIN, NULL, 0);
+      SDValue Store = DAG.getStore(Chain, dl, Val, FIN, NULL, 0);
       MemOps.push_back(Store);
       // Increment the address by four for the next argument to store
       SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
@@ -1658,7 +1681,7 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_SVR4(SDValue Op,
     for (; GPRIndex != NumGPArgRegs; ++GPRIndex) {
       unsigned VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
 
-      SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, PtrVT);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
       SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
       MemOps.push_back(Store);
       // Increment the address by four for the next argument to store
@@ -1666,18 +1689,18 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_SVR4(SDValue Op,
       FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
     }
 
-    // FIXME SVR4: We only need to save FP argument registers if CR bit 6 is
-    // set.
+    // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
+    // is set.
     
     // The double arguments are stored to the VarArgsFrameIndex
     // on the stack.
     unsigned FPRIndex = 0;
     for (FPRIndex = 0; FPRIndex != VarArgsNumFPR; ++FPRIndex) {
       SDValue Val = DAG.getRegister(FPArgRegs[FPRIndex], MVT::f64);
-      SDValue Store = DAG.getStore(Root, dl, Val, FIN, NULL, 0);
+      SDValue Store = DAG.getStore(Chain, dl, Val, FIN, NULL, 0);
       MemOps.push_back(Store);
       // Increment the address by eight for the next argument to store
-      SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8,
+      SDValue PtrOff = DAG.getConstant(EVT(MVT::f64).getSizeInBits()/8,
                                          PtrVT);
       FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
     }
@@ -1685,47 +1708,40 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_SVR4(SDValue Op,
     for (; FPRIndex != NumFPArgRegs; ++FPRIndex) {
       unsigned VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
 
-      SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::f64);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
       SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
       MemOps.push_back(Store);
       // Increment the address by eight for the next argument to store
-      SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8,
+      SDValue PtrOff = DAG.getConstant(EVT(MVT::f64).getSizeInBits()/8,
                                          PtrVT);
       FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
     }
   }
 
   if (!MemOps.empty())
-    Root = DAG.getNode(ISD::TokenFactor, dl,
-                       MVT::Other, &MemOps[0], MemOps.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl,
+                        MVT::Other, &MemOps[0], MemOps.size());
 
-  
-  ArgValues.push_back(Root);
-
-  // Return the new list of results.
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
-                     &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
+  return Chain;
 }
 
 SDValue
-PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
-                                                SelectionDAG &DAG,
-                                                int &VarArgsFrameIndex,
-                                                const PPCSubtarget &Subtarget) {
+PPCTargetLowering::LowerFormalArguments_Darwin(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) {
   // TODO: add description of PPC stack frame format, or at least some docs.
   //
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  SmallVector<SDValue, 8> ArgValues;
-  SDValue Root = Op.getOperand(0);
-  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
-  DebugLoc dl = Op.getDebugLoc();
 
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = PtrVT == MVT::i64;
   // Potential tail calls could cause overwriting of argument stack slots.
-  unsigned CC = MF.getFunction()->getCallingConv();
-  bool isImmutable = !(PerformTailCallOpt && (CC==CallingConv::Fast));
+  bool isImmutable = !(PerformTailCallOpt && (CallConv==CallingConv::Fast));
   unsigned PtrByteSize = isPPC64 ? 8 : 4;
 
   unsigned ArgOffset = PPCFrameInfo::getLinkageSize(isPPC64, true);
@@ -1741,7 +1757,7 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
 
-  static const unsigned *FPR = GetFPR(Subtarget);
+  static const unsigned *FPR = GetFPR();
 
   static const unsigned VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
@@ -1765,12 +1781,11 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
   // entire point of the following loop.
   unsigned VecArgOffset = ArgOffset;
   if (!isVarArg && !isPPC64) {
-    for (unsigned ArgNo = 0, e = Op.getNode()->getNumValues()-1; ArgNo != e;
+    for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
          ++ArgNo) {
-      MVT ObjectVT = Op.getValue(ArgNo).getValueType();
+      EVT ObjectVT = Ins[ArgNo].VT;
       unsigned ObjSize = ObjectVT.getSizeInBits()/8;
-      ISD::ArgFlagsTy Flags =
-        cast<ARG_FLAGSSDNode>(Op.getOperand(ArgNo+3))->getArgFlags();
+      ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
 
       if (Flags.isByVal()) {
         // ObjSize is the true size, ArgSize rounded up to multiple of regs.
@@ -1781,8 +1796,8 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
         continue;
       }
 
-      switch(ObjectVT.getSimpleVT()) {
-      default: assert(0 && "Unhandled argument type!");
+      switch(ObjectVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unhandled argument type!");
       case MVT::i32:
       case MVT::f32:
         VecArgOffset += isPPC64 ? 8 : 4;
@@ -1811,15 +1826,13 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
 
   SmallVector<SDValue, 8> MemOps;
   unsigned nAltivecParamsAtEnd = 0;
-  for (unsigned ArgNo = 0, e = Op.getNode()->getNumValues() - 1;
-       ArgNo != e; ++ArgNo) {
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
     SDValue ArgVal;
     bool needsLoad = false;
-    MVT ObjectVT = Op.getValue(ArgNo).getValueType();
+    EVT ObjectVT = Ins[ArgNo].VT;
     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
     unsigned ArgSize = ObjSize;
-    ISD::ArgFlagsTy Flags =
-      cast<ARG_FLAGSSDNode>(Op.getOperand(ArgNo+3))->getArgFlags();
+    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
 
     unsigned CurArgOffset = ArgOffset;
 
@@ -1828,13 +1841,13 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
         ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
       if (isVarArg || isPPC64) {
         MinReservedArea = ((MinReservedArea+15)/16)*16;
-        MinReservedArea += CalculateStackSlotSize(Op.getValue(ArgNo),
+        MinReservedArea += CalculateStackSlotSize(ObjectVT,
                                                   Flags,
                                                   PtrByteSize);
       } else  nAltivecParamsAtEnd++;
     } else
       // Calculate min reserved area.
-      MinReservedArea += CalculateStackSlotSize(Op.getValue(ArgNo),
+      MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
                                                 Flags,
                                                 PtrByteSize);
 
@@ -1852,11 +1865,11 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
       // The value of the object is its address.
       int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-      ArgValues.push_back(FIN);
+      InVals.push_back(FIN);
       if (ObjSize==1 || ObjSize==2) {
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
-          SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, PtrVT);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
                                NULL, 0, ObjSize==1 ? MVT::i8 : MVT::i16 );
           MemOps.push_back(Store);
@@ -1875,7 +1888,7 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
           unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
           int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset);
           SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-          SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, PtrVT);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
           MemOps.push_back(Store);
           ++GPR_idx;
@@ -1888,13 +1901,13 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
       continue;
     }
 
-    switch (ObjectVT.getSimpleVT()) {
-    default: assert(0 && "Unhandled argument type!");
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
     case MVT::i32:
       if (!isPPC64) {
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
-          ArgVal = DAG.getCopyFromReg(Root, dl, VReg, MVT::i32);
+          ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
           ++GPR_idx;
         } else {
           needsLoad = true;
@@ -1908,7 +1921,7 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
     case MVT::i64:  // PPC64
       if (GPR_idx != Num_GPR_Regs) {
         unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
-        ArgVal = DAG.getCopyFromReg(Root, dl, VReg, MVT::i64);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
         if (ObjectVT == MVT::i32) {
           // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
@@ -1949,7 +1962,7 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
         else
           VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
 
-        ArgVal = DAG.getCopyFromReg(Root, dl, VReg, ObjectVT);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         ++FPR_idx;
       } else {
         needsLoad = true;
@@ -1966,7 +1979,7 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
       // except in varargs functions.
       if (VR_idx != Num_VR_Regs) {
         unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
-        ArgVal = DAG.getCopyFromReg(Root, dl, VReg, ObjectVT);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         if (isVarArg) {
           while ((ArgOffset % 16) != 0) {
             ArgOffset += PtrByteSize;
@@ -1974,7 +1987,7 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
               GPR_idx++;
           }
           ArgOffset += 16;
-          GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs);
+          GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
         }
         ++VR_idx;
       } else {
@@ -2000,10 +2013,10 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
                                       CurArgOffset + (ArgSize - ObjSize),
                                       isImmutable);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-      ArgVal = DAG.getLoad(ObjectVT, dl, Root, FIN, NULL, 0);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, NULL, 0);
     }
 
-    ArgValues.push_back(ArgVal);
+    InVals.push_back(ArgVal);
   }
 
   // Set the size that is at least reserved in caller of this function.  Tail
@@ -2045,7 +2058,7 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
       else
         VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
 
-      SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, PtrVT);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
       SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
       MemOps.push_back(Store);
       // Increment the address by four for the next argument to store
@@ -2055,14 +2068,10 @@ PPCTargetLowering::LowerFORMAL_ARGUMENTS_Darwin(SDValue Op,
   }
 
   if (!MemOps.empty())
-    Root = DAG.getNode(ISD::TokenFactor, dl,
-                       MVT::Other, &MemOps[0], MemOps.size());
-
-  ArgValues.push_back(Root);
+    Chain = DAG.getNode(ISD::TokenFactor, dl,
+                        MVT::Other, &MemOps[0], MemOps.size());
 
-  // Return the new list of results.
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
-                     &ArgValues[0], ArgValues.size());
+  return Chain;
 }
 
 /// CalculateParameterAndLinkageAreaSize - Get the size of the paramter plus
@@ -2072,13 +2081,14 @@ CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
                                      bool isPPC64,
                                      bool isVarArg,
                                      unsigned CC,
-                                     CallSDNode *TheCall,
+                                     const SmallVectorImpl<ISD::OutputArg>
+                                       &Outs,
                                      unsigned &nAltivecParamsAtEnd) {
   // Count how many bytes are to be pushed on the stack, including the linkage
   // area, and parameter passing area.  We start with 24/48 bytes, which is
   // prereserved space for [SP][CR][LR][3 x unused].
   unsigned NumBytes = PPCFrameInfo::getLinkageSize(isPPC64, true);
-  unsigned NumOps = TheCall->getNumArgs();
+  unsigned NumOps = Outs.size();
   unsigned PtrByteSize = isPPC64 ? 8 : 4;
 
   // Add up all the space actually used.
@@ -2089,9 +2099,9 @@ CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
   // 16-byte aligned.
   nAltivecParamsAtEnd = 0;
   for (unsigned i = 0; i != NumOps; ++i) {
-    SDValue Arg = TheCall->getArg(i);
-    ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
-    MVT ArgVT = Arg.getValueType();
+    SDValue Arg = Outs[i].Val;
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    EVT ArgVT = Arg.getValueType();
     // Varargs Altivec parameters are padded to a 16 byte boundary.
     if (ArgVT==MVT::v4f32 || ArgVT==MVT::v4i32 ||
         ArgVT==MVT::v8i16 || ArgVT==MVT::v16i8) {
@@ -2104,7 +2114,7 @@ CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
       // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
       NumBytes = ((NumBytes+15)/16)*16;
     }
-    NumBytes += CalculateStackSlotSize(Arg, Flags, PtrByteSize);
+    NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
   }
 
    // Allow for Altivec parameters at the end, if needed.
@@ -2149,40 +2159,37 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool IsTailCall,
   return SPDiff;
 }
 
-/// IsEligibleForTailCallElimination - Check to see whether the next instruction
-/// following the call is a return. A function is eligible if caller/callee
-/// calling conventions match, currently only fastcc supports tail calls, and
-/// the function CALL is immediatly followed by a RET.
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
 bool
-PPCTargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall,
-                                                     SDValue Ret,
+PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
                                                      SelectionDAG& DAG) const {
   // Variable argument functions are not supported.
-  if (!PerformTailCallOpt || TheCall->isVarArg())
+  if (isVarArg)
     return false;
 
-  if (CheckTailCallReturnConstraints(TheCall, Ret)) {
-    MachineFunction &MF = DAG.getMachineFunction();
-    unsigned CallerCC = MF.getFunction()->getCallingConv();
-    unsigned CalleeCC = TheCall->getCallingConv();
-    if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
-      // Functions containing by val parameters are not supported.
-      for (unsigned i = 0; i != TheCall->getNumArgs(); i++) {
-         ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
-         if (Flags.isByVal()) return false;
-      }
+  MachineFunction &MF = DAG.getMachineFunction();
+  CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+  if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
+    // Functions containing by val parameters are not supported.
+    for (unsigned i = 0; i != Ins.size(); i++) {
+       ISD::ArgFlagsTy Flags = Ins[i].Flags;
+       if (Flags.isByVal()) return false;
+    }
 
-      SDValue Callee = TheCall->getCallee();
-      // Non PIC/GOT  tail calls are supported.
-      if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
-        return true;
+    // Non PIC/GOT  tail calls are supported.
+    if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
+      return true;
 
-      // At the moment we can only do local tail calls (in same module, hidden
-      // or protected) if we are generating PIC.
-      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-        return G->getGlobal()->hasHiddenVisibility()
-            || G->getGlobal()->hasProtectedVisibility();
-    }
+    // At the moment we can only do local tail calls (in same module, hidden
+    // or protected) if we are generating PIC.
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+      return G->getGlobal()->hasHiddenVisibility()
+          || G->getGlobal()->hasProtectedVisibility();
   }
 
   return false;
@@ -2251,13 +2258,13 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
                                                                    isDarwinABI);
     int NewRetAddr = MF.getFrameInfo()->CreateFixedObject(SlotSize,
                                                           NewRetAddrLoc);
-    MVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+    EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
     SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
     Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
                          PseudoSourceValue::getFixedStack(NewRetAddr), 0);
 
-    // When using the SVR4 ABI there is no need to move the FP stack slot
-    // as the FP is never overwritten.
+    // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
+    // slot as the FP is never overwritten.
     if (isDarwinABI) {
       int NewFPLoc =
         SPDiff + PPCFrameInfo::getFramePointerSaveOffset(isPPC64, isDarwinABI);
@@ -2279,7 +2286,7 @@ CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
   int Offset = ArgOffset + SPDiff;
   uint32_t OpSize = (Arg.getValueType().getSizeInBits()+7)/8;
   int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
-  MVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+  EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
   SDValue FIN = DAG.getFrameIndex(FI, VT);
   TailCallArgumentInfo Info;
   Info.Arg = Arg;
@@ -2300,13 +2307,13 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
                                                         DebugLoc dl) {
   if (SPDiff) {
     // Load the LR and FP stack slot for later adjusting.
-    MVT VT = PPCSubTarget.isPPC64() ? MVT::i64 : MVT::i32;
+    EVT VT = PPCSubTarget.isPPC64() ? MVT::i64 : MVT::i32;
     LROpOut = getReturnAddrFrameIndex(DAG);
     LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, NULL, 0);
     Chain = SDValue(LROpOut.getNode(), 1);
     
-    // When using the SVR4 ABI there is no need to load the FP stack slot
-    // as the FP is never overwritten.
+    // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
+    // slot as the FP is never overwritten.
     if (isDarwinABI) {
       FPOpOut = getFramePointerFrameIndex(DAG);
       FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, NULL, 0);
@@ -2340,7 +2347,7 @@ LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
                  bool isVector, SmallVector<SDValue, 8> &MemOpChains,
                  SmallVector<TailCallArgumentInfo, 8>& TailCallArguments,
                  DebugLoc dl) {
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   if (!isTailCall) {
     if (isVector) {
       SDValue StackPtr;
@@ -2389,9 +2396,9 @@ static
 unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
                      SDValue &Chain, DebugLoc dl, int SPDiff, bool isTailCall,
                      SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
-                     SmallVector<SDValue, 8> &Ops, std::vector<MVT> &NodeTys,
+                     SmallVector<SDValue, 8> &Ops, std::vector<EVT> &NodeTys,
                      bool isSVR4ABI) {
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   NodeTys.push_back(MVT::Other);   // Returns a chain
   NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
 
@@ -2444,102 +2451,145 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
   return CallOpc;
 }
 
-static SDValue LowerCallReturn(SDValue Op, SelectionDAG &DAG, TargetMachine &TM,
-                               CallSDNode *TheCall, SDValue Chain,
-                               SDValue InFlag) {
-  bool isVarArg = TheCall->isVarArg();
-  DebugLoc dl = TheCall->getDebugLoc();
-  SmallVector<SDValue, 16> ResultVals;
+SDValue
+PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals) {
+
   SmallVector<CCValAssign, 16> RVLocs;
-  unsigned CallerCC = DAG.getMachineFunction().getFunction()->getCallingConv();
-  CCState CCRetInfo(CallerCC, isVarArg, TM, RVLocs);
-  CCRetInfo.AnalyzeCallResult(TheCall, RetCC_PPC);
+  CCState CCRetInfo(CallConv, isVarArg, getTargetMachine(),
+                    RVLocs, *DAG.getContext());
+  CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
     CCValAssign &VA = RVLocs[i];
-    MVT VT = VA.getValVT();
+    EVT VT = VA.getValVT();
     assert(VA.isRegLoc() && "Can only return in registers!");
     Chain = DAG.getCopyFromReg(Chain, dl,
                                VA.getLocReg(), VT, InFlag).getValue(1);
-    ResultVals.push_back(Chain.getValue(0));
+    InVals.push_back(Chain.getValue(0));
     InFlag = Chain.getValue(2);
   }
 
-  // If the function returns void, just return the chain.
-  if (RVLocs.empty())
-    return Chain;
-
-  // Otherwise, merge everything together with a MERGE_VALUES node.
-  ResultVals.push_back(Chain);
-  SDValue Res = DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
-                            &ResultVals[0], ResultVals.size());
-  return Res.getValue(Op.getResNo());
+  return Chain;
 }
 
-static
-SDValue FinishCall(SelectionDAG &DAG, CallSDNode *TheCall, TargetMachine &TM,
-                   SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
-                   SDValue Op, SDValue InFlag, SDValue Chain, SDValue &Callee,
-                   int SPDiff, unsigned NumBytes) {
-  unsigned CC = TheCall->getCallingConv();
-  DebugLoc dl = TheCall->getDebugLoc();
-  bool isTailCall = TheCall->isTailCall()
-                 && CC == CallingConv::Fast && PerformTailCallOpt;
-
-  std::vector<MVT> NodeTys;
+SDValue
+PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
+                              bool isTailCall, bool isVarArg,
+                              SelectionDAG &DAG,
+                              SmallVector<std::pair<unsigned, SDValue>, 8>
+                                &RegsToPass,
+                              SDValue InFlag, SDValue Chain,
+                              SDValue &Callee,
+                              int SPDiff, unsigned NumBytes,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              SmallVectorImpl<SDValue> &InVals) {
+  std::vector<EVT> NodeTys;
   SmallVector<SDValue, 8> Ops;
   unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff,
                                  isTailCall, RegsToPass, Ops, NodeTys,
-                                 TM.getSubtarget<PPCSubtarget>().isSVR4ABI());
+                                 PPCSubTarget.isSVR4ABI());
 
   // When performing tail call optimization the callee pops its arguments off
   // the stack. Account for this here so these bytes can be pushed back on in
   // PPCRegisterInfo::eliminateCallFramePseudoInstr.
   int BytesCalleePops =
-    (CC==CallingConv::Fast && PerformTailCallOpt) ? NumBytes : 0;
+    (CallConv==CallingConv::Fast && PerformTailCallOpt) ? NumBytes : 0;
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
   // Emit tail call.
   if (isTailCall) {
-    assert(InFlag.getNode() &&
-           "Flag must be set. Depend on flag being set in LowerRET");
-    Chain = DAG.getNode(PPCISD::TAILCALL, dl,
-                        TheCall->getVTList(), &Ops[0], Ops.size());
-    return SDValue(Chain.getNode(), Op.getResNo());
+    // If this is the first return lowered for this function, add the regs
+    // to the liveout set for the function.
+    if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+      SmallVector<CCValAssign, 16> RVLocs;
+      CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
+                     *DAG.getContext());
+      CCInfo.AnalyzeCallResult(Ins, RetCC_PPC);
+      for (unsigned i = 0; i != RVLocs.size(); ++i)
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+    }
+
+    assert(((Callee.getOpcode() == ISD::Register &&
+             cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
+            Callee.getOpcode() == ISD::TargetExternalSymbol ||
+            Callee.getOpcode() == ISD::TargetGlobalAddress ||
+            isa<ConstantSDNode>(Callee)) &&
+    "Expecting an global address, external symbol, absolute value or register");
+
+    return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, &Ops[0], Ops.size());
   }
 
   Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
   InFlag = Chain.getValue(1);
 
+  // Add a NOP immediately after the branch instruction when using the 64-bit
+  // SVR4 ABI. At link time, if caller and callee are in a different module and
+  // thus have a different TOC, the call will be replaced with a call to a stub
+  // function which saves the current TOC, loads the TOC of the callee and
+  // branches to the callee. The NOP will be replaced with a load instruction
+  // which restores the TOC of the caller from the TOC save slot of the current
+  // stack frame. If caller and callee belong to the same module (and have the
+  // same TOC), the NOP will remain unchanged.
+  if (!isTailCall && PPCSubTarget.isSVR4ABI()&& PPCSubTarget.isPPC64()) {
+    // Insert NOP.
+    InFlag = DAG.getNode(PPCISD::NOP, dl, MVT::Flag, InFlag);
+  }
+
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                              DAG.getIntPtrConstant(BytesCalleePops, true),
                              InFlag);
-  if (TheCall->getValueType(0) != MVT::Other)
+  if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
-  return LowerCallReturn(Op, DAG, TM, TheCall, Chain, InFlag);
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+SDValue
+PPCTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool isTailCall,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) {
+  if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) {
+    return LowerCall_SVR4(Chain, Callee, CallConv, isVarArg,
+                          isTailCall, Outs, Ins,
+                          dl, DAG, InVals);
+  } else {
+    return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
+                            isTailCall, Outs, Ins,
+                            dl, DAG, InVals);
+  }
 }
 
-SDValue PPCTargetLowering::LowerCALL_SVR4(SDValue Op, SelectionDAG &DAG,
-                                          const PPCSubtarget &Subtarget,
-                                          TargetMachine &TM) {
-  // See PPCTargetLowering::LowerFORMAL_ARGUMENTS_SVR4() for a description
-  // of the SVR4 ABI stack frame layout.
-  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
-  SDValue Chain  = TheCall->getChain();
-  bool isVarArg   = TheCall->isVarArg();
-  unsigned CC     = TheCall->getCallingConv();
-  assert((CC == CallingConv::C ||
-          CC == CallingConv::Fast) && "Unknown calling convention!");
-  bool isTailCall = TheCall->isTailCall()
-                 && CC == CallingConv::Fast && PerformTailCallOpt;
-  SDValue Callee = TheCall->getCallee();
-  DebugLoc dl = TheCall->getDebugLoc();
-
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+SDValue
+PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  bool isTailCall,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) {
+  // See PPCTargetLowering::LowerFormalArguments_SVR4() for a description
+  // of the 32-bit SVR4 ABI stack frame layout.
+
+  assert((!isTailCall ||
+          (CallConv == CallingConv::Fast && PerformTailCallOpt)) &&
+         "IsEligibleForTailCallOptimization missed a case!");
+
+  assert((CallConv == CallingConv::C ||
+          CallConv == CallingConv::Fast) && "Unknown calling convention!");
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   unsigned PtrByteSize = 4;
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -2549,7 +2599,7 @@ SDValue PPCTargetLowering::LowerCALL_SVR4(SDValue Op, SelectionDAG &DAG,
   // and restoring the callers stack pointer in this functions epilog. This is
   // done because by tail calling the called function might overwrite the value
   // in this function's (MF) stack pointer stack slot 0(SP).
-  if (PerformTailCallOpt && CC==CallingConv::Fast)
+  if (PerformTailCallOpt && CallConv==CallingConv::Fast)
     MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
   
   // Count how many bytes are to be pushed on the stack, including the linkage
@@ -2558,7 +2608,8 @@ SDValue PPCTargetLowering::LowerCALL_SVR4(SDValue Op, SelectionDAG &DAG,
 
   // Assign locations to all of the outgoing arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(PPCFrameInfo::getLinkageSize(false, false), PtrByteSize);
@@ -2567,15 +2618,14 @@ SDValue PPCTargetLowering::LowerCALL_SVR4(SDValue Op, SelectionDAG &DAG,
     // Handle fixed and variable vector arguments differently.
     // Fixed vector arguments go into registers as long as registers are
     // available. Variable vector arguments always go into memory.
-    unsigned NumArgs = TheCall->getNumArgs();
-    unsigned NumFixedArgs = TheCall->getNumFixedArgs();
+    unsigned NumArgs = Outs.size();
     
     for (unsigned i = 0; i != NumArgs; ++i) {
-      MVT ArgVT = TheCall->getArg(i).getValueType();
-      ISD::ArgFlagsTy ArgFlags = TheCall->getArgFlags(i);
+      EVT ArgVT = Outs[i].Val.getValueType();
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
       bool Result;
       
-      if (i < NumFixedArgs) {
+      if (Outs[i].IsFixed) {
         Result = CC_PPC_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
                              CCInfo);
       } else {
@@ -2584,24 +2634,27 @@ SDValue PPCTargetLowering::LowerCALL_SVR4(SDValue Op, SelectionDAG &DAG,
       }
       
       if (Result) {
-        cerr << "Call operand #" << i << " has unhandled type "
-             << ArgVT.getMVTString() << "\n";
-        abort();
+#ifndef NDEBUG
+        errs() << "Call operand #" << i << " has unhandled type "
+             << ArgVT.getEVTString() << "\n";
+#endif
+        llvm_unreachable(0);
       }
     }
   } else {
     // All arguments are treated the same.
-    CCInfo.AnalyzeCallOperands(TheCall, CC_PPC_SVR4);
+    CCInfo.AnalyzeCallOperands(Outs, CC_PPC_SVR4);
   }
   
   // Assign locations to all of the outgoing aggregate by value arguments.
   SmallVector<CCValAssign, 16> ByValArgLocs;
-  CCState CCByValInfo(CC, isVarArg, getTargetMachine(), ByValArgLocs);
+  CCState CCByValInfo(CallConv, isVarArg, getTargetMachine(), ByValArgLocs,
+                      *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
 
-  CCByValInfo.AnalyzeCallOperands(TheCall, CC_PPC_SVR4_ByVal);
+  CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC_SVR4_ByVal);
 
   // Size of the linkage area, parameter list area and the part of the local
   // space variable where copies of aggregates which are passed by value are
@@ -2637,8 +2690,8 @@ SDValue PPCTargetLowering::LowerCALL_SVR4(SDValue Op, SelectionDAG &DAG,
        i != e;
        ++i) {
     CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = TheCall->getArg(i);
-    ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
+    SDValue Arg = Outs[i].Val;
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
     
     if (Flags.isByVal()) {
       // Argument is an aggregate which is passed by value, thus we need to
@@ -2712,7 +2765,7 @@ SDValue PPCTargetLowering::LowerCALL_SVR4(SDValue Op, SelectionDAG &DAG,
   
   // Set CR6 to true if this is a vararg call.
   if (isVarArg) {
-    SDValue SetCR(DAG.getTargetNode(PPC::CRSET, dl, MVT::i32), 0);
+    SDValue SetCR(DAG.getMachineNode(PPC::CRSET, dl, MVT::i32), 0);
     Chain = DAG.getCopyToReg(Chain, dl, PPC::CR1EQ, SetCR, InFlag);
     InFlag = Chain.getValue(1);
   }
@@ -2722,24 +2775,23 @@ SDValue PPCTargetLowering::LowerCALL_SVR4(SDValue Op, SelectionDAG &DAG,
                     false, TailCallArguments);
   }
 
-  return FinishCall(DAG, TheCall, TM, RegsToPass, Op, InFlag, Chain, Callee,
-                    SPDiff, NumBytes);
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
+                    Ins, InVals);
 }
 
-SDValue PPCTargetLowering::LowerCALL_Darwin(SDValue Op, SelectionDAG &DAG,
-                                            const PPCSubtarget &Subtarget,
-                                            TargetMachine &TM) {
-  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
-  SDValue Chain  = TheCall->getChain();
-  bool isVarArg   = TheCall->isVarArg();
-  unsigned CC     = TheCall->getCallingConv();
-  bool isTailCall = TheCall->isTailCall()
-                 && CC == CallingConv::Fast && PerformTailCallOpt;
-  SDValue Callee = TheCall->getCallee();
-  unsigned NumOps  = TheCall->getNumArgs();
-  DebugLoc dl = TheCall->getDebugLoc();
-
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+SDValue
+PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) {
+
+  unsigned NumOps  = Outs.size();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = PtrVT == MVT::i64;
   unsigned PtrByteSize = isPPC64 ? 8 : 4;
 
@@ -2750,7 +2802,7 @@ SDValue PPCTargetLowering::LowerCALL_Darwin(SDValue Op, SelectionDAG &DAG,
   // and restoring the callers stack pointer in this functions epilog. This is
   // done because by tail calling the called function might overwrite the value
   // in this function's (MF) stack pointer stack slot 0(SP).
-  if (PerformTailCallOpt && CC==CallingConv::Fast)
+  if (PerformTailCallOpt && CallConv==CallingConv::Fast)
     MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
 
   unsigned nAltivecParamsAtEnd = 0;
@@ -2759,13 +2811,19 @@ SDValue PPCTargetLowering::LowerCALL_Darwin(SDValue Op, SelectionDAG &DAG,
   // area, and parameter passing area.  We start with 24/48 bytes, which is
   // prereserved space for [SP][CR][LR][3 x unused].
   unsigned NumBytes =
-    CalculateParameterAndLinkageAreaSize(DAG, isPPC64, isVarArg, CC, TheCall,
+    CalculateParameterAndLinkageAreaSize(DAG, isPPC64, isVarArg, CallConv,
+                                         Outs,
                                          nAltivecParamsAtEnd);
 
   // Calculate by how many bytes the stack has to be adjusted in case of tail
   // call optimization.
   int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
 
+  // To protect arguments on the stack from being clobbered in a tail call,
+  // force all the loads to happen before doing any other lowering.
+  if (isTailCall)
+    Chain = DAG.getStackArgumentTokenFactor(Chain);
+
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
@@ -2801,7 +2859,7 @@ SDValue PPCTargetLowering::LowerCALL_Darwin(SDValue Op, SelectionDAG &DAG,
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
-  static const unsigned *FPR = GetFPR(Subtarget);
+  static const unsigned *FPR = GetFPR();
 
   static const unsigned VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
@@ -2818,9 +2876,8 @@ SDValue PPCTargetLowering::LowerCALL_Darwin(SDValue Op, SelectionDAG &DAG,
 
   SmallVector<SDValue, 8> MemOpChains;
   for (unsigned i = 0; i != NumOps; ++i) {
-    bool inMem = false;
-    SDValue Arg = TheCall->getArg(i);
-    ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
+    SDValue Arg = Outs[i].Val;
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
 
     // PtrOff will be used to store the current argument to the stack if a
     // register cannot be found for it.
@@ -2843,7 +2900,7 @@ SDValue PPCTargetLowering::LowerCALL_Darwin(SDValue Op, SelectionDAG &DAG,
       if (Size==1 || Size==2) {
         // Very small objects are passed right-justified.
         // Everything else is passed left-justified.
-        MVT VT = (Size==1) ? MVT::i8 : MVT::i16;
+        EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
         if (GPR_idx != NumGPRs) {
           SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
                                           NULL, 0, VT);
@@ -2895,8 +2952,8 @@ SDValue PPCTargetLowering::LowerCALL_Darwin(SDValue Op, SelectionDAG &DAG,
       continue;
     }
 
-    switch (Arg.getValueType().getSimpleVT()) {
-    default: assert(0 && "Unexpected ValueType for argument!");
+    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
     case MVT::i32:
     case MVT::i64:
       if (GPR_idx != NumGPRs) {
@@ -2905,7 +2962,6 @@ SDValue PPCTargetLowering::LowerCALL_Darwin(SDValue Op, SelectionDAG &DAG,
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          isPPC64, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
-        inMem = true;
       }
       ArgOffset += PtrByteSize;
       break;
@@ -2945,7 +3001,6 @@ SDValue PPCTargetLowering::LowerCALL_Darwin(SDValue Op, SelectionDAG &DAG,
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          isPPC64, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
-        inMem = true;
       }
       if (isPPC64)
         ArgOffset += 8;
@@ -3017,8 +3072,8 @@ SDValue PPCTargetLowering::LowerCALL_Darwin(SDValue Op, SelectionDAG &DAG,
     ArgOffset = ((ArgOffset+15)/16)*16;
     ArgOffset += 12*16;
     for (unsigned i = 0; i != NumOps; ++i) {
-      SDValue Arg = TheCall->getArg(i);
-      MVT ArgType = Arg.getValueType();
+      SDValue Arg = Outs[i].Val;
+      EVT ArgType = Arg.getValueType();
       if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
           ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
         if (++j > NumVRs) {
@@ -3051,18 +3106,21 @@ SDValue PPCTargetLowering::LowerCALL_Darwin(SDValue Op, SelectionDAG &DAG,
                     FPOp, true, TailCallArguments);
   }
 
-  return FinishCall(DAG, TheCall, TM, RegsToPass, Op, InFlag, Chain, Callee,
-                    SPDiff, NumBytes);
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
+                    Ins, InVals);
 }
 
-SDValue PPCTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG,
-                                      TargetMachine &TM) {
+SDValue
+PPCTargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               DebugLoc dl, SelectionDAG &DAG) {
+
   SmallVector<CCValAssign, 16> RVLocs;
-  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
-  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
-  DebugLoc dl = Op.getDebugLoc();
-  CCState CCInfo(CC, isVarArg, TM, RVLocs);
-  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_PPC);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
 
   // If this is the first return lowered for this function, add the regs to the
   // liveout set for the function.
@@ -3071,37 +3129,6 @@ SDValue PPCTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG,
       DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
   }
 
-  SDValue Chain = Op.getOperand(0);
-
-  Chain = GetPossiblePreceedingTailCall(Chain, PPCISD::TAILCALL);
-  if (Chain.getOpcode() == PPCISD::TAILCALL) {
-    SDValue TailCall = Chain;
-    SDValue TargetAddress = TailCall.getOperand(1);
-    SDValue StackAdjustment = TailCall.getOperand(2);
-
-    assert(((TargetAddress.getOpcode() == ISD::Register &&
-             cast<RegisterSDNode>(TargetAddress)->getReg() == PPC::CTR) ||
-            TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
-            TargetAddress.getOpcode() == ISD::TargetGlobalAddress ||
-            isa<ConstantSDNode>(TargetAddress)) &&
-    "Expecting an global address, external symbol, absolute value or register");
-
-    assert(StackAdjustment.getOpcode() == ISD::Constant &&
-           "Expecting a const value");
-
-    SmallVector<SDValue,8> Operands;
-    Operands.push_back(Chain.getOperand(0));
-    Operands.push_back(TargetAddress);
-    Operands.push_back(StackAdjustment);
-    // Copy registers used by the call. Last operand is a flag so it is not
-    // copied.
-    for (unsigned i=3; i < TailCall.getNumOperands()-1; i++) {
-      Operands.push_back(Chain.getOperand(i));
-    }
-    return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, &Operands[0],
-                       Operands.size());
-  }
-
   SDValue Flag;
 
   // Copy the result values into the output registers.
@@ -3109,7 +3136,7 @@ SDValue PPCTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG,
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             Op.getOperand(i*2+1), Flag);
+                             Outs[i].Val, Flag);
     Flag = Chain.getValue(1);
   }
 
@@ -3125,7 +3152,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
   DebugLoc dl = Op.getDebugLoc();
 
   // Get the corect type for pointers.
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   // Construct the stack pointer operand.
   bool IsPPC64 = Subtarget.isPPC64();
@@ -3153,7 +3180,7 @@ PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool IsPPC64 = PPCSubTarget.isPPC64();
   bool isDarwinABI = PPCSubTarget.isDarwinABI();
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   // Get current frame pointer save index.  The users of this index will be
   // primarily DYNALLOC instructions.
@@ -3177,7 +3204,7 @@ PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool IsPPC64 = PPCSubTarget.isPPC64();
   bool isDarwinABI = PPCSubTarget.isDarwinABI();
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   // Get current frame pointer save index.  The users of this index will be
   // primarily DYNALLOC instructions.
@@ -3207,7 +3234,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   DebugLoc dl = Op.getDebugLoc();
 
   // Get the corect type for pointers.
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   // Negate the size.
   SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
                                   DAG.getConstant(0, PtrVT), Size);
@@ -3232,8 +3259,8 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
   // Cannot handle SETEQ/SETNE.
   if (CC == ISD::SETEQ || CC == ISD::SETNE) return Op;
 
-  MVT ResVT = Op.getValueType();
-  MVT CmpVT = Op.getOperand(0).getValueType();
+  EVT ResVT = Op.getValueType();
+  EVT CmpVT = Op.getOperand(0).getValueType();
   SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
   SDValue TV  = Op.getOperand(2), FV  = Op.getOperand(3);
   DebugLoc dl = Op.getDebugLoc();
@@ -3302,8 +3329,8 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
     Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
 
   SDValue Tmp;
-  switch (Op.getValueType().getSimpleVT()) {
-  default: assert(0 && "Unhandled FP_TO_INT type in custom expander!");
+  switch (Op.getValueType().getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
   case MVT::i32:
     Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ :
                                                          PPCISD::FCTIDZ, 
@@ -3350,20 +3377,23 @@ SDValue PPCTargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   // 64-bit registers.  In particular, sign extend the input value into the
   // 64-bit register with extsw, store the WHOLE 64-bit value into the stack
   // then lfd it and fcfid it.
-  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *FrameInfo = MF.getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(8, 8);
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
   SDValue Ext64 = DAG.getNode(PPCISD::EXTSW_32, dl, MVT::i32,
                                 Op.getOperand(0));
 
   // STD the extended value into the stack slot.
-  MachineMemOperand MO(PseudoSourceValue::getFixedStack(FrameIdx),
-                       MachineMemOperand::MOStore, 0, 8, 8);
-  SDValue Store = DAG.getNode(PPCISD::STD_32, dl, MVT::Other,
-                                DAG.getEntryNode(), Ext64, FIdx,
-                                DAG.getMemOperand(MO));
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOStore, 0, 8, 8);
+  SDValue Ops[] = { DAG.getEntryNode(), Ext64, FIdx };
+  SDValue Store =
+    DAG.getMemIntrinsicNode(PPCISD::STD_32, dl, DAG.getVTList(MVT::Other),
+                            Ops, 4, MVT::i64, MMO);
   // Load the value as a double.
   SDValue Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx, NULL, 0);
 
@@ -3396,9 +3426,9 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
   */
 
   MachineFunction &MF = DAG.getMachineFunction();
-  MVT VT = Op.getValueType();
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-  std::vector<MVT> NodeTys;
+  EVT VT = Op.getValueType();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  std::vector<EVT> NodeTys;
   SDValue MFFSreg, InFlag;
 
   // Save FP Control Word to register
@@ -3437,7 +3467,7 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   unsigned BitWidth = VT.getSizeInBits();
   DebugLoc dl = Op.getDebugLoc();
   assert(Op.getNumOperands() == 3 &&
@@ -3449,7 +3479,7 @@ SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) {
   SDValue Lo = Op.getOperand(0);
   SDValue Hi = Op.getOperand(1);
   SDValue Amt = Op.getOperand(2);
-  MVT AmtVT = Amt.getValueType();
+  EVT AmtVT = Amt.getValueType();
 
   SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
                              DAG.getConstant(BitWidth, AmtVT), Amt);
@@ -3466,7 +3496,7 @@ SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
   unsigned BitWidth = VT.getSizeInBits();
   assert(Op.getNumOperands() == 3 &&
@@ -3478,7 +3508,7 @@ SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) {
   SDValue Lo = Op.getOperand(0);
   SDValue Hi = Op.getOperand(1);
   SDValue Amt = Op.getOperand(2);
-  MVT AmtVT = Amt.getValueType();
+  EVT AmtVT = Amt.getValueType();
 
   SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
                              DAG.getConstant(BitWidth, AmtVT), Amt);
@@ -3496,7 +3526,7 @@ SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) {
 
 SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   unsigned BitWidth = VT.getSizeInBits();
   assert(Op.getNumOperands() == 3 &&
          VT == Op.getOperand(1).getValueType() &&
@@ -3506,7 +3536,7 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) {
   SDValue Lo = Op.getOperand(0);
   SDValue Hi = Op.getOperand(1);
   SDValue Amt = Op.getOperand(2);
-  MVT AmtVT = Amt.getValueType();
+  EVT AmtVT = Amt.getValueType();
 
   SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
                              DAG.getConstant(BitWidth, AmtVT), Amt);
@@ -3529,21 +3559,21 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) {
 
 /// BuildSplatI - Build a canonical splati of Val with an element size of
 /// SplatSize.  Cast the result to VT.
-static SDValue BuildSplatI(int Val, unsigned SplatSize, MVT VT,
+static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
                              SelectionDAG &DAG, DebugLoc dl) {
   assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");
 
-  static const MVT VTys[] = { // canonical VT to use for each size.
+  static const EVT VTys[] = { // canonical VT to use for each size.
     MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
   };
 
-  MVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
+  EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
 
   // Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
   if (Val == -1)
     SplatSize = 1;
 
-  MVT CanonicalVT = VTys[SplatSize-1];
+  EVT CanonicalVT = VTys[SplatSize-1];
 
   // Build a canonical splat for this value.
   SDValue Elt = DAG.getConstant(Val, MVT::i32);
@@ -3558,7 +3588,7 @@ static SDValue BuildSplatI(int Val, unsigned SplatSize, MVT VT,
 /// specified intrinsic ID.
 static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
                                 SelectionDAG &DAG, DebugLoc dl,
-                                MVT DestVT = MVT::Other) {
+                                EVT DestVT = MVT::Other) {
   if (DestVT == MVT::Other) DestVT = LHS.getValueType();
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
                      DAG.getConstant(IID, MVT::i32), LHS, RHS);
@@ -3568,7 +3598,7 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
 /// specified intrinsic ID.
 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
                                 SDValue Op2, SelectionDAG &DAG,
-                                DebugLoc dl, MVT DestVT = MVT::Other) {
+                                DebugLoc dl, EVT DestVT = MVT::Other) {
   if (DestVT == MVT::Other) DestVT = Op0.getValueType();
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
                      DAG.getConstant(IID, MVT::i32), Op0, Op1, Op2);
@@ -3578,7 +3608,7 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
 /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
 /// amount.  The result has the specified value type.
 static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt,
-                             MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+                             EVT VT, SelectionDAG &DAG, DebugLoc dl) {
   // Force LHS/RHS to be the right type.
   LHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, LHS);
   RHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, RHS);
@@ -3789,7 +3819,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
 
   int ShufIdxs[16];
   switch (OpNum) {
-  default: assert(0 && "Unknown i32 permute!");
+  default: llvm_unreachable("Unknown i32 permute!");
   case OP_VMRGHW:
     ShufIdxs[ 0] =  0; ShufIdxs[ 1] =  1; ShufIdxs[ 2] =  2; ShufIdxs[ 3] =  3;
     ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
@@ -3825,7 +3855,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
   case OP_VSLDOI12:
     return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
   }
-  MVT VT = OpLHS.getValueType();
+  EVT VT = OpLHS.getValueType();
   OpLHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, OpLHS);
   OpRHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, OpRHS);
   SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
@@ -3842,7 +3872,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
 
   // Cases that are handled by instructions that take permute immediates
   // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
@@ -3939,7 +3969,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 
   // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
   // that it is in input element units, not in bytes.  Convert now.
-  MVT EltVT = V1.getValueType().getVectorElementType();
+  EVT EltVT = V1.getValueType().getVectorElementType();
   unsigned BytesPerElement = EltVT.getSizeInBits()/8;
 
   SmallVector<SDValue, 16> ResultMask;
@@ -4026,7 +4056,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     Op.getOperand(3),  // RHS
     DAG.getConstant(CompareOpc, MVT::i32)
   };
-  std::vector<MVT> VTs;
+  std::vector<EVT> VTs;
   VTs.push_back(Op.getOperand(2).getValueType());
   VTs.push_back(MVT::Flag);
   SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3);
@@ -4076,7 +4106,7 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
   // Create a stack slot that is 16-byte aligned.
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(16, 16);
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
   // Store the input value into Value#0 of the stack slot.
@@ -4141,8 +4171,7 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) {
     }
     return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
   } else {
-    assert(0 && "Unknown mul to lower!");
-    abort();
+    llvm_unreachable("Unknown mul to lower!");
   }
 }
 
@@ -4150,7 +4179,7 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) {
 ///
 SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   switch (Op.getOpcode()) {
-  default: assert(0 && "Wasn't expecting to be able to lower this!");
+  default: llvm_unreachable("Wasn't expecting to be able to lower this!");
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
@@ -4165,24 +4194,6 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
     return LowerVAARG(Op, DAG, VarArgsFrameIndex, VarArgsStackOffset,
                       VarArgsNumGPR, VarArgsNumFPR, PPCSubTarget);
 
-  case ISD::FORMAL_ARGUMENTS:
-    if (PPCSubTarget.isSVR4ABI()) {
-      return LowerFORMAL_ARGUMENTS_SVR4(Op, DAG, VarArgsFrameIndex,
-                                        VarArgsStackOffset, VarArgsNumGPR,
-                                        VarArgsNumFPR, PPCSubTarget);
-    } else {
-      return LowerFORMAL_ARGUMENTS_Darwin(Op, DAG, VarArgsFrameIndex,
-                                          PPCSubTarget);
-    }
-
-  case ISD::CALL:
-    if (PPCSubTarget.isSVR4ABI()) {
-      return LowerCALL_SVR4(Op, DAG, PPCSubTarget, getTargetMachine());
-    } else {
-      return LowerCALL_Darwin(Op, DAG, PPCSubTarget, getTargetMachine());
-    }
-    
-  case ISD::RET:                return LowerRET(Op, DAG, getTargetMachine());
   case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, PPCSubTarget);
   case ISD::DYNAMIC_STACKALLOC:
     return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget);
@@ -4234,7 +4245,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     // This sequence changes FPSCR to do round-to-zero, adds the two halves
     // of the long double, and puts FPSCR back the way it was.  We do not
     // actually model FPSCR.
-    std::vector<MVT> NodeTys;
+    std::vector<EVT> NodeTys;
     SDValue Ops[4], Result, MFFSreg, InFlag, FPreg;
 
     NodeTys.push_back(MVT::f64);   // Return register
@@ -4480,7 +4491,8 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
 
 MachineBasicBlock *
 PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                               MachineBasicBlock *BB) const {
+                                               MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 
   // To "insert" these instructions we actually have to insert their
@@ -4516,9 +4528,18 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
       .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
     F->insert(It, copy0MBB);
     F->insert(It, sinkMBB);
-    // Update machine-CFG edges by transferring all successors of the current
+    // Update machine-CFG edges by first adding all successors of the current
     // block to the new block which will contain the Phi node for the select.
-    sinkMBB->transferSuccessors(BB);
+    // Also inform sdisel of the edge changes.
+    for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
+           E = BB->succ_end(); I != E; ++I) {
+      EM->insert(std::make_pair(*I, sinkMBB));
+      sinkMBB->addSuccessor(*I);
+    }
+    // Next, remove all successors of the current block, and add the true
+    // and fallthrough blocks as its successors.
+    while (!BB->succ_empty())
+      BB->removeSuccessor(BB->succ_begin());
     // Next, add the true and fallthrough blocks as its successors.
     BB->addSuccessor(copy0MBB);
     BB->addSuccessor(sinkMBB);
@@ -4812,7 +4833,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     BB = exitMBB;
     BuildMI(BB, dl, TII->get(PPC::SRW),dest).addReg(TmpReg).addReg(ShiftReg);
   } else {
-    assert(0 && "Unexpected instr type to insert");
+    llvm_unreachable("Unexpected instr type to insert");
   }
 
   F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
@@ -4903,7 +4924,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     }
 
     // Turn STORE (BSWAP) -> sthbrx/stwbrx.
-    if (N->getOperand(1).getOpcode() == ISD::BSWAP &&
+    if (cast<StoreSDNode>(N)->isUnindexed() &&
+        N->getOperand(1).getOpcode() == ISD::BSWAP &&
         N->getOperand(1).getNode()->hasOneUse() &&
         (N->getOperand(1).getValueType() == MVT::i32 ||
          N->getOperand(1).getValueType() == MVT::i16)) {
@@ -4912,9 +4934,15 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       if (BSwapOp.getValueType() == MVT::i16)
         BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
 
-      return DAG.getNode(PPCISD::STBRX, dl, MVT::Other, N->getOperand(0),
-                         BSwapOp, N->getOperand(2), N->getOperand(3),
-                         DAG.getValueType(N->getOperand(1).getValueType()));
+      SDValue Ops[] = {
+        N->getOperand(0), BSwapOp, N->getOperand(2),
+        DAG.getValueType(N->getOperand(1).getValueType())
+      };
+      return
+        DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
+                                Ops, array_lengthof(Ops),
+                                cast<StoreSDNode>(N)->getMemoryVT(),
+                                cast<StoreSDNode>(N)->getMemOperand());
     }
     break;
   case ISD::BSWAP:
@@ -4925,17 +4953,15 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       SDValue Load = N->getOperand(0);
       LoadSDNode *LD = cast<LoadSDNode>(Load);
       // Create the byte-swapping load.
-      std::vector<MVT> VTs;
-      VTs.push_back(MVT::i32);
-      VTs.push_back(MVT::Other);
-      SDValue MO = DAG.getMemOperand(LD->getMemOperand());
       SDValue Ops[] = {
         LD->getChain(),    // Chain
         LD->getBasePtr(),  // Ptr
-        MO,                // MemOperand
         DAG.getValueType(N->getValueType(0)) // VT
       };
-      SDValue BSLoad = DAG.getNode(PPCISD::LBRX, dl, VTs, Ops, 4);
+      SDValue BSLoad =
+        DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
+                                DAG.getVTList(MVT::i32, MVT::Other), Ops, 3,
+                                LD->getMemoryVT(), LD->getMemOperand());
 
       // If this is an i16 load, insert the truncate.
       SDValue ResVal = BSLoad;
@@ -5035,7 +5061,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
 
       // Create the PPCISD altivec 'dot' comparison node.
-      std::vector<MVT> VTs;
+      std::vector<EVT> VTs;
       SDValue Ops[] = {
         LHS.getOperand(2),  // LHS of compare
         LHS.getOperand(3),  // RHS of compare
@@ -5090,7 +5116,7 @@ void PPCTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   default: break;
   case PPCISD::LBRX: {
     // lhbrx is known to have the top bits cleared out.
-    if (cast<VTSDNode>(Op.getOperand(3))->getVT() == MVT::i16)
+    if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
       KnownZero = 0xFFFF0000;
     break;
   }
@@ -5138,7 +5164,7 @@ PPCTargetLowering::getConstraintType(const std::string &Constraint) const {
 
 std::pair<unsigned, const TargetRegisterClass*>
 PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                MVT VT) const {
+                                                EVT VT) const {
   if (Constraint.size() == 1) {
     // GCC RS6000 Constraint Letters
     switch (Constraint[0]) {
@@ -5187,7 +5213,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Letter,
     if (!CST) return; // Must be an immediate to match.
     unsigned Value = CST->getZExtValue();
     switch (Letter) {
-    default: assert(0 && "Unknown constraint letter!");
+    default: llvm_unreachable("Unknown constraint letter!");
     case 'I':  // "I" is a signed 16-bit constant.
       if ((short)Value == (int)Value)
         Result = DAG.getTargetConstant(Value, Op.getValueType());
@@ -5304,7 +5330,7 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
   if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
     return SDValue();
 
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = PtrVT == MVT::i64;
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -5326,7 +5352,7 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   return false;
 }
 
-MVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
+EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
                                            bool isSrcConst, bool isSrcStr,
                                            SelectionDAG &DAG) const {
   if (this->PPCSubTarget.isPPC64()) {
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 962bbb1..ac72d87 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -41,8 +41,7 @@ namespace llvm {
       FCTIDZ, FCTIWZ,
       
       /// STFIWX - The STFIWX instruction.  The first operand is an input token
-      /// chain, then an f64 value to store, then an address to store it to,
-      /// then a SRCVALUE for the address.
+      /// chain, then an f64 value to store, then an address to store it to.
       STFIWX,
       
       // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
@@ -60,6 +59,8 @@ namespace llvm {
       /// though these are usually folded into other nodes.
       Hi, Lo,
       
+      TOC_ENTRY,
+
       /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
       /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
       /// compute an allocation on the stack.
@@ -78,12 +79,12 @@ namespace llvm {
       /// registers.
       EXTSW_32,
 
-      /// STD_32 - This is the STD instruction for use with "32-bit" registers.
-      STD_32,
-      
       /// CALL - A direct function call.
       CALL_Darwin, CALL_SVR4,
       
+      /// NOP - Special NOP which follows 64-bit SVR4 calls.
+      NOP,
+
       /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
       /// MTCTR instruction.
       MTCTR,
@@ -119,18 +120,6 @@ namespace llvm {
       /// an optional input flag argument.
       COND_BRANCH,
       
-      /// CHAIN = STBRX CHAIN, GPRC, Ptr, SRCVALUE, Type - This is a 
-      /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
-      /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
-      /// i32.
-      STBRX, 
-      
-      /// GPRC, CHAIN = LBRX CHAIN, Ptr, SRCVALUE, Type - This is a 
-      /// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
-      /// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
-      /// or i32.
-      LBRX,
-
       // The following 5 instructions are used only as part of the
       // long double-to-int conversion sequence.
 
@@ -160,14 +149,27 @@ namespace llvm {
       /// indexed. This is used to implement atomic operations.
       STCX,
 
-      /// TAILCALL - Indicates a tail call should be taken.
-      TAILCALL,
       /// TC_RETURN - A tail call return.
       ///   operand #0 chain
       ///   operand #1 callee (register or absolute)
       ///   operand #2 stack adjustment
       ///   operand #3 optional in flag
-      TC_RETURN
+      TC_RETURN,
+
+      /// STD_32 - This is the STD instruction for use with "32-bit" registers.
+      STD_32 = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      
+      /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a 
+      /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
+      /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
+      /// i32.
+      STBRX, 
+      
+      /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a 
+      /// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
+      /// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
+      /// or i32.
+      LBRX
     };
   }
 
@@ -232,7 +234,7 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual MVT getSetCCResultType(MVT VT) const;
+    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
 
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
@@ -286,7 +288,8 @@ namespace llvm {
                                                 unsigned Depth = 0) const;
 
     virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                  MachineBasicBlock *MBB) const;
+                                                         MachineBasicBlock *MBB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
     MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, 
                                         MachineBasicBlock *MBB, bool is64Bit,
                                         unsigned BinOpcode) const;
@@ -297,7 +300,7 @@ namespace llvm {
     ConstraintType getConstraintType(const std::string &Constraint) const;
     std::pair<unsigned, const TargetRegisterClass*> 
       getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   MVT VT) const;
+                                   EVT VT) const;
 
     /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area.  This is the actual
@@ -327,16 +330,16 @@ namespace llvm {
     /// the offset of the target addressing mode.
     virtual bool isLegalAddressImmediate(GlobalValue *GV) const;
 
-    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
-    /// for tail call optimization. Target which want to do tail call
-    /// optimization should implement this function.
-    virtual bool IsEligibleForTailCallOptimization(CallSDNode *TheCall,
-                                                   SDValue Ret,
-                                                   SelectionDAG &DAG) const;
+    virtual bool
+    IsEligibleForTailCallOptimization(SDValue Callee,
+                                      CallingConv::ID CalleeCC,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      SelectionDAG& DAG) const;
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
     
-    virtual MVT getOptimalMemOpType(uint64_t Size, unsigned Align,
+    virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align,
                                     bool isSrcConst, bool isSrcStr,
                                     SelectionDAG &DAG) const;
 
@@ -370,20 +373,6 @@ namespace llvm {
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG, int VarArgsFrameIndex,
                          int VarArgsStackOffset, unsigned VarArgsNumGPR,
                          unsigned VarArgsNumFPR, const PPCSubtarget &Subtarget);
-    SDValue LowerFORMAL_ARGUMENTS_SVR4(SDValue Op, SelectionDAG &DAG,
-                                       int &VarArgsFrameIndex, 
-                                       int &VarArgsStackOffset,
-                                       unsigned &VarArgsNumGPR,
-                                       unsigned &VarArgsNumFPR,
-                                       const PPCSubtarget &Subtarget);
-    SDValue LowerFORMAL_ARGUMENTS_Darwin(SDValue Op, SelectionDAG &DAG,
-                                         int &VarArgsFrameIndex, 
-                                         const PPCSubtarget &Subtarget);
-    SDValue LowerCALL_Darwin(SDValue Op, SelectionDAG &DAG,
-                             const PPCSubtarget &Subtarget, TargetMachine &TM);
-    SDValue LowerCALL_SVR4(SDValue Op, SelectionDAG &DAG,
-                           const PPCSubtarget &Subtarget, TargetMachine &TM);
-    SDValue LowerRET(SDValue Op, SelectionDAG &DAG, TargetMachine &TM);
     SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
                                 const PPCSubtarget &Subtarget);
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
@@ -400,6 +389,71 @@ namespace llvm {
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG);
     SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG);
     SDValue LowerMUL(SDValue Op, SelectionDAG &DAG);
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+    SDValue FinishCall(CallingConv::ID CallConv, DebugLoc dl, bool isTailCall,
+                       bool isVarArg,
+                       SelectionDAG &DAG,
+                       SmallVector<std::pair<unsigned, SDValue>, 8>
+                         &RegsToPass,
+                       SDValue InFlag, SDValue Chain,
+                       SDValue &Callee,
+                       int SPDiff, unsigned NumBytes,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
+    SDValue
+      LowerFormalArguments_Darwin(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals);
+    SDValue
+      LowerFormalArguments_SVR4(SDValue Chain,
+                                CallingConv::ID CallConv, bool isVarArg,
+                                const SmallVectorImpl<ISD::InputArg> &Ins,
+                                DebugLoc dl, SelectionDAG &DAG,
+                                SmallVectorImpl<SDValue> &InVals);
+
+    SDValue
+      LowerCall_Darwin(SDValue Chain, SDValue Callee,
+                       CallingConv::ID CallConv, bool isVarArg, bool isTailCall,
+                       const SmallVectorImpl<ISD::OutputArg> &Outs,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       DebugLoc dl, SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals);
+    SDValue
+      LowerCall_SVR4(SDValue Chain, SDValue Callee,
+                     CallingConv::ID CallConv, bool isVarArg, bool isTailCall,
+                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                     DebugLoc dl, SelectionDAG &DAG,
+                     SmallVectorImpl<SDValue> &InVals);
   };
 }
 
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 3823e53..0f68fb9 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -68,7 +68,7 @@ let isCall = 1, PPC970_Unit = 7,
           F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
           V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
           LR8,CTR8,
-          CR0,CR1,CR5,CR6,CR7] in {
+          CR0,CR1,CR5,CR6,CR7,CARRY] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL8_Darwin  : IForm<18, 0, 1,
@@ -94,7 +94,7 @@ let isCall = 1, PPC970_Unit = 7,
           F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
           V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
           LR8,CTR8,
-          CR0,CR1,CR5,CR6,CR7] in {
+          CR0,CR1,CR5,CR6,CR7,CARRY] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL8_ELF  : IForm<18, 0, 1,
@@ -123,6 +123,8 @@ def : Pat<(PPCcall_SVR4 (i64 tglobaladdr:$dst)),
           (BL8_ELF tglobaladdr:$dst)>;
 def : Pat<(PPCcall_SVR4 (i64 texternalsym:$dst)),
           (BL8_ELF texternalsym:$dst)>;
+def : Pat<(PPCnop),
+          (NOP)>;
 
 // Atomic operations
 let usesCustomDAGSchedInserter = 1 in {
@@ -327,14 +329,15 @@ def ADD8  : XOForm_1<31, 266, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
                      "add $rT, $rA, $rB", IntGeneral,
                      [(set G8RC:$rT, (add G8RC:$rA, G8RC:$rB))]>;
                      
+let Defs = [CARRY] in {
 def ADDC8 : XOForm_1<31, 10, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
                      "addc $rT, $rA, $rB", IntGeneral,
                      [(set G8RC:$rT, (addc G8RC:$rA, G8RC:$rB))]>,
                      PPC970_DGroup_Cracked;
-def ADDE8 : XOForm_1<31, 138, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
-                     "adde $rT, $rA, $rB", IntGeneral,
-                     [(set G8RC:$rT, (adde G8RC:$rA, G8RC:$rB))]>;
-                     
+def ADDIC8 : DForm_2<12, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
+                     "addic $rD, $rA, $imm", IntGeneral,
+                     [(set G8RC:$rD, (addc G8RC:$rA, immSExt16:$imm))]>;
+}
 def ADDI8  : DForm_2<14, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
                      "addi $rD, $rA, $imm", IntGeneral,
                      [(set G8RC:$rD, (add G8RC:$rA, immSExt16:$imm))]>;
@@ -342,36 +345,41 @@ def ADDIS8 : DForm_2<15, (outs G8RC:$rD), (ins G8RC:$rA, symbolHi64:$imm),
                      "addis $rD, $rA, $imm", IntGeneral,
                      [(set G8RC:$rD, (add G8RC:$rA, imm16ShiftedSExt:$imm))]>;
 
+let Defs = [CARRY] in {
 def SUBFIC8: DForm_2< 8, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
                      "subfic $rD, $rA, $imm", IntGeneral,
                      [(set G8RC:$rD, (subc immSExt16:$imm, G8RC:$rA))]>;
-def SUBF8 : XOForm_1<31, 40, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
-                     "subf $rT, $rA, $rB", IntGeneral,
-                     [(set G8RC:$rT, (sub G8RC:$rB, G8RC:$rA))]>;
 def SUBFC8 : XOForm_1<31, 8, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
                       "subfc $rT, $rA, $rB", IntGeneral,
                       [(set G8RC:$rT, (subc G8RC:$rB, G8RC:$rA))]>,
                       PPC970_DGroup_Cracked;
-
-def SUBFE8 : XOForm_1<31, 136, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
-                      "subfe $rT, $rA, $rB", IntGeneral,
-                      [(set G8RC:$rT, (sube G8RC:$rB, G8RC:$rA))]>;
+}
+def SUBF8 : XOForm_1<31, 40, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "subf $rT, $rA, $rB", IntGeneral,
+                     [(set G8RC:$rT, (sub G8RC:$rB, G8RC:$rA))]>;
+def NEG8    : XOForm_3<31, 104, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "neg $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (ineg G8RC:$rA))]>;
+let Uses = [CARRY], Defs = [CARRY] in {
+def ADDE8   : XOForm_1<31, 138, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                       "adde $rT, $rA, $rB", IntGeneral,
+                       [(set G8RC:$rT, (adde G8RC:$rA, G8RC:$rB))]>;
 def ADDME8  : XOForm_3<31, 234, 0, (outs G8RC:$rT), (ins G8RC:$rA),
                        "addme $rT, $rA", IntGeneral,
                        [(set G8RC:$rT, (adde G8RC:$rA, immAllOnes))]>;
 def ADDZE8  : XOForm_3<31, 202, 0, (outs G8RC:$rT), (ins G8RC:$rA),
                        "addze $rT, $rA", IntGeneral,
                        [(set G8RC:$rT, (adde G8RC:$rA, 0))]>;
-def NEG8    : XOForm_3<31, 104, 0, (outs G8RC:$rT), (ins G8RC:$rA),
-                       "neg $rT, $rA", IntGeneral,
-                       [(set G8RC:$rT, (ineg G8RC:$rA))]>;
+def SUBFE8  : XOForm_1<31, 136, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                       "subfe $rT, $rA, $rB", IntGeneral,
+                       [(set G8RC:$rT, (sube G8RC:$rB, G8RC:$rA))]>;
 def SUBFME8 : XOForm_3<31, 232, 0, (outs G8RC:$rT), (ins G8RC:$rA),
                        "subfme $rT, $rA", IntGeneral,
                        [(set G8RC:$rT, (sube immAllOnes, G8RC:$rA))]>;
 def SUBFZE8 : XOForm_3<31, 200, 0, (outs G8RC:$rT), (ins G8RC:$rA),
                        "subfze $rT, $rA", IntGeneral,
                        [(set G8RC:$rT, (sube 0, G8RC:$rA))]>;
-
+}
 
 
 def MULHD : XOForm_1<31, 73, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
@@ -396,9 +404,11 @@ def SLD  : XForm_6<31,  27, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
 def SRD  : XForm_6<31, 539, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
                    "srd $rA, $rS, $rB", IntRotateD,
                    [(set G8RC:$rA, (PPCsrl G8RC:$rS, GPRC:$rB))]>, isPPC64;
+let Defs = [CARRY] in {
 def SRAD : XForm_6<31, 794, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
                    "srad $rA, $rS, $rB", IntRotateD,
                    [(set G8RC:$rA, (PPCsra G8RC:$rS, GPRC:$rB))]>, isPPC64;
+}
                    
 def EXTSB8 : XForm_11<31, 954, (outs G8RC:$rA), (ins G8RC:$rS),
                       "extsb $rA, $rS", IntGeneral,
@@ -418,9 +428,11 @@ def EXTSW_32_64 : XForm_11<31, 986, (outs G8RC:$rA), (ins GPRC:$rS),
                       "extsw $rA, $rS", IntGeneral,
                       [(set G8RC:$rA, (sext GPRC:$rS))]>, isPPC64;
 
+let Defs = [CARRY] in {
 def SRADI  : XSForm_1<31, 413, (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH),
                       "sradi $rA, $rS, $SH", IntRotateD,
                       [(set G8RC:$rA, (sra G8RC:$rS, (i32 imm:$SH)))]>, isPPC64;
+}
 def CNTLZD : XForm_11<31, 58, (outs G8RC:$rA), (ins G8RC:$rS),
                       "cntlzd $rA, $rS", IntGeneral,
                       [(set G8RC:$rA, (ctlz G8RC:$rS))]>;
@@ -543,6 +555,10 @@ let canFoldAsLoad = 1, PPC970_Unit = 2 in {
 def LD   : DSForm_1<58, 0, (outs G8RC:$rD), (ins memrix:$src),
                     "ld $rD, $src", LdStLD,
                     [(set G8RC:$rD, (load ixaddr:$src))]>, isPPC64;
+def LDtoc: DSForm_1<58, 0, (outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
+                    "ld $rD, $disp($reg)", LdStLD,
+                    [(set G8RC:$rD,
+                     (PPCtoc_entry tglobaladdr:$disp, G8RC:$reg))]>, isPPC64;
 def LDX  : XForm_1<31,  21, (outs G8RC:$rD), (ins memrr:$src),
                    "ldx $rD, $src", LdStLD,
                    [(set G8RC:$rD, (load xaddr:$src))]>, isPPC64;
diff --git a/lib/Target/PowerPC/PPCInstrBuilder.h b/lib/Target/PowerPC/PPCInstrBuilder.h
index 1de6911..b424d11 100644
--- a/lib/Target/PowerPC/PPCInstrBuilder.h
+++ b/lib/Target/PowerPC/PPCInstrBuilder.h
@@ -29,7 +29,7 @@ namespace llvm {
 /// reference has base register as the FrameIndex offset until it is resolved.
 /// This allows a constant offset to be specified as well...
 ///
-inline const MachineInstrBuilder&
+static inline const MachineInstrBuilder&
 addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
                   bool mem = true) {
   if (mem)
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 87c612a..0083598 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -20,7 +20,9 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCAsmInfo.h"
 using namespace llvm;
 
 extern cl::opt<bool> EnablePPC32RS;  // FIXME (64-bit): See PPCRegisterInfo.cpp.
@@ -485,8 +487,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                      .addReg(PPC::R0)
                      .addReg(PPC::R0));
   } else {
-    assert(0 && "Unknown regclass!");
-    abort();
+    llvm_unreachable("Unknown regclass!");
   }
 
   return false;
@@ -509,45 +510,6 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     MBB.insert(MI, NewMIs[i]);
 }
 
-void PPCInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
-                                  bool isKill,
-                                  SmallVectorImpl<MachineOperand> &Addr,
-                                  const TargetRegisterClass *RC,
-                                  SmallVectorImpl<MachineInstr*> &NewMIs) const{
-  if (Addr[0].isFI()) {
-    if (StoreRegToStackSlot(MF, SrcReg, isKill,
-                            Addr[0].getIndex(), RC, NewMIs)) {
-      PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
-      FuncInfo->setSpillsCR();
-    }
-
-    return;
-  }
-
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  unsigned Opc = 0;
-  if (RC == PPC::GPRCRegisterClass) {
-    Opc = PPC::STW;
-  } else if (RC == PPC::G8RCRegisterClass) {
-    Opc = PPC::STD;
-  } else if (RC == PPC::F8RCRegisterClass) {
-    Opc = PPC::STFD;
-  } else if (RC == PPC::F4RCRegisterClass) {
-    Opc = PPC::STFS;
-  } else if (RC == PPC::VRRCRegisterClass) {
-    Opc = PPC::STVX;
-  } else {
-    assert(0 && "Unknown regclass!");
-    abort();
-  }
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc))
-    .addReg(SrcReg, getKillRegState(isKill));
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  NewMIs.push_back(MIB);
-  return;
-}
-
 void
 PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                    unsigned DestReg, int FrameIdx,
@@ -634,8 +596,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
     NewMIs.push_back(BuildMI(MF, DL, get(PPC::LVX),DestReg).addReg(PPC::R0)
                      .addReg(PPC::R0));
   } else {
-    assert(0 && "Unknown regclass!");
-    abort();
+    llvm_unreachable("Unknown regclass!");
   }
 }
 
@@ -653,41 +614,6 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     MBB.insert(MI, NewMIs[i]);
 }
 
-void PPCInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                                   SmallVectorImpl<MachineOperand> &Addr,
-                                   const TargetRegisterClass *RC,
-                                   SmallVectorImpl<MachineInstr*> &NewMIs)const{
-  if (Addr[0].isFI()) {
-    LoadRegFromStackSlot(MF, DebugLoc::getUnknownLoc(),
-                         DestReg, Addr[0].getIndex(), RC, NewMIs);
-    return;
-  }
-
-  unsigned Opc = 0;
-  if (RC == PPC::GPRCRegisterClass) {
-    assert(DestReg != PPC::LR && "Can't handle this yet!");
-    Opc = PPC::LWZ;
-  } else if (RC == PPC::G8RCRegisterClass) {
-    assert(DestReg != PPC::LR8 && "Can't handle this yet!");
-    Opc = PPC::LD;
-  } else if (RC == PPC::F8RCRegisterClass) {
-    Opc = PPC::LFD;
-  } else if (RC == PPC::F4RCRegisterClass) {
-    Opc = PPC::LFS;
-  } else if (RC == PPC::VRRCRegisterClass) {
-    Opc = PPC::LVX;
-  } else {
-    assert(0 && "Unknown regclass!");
-    abort();
-  }
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  NewMIs.push_back(MIB);
-  return;
-}
-
 /// foldMemoryOperand - PowerPC (like most RISC's) can only fold spills into
 /// copy instructions, turning them into load/store instructions.
 MachineInstr *PPCInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
@@ -842,7 +768,7 @@ unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   case PPC::INLINEASM: {       // Inline Asm: Variable size.
     const MachineFunction *MF = MI->getParent()->getParent();
     const char *AsmStr = MI->getOperand(0).getSymbolName();
-    return MF->getTarget().getTargetAsmInfo()->getInlineAsmLength(AsmStr);
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
   }
   case PPC::DBG_LABEL:
   case PPC::EH_LABEL:
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 492634c..bb0dc15a 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -121,20 +121,10 @@ public:
                                    unsigned SrcReg, bool isKill, int FrameIndex,
                                    const TargetRegisterClass *RC) const;
 
-  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                              SmallVectorImpl<MachineOperand> &Addr,
-                              const TargetRegisterClass *RC,
-                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
   virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MBBI,
                                     unsigned DestReg, int FrameIndex,
                                     const TargetRegisterClass *RC) const;
-
-  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                               SmallVectorImpl<MachineOperand> &Addr,
-                               const TargetRegisterClass *RC,
-                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
   
   /// foldMemoryOperand - PowerPC (like most RISC's) can only fold spills into
   /// copy instructions, turning them into load/store instructions.
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 7af59a2..dc5db6f 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -35,11 +35,11 @@ def SDT_PPCcondbr : SDTypeProfile<0, 3, [
   SDTCisVT<0, i32>, SDTCisVT<2, OtherVT>
 ]>;
 
-def SDT_PPClbrx : SDTypeProfile<1, 3, [
-  SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>, SDTCisVT<3, OtherVT>
+def SDT_PPClbrx : SDTypeProfile<1, 2, [
+  SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
 ]>;
-def SDT_PPCstbrx : SDTypeProfile<0, 4, [
-  SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>, SDTCisVT<3, OtherVT>
+def SDT_PPCstbrx : SDTypeProfile<0, 3, [
+  SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
 ]>;
 
 def SDT_PPClarx : SDTypeProfile<1, 1, [
@@ -53,6 +53,8 @@ def SDT_PPCTC_ret : SDTypeProfile<0, 2, [
   SDTCisPtrTy<0>, SDTCisVT<1, i32>
 ]>;
 
+def SDT_PPCnop : SDTypeProfile<0, 0, []>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC specific DAG Nodes.
 //
@@ -85,6 +87,7 @@ def PPCfsel   : SDNode<"PPCISD::FSEL",
 
 def PPChi       : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
 def PPClo       : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
+def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp, [SDNPMayLoad]>;
 def PPCvmaddfp  : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
 def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;
 
@@ -111,6 +114,7 @@ def PPCcall_Darwin : SDNode<"PPCISD::CALL_Darwin", SDT_PPCCall,
                             [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
 def PPCcall_SVR4  : SDNode<"PPCISD::CALL_SVR4", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def PPCnop : SDNode<"PPCISD::NOP", SDT_PPCnop, [SDNPInFlag, SDNPOutFlag]>;
 def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
 def PPCbctrl_Darwin  : SDNode<"PPCISD::BCTRL_Darwin", SDTNone,
@@ -125,9 +129,6 @@ def retflag       : SDNode<"PPCISD::RET_FLAG", SDTNone,
 def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret,
                         [SDNPHasChain,  SDNPOptInFlag]>;
 
-def PPCtailcall : SDNode<"PPCISD::TAILCALL",     SDT_PPCCall,
-                        [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
-
 def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
 def PPCvcmp_o     : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutFlag]>;
 
@@ -309,6 +310,10 @@ def memrix : Operand<iPTR> {   // memri where the imm is shifted 2 bits.
   let PrintMethod = "printMemRegImmShifted";
   let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg);
 }
+def tocentry : Operand<iPTR> {
+  let PrintMethod = "printTOCEntryLabel";
+  let MIOperandInfo = (ops i32imm:$imm);
+}
 
 // PowerPC Predicate operand.  20 = (0<<5)|20 = always, CR0 is a dummy reg
 // that doesn't matter.
@@ -421,7 +426,7 @@ let isCall = 1, PPC970_Unit = 7,
           LR,CTR,
           CR0,CR1,CR5,CR6,CR7,
           CR0LT,CR0GT,CR0EQ,CR0UN,CR1LT,CR1GT,CR1EQ,CR1UN,CR5LT,CR5GT,CR5EQ,
-          CR5UN,CR6LT,CR6GT,CR6EQ,CR6UN,CR7LT,CR7GT,CR7EQ,CR7UN] in {
+          CR5UN,CR6LT,CR6GT,CR6EQ,CR6UN,CR7LT,CR7GT,CR7EQ,CR7UN,CARRY] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL_Darwin  : IForm<18, 0, 1,
@@ -448,7 +453,7 @@ let isCall = 1, PPC970_Unit = 7,
           LR,CTR,
           CR0,CR1,CR5,CR6,CR7,
           CR0LT,CR0GT,CR0EQ,CR0UN,CR1LT,CR1GT,CR1EQ,CR1UN,CR5LT,CR5GT,CR5EQ,
-          CR5UN,CR6LT,CR6GT,CR6EQ,CR6UN,CR7LT,CR7GT,CR7EQ,CR7UN] in {
+          CR5UN,CR6LT,CR6GT,CR6EQ,CR6UN,CR7LT,CR7GT,CR7EQ,CR7UN,CARRY] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL_SVR4  : IForm<18, 0, 1,
@@ -736,10 +741,10 @@ def LWZX : XForm_1<31,  23, (outs GPRC:$rD), (ins memrr:$src),
                    
 def LHBRX : XForm_1<31, 790, (outs GPRC:$rD), (ins memrr:$src),
                    "lhbrx $rD, $src", LdStGeneral,
-                   [(set GPRC:$rD, (PPClbrx xoaddr:$src, srcvalue:$sv, i16))]>;
+                   [(set GPRC:$rD, (PPClbrx xoaddr:$src, i16))]>;
 def LWBRX : XForm_1<31,  534, (outs GPRC:$rD), (ins memrr:$src),
                    "lwbrx $rD, $src", LdStGeneral,
-                   [(set GPRC:$rD, (PPClbrx xoaddr:$src, srcvalue:$sv, i32))]>;
+                   [(set GPRC:$rD, (PPClbrx xoaddr:$src, i32))]>;
 
 def LFSX   : XForm_25<31, 535, (outs F4RC:$frD), (ins memrr:$src),
                       "lfsx $frD, $src", LdStLFDU,
@@ -832,11 +837,11 @@ def STWUX : XForm_8<31, 183, (outs), (ins GPRC:$rS, GPRC:$rA, GPRC:$rB),
 }
 def STHBRX: XForm_8<31, 918, (outs), (ins GPRC:$rS, memrr:$dst),
                    "sthbrx $rS, $dst", LdStGeneral,
-                   [(PPCstbrx GPRC:$rS, xoaddr:$dst, srcvalue:$dummy, i16)]>, 
+                   [(PPCstbrx GPRC:$rS, xoaddr:$dst, i16)]>, 
                    PPC970_DGroup_Cracked;
 def STWBRX: XForm_8<31, 662, (outs), (ins GPRC:$rS, memrr:$dst),
                    "stwbrx $rS, $dst", LdStGeneral,
-                   [(PPCstbrx GPRC:$rS, xoaddr:$dst, srcvalue:$dummy, i32)]>,
+                   [(PPCstbrx GPRC:$rS, xoaddr:$dst, i32)]>,
                    PPC970_DGroup_Cracked;
 
 def STFIWX: XForm_28<31, 983, (outs), (ins F8RC:$frS, memrr:$dst),
@@ -864,6 +869,7 @@ let PPC970_Unit = 1 in {  // FXU Operations.
 def ADDI   : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
                      "addi $rD, $rA, $imm", IntGeneral,
                      [(set GPRC:$rD, (add GPRC:$rA, immSExt16:$imm))]>;
+let Defs = [CARRY] in {
 def ADDIC  : DForm_2<12, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
                      "addic $rD, $rA, $imm", IntGeneral,
                      [(set GPRC:$rD, (addc GPRC:$rA, immSExt16:$imm))]>,
@@ -871,6 +877,7 @@ def ADDIC  : DForm_2<12, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
 def ADDICo : DForm_2<13, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
                      "addic. $rD, $rA, $imm", IntGeneral,
                      []>;
+}
 def ADDIS  : DForm_2<15, (outs GPRC:$rD), (ins GPRC:$rA, symbolHi:$imm),
                      "addis $rD, $rA, $imm", IntGeneral,
                      [(set GPRC:$rD, (add GPRC:$rA, imm16ShiftedSExt:$imm))]>;
@@ -881,9 +888,11 @@ def LA     : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, symbolLo:$sym),
 def MULLI  : DForm_2< 7, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
                      "mulli $rD, $rA, $imm", IntMulLI,
                      [(set GPRC:$rD, (mul GPRC:$rA, immSExt16:$imm))]>;
+let Defs = [CARRY] in {
 def SUBFIC : DForm_2< 8, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
                      "subfic $rD, $rA, $imm", IntGeneral,
                      [(set GPRC:$rD, (subc immSExt16:$imm, GPRC:$rA))]>;
+}
 
 let isReMaterializable = 1 in {
   def LI  : DForm_2_r0<14, (outs GPRC:$rD), (ins symbolLo:$imm),
@@ -956,15 +965,19 @@ def SLW  : XForm_6<31,  24, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
 def SRW  : XForm_6<31, 536, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
                    "srw $rA, $rS, $rB", IntGeneral,
                    [(set GPRC:$rA, (PPCsrl GPRC:$rS, GPRC:$rB))]>;
+let Defs = [CARRY] in {
 def SRAW : XForm_6<31, 792, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
                    "sraw $rA, $rS, $rB", IntShift,
                    [(set GPRC:$rA, (PPCsra GPRC:$rS, GPRC:$rB))]>;
 }
+}
 
 let PPC970_Unit = 1 in {  // FXU Operations.
+let Defs = [CARRY] in {
 def SRAWI : XForm_10<31, 824, (outs GPRC:$rA), (ins GPRC:$rS, u5imm:$SH), 
                      "srawi $rA, $rS, $SH", IntShift,
                      [(set GPRC:$rA, (sra GPRC:$rS, (i32 imm:$SH)))]>;
+}
 def CNTLZW : XForm_11<31,  26, (outs GPRC:$rA), (ins GPRC:$rS),
                       "cntlzw $rA, $rS", IntGeneral,
                       [(set GPRC:$rA, (ctlz GPRC:$rS))]>;
@@ -1159,13 +1172,12 @@ let PPC970_Unit = 1 in {  // FXU Operations.
 def ADD4  : XOForm_1<31, 266, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
                      "add $rT, $rA, $rB", IntGeneral,
                      [(set GPRC:$rT, (add GPRC:$rA, GPRC:$rB))]>;
+let Defs = [CARRY] in {
 def ADDC  : XOForm_1<31, 10, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
                      "addc $rT, $rA, $rB", IntGeneral,
                      [(set GPRC:$rT, (addc GPRC:$rA, GPRC:$rB))]>,
                      PPC970_DGroup_Cracked;
-def ADDE  : XOForm_1<31, 138, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
-                     "adde $rT, $rA, $rB", IntGeneral,
-                     [(set GPRC:$rT, (adde GPRC:$rA, GPRC:$rB))]>;
+}
 def DIVW  : XOForm_1<31, 491, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
                      "divw $rT, $rA, $rB", IntDivW,
                      [(set GPRC:$rT, (sdiv GPRC:$rA, GPRC:$rB))]>,
@@ -1186,22 +1198,28 @@ def MULLW : XOForm_1<31, 235, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
 def SUBF  : XOForm_1<31, 40, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
                      "subf $rT, $rA, $rB", IntGeneral,
                      [(set GPRC:$rT, (sub GPRC:$rB, GPRC:$rA))]>;
+let Defs = [CARRY] in {
 def SUBFC : XOForm_1<31, 8, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
                      "subfc $rT, $rA, $rB", IntGeneral,
                      [(set GPRC:$rT, (subc GPRC:$rB, GPRC:$rA))]>,
                      PPC970_DGroup_Cracked;
-def SUBFE : XOForm_1<31, 136, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
-                     "subfe $rT, $rA, $rB", IntGeneral,
-                     [(set GPRC:$rT, (sube GPRC:$rB, GPRC:$rA))]>;
+}
+def NEG    : XOForm_3<31, 104, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "neg $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (ineg GPRC:$rA))]>;
+let Uses = [CARRY], Defs = [CARRY] in {
+def ADDE  : XOForm_1<31, 138, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                      "adde $rT, $rA, $rB", IntGeneral,
+                      [(set GPRC:$rT, (adde GPRC:$rA, GPRC:$rB))]>;
 def ADDME  : XOForm_3<31, 234, 0, (outs GPRC:$rT), (ins GPRC:$rA),
                       "addme $rT, $rA", IntGeneral,
                       [(set GPRC:$rT, (adde GPRC:$rA, immAllOnes))]>;
 def ADDZE  : XOForm_3<31, 202, 0, (outs GPRC:$rT), (ins GPRC:$rA),
                       "addze $rT, $rA", IntGeneral,
                       [(set GPRC:$rT, (adde GPRC:$rA, 0))]>;
-def NEG    : XOForm_3<31, 104, 0, (outs GPRC:$rT), (ins GPRC:$rA),
-                      "neg $rT, $rA", IntGeneral,
-                      [(set GPRC:$rT, (ineg GPRC:$rA))]>;
+def SUBFE : XOForm_1<31, 136, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                      "subfe $rT, $rA, $rB", IntGeneral,
+                      [(set GPRC:$rT, (sube GPRC:$rB, GPRC:$rA))]>;
 def SUBFME : XOForm_3<31, 232, 0, (outs GPRC:$rT), (ins GPRC:$rA),
                       "subfme $rT, $rA", IntGeneral,
                       [(set GPRC:$rT, (sube immAllOnes, GPRC:$rA))]>;
@@ -1209,6 +1227,7 @@ def SUBFZE : XOForm_3<31, 200, 0, (outs GPRC:$rT), (ins GPRC:$rA),
                       "subfze $rT, $rA", IntGeneral,
                       [(set GPRC:$rT, (sube 0, GPRC:$rA))]>;
 }
+}
 
 // A-Form instructions.  Most of the instructions executed in the FPU are of
 // this type.
diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
index 7486d74..ef25d92 100644
--- a/lib/Target/PowerPC/PPCJITInfo.cpp
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -18,6 +18,8 @@
 #include "llvm/Function.h"
 #include "llvm/System/Memory.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 static TargetJITInfo::JITCompilerFn JITCompilerFunction;
@@ -197,8 +199,7 @@ asm(
     );
 #else
 void PPC32CompilationCallback() {
-  assert(0 && "This is not a power pc, you can't execute this!");
-  abort();
+  llvm_unreachable("This is not a power pc, you can't execute this!");
 }
 #endif
 
@@ -264,8 +265,7 @@ asm(
     );
 #else
 void PPC64CompilationCallback() {
-  assert(0 && "This is not a power pc, you can't execute this!");
-  abort();
+  llvm_unreachable("This is not a power pc, you can't execute this!");
 }
 #endif
 
@@ -383,7 +383,7 @@ void PPCJITInfo::relocate(void *Function, MachineRelocation *MR,
     unsigned *RelocPos = (unsigned*)Function + MR->getMachineCodeOffset()/4;
     intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
     switch ((PPC::RelocationType)MR->getRelocationType()) {
-    default: assert(0 && "Unknown relocation type!");
+    default: llvm_unreachable("Unknown relocation type!");
     case PPC::reloc_pcrel_bx:
       // PC-relative relocation for b and bl instructions.
       ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2;
diff --git a/lib/Target/PowerPC/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/PPCMCAsmInfo.cpp
new file mode 100644
index 0000000..c87879b
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMCAsmInfo.cpp
@@ -0,0 +1,58 @@
+//===-- PPCMCAsmInfo.cpp - PPC asm properties -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MCAsmInfoDarwin properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMCAsmInfo.h"
+using namespace llvm;
+
+PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
+  PCSymbol = ".";
+  CommentString = ";";
+  ExceptionsType = ExceptionHandling::Dwarf;
+
+  if (!is64Bit)
+    Data64bitsDirective = 0;      // We can't emit a 64-bit unit in PPC32 mode.
+  AssemblerDialect = 1;           // New-Style mnemonics.
+}
+
+PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
+  CommentString = "#";
+  GlobalPrefix = "";
+  PrivateGlobalPrefix = ".L";
+  UsedDirective = "\t# .no_dead_strip\t";
+  WeakRefDirective = "\t.weak\t";
+  
+  // Uses '.section' before '.bss' directive
+  UsesELFSectionDirectiveForBSS = true;  
+
+  // Debug Information
+  AbsoluteDebugSectionOffsets = true;
+  SupportsDebugInformation = true;
+
+  PCSymbol = ".";
+
+  // Set up DWARF directives
+  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+
+  // Exceptions handling
+  if (!is64Bit)
+    ExceptionsType = ExceptionHandling::Dwarf;
+  AbsoluteEHSectionOffsets = false;
+    
+  ZeroDirective = "\t.space\t";
+  SetDirective = "\t.set";
+  Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
+  AlignmentIsInBytes = false;
+  LCOMMDirective = "\t.lcomm\t";
+  AssemblerDialect = 0;           // Old-Style mnemonics.
+}
+
diff --git a/lib/Target/PowerPC/PPCMCAsmInfo.h b/lib/Target/PowerPC/PPCMCAsmInfo.h
new file mode 100644
index 0000000..96ae6fb
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMCAsmInfo.h
@@ -0,0 +1,31 @@
+//=====-- PPCMCAsmInfo.h - PPC asm properties -----------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MCAsmInfoDarwin class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCTARGETASMINFO_H
+#define PPCTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+
+  struct PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
+    explicit PPCMCAsmInfoDarwin(bool is64Bit);
+  };
+
+  struct PPCLinuxMCAsmInfo : public MCAsmInfo {
+    explicit PPCLinuxMCAsmInfo(bool is64Bit);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/PowerPC/PPCMachOWriterInfo.cpp b/lib/Target/PowerPC/PPCMachOWriterInfo.cpp
index 3bfa6d7..4c14454 100644
--- a/lib/Target/PowerPC/PPCMachOWriterInfo.cpp
+++ b/lib/Target/PowerPC/PPCMachOWriterInfo.cpp
@@ -16,6 +16,7 @@
 #include "PPCTargetMachine.h"
 #include "llvm/CodeGen/MachORelocation.h"
 #include "llvm/Support/OutputBuffer.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <cstdio>
 using namespace llvm;
 
@@ -46,9 +47,9 @@ unsigned PPCMachOWriterInfo::GetTargetRelocation(MachineRelocation &MR,
     Addr = (uintptr_t)MR.getResultPointer() + ToAddr;
 
   switch ((PPC::RelocationType)MR.getRelocationType()) {
-  default: assert(0 && "Unknown PPC relocation type!");
+  default: llvm_unreachable("Unknown PPC relocation type!");
   case PPC::reloc_absolute_low_ix:
-    assert(0 && "Unhandled PPC relocation type!");
+    llvm_unreachable("Unhandled PPC relocation type!");
     break;
   case PPC::reloc_vanilla:
     {
diff --git a/lib/Target/PowerPC/PPCPredicates.cpp b/lib/Target/PowerPC/PPCPredicates.cpp
index 08a2812..12bb0a1 100644
--- a/lib/Target/PowerPC/PPCPredicates.cpp
+++ b/lib/Target/PowerPC/PPCPredicates.cpp
@@ -12,12 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCPredicates.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 using namespace llvm;
 
 PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) {
   switch (Opcode) {
-  default: assert(0 && "Unknown PPC branch opcode!");
+  default: llvm_unreachable("Unknown PPC branch opcode!");
   case PPC::PRED_EQ: return PPC::PRED_NE;
   case PPC::PRED_NE: return PPC::PRED_EQ;
   case PPC::PRED_LT: return PPC::PRED_GE;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 97b1c57..cf5c7c0 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -37,7 +37,9 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include <cstdlib>
@@ -111,8 +113,7 @@ unsigned PPCRegisterInfo::getRegisterNumbering(unsigned RegEnum) {
   case R30:  case X30:  case F30:  case V30: case CR7EQ: return 30;
   case R31:  case X31:  case F31:  case V31: case CR7UN: return 31;
   default:
-    cerr << "Unhandled reg in PPCRegisterInfo::getRegisterNumbering!\n";
-    abort();
+    llvm_unreachable("Unhandled reg in PPCRegisterInfo::getRegisterNumbering!");
   }
 }
 
@@ -139,11 +140,11 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
 
 /// getPointerRegClass - Return the register class to use to hold pointers.
 /// This is used for addressing modes.
-const TargetRegisterClass *PPCRegisterInfo::getPointerRegClass() const {
+const TargetRegisterClass *
+PPCRegisterInfo::getPointerRegClass(unsigned Kind) const {
   if (Subtarget.isPPC64())
     return &PPC::G8RCRegClass;
-  else
-    return &PPC::GPRCRegClass;
+  return &PPC::GPRCRegClass;
 }
 
 const unsigned*
@@ -173,7 +174,8 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     
     PPC::LR,  0
   };
-  
+
+  // 32-bit SVR4 calling convention.
   static const unsigned SVR4_CalleeSavedRegs[] = {
                         PPC::R14, PPC::R15,
     PPC::R16, PPC::R17, PPC::R18, PPC::R19,
@@ -199,7 +201,7 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
     PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
     
-    PPC::LR,  0
+    0
   };
   // 64-bit Darwin calling convention. 
   static const unsigned Darwin64_CalleeSavedRegs[] = {
@@ -226,12 +228,41 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     
     PPC::LR8,  0
   };
+
+  // 64-bit SVR4 calling convention.
+  static const unsigned SVR4_64_CalleeSavedRegs[] = {
+    PPC::X14, PPC::X15,
+    PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+    PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+    PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+    PPC::X28, PPC::X29, PPC::X30, PPC::X31,
+
+    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
+    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
+    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
+    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
+    PPC::F30, PPC::F31,
+
+    PPC::CR2, PPC::CR3, PPC::CR4,
+
+    PPC::VRSAVE,
+
+    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
+
+    PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+    PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+    PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+
+    0
+  };
   
   if (Subtarget.isDarwinABI())
     return Subtarget.isPPC64() ? Darwin64_CalleeSavedRegs :
                                  Darwin32_CalleeSavedRegs;
-  
-  return SVR4_CalleeSavedRegs;
+
+  return Subtarget.isPPC64() ? SVR4_64_CalleeSavedRegs : SVR4_CalleeSavedRegs;
 }
 
 const TargetRegisterClass* const*
@@ -266,6 +297,7 @@ PPCRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
     &PPC::GPRCRegClass, 0
   };
   
+  // 32-bit SVR4 calling convention.
   static const TargetRegisterClass * const SVR4_CalleeSavedRegClasses[] = {
                                           &PPC::GPRCRegClass,&PPC::GPRCRegClass,
     &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
@@ -294,7 +326,7 @@ PPCRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
     &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
     &PPC::CRBITRCRegClass, 
     
-    &PPC::GPRCRegClass, 0
+    0
   };
   
   // 64-bit Darwin calling convention.
@@ -326,12 +358,45 @@ PPCRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
     
     &PPC::G8RCRegClass, 0
   };
+
+  // 64-bit SVR4 calling convention.
+  static const TargetRegisterClass * const SVR4_64_CalleeSavedRegClasses[] = {
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
+
+    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
+
+    &PPC::VRSAVERCRegClass,
+
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass,
+
+    0
+  };
   
   if (Subtarget.isDarwinABI())
     return Subtarget.isPPC64() ? Darwin64_CalleeSavedRegClasses :
                                  Darwin32_CalleeSavedRegClasses;
   
-  return SVR4_CalleeSavedRegClasses;
+  return Subtarget.isPPC64() ? SVR4_64_CalleeSavedRegClasses
+                             : SVR4_CalleeSavedRegClasses;
 }
 
 // needsFP - Return true if the specified function should have a dedicated frame
@@ -363,9 +428,9 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(PPC::R13); // Small Data Area pointer register
   }
   
-  // On PPC64, r13 is the thread pointer. Never allocate this register. Note
-  // that this is over conservative, as it also prevents allocation of R31 when
-  // the FP is not needed.
+  // On PPC64, r13 is the thread pointer. Never allocate this register.
+  // Note that this is over conservative, as it also prevents allocation of R31
+  // when the FP is not needed.
   if (Subtarget.isPPC64()) {
     Reserved.set(PPC::R13);
     Reserved.set(PPC::R31);
@@ -377,6 +442,11 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(PPC::X1);
     Reserved.set(PPC::X13);
     Reserved.set(PPC::X31);
+
+    // The 64-bit SVR4 ABI reserves r2 for the TOC pointer.
+    if (Subtarget.isSVR4ABI()) {
+      Reserved.set(PPC::X2);
+    }
   }
 
   if (needsFP(MF))
@@ -457,7 +527,7 @@ static
 unsigned findScratchRegister(MachineBasicBlock::iterator II, RegScavenger *RS,
                              const TargetRegisterClass *RC, int SPAdj) {
   assert(RS && "Register scavenging must be on");
-  unsigned Reg = RS->FindUnusedReg(RC, true);
+  unsigned Reg = RS->FindUnusedReg(RC);
   // FIXME: move ARM callee-saved reg scan to target independent code, then 
   // search for already spilled CS register here.
   if (Reg == 0)
@@ -629,8 +699,10 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
   MBB.erase(II);
 }
 
-void PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                          int SPAdj, RegScavenger *RS) const {
+unsigned
+PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                     int SPAdj, int *Value,
+                                     RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
   // Get the instruction.
@@ -669,14 +741,14 @@ void PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (FPSI && FrameIndex == FPSI &&
       (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
     lowerDynamicAlloc(II, SPAdj, RS);
-    return;
+    return 0;
   }
 
   // Special case for pseudo-op SPILL_CR.
   if (EnableRegisterScavenging) // FIXME (64-bit): Enable by default.
     if (OpC == PPC::SPILL_CR) {
       lowerCRSpilling(II, FrameIndex, SPAdj, RS);
-      return;
+      return 0;
     }
 
   // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
@@ -718,7 +790,7 @@ void PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     if (isIXAddr)
       Offset >>= 2;    // The actual encoded value has the low two bits zero.
     MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
-    return;
+    return 0;
   }
 
   // The offset doesn't fit into a single register, scavenge one to build the
@@ -758,6 +830,7 @@ void PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   unsigned StackReg = MI.getOperand(FIOperandNo).getReg();
   MI.getOperand(OperandBase).ChangeToRegister(StackReg, false);
   MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false);
+  return 0;
 }
 
 /// VRRegNo - Map from a numbered VR register to its enum value.
@@ -910,7 +983,7 @@ void PPCRegisterInfo::determineFrameLayout(MachineFunction &MF) const {
   // don't have a frame pointer, calls, or dynamic alloca then we do not need
   // to adjust the stack pointer (we fit in the Red Zone).
   bool DisableRedZone = MF.getFunction()->hasFnAttr(Attribute::NoRedZone);
-  // FIXME SVR4 The SVR4 ABI has no red zone.
+  // FIXME SVR4 The 32-bit SVR4 ABI has no red zone.
   if (!DisableRedZone &&
       FrameSize <= 224 &&                          // Fits in red zone.
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
@@ -1005,7 +1078,7 @@ PPCRegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
   if (!Subtarget.isSVR4ABI()) {
     return;
   }
-  
+
   // Get callee saved register information.
   MachineFrameInfo *FFI = MF.getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = FFI->getCalleeSavedInfo();
@@ -1016,16 +1089,19 @@ PPCRegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
   }
   
   unsigned MinGPR = PPC::R31;
+  unsigned MinG8R = PPC::X31;
   unsigned MinFPR = PPC::F31;
   unsigned MinVR = PPC::V31;
   
   bool HasGPSaveArea = false;
+  bool HasG8SaveArea = false;
   bool HasFPSaveArea = false;
   bool HasCRSaveArea = false;
   bool HasVRSAVESaveArea = false;
   bool HasVRSaveArea = false;
   
   SmallVector<CalleeSavedInfo, 18> GPRegs;
+  SmallVector<CalleeSavedInfo, 18> G8Regs;
   SmallVector<CalleeSavedInfo, 18> FPRegs;
   SmallVector<CalleeSavedInfo, 18> VRegs;
   
@@ -1041,6 +1117,14 @@ PPCRegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
       if (Reg < MinGPR) {
         MinGPR = Reg;
       }
+    } else if (RC == PPC::G8RCRegisterClass) {
+      HasG8SaveArea = true;
+
+      G8Regs.push_back(CSI[i]);
+
+      if (Reg < MinG8R) {
+        MinG8R = Reg;
+      }
     } else if (RC == PPC::F8RCRegisterClass) {
       HasFPSaveArea = true;
       
@@ -1064,7 +1148,7 @@ PPCRegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
         MinVR = Reg;
       }
     } else {
-      assert(0 && "Unknown RegisterClass!");
+      llvm_unreachable("Unknown RegisterClass!");
     }
   }
 
@@ -1103,7 +1187,7 @@ PPCRegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
   
   // General register save area starts right below the Floating-point
   // register save area.
-  if (HasGPSaveArea) {
+  if (HasGPSaveArea || HasG8SaveArea) {
     // Move general register save area spill slots down, taking into account
     // the size of the Floating-point register save area.
     for (unsigned i = 0, e = GPRegs.size(); i != e; ++i) {
@@ -1112,7 +1196,22 @@ PPCRegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
       FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
     }
     
-    LowerBound -= (31 - getRegisterNumbering(MinGPR) + 1) * 4;
+    // Move general register save area spill slots down, taking into account
+    // the size of the Floating-point register save area.
+    for (unsigned i = 0, e = G8Regs.size(); i != e; ++i) {
+      int FI = G8Regs[i].getFrameIdx();
+
+      FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+    }
+
+    unsigned MinReg = std::min<unsigned>(getRegisterNumbering(MinGPR),
+                                         getRegisterNumbering(MinG8R));
+
+    if (Subtarget.isPPC64()) {
+      LowerBound -= (31 - MinReg + 1) * 8;
+    } else {
+      LowerBound -= (31 - MinReg + 1) * 4;
+    }
   }
   
   // The CR save area is below the general register save area.
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index ddaefdd..1689bc2 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -37,7 +37,7 @@ public:
 
   /// getPointerRegClass - Return the register class to use to hold pointers.
   /// This is used for addressing modes.
-  virtual const TargetRegisterClass *getPointerRegClass() const;  
+  virtual const TargetRegisterClass *getPointerRegClass(unsigned Kind=0) const;  
 
   /// Code Generation virtual methods...
   const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
@@ -66,8 +66,9 @@ public:
                          int SPAdj, RegScavenger *RS) const;
   void lowerCRSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex,
                        int SPAdj, RegScavenger *RS) const;
-  void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
 
   /// determineFrameLayout - Determine the size of the frame and maximum call
   /// frame size.
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index bac8e3a..049e893 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -255,6 +255,11 @@ def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66]>;
 // VRsave register
 def VRSAVE: SPR<256, "VRsave">, DwarfRegNum<[107]>;
 
+// Carry bit.  In the architecture this is really bit 0 of the XER register
+// (which really is SPR register 1);  this is the only bit interesting to a
+// compiler.
+def CARRY: SPR<1, "ca">, DwarfRegNum<[0]>;
+
 // FP rounding mode:  bits 30 and 31 of the FP status and control register
 // This is not allocated as a normal register; it appears only in
 // Uses and Defs.  The ABI says it needs to be preserved by a function,
@@ -280,7 +285,8 @@ def GPRC : RegisterClass<"PPC", [i32], 32,
   let MethodBodies = [{
     GPRCClass::iterator
     GPRCClass::allocation_order_begin(const MachineFunction &MF) const {
-      // In Linux, r2 is reserved for the OS.
+      // 32-bit SVR4 ABI: r2 is reserved for the OS.
+      // 64-bit SVR4 ABI: r2 is reserved for the TOC pointer.
       if (!MF.getTarget().getSubtarget<PPCSubtarget>().isDarwin())
         return begin()+1;
 
@@ -291,7 +297,7 @@ def GPRC : RegisterClass<"PPC", [i32], 32,
       // On PPC64, r13 is the thread pointer.  Never allocate this register.
       // Note that this is overconservative, as it also prevents allocation of
       // R31 when the FP is not needed.
-      // When using the SVR4 ABI, r13 is reserved for the Small Data Area
+      // When using the 32-bit SVR4 ABI, r13 is reserved for the Small Data Area
       // pointer.
       const PPCSubtarget &Subtarget
         = MF.getTarget().getSubtarget<PPCSubtarget>();
@@ -318,6 +324,10 @@ def G8RC : RegisterClass<"PPC", [i64], 64,
   let MethodBodies = [{
     G8RCClass::iterator
     G8RCClass::allocation_order_begin(const MachineFunction &MF) const {
+      // 64-bit SVR4 ABI: r2 is reserved for the TOC pointer.
+      if (!MF.getTarget().getSubtarget<PPCSubtarget>().isDarwin())
+        return begin()+1;
+
       return begin();
     }
     G8RCClass::iterator
@@ -372,4 +382,6 @@ def CRRC : RegisterClass<"PPC", [i32], 32, [CR0, CR1, CR5, CR6, CR7, CR2,
 def CTRRC : RegisterClass<"PPC", [i32], 32, [CTR]>;
 def CTRRC8 : RegisterClass<"PPC", [i64], 64, [CTR8]>;
 def VRSAVERC : RegisterClass<"PPC", [i32], 32, [VRSAVE]>;
-
+def CARRYRC : RegisterClass<"PPC", [i32], 32, [CARRY]> {
+  let CopyCost = -1;
+}
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 425d8e6..f75e781 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -13,7 +13,7 @@
 
 #include "PPCSubtarget.h"
 #include "PPC.h"
-#include "llvm/Module.h"
+#include "llvm/GlobalValue.h"
 #include "llvm/Target/TargetMachine.h"
 #include "PPCGenSubtarget.inc"
 #include <cstdlib>
@@ -57,10 +57,9 @@ static const char *GetCurrentPowerPCCPU() {
 #endif
 
 
-PPCSubtarget::PPCSubtarget(const TargetMachine &tm, const Module &M,
-                           const std::string &FS, bool is64Bit)
-  : TM(tm)
-  , StackAlignment(16)
+PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &FS,
+                           bool is64Bit)
+  : StackAlignment(16)
   , DarwinDirective(PPC::DIR_NONE)
   , IsGigaProcessor(false)
   , Has64BitSupport(false)
@@ -95,7 +94,6 @@ PPCSubtarget::PPCSubtarget(const TargetMachine &tm, const Module &M,
   
   // Set the boolean corresponding to the current target triple, or the default
   // if one cannot be determined, to true.
-  const std::string &TT = M.getTargetTriple();
   if (TT.length() > 7) {
     // Determine which version of darwin this is.
     size_t DarwinPos = TT.find("-darwin");
@@ -105,24 +103,11 @@ PPCSubtarget::PPCSubtarget(const TargetMachine &tm, const Module &M,
       else
         DarwinVers = 8;  // Minimum supported darwin is Tiger.
     }
-  } else if (TT.empty()) {
-    // Try to autosense the subtarget from the host compiler.
-#if defined(__APPLE__)
-#if __APPLE_CC__ > 5400
-    DarwinVers = 9;  // GCC 5400+ is Leopard.
-#else
-    DarwinVers = 8;  // Minimum supported darwin is Tiger.
-#endif
-#endif
   }
 
   // Set up darwin-specific properties.
-  if (isDarwin()) {
+  if (isDarwin())
     HasLazyResolverStubs = true;
-    AsmFlavor = NewMnemonic;
-  } else {
-    AsmFlavor = OldMnemonic;
-  }
 }
 
 /// SetJITMode - This is called to inform the subtarget info that we are
@@ -138,7 +123,8 @@ void PPCSubtarget::SetJITMode() {
 /// hasLazyResolverStub - Return true if accesses to the specified global have
 /// to go through a dyld lazy resolution stub.  This means that an extra load
 /// is required to get the address of the global.
-bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
+bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV,
+                                       const TargetMachine &TM) const {
   // We never hae stubs if HasLazyResolverStubs=false or if in static mode.
   if (!HasLazyResolverStubs || TM.getRelocationModel() == Reloc::Static)
     return false;
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index f633cc6..02c8ad7 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -39,18 +39,11 @@ namespace PPC {
   };
 }
 
-class Module;
 class GlobalValue;
 class TargetMachine;
   
 class PPCSubtarget : public TargetSubtarget {
-public:
-  enum AsmWriterFlavorTy {
-    OldMnemonic, NewMnemonic, Unset
-  };
 protected:
-  const TargetMachine &TM;
-  
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned StackAlignment;
@@ -61,9 +54,6 @@ protected:
   /// Which cpu directive was used.
   unsigned DarwinDirective;
 
-  /// AsmFlavor - Which PPC asm dialect to use.
-  AsmWriterFlavorTy AsmFlavor;
-
   /// Used by the ISel to turn in optimizations for POWER4-derived architectures
   bool IsGigaProcessor;
   bool Has64BitSupport;
@@ -79,10 +69,9 @@ protected:
   unsigned char DarwinVers; // Is any darwin-ppc platform.
 public:
   /// This constructor initializes the data members to match that
-  /// of the specified module.
+  /// of the specified triple.
   ///
-  PPCSubtarget(const TargetMachine &TM, const Module &M,
-               const std::string &FS, bool is64Bit);
+  PPCSubtarget(const std::string &TT, const std::string &FS, bool is64Bit);
   
   /// ParseSubtargetFeatures - Parses features string setting specified 
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -132,7 +121,8 @@ public:
   /// hasLazyResolverStub - Return true if accesses to the specified global have
   /// to go through a dyld lazy resolution stub.  This means that an extra load
   /// is required to get the address of the global.
-  bool hasLazyResolverStub(const GlobalValue *GV) const;
+  bool hasLazyResolverStub(const GlobalValue *GV, 
+                           const TargetMachine &TM) const;
   
   // Specific obvious features.
   bool hasFSQRT() const { return HasFSQRT; }
@@ -148,12 +138,9 @@ public:
   /// getDarwinVers - Return the darwin version number, 8 = tiger, 9 = leopard.
   unsigned getDarwinVers() const { return DarwinVers; }
 
-  bool isDarwinABI() const { return isDarwin() || IsPPC64; }
-  bool isSVR4ABI() const { return !isDarwin() && !IsPPC64; }
+  bool isDarwinABI() const { return isDarwin(); }
+  bool isSVR4ABI() const { return !isDarwin(); }
 
-  unsigned getAsmFlavor() const {
-    return AsmFlavor != Unset ? unsigned(AsmFlavor) : 0;
-  }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index e9073d6..3371954 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -12,96 +12,38 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPC.h"
-#include "PPCTargetAsmInfo.h"
+#include "PPCMCAsmInfo.h"
 #include "PPCTargetMachine.h"
-#include "llvm/Module.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetMachineRegistry.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/FormattedStream.h"
 using namespace llvm;
 
-/// PowerPCTargetMachineModule - Note that this is used on hosts that
-/// cannot link in a library unless there are references into the
-/// library.  In particular, it seems that it is not possible to get
-/// things to work on Win32 without this.  Though it is unused, do not
-/// remove it.
-extern "C" int PowerPCTargetMachineModule;
-int PowerPCTargetMachineModule = 0;
-
-// Register the targets
-static RegisterTarget<PPC32TargetMachine>
-X("ppc32", "PowerPC 32");
-static RegisterTarget<PPC64TargetMachine>
-Y("ppc64", "PowerPC 64");
-
-// Force static initialization.
-extern "C" void LLVMInitializePowerPCTarget() { }
-
-// No assembler printer by default
-PPCTargetMachine::AsmPrinterCtorFn PPCTargetMachine::AsmPrinterCtor = 0;
-
-const TargetAsmInfo *PPCTargetMachine::createTargetAsmInfo() const {
-  if (Subtarget.isDarwin())
-    return new PPCDarwinTargetAsmInfo(*this);
-  else
-    return new PPCLinuxTargetAsmInfo(*this);
-}
-
-unsigned PPC32TargetMachine::getJITMatchQuality() {
-#if defined(__POWERPC__) || defined (__ppc__) || defined(_POWER) || defined(__PPC__)
-  if (sizeof(void*) == 4)
-    return 10;
-#endif
-  return 0;
-}
-unsigned PPC64TargetMachine::getJITMatchQuality() {
-#if defined(__POWERPC__) || defined (__ppc__) || defined(_POWER) || defined(__PPC__)
-  if (sizeof(void*) == 8)
-    return 10;
-#endif
-  return 0;
-}
-
-unsigned PPC32TargetMachine::getModuleMatchQuality(const Module &M) {
-  // We strongly match "powerpc-*".
-  std::string TT = M.getTargetTriple();
-  if (TT.size() >= 8 && std::string(TT.begin(), TT.begin()+8) == "powerpc-")
-    return 20;
-  
-  // If the target triple is something non-powerpc, we don't match.
-  if (!TT.empty()) return 0;
+static const MCAsmInfo *createMCAsmInfo(const Target &T,
+                                                const StringRef &TT) {
+  Triple TheTriple(TT);
+  bool isPPC64 = TheTriple.getArch() == Triple::ppc64;
+  if (TheTriple.getOS() == Triple::Darwin)
+    return new PPCMCAsmInfoDarwin(isPPC64);
+  return new PPCLinuxMCAsmInfo(isPPC64);
   
-  if (M.getEndianness()  == Module::BigEndian &&
-      M.getPointerSize() == Module::Pointer32)
-    return 10;                                   // Weak match
-  else if (M.getEndianness() != Module::AnyEndianness ||
-           M.getPointerSize() != Module::AnyPointerSize)
-    return 0;                                    // Match for some other target
-  
-  return getJITMatchQuality()/2;
 }
 
-unsigned PPC64TargetMachine::getModuleMatchQuality(const Module &M) {
-  // We strongly match "powerpc64-*".
-  std::string TT = M.getTargetTriple();
-  if (TT.size() >= 10 && std::string(TT.begin(), TT.begin()+10) == "powerpc64-")
-    return 20;
-  
-  if (M.getEndianness()  == Module::BigEndian &&
-      M.getPointerSize() == Module::Pointer64)
-    return 10;                                   // Weak match
-  else if (M.getEndianness() != Module::AnyEndianness ||
-           M.getPointerSize() != Module::AnyPointerSize)
-    return 0;                                    // Match for some other target
+extern "C" void LLVMInitializePowerPCTarget() {
+  // Register the targets
+  RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);  
+  RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target);
   
-  return getJITMatchQuality()/2;
+  RegisterAsmInfoFn C(ThePPC32Target, createMCAsmInfo);
+  RegisterAsmInfoFn D(ThePPC64Target, createMCAsmInfo);
 }
 
 
-PPCTargetMachine::PPCTargetMachine(const Module &M, const std::string &FS,
-                                   bool is64Bit)
-  : Subtarget(*this, M, FS, is64Bit),
+PPCTargetMachine::PPCTargetMachine(const Target &T, const std::string &TT,
+                                   const std::string &FS, bool is64Bit)
+  : LLVMTargetMachine(T, TT),
+    Subtarget(TT, FS, is64Bit),
     DataLayout(Subtarget.getTargetDataString()), InstrInfo(*this),
     FrameInfo(*this, is64Bit), JITInfo(*this, is64Bit), TLInfo(*this),
     InstrItins(Subtarget.getInstrItineraryData()), MachOWriterInfo(*this) {
@@ -118,13 +60,15 @@ PPCTargetMachine::PPCTargetMachine(const Module &M, const std::string &FS,
 /// groups, which typically degrades performance.
 bool PPCTargetMachine::getEnableTailMergeDefault() const { return false; }
 
-PPC32TargetMachine::PPC32TargetMachine(const Module &M, const std::string &FS) 
-  : PPCTargetMachine(M, FS, false) {
+PPC32TargetMachine::PPC32TargetMachine(const Target &T, const std::string &TT, 
+                                       const std::string &FS) 
+  : PPCTargetMachine(T, TT, FS, false) {
 }
 
 
-PPC64TargetMachine::PPC64TargetMachine(const Module &M, const std::string &FS)
-  : PPCTargetMachine(M, FS, true) {
+PPC64TargetMachine::PPC64TargetMachine(const Target &T, const std::string &TT, 
+                                       const std::string &FS)
+  : PPCTargetMachine(T, TT, FS, true) {
 }
 
 
@@ -146,20 +90,36 @@ bool PPCTargetMachine::addPreEmitPass(PassManagerBase &PM,
   return false;
 }
 
-bool PPCTargetMachine::addAssemblyEmitter(PassManagerBase &PM,
-                                          CodeGenOpt::Level OptLevel,
-                                          bool Verbose,
-                                          raw_ostream &Out) {
-  assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-  if (AsmPrinterCtor)
-    PM.add(AsmPrinterCtor(Out, *this, Verbose));
+bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel,
+                                      MachineCodeEmitter &MCE) {
+  // The JIT should use the static relocation model in ppc32 mode, PIC in ppc64.
+  // FIXME: This should be moved to TargetJITInfo!!
+  if (Subtarget.isPPC64()) {
+    // We use PIC codegen in ppc64 mode, because otherwise we'd have to use many
+    // instructions to materialize arbitrary global variable + function +
+    // constant pool addresses.
+    setRelocationModel(Reloc::PIC_);
+    // Temporary workaround for the inability of PPC64 JIT to handle jump
+    // tables.
+    DisableJumpTables = true;      
+  } else {
+    setRelocationModel(Reloc::Static);
+  }
+  
+  // Inform the subtarget that we are in JIT mode.  FIXME: does this break macho
+  // writing?
+  Subtarget.SetJITMode();
+  
+  // Machine code emitter pass for PowerPC.
+  PM.add(createPPCCodeEmitterPass(*this, MCE));
 
   return false;
 }
 
 bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel,
-                                      bool DumpAsm, MachineCodeEmitter &MCE) {
+                                      JITCodeEmitter &JCE) {
   // The JIT should use the static relocation model in ppc32 mode, PIC in ppc64.
   // FIXME: This should be moved to TargetJITInfo!!
   if (Subtarget.isPPC64()) {
@@ -179,19 +139,14 @@ bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
   Subtarget.SetJITMode();
   
   // Machine code emitter pass for PowerPC.
-  PM.add(createPPCCodeEmitterPass(*this, MCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
+  PM.add(createPPCJITCodeEmitterPass(*this, JCE));
 
   return false;
 }
 
 bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel,
-                                      bool DumpAsm, JITCodeEmitter &JCE) {
+                                      ObjectCodeEmitter &OCE) {
   // The JIT should use the static relocation model in ppc32 mode, PIC in ppc64.
   // FIXME: This should be moved to TargetJITInfo!!
   if (Subtarget.isPPC64()) {
@@ -211,43 +166,33 @@ bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
   Subtarget.SetJITMode();
   
   // Machine code emitter pass for PowerPC.
-  PM.add(createPPCJITCodeEmitterPass(*this, JCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
+  PM.add(createPPCObjectCodeEmitterPass(*this, OCE));
 
   return false;
 }
 
 bool PPCTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
                                             CodeGenOpt::Level OptLevel,
-                                            bool DumpAsm, 
                                             MachineCodeEmitter &MCE) {
   // Machine code emitter pass for PowerPC.
   PM.add(createPPCCodeEmitterPass(*this, MCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
-
   return false;
 }
 
 bool PPCTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
                                             CodeGenOpt::Level OptLevel,
-                                            bool DumpAsm,
                                             JITCodeEmitter &JCE) {
   // Machine code emitter pass for PowerPC.
   PM.add(createPPCJITCodeEmitterPass(*this, JCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
+  return false;
+}
 
+bool PPCTargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel,
+                                            ObjectCodeEmitter &OCE) {
+  // Machine code emitter pass for PowerPC.
+  PM.add(createPPCObjectCodeEmitterPass(*this, OCE));
   return false;
 }
 
+
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index c693bf4..3399ac8 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -39,18 +39,9 @@ class PPCTargetMachine : public LLVMTargetMachine {
   InstrItineraryData  InstrItins;
   PPCMachOWriterInfo  MachOWriterInfo;
 
-protected:
-  virtual const TargetAsmInfo *createTargetAsmInfo() const;
-
-  // To avoid having target depend on the asmprinter stuff libraries, asmprinter
-  // set this functions to ctor pointer at startup time if they are linked in.
-  typedef FunctionPass *(*AsmPrinterCtorFn)(raw_ostream &o,
-                                            PPCTargetMachine &tm, 
-                                            bool verbose);
-  static AsmPrinterCtorFn AsmPrinterCtor;
-
 public:
-  PPCTargetMachine(const Module &M, const std::string &FS, bool is64Bit);
+  PPCTargetMachine(const Target &T, const std::string &TT,
+                   const std::string &FS, bool is64Bit);
 
   virtual const PPCInstrInfo     *getInstrInfo() const { return &InstrInfo; }
   virtual const PPCFrameInfo     *getFrameInfo() const { return &FrameInfo; }
@@ -71,26 +62,24 @@ public:
     return &MachOWriterInfo;
   }
 
-  static void registerAsmPrinter(AsmPrinterCtorFn F) {
-    AsmPrinterCtor = F;
-  }
-
   // Pass Pipeline Configuration
   virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addAssemblyEmitter(PassManagerBase &PM,
-                                  CodeGenOpt::Level OptLevel, 
-                                  bool Verbose, raw_ostream &Out);
   virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
-                              bool DumpAsm, MachineCodeEmitter &MCE);
+                              MachineCodeEmitter &MCE);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              JITCodeEmitter &JCE);
   virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
-                              bool DumpAsm, JITCodeEmitter &JCE);
+                              ObjectCodeEmitter &OCE);
   virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
                                     CodeGenOpt::Level OptLevel,
-                                    bool DumpAsm, MachineCodeEmitter &MCE);
+                                    MachineCodeEmitter &MCE);
   virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
                                     CodeGenOpt::Level OptLevel,
-                                    bool DumpAsm, JITCodeEmitter &JCE);
+                                    JITCodeEmitter &JCE);
+  virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+                                    CodeGenOpt::Level OptLevel,
+                                    ObjectCodeEmitter &OCE);
   virtual bool getEnableTailMergeDefault() const;
 };
 
@@ -98,20 +87,16 @@ public:
 ///
 class PPC32TargetMachine : public PPCTargetMachine {
 public:
-  PPC32TargetMachine(const Module &M, const std::string &FS);
-  
-  static unsigned getJITMatchQuality();
-  static unsigned getModuleMatchQuality(const Module &M);
+  PPC32TargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS);
 };
 
 /// PPC64TargetMachine - PowerPC 64-bit target machine.
 ///
 class PPC64TargetMachine : public PPCTargetMachine {
 public:
-  PPC64TargetMachine(const Module &M, const std::string &FS);
-  
-  static unsigned getJITMatchQuality();
-  static unsigned getModuleMatchQuality(const Module &M);
+  PPC64TargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt
index 6e9e6c7..f5e50fc 100644
--- a/lib/Target/PowerPC/README.txt
+++ b/lib/Target/PowerPC/README.txt
@@ -149,7 +149,7 @@ http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 
 Implement Newton-Rhapson method for improving estimate instructions to the
 correct accuracy, and implementing divide as multiply by reciprocal when it has
-more than one use.  Itanium will want this too.
+more than one use.  Itanium would want this too.
 
 ===-------------------------------------------------------------------------===
 
diff --git a/lib/Target/PowerPC/TargetInfo/CMakeLists.txt b/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..058d599
--- /dev/null
+++ b/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMPowerPCInfo
+  PowerPCTargetInfo.cpp
+  )
+
+add_dependencies(LLVMPowerPCInfo PowerPCCodeGenTable_gen)
diff --git a/lib/Target/PowerPC/TargetInfo/Makefile b/lib/Target/PowerPC/TargetInfo/Makefile
new file mode 100644
index 0000000..a101aa4
--- /dev/null
+++ b/lib/Target/PowerPC/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/PowerPC/TargetInfo/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMPowerPCInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
new file mode 100644
index 0000000..ad607d0
--- /dev/null
+++ b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- PowerPCTargetInfo.cpp - PowerPC Target Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::ThePPC32Target, llvm::ThePPC64Target;
+
+extern "C" void LLVMInitializePowerPCTargetInfo() { 
+  RegisterTarget<Triple::ppc, /*HasJIT=*/true>
+    X(ThePPC32Target, "ppc32", "PowerPC 32");
+
+  RegisterTarget<Triple::ppc64, /*HasJIT=*/true>
+    Y(ThePPC64Target, "ppc64", "PowerPC 64");
+}
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index f68cf0e..89ea9d0 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -197,13 +197,6 @@ _bar:   addic r3,r3,-1
 
 //===---------------------------------------------------------------------===//
 
-Legalize should lower ctlz like this:
-  ctlz(x) = popcnt((x-1) & ~x)
-
-on targets that have popcnt but not ctlz.  itanium, what else?
-
-//===---------------------------------------------------------------------===//
-
 quantum_sigma_x in 462.libquantum contains the following loop:
 
       for(i=0; i<reg->size; i++)
@@ -227,7 +220,20 @@ so cool to turn it into something like:
 ... which would only do one 32-bit XOR per loop iteration instead of two.
 
 It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
-alas...
+alas.
+
+//===---------------------------------------------------------------------===//
+
+This should be optimized to one 'and' and one 'or', from PR4216:
+
+define i32 @test_bitfield(i32 %bf.prev.low) nounwind ssp {
+entry:
+  %bf.prev.lo.cleared10 = or i32 %bf.prev.low, 32962 ; <i32> [#uses=1]
+  %0 = and i32 %bf.prev.low, -65536               ; <i32> [#uses=1]
+  %1 = and i32 %bf.prev.lo.cleared10, 40186       ; <i32> [#uses=1]
+  %2 = or i32 %1, %0                              ; <i32> [#uses=1]
+  ret i32 %2
+}
 
 //===---------------------------------------------------------------------===//
 
@@ -335,11 +341,6 @@ when it is declared U32.
 
 //===---------------------------------------------------------------------===//
 
-Promote for i32 bswap can use i64 bswap + shr.  Useful on targets with 64-bit
-regs and bswap, like itanium.
-
-//===---------------------------------------------------------------------===//
-
 LSR should know what GPR types a target has.  This code:
 
 volatile short X, Y; // globals
@@ -349,24 +350,22 @@ void foo(int N) {
   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 }
 
-produces two identical IV's (after promotion) on PPC/ARM:
+produces two near identical IV's (after promotion) on PPC/ARM:
 
-LBB1_1: @bb.preheader
-        mov r3, #0
-        mov r2, r3
-        mov r1, r3
-LBB1_2: @bb
-        ldr r12, LCPI1_0
-        ldr r12, [r12]
-        strh r2, [r12]
-        ldr r12, LCPI1_1
-        ldr r12, [r12]
-        strh r3, [r12]
-        add r1, r1, #1    <- [0,+,1]
-        add r3, r3, #4
-        add r2, r2, #1    <- [0,+,1]
-        cmp r1, r0
-        bne LBB1_2      @bb
+LBB1_2:
+	ldr r3, LCPI1_0
+	ldr r3, [r3]
+	strh r2, [r3]
+	ldr r3, LCPI1_1
+	ldr r3, [r3]
+	strh r1, [r3]
+	add r1, r1, #4
+	add r2, r2, #1   <- [0,+,1]
+	sub r0, r0, #1   <- [0,-,1]
+	cmp r0, #0
+	bne LBB1_2
+
+LSR should reuse the "+" IV for the exit test.
 
 
 //===---------------------------------------------------------------------===//
@@ -600,25 +599,6 @@ implementations of ceil/floor/rint.
 
 //===---------------------------------------------------------------------===//
 
-This GCC bug: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34043
-contains a testcase that compiles down to:
-
-	%struct.XMM128 = type { <4 x float> }
-..
-	%src = alloca %struct.XMM128
-..
-	%tmp6263 = bitcast %struct.XMM128* %src to <2 x i64>*
-	%tmp65 = getelementptr %struct.XMM128* %src, i32 0, i32 0
-	store <2 x i64> %tmp5899, <2 x i64>* %tmp6263, align 16
-	%tmp66 = load <4 x float>* %tmp65, align 16		
-	%tmp71 = add <4 x float> %tmp66, %tmp66		
-
-If the mid-level optimizer turned the bitcast of pointer + store of tmp5899
-into a bitcast of the vector value and a store to the pointer, then the 
-store->load could be easily removed.
-
-//===---------------------------------------------------------------------===//
-
 Consider:
 
 int test() {
@@ -1123,16 +1103,6 @@ optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 
 //===---------------------------------------------------------------------===//
 
-We would like to do the following transform in the instcombiner:
-
-  -X/C -> X/-C
-
-However, this isn't valid if (-X) overflows. We can implement this when we
-have the concept of a "C signed subtraction" operator that which is undefined
-on overflow.
-
-//===---------------------------------------------------------------------===//
-
 This was noticed in the entryblock for grokdeclarator in 403.gcc:
 
         %tmp = icmp eq i32 %decl_context, 4          
@@ -1311,6 +1281,8 @@ http://gcc.gnu.org/bugzilla/show_bug.cgi?id=35287 [LPRE crit edge splitting]
 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34677 (licm does this, LPRE crit edge)
   llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as | opt -mem2reg -simplifycfg -gvn | llvm-dis
 
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16799 [BITCAST PHI TRANS]
+
 //===---------------------------------------------------------------------===//
 
 Type based alias analysis:
@@ -1318,31 +1290,25 @@ http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14705
 
 //===---------------------------------------------------------------------===//
 
-When GVN/PRE finds a store of float* to a must aliases pointer when expecting
-an int*, it should turn it into a bitcast.  This is a nice generalization of
-the SROA hack that would apply to other cases, e.g.:
-
-int foo(int C, int *P, float X) {
-  if (C) {
-    bar();
-    *P = 42;
-  } else
-    *(float*)P = X;
-
-   return *P;
-}
-
-
-One example (that requires crazy phi translation) is:
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16799 [BITCAST PHI TRANS]
-
-//===---------------------------------------------------------------------===//
-
 A/B get pinned to the stack because we turn an if/then into a select instead
 of PRE'ing the load/store.  This may be fixable in instcombine:
 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37892
 
+struct X { int i; };
+int foo (int x) {
+  struct X a;
+  struct X b;
+  struct X *p;
+  a.i = 1;
+  b.i = 2;
+  if (x)
+    p = &a;
+  else
+    p = &b;
+  return p->i;
+}
 
+//===---------------------------------------------------------------------===//
 
 Interesting missed case because of control flow flattening (should be 2 loads):
 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629
@@ -1675,5 +1641,6 @@ entry:
 
 Instcombine should be able to optimize away the loads (and thus the globals).
 
+See also PR4973
 
 //===---------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/AsmPrinter/Makefile b/lib/Target/Sparc/AsmPrinter/Makefile
index f12a6ac..a856828 100644
--- a/lib/Target/Sparc/AsmPrinter/Makefile
+++ b/lib/Target/Sparc/AsmPrinter/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/Sparc/Makefile ---------------------------*- Makefile -*-===##
+##===- lib/Target/Sparc/AsmPrinter/Makefile ----------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
diff --git a/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp b/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp
index 71bd0de..a3e5fba 100644
--- a/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp
@@ -19,18 +19,22 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
-#include "llvm/MDNode.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DwarfWriter.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/Target/TargetAsmInfo.h"
-#include "llvm/Support/Mangler.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Mangler.h"
 #include "llvm/Support/MathExtras.h"
 #include <cctype>
 #include <cstring>
@@ -49,45 +53,36 @@ namespace {
     ValueMapTy NumberForBB;
     unsigned BBNumber;
   public:
-    explicit SparcAsmPrinter(raw_ostream &O, TargetMachine &TM,
-                             const TargetAsmInfo *T, bool V)
+    explicit SparcAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                             const MCAsmInfo *T, bool V)
       : AsmPrinter(O, TM, T, V), BBNumber(0) {}
 
     virtual const char *getPassName() const {
       return "Sparc Assembly Printer";
     }
 
-    void printModuleLevelGV(const GlobalVariable* GVar);
+    void PrintGlobalVariable(const GlobalVariable *GVar);
     void printOperand(const MachineInstr *MI, int opNum);
     void printMemOperand(const MachineInstr *MI, int opNum,
                          const char *Modifier = 0);
     void printCCOperand(const MachineInstr *MI, int opNum);
 
-    bool printInstruction(const MachineInstr *MI);  // autogenerated.
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
     bool runOnMachineFunction(MachineFunction &F);
-    bool doInitialization(Module &M);
-    bool doFinalization(Module &M);
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode);
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                              unsigned AsmVariant, const char *ExtraCode);
+
+    void emitFunctionHeader(const MachineFunction &MF);
+    bool printGetPCX(const MachineInstr *MI, unsigned OpNo);
   };
 } // end of anonymous namespace
 
 #include "SparcGenAsmWriter.inc"
 
-/// createSparcCodePrinterPass - Returns a pass that prints the SPARC
-/// assembly code for a MachineFunction to the given output stream,
-/// using the given target machine description.  This should work
-/// regardless of whether the function is in SSA form.
-///
-FunctionPass *llvm::createSparcCodePrinterPass(raw_ostream &o,
-                                               TargetMachine &tm,
-                                               bool verbose) {
-  return new SparcAsmPrinter(o, tm, tm.getTargetAsmInfo(), verbose);
-}
-
-
 /// runOnMachineFunction - This uses the printInstruction()
 /// method to print assembly for each instruction.
 ///
@@ -103,17 +98,11 @@ bool SparcAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // BBs the same name. (If you have a better way, please let me know!)
 
   O << "\n\n";
-
-  // Print out the label for the function.
-  const Function *F = MF.getFunction();
-  SwitchToSection(TAI->SectionForGlobal(F));
-  EmitAlignment(MF.getAlignment(), F);
-  O << "\t.globl\t" << CurrentFnName << '\n';
-
-  printVisibility(CurrentFnName, F->getVisibility());
-
-  O << "\t.type\t" << CurrentFnName << ", #function\n";
-  O << CurrentFnName << ":\n";
+  emitFunctionHeader(MF);
+  
+  
+  // Emit pre-function debug information.
+  DW->BeginFunction(&MF);
 
   // Number each basic block so that we can consistently refer to them
   // in PC-relative references.
@@ -129,24 +118,65 @@ bool SparcAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
        I != E; ++I) {
     // Print a label for the basic block.
     if (I != MF.begin()) {
-      printBasicBlockLabel(I, true, true);
-      O << '\n';
+      EmitBasicBlockStart(I);
     }
     for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
          II != E; ++II) {
       // Print the assembly for the instruction.
+      processDebugLoc(II, true);
       printInstruction(II);
+      
+      if (VerboseAsm && !II->getDebugLoc().isUnknown())
+        EmitComments(*II);
+      O << '\n';
+      processDebugLoc(II, false);
       ++EmittedInsts;
     }
   }
 
+  // Emit post-function debug information.
+  DW->EndFunction(&MF);
+
   // We didn't modify anything.
+  O << "\t.size\t" << CurrentFnName << ", .-" << CurrentFnName << '\n';
   return false;
 }
 
+void SparcAsmPrinter::emitFunctionHeader(const MachineFunction &MF) {
+  const Function *F = MF.getFunction();
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
+  EmitAlignment(MF.getAlignment(), F);
+  
+  switch (F->getLinkage()) {
+  default: llvm_unreachable("Unknown linkage type");
+  case Function::PrivateLinkage:
+  case Function::InternalLinkage:
+    // Function is internal.
+    break;
+  case Function::DLLExportLinkage:
+  case Function::ExternalLinkage:
+    // Function is externally visible
+    O << "\t.global\t" << CurrentFnName << '\n';
+    break;
+  case Function::LinkerPrivateLinkage:
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+    // Function is weak
+    O << "\t.weak\t" << CurrentFnName << '\n' ;
+    break;
+  }
+  
+  printVisibility(CurrentFnName, F->getVisibility());
+  
+  O << "\t.type\t" << CurrentFnName << ", #function\n";
+  O << CurrentFnName << ":\n";
+}
+
+
 void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
   const MachineOperand &MO = MI->getOperand (opNum);
-  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
   bool CloseParen = false;
   if (MI->getOpcode() == SP::SETHIi && !MO.isReg() && !MO.isImm()) {
     O << "%hi(";
@@ -158,33 +188,27 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
   }
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
-      O << "%" << LowercaseString (RI.get(MO.getReg()).AsmName);
-    else
-      O << "%reg" << MO.getReg();
+    O << "%" << LowercaseString(getRegisterName(MO.getReg()));
     break;
 
   case MachineOperand::MO_Immediate:
     O << (int)MO.getImm();
     break;
   case MachineOperand::MO_MachineBasicBlock:
-    printBasicBlockLabel(MO.getMBB());
+    GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
     return;
   case MachineOperand::MO_GlobalAddress:
-    {
-      const GlobalValue *GV = MO.getGlobal();
-      O << Mang->getValueName(GV);
-    }
+    O << Mang->getMangledName(MO.getGlobal());
     break;
   case MachineOperand::MO_ExternalSymbol:
     O << MO.getSymbolName();
     break;
   case MachineOperand::MO_ConstantPoolIndex:
-    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
       << MO.getIndex();
     break;
   default:
-    O << "<unknown operand type>"; abort (); break;
+    llvm_unreachable("<unknown operand type>");
   }
   if (CloseParen) O << ")";
 }
@@ -218,28 +242,42 @@ void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
   }
 }
 
-void SparcAsmPrinter::printCCOperand(const MachineInstr *MI, int opNum) {
-  int CC = (int)MI->getOperand(opNum).getImm();
-  O << SPARCCondCodeToString((SPCC::CondCodes)CC);
-}
+bool SparcAsmPrinter::printGetPCX(const MachineInstr *MI, unsigned opNum) {
+  std::string operand = "";
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  default: assert(0 && "Operand is not a register ");
+  case MachineOperand::MO_Register:
+    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           "Operand is not a physical register ");
+    operand = "%" + LowercaseString(getRegisterName(MO.getReg()));
+    break;
+  }
 
-bool SparcAsmPrinter::doInitialization(Module &M) {
-  Mang = new Mangler(M, "", TAI->getPrivateGlobalPrefix());
-  return false; // success
-}
+  unsigned bbNum = NumberForBB[MI->getParent()->getBasicBlock()];
 
-bool SparcAsmPrinter::doFinalization(Module &M) {
-  // Print out module-level global variables here.
-  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    printModuleLevelGV(I);
+  O << '\n' << ".LLGETPCH" << bbNum << ":\n";
+  O << "\tcall\t.LLGETPC" << bbNum << '\n' ;
 
-  O << '\n';
+  O << "\t  sethi\t"
+    << "%hi(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << bbNum << ")), "  
+    << operand << '\n' ;
+
+  O << ".LLGETPC" << bbNum << ":\n" ;
+  O << "\tor\t" << operand  
+    << ", %lo(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << bbNum << ")), "
+    << operand << '\n';
+  O << "\tadd\t" << operand << ", %o7, " << operand << '\n'; 
+  
+  return true;
+}
 
-  return AsmPrinter::doFinalization(M);
+void SparcAsmPrinter::printCCOperand(const MachineInstr *MI, int opNum) {
+  int CC = (int)MI->getOperand(opNum).getImm();
+  O << SPARCCondCodeToString((SPCC::CondCodes)CC);
 }
 
-void SparcAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+void SparcAsmPrinter::PrintGlobalVariable(const GlobalVariable* GVar) {
   const TargetData *TD = TM.getTargetData();
 
   if (!GVar->hasInitializer())
@@ -250,16 +288,15 @@ void SparcAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
     return;
 
   O << "\n\n";
-  std::string name = Mang->getValueName(GVar);
+  std::string name = Mang->getMangledName(GVar);
   Constant *C = GVar->getInitializer();
-  if (isa<MDNode>(C) || isa<MDString>(C))
-    return;
   unsigned Size = TD->getTypeAllocSize(C->getType());
   unsigned Align = TD->getPreferredAlignment(GVar);
 
   printVisibility(name, GVar->getVisibility());
 
-  SwitchToSection(TAI->SectionForGlobal(GVar));
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GVar, Mang,
+                                                                  TM));
 
   if (C->isNullValue() && !GVar->hasSection()) {
     if (!GVar->isThreadLocal() &&
@@ -269,8 +306,8 @@ void SparcAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
       if (GVar->hasLocalLinkage())
         O << "\t.local " << name << '\n';
 
-      O << TAI->getCOMMDirective() << name << ',' << Size;
-      if (TAI->getCOMMDirectiveTakesAlignment())
+      O << MAI->getCOMMDirective() << name << ',' << Size;
+      if (MAI->getCOMMDirectiveTakesAlignment())
         O << ',' << (1 << Align);
 
       O << '\n';
@@ -292,27 +329,25 @@ void SparcAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
     // their name or something.  For now, just emit them as external.
    case GlobalValue::ExternalLinkage:
     // If external or appending, declare as a global symbol
-    O << TAI->getGlobalDirective() << name << '\n';
+    O << MAI->getGlobalDirective() << name << '\n';
     // FALL THROUGH
    case GlobalValue::PrivateLinkage:
+   case GlobalValue::LinkerPrivateLinkage:
    case GlobalValue::InternalLinkage:
     break;
    case GlobalValue::GhostLinkage:
-    cerr << "Should not have any unmaterialized functions!\n";
-    abort();
+    llvm_unreachable("Should not have any unmaterialized functions!");
    case GlobalValue::DLLImportLinkage:
-    cerr << "DLLImport linkage is not supported by this target!\n";
-    abort();
+    llvm_unreachable("DLLImport linkage is not supported by this target!");
    case GlobalValue::DLLExportLinkage:
-    cerr << "DLLExport linkage is not supported by this target!\n";
-    abort();
+    llvm_unreachable("DLLExport linkage is not supported by this target!");
    default:
-    assert(0 && "Unknown linkage type!");
+    llvm_unreachable("Unknown linkage type!");
   }
 
   EmitAlignment(Align, GVar);
 
-  if (TAI->hasDotTypeDotSizeDirective()) {
+  if (MAI->hasDotTypeDotSizeDirective()) {
     O << "\t.type " << name << ",#object\n";
     O << "\t.size " << name << ',' << Size << '\n';
   }
@@ -355,13 +390,7 @@ bool SparcAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
-namespace {
-  static struct Register {
-    Register() {
-      SparcTargetMachine::registerAsmPrinter(createSparcCodePrinterPass);
-    }
-  } Registrator;
-}
-
 // Force static initialization.
-extern "C" void LLVMInitializeSparcAsmPrinter() { }
+extern "C" void LLVMInitializeSparcAsmPrinter() { 
+  RegisterAsmPrinter<SparcAsmPrinter> X(TheSparcTarget);
+}
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index eb045e2..74f320a 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -16,9 +16,9 @@ add_llvm_target(SparcCodeGen
   SparcInstrInfo.cpp
   SparcISelDAGToDAG.cpp
   SparcISelLowering.cpp
+  SparcMCAsmInfo.cpp
   SparcRegisterInfo.cpp
   SparcSubtarget.cpp
-  SparcTargetAsmInfo.cpp
   SparcTargetMachine.cpp
   )
 
diff --git a/lib/Target/Sparc/FPMover.cpp b/lib/Target/Sparc/FPMover.cpp
index f72a4c4..88b0927 100644
--- a/lib/Target/Sparc/FPMover.cpp
+++ b/lib/Target/Sparc/FPMover.cpp
@@ -20,6 +20,8 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 STATISTIC(NumFpDs , "Number of instructions translated");
@@ -75,7 +77,7 @@ static void getDoubleRegPair(unsigned DoubleReg, unsigned &EvenReg,
       OddReg = OddHalvesOfPairs[i];
       return;
     }
-  assert(0 && "Can't find reg");
+  llvm_unreachable("Can't find reg");
 }
 
 /// runOnMachineBasicBlock - Fixup FpMOVD instructions in this MBB.
@@ -108,16 +110,16 @@ bool FPMover::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
       else if (MI->getOpcode() == SP::FpABSD)
         MI->setDesc(TII->get(SP::FABSS));
       else
-        assert(0 && "Unknown opcode!");
+        llvm_unreachable("Unknown opcode!");
         
       MI->getOperand(0).setReg(EvenDestReg);
       MI->getOperand(1).setReg(EvenSrcReg);
-      DOUT << "FPMover: the modified instr is: " << *MI;
+      DEBUG(errs() << "FPMover: the modified instr is: " << *MI);
       // Insert copy for the other half of the double.
       if (DestDReg != SrcDReg) {
         MI = BuildMI(MBB, I, dl, TM.getInstrInfo()->get(SP::FMOVS), OddDestReg)
           .addReg(OddSrcReg);
-        DOUT << "FPMover: the inserted instr is: " << *MI;
+        DEBUG(errs() << "FPMover: the inserted instr is: " << *MI);
       }
       ++NumFpDs;
     }
diff --git a/lib/Target/Sparc/Makefile b/lib/Target/Sparc/Makefile
index fdf6afa..6714b4d 100644
--- a/lib/Target/Sparc/Makefile
+++ b/lib/Target/Sparc/Makefile
@@ -16,7 +16,7 @@ BUILT_SOURCES = SparcGenRegisterInfo.h.inc SparcGenRegisterNames.inc \
                 SparcGenInstrInfo.inc SparcGenAsmWriter.inc \
                 SparcGenDAGISel.inc SparcGenSubtarget.inc SparcGenCallingConv.inc
 
-DIRS = AsmPrinter
+DIRS = AsmPrinter TargetInfo
 
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h
index c7d0ca8..bb5155e1 100644
--- a/lib/Target/Sparc/Sparc.h
+++ b/lib/Target/Sparc/Sparc.h
@@ -15,19 +15,21 @@
 #ifndef TARGET_SPARC_H
 #define TARGET_SPARC_H
 
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
 
 namespace llvm {
   class FunctionPass;
   class SparcTargetMachine;
-  class raw_ostream;
+  class formatted_raw_ostream;
 
   FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
-  FunctionPass *createSparcCodePrinterPass(raw_ostream &OS, TargetMachine &TM,
-                                           bool Verbose);
   FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
   FunctionPass *createSparcFPMoverPass(TargetMachine &TM);
+
+  extern Target TheSparcTarget;
+
 } // end namespace llvm;
 
 // Defines symbolic names for Sparc registers.  This defines a mapping from
@@ -83,7 +85,7 @@ namespace llvm {
   
   inline static const char *SPARCCondCodeToString(SPCC::CondCodes CC) {
     switch (CC) {
-    default: assert(0 && "Unknown condition code");
+    default: llvm_unreachable("Unknown condition code");
     case SPCC::ICC_NE:  return "ne";
     case SPCC::ICC_E:   return "e";
     case SPCC::ICC_G:   return "g";
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index c9bd62d..a1a4a8e 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -17,6 +17,8 @@
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -32,10 +34,13 @@ class SparcDAGToDAGISel : public SelectionDAGISel {
   /// Subtarget - Keep a pointer to the Sparc Subtarget around so that we can
   /// make the right decision when generating code for different targets.
   const SparcSubtarget &Subtarget;
+  SparcTargetMachine& TM;
+  MachineBasicBlock *CurBB;
 public:
-  explicit SparcDAGToDAGISel(SparcTargetMachine &TM)
-    : SelectionDAGISel(TM),
-      Subtarget(TM.getSubtarget<SparcSubtarget>()) {
+  explicit SparcDAGToDAGISel(SparcTargetMachine &tm)
+    : SelectionDAGISel(tm),
+      Subtarget(tm.getSubtarget<SparcSubtarget>()),
+      TM(tm) {
   }
 
   SDNode *Select(SDValue Op);
@@ -61,6 +66,9 @@ public:
 
   // Include the pieces autogenerated from the target description.
 #include "SparcGenDAGISel.inc"
+
+private:
+  SDNode* getGlobalBaseReg();
 };
 }  // end anonymous namespace
 
@@ -68,12 +76,18 @@ public:
 /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
 void SparcDAGToDAGISel::InstructionSelect() {
   DEBUG(BB->dump());
-
+  CurBB = BB;
   // Select target instructions for the DAG.
   SelectRoot(*CurDAG);
   CurDAG->RemoveDeadNodes();
 }
 
+SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
+  MachineFunction *MF = CurBB->getParent();
+  unsigned GlobalBaseReg = TM.getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
 bool SparcDAGToDAGISel::SelectADDRri(SDValue Op, SDValue Addr,
                                      SDValue &Base, SDValue &Offset) {
   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
@@ -147,6 +161,9 @@ SDNode *SparcDAGToDAGISel::Select(SDValue Op) {
 
   switch (N->getOpcode()) {
   default: break;
+  case SPISD::GLOBAL_BASE_REG:
+    return getGlobalBaseReg();
+
   case ISD::SDIV:
   case ISD::UDIV: {
     // FIXME: should use a custom expander to expose the SRA to the dag.
@@ -156,12 +173,12 @@ SDNode *SparcDAGToDAGISel::Select(SDValue Op) {
     // Set the Y register to the high-part.
     SDValue TopPart;
     if (N->getOpcode() == ISD::SDIV) {
-      TopPart = SDValue(CurDAG->getTargetNode(SP::SRAri, dl, MVT::i32, DivLHS,
+      TopPart = SDValue(CurDAG->getMachineNode(SP::SRAri, dl, MVT::i32, DivLHS,
                                    CurDAG->getTargetConstant(31, MVT::i32)), 0);
     } else {
       TopPart = CurDAG->getRegister(SP::G0, MVT::i32);
     }
-    TopPart = SDValue(CurDAG->getTargetNode(SP::WRYrr, dl, MVT::Flag, TopPart,
+    TopPart = SDValue(CurDAG->getMachineNode(SP::WRYrr, dl, MVT::Flag, TopPart,
                                      CurDAG->getRegister(SP::G0, MVT::i32)), 0);
 
     // FIXME: Handle div by immediate.
@@ -175,8 +192,8 @@ SDNode *SparcDAGToDAGISel::Select(SDValue Op) {
     SDValue MulLHS = N->getOperand(0);
     SDValue MulRHS = N->getOperand(1);
     unsigned Opcode = N->getOpcode() == ISD::MULHU ? SP::UMULrr : SP::SMULrr;
-    SDNode *Mul = CurDAG->getTargetNode(Opcode, dl, MVT::i32, MVT::Flag,
-                                        MulLHS, MulRHS);
+    SDNode *Mul = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Flag,
+                                         MulLHS, MulRHS);
     // The high part is in the Y register.
     return CurDAG->SelectNodeTo(N, SP::RDY, MVT::i32, SDValue(Mul, 1));
     return NULL;
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 4c3efde..164770d 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -21,7 +21,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
 
@@ -31,18 +33,21 @@ using namespace llvm;
 
 #include "SparcGenCallingConv.inc"
 
-static SDValue LowerRET(SDValue Op, SelectionDAG &DAG) {
+SDValue
+SparcTargetLowering::LowerReturn(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 DebugLoc dl, SelectionDAG &DAG) {
+
   // CCValAssign - represent the assignment of the return value to locations.
   SmallVector<CCValAssign, 16> RVLocs;
-  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
-  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
-  DebugLoc dl = Op.getDebugLoc();
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CC, isVarArg, DAG.getTarget(), RVLocs);
+  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(),
+                 RVLocs, *DAG.getContext());
 
-  // Analize return values of ISD::RET
-  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_Sparc32);
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32);
 
   // If this is the first return lowered for this function, add the regs to the
   // liveout set for the function.
@@ -52,7 +57,6 @@ static SDValue LowerRET(SDValue Op, SelectionDAG &DAG) {
         DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
   }
 
-  SDValue Chain = Op.getOperand(0);
   SDValue Flag;
 
   // Copy the result values into the output registers.
@@ -60,10 +64,8 @@ static SDValue LowerRET(SDValue Op, SelectionDAG &DAG) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
-    // ISD::RET => ret chain, (regnum1,val1), ...
-    // So i*2+1 index only the regnums.
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 
-                             Op.getOperand(i*2+1), Flag);
+                             Outs[i].Val, Flag);
 
     // Guarantee that all emitted copies are stuck together with flags.
     Flag = Chain.getValue(1);
@@ -74,55 +76,64 @@ static SDValue LowerRET(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain);
 }
 
-/// LowerArguments - V8 uses a very simple ABI, where all values are passed in
-/// either one or two GPRs, including FP values.  TODO: we should pass FP values
-/// in FP registers for fastcc functions.
-void
-SparcTargetLowering::LowerArguments(Function &F, SelectionDAG &DAG,
-                                    SmallVectorImpl<SDValue> &ArgValues,
-                                    DebugLoc dl) {
+/// LowerFormalArguments - V8 uses a very simple ABI, where all values are
+/// passed in either one or two GPRs, including FP values.  TODO: we should
+/// pass FP values in FP registers for fastcc functions.
+SDValue
+SparcTargetLowering::LowerFormalArguments(SDValue Chain,
+                                          CallingConv::ID CallConv, bool isVarArg,
+                                          const SmallVectorImpl<ISD::InputArg>
+                                            &Ins,
+                                          DebugLoc dl, SelectionDAG &DAG,
+                                          SmallVectorImpl<SDValue> &InVals) {
+
   MachineFunction &MF = DAG.getMachineFunction();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
 
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
+
   static const unsigned ArgRegs[] = {
     SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
   };
-
   const unsigned *CurArgReg = ArgRegs, *ArgRegEnd = ArgRegs+6;
   unsigned ArgOffset = 68;
 
-  SDValue Root = DAG.getRoot();
-  std::vector<SDValue> OutChains;
-
-  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
-    MVT ObjectVT = getValueType(I->getType());
-
-    switch (ObjectVT.getSimpleVT()) {
-    default: assert(0 && "Unhandled argument type!");
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    SDValue ArgValue;
+    CCValAssign &VA = ArgLocs[i];
+    // FIXME: We ignore the register assignments of AnalyzeFormalArguments
+    // because it doesn't know how to split a double into two i32 registers.
+    EVT ObjectVT = VA.getValVT();
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
     case MVT::i1:
     case MVT::i8:
     case MVT::i16:
     case MVT::i32:
-      if (I->use_empty()) {                // Argument is dead.
+      if (!Ins[i].Used) {                  // Argument is dead.
         if (CurArgReg < ArgRegEnd) ++CurArgReg;
-        ArgValues.push_back(DAG.getUNDEF(ObjectVT));
+        InVals.push_back(DAG.getUNDEF(ObjectVT));
       } else if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
         unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
         MF.getRegInfo().addLiveIn(*CurArgReg++, VReg);
-        SDValue Arg = DAG.getCopyFromReg(Root, dl, VReg, MVT::i32);
+        SDValue Arg = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
         if (ObjectVT != MVT::i32) {
           unsigned AssertOp = ISD::AssertSext;
           Arg = DAG.getNode(AssertOp, dl, MVT::i32, Arg,
                             DAG.getValueType(ObjectVT));
           Arg = DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, Arg);
         }
-        ArgValues.push_back(Arg);
+        InVals.push_back(Arg);
       } else {
         int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset);
         SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
         SDValue Load;
         if (ObjectVT == MVT::i32) {
-          Load = DAG.getLoad(MVT::i32, dl, Root, FIPtr, NULL, 0);
+          Load = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, NULL, 0);
         } else {
           ISD::LoadExtType LoadOp = ISD::SEXTLOAD;
 
@@ -130,63 +141,63 @@ SparcTargetLowering::LowerArguments(Function &F, SelectionDAG &DAG,
           unsigned Offset = 4-std::max(1U, ObjectVT.getSizeInBits()/8);
           FIPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIPtr,
                               DAG.getConstant(Offset, MVT::i32));
-          Load = DAG.getExtLoad(LoadOp, dl, MVT::i32, Root, FIPtr,
+          Load = DAG.getExtLoad(LoadOp, dl, MVT::i32, Chain, FIPtr,
                                 NULL, 0, ObjectVT);
           Load = DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, Load);
         }
-        ArgValues.push_back(Load);
+        InVals.push_back(Load);
       }
 
       ArgOffset += 4;
       break;
     case MVT::f32:
-      if (I->use_empty()) {                // Argument is dead.
+      if (!Ins[i].Used) {                  // Argument is dead.
         if (CurArgReg < ArgRegEnd) ++CurArgReg;
-        ArgValues.push_back(DAG.getUNDEF(ObjectVT));
+        InVals.push_back(DAG.getUNDEF(ObjectVT));
       } else if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
         // FP value is passed in an integer register.
         unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
         MF.getRegInfo().addLiveIn(*CurArgReg++, VReg);
-        SDValue Arg = DAG.getCopyFromReg(Root, dl, VReg, MVT::i32);
+        SDValue Arg = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
 
         Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Arg);
-        ArgValues.push_back(Arg);
+        InVals.push_back(Arg);
       } else {
         int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset);
         SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
-        SDValue Load = DAG.getLoad(MVT::f32, dl, Root, FIPtr, NULL, 0);
-        ArgValues.push_back(Load);
+        SDValue Load = DAG.getLoad(MVT::f32, dl, Chain, FIPtr, NULL, 0);
+        InVals.push_back(Load);
       }
       ArgOffset += 4;
       break;
 
     case MVT::i64:
     case MVT::f64:
-      if (I->use_empty()) {                // Argument is dead.
+      if (!Ins[i].Used) {                // Argument is dead.
         if (CurArgReg < ArgRegEnd) ++CurArgReg;
         if (CurArgReg < ArgRegEnd) ++CurArgReg;
-        ArgValues.push_back(DAG.getUNDEF(ObjectVT));
+        InVals.push_back(DAG.getUNDEF(ObjectVT));
       } else {
         SDValue HiVal;
         if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
           unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
           MF.getRegInfo().addLiveIn(*CurArgReg++, VRegHi);
-          HiVal = DAG.getCopyFromReg(Root, dl, VRegHi, MVT::i32);
+          HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);
         } else {
           int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset);
           SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
-          HiVal = DAG.getLoad(MVT::i32, dl, Root, FIPtr, NULL, 0);
+          HiVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, NULL, 0);
         }
 
         SDValue LoVal;
         if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
           unsigned VRegLo = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
           MF.getRegInfo().addLiveIn(*CurArgReg++, VRegLo);
-          LoVal = DAG.getCopyFromReg(Root, dl, VRegLo, MVT::i32);
+          LoVal = DAG.getCopyFromReg(Chain, dl, VRegLo, MVT::i32);
         } else {
           int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset+4);
           SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
-          LoVal = DAG.getLoad(MVT::i32, dl, Root, FIPtr, NULL, 0);
+          LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, NULL, 0);
         }
 
         // Compose the two halves together into an i64 unit.
@@ -197,7 +208,7 @@ SparcTargetLowering::LowerArguments(Function &F, SelectionDAG &DAG,
         if (ObjectVT == MVT::f64)
           WholeValue = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, WholeValue);
 
-        ArgValues.push_back(WholeValue);
+        InVals.push_back(WholeValue);
       }
       ArgOffset += 8;
       break;
@@ -205,10 +216,12 @@ SparcTargetLowering::LowerArguments(Function &F, SelectionDAG &DAG,
   }
 
   // Store remaining ArgRegs to the stack if this is a varargs function.
-  if (F.isVarArg()) {
+  if (isVarArg) {
     // Remember the vararg offset for the va_start implementation.
     VarArgsFrameOffset = ArgOffset;
 
+    std::vector<SDValue> OutChains;
+
     for (; CurArgReg != ArgRegEnd; ++CurArgReg) {
       unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
       MF.getRegInfo().addLiveIn(*CurArgReg, VReg);
@@ -220,26 +233,31 @@ SparcTargetLowering::LowerArguments(Function &F, SelectionDAG &DAG,
       OutChains.push_back(DAG.getStore(DAG.getRoot(), dl, Arg, FIPtr, NULL, 0));
       ArgOffset += 4;
     }
+
+    if (!OutChains.empty()) {
+      OutChains.push_back(Chain);
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &OutChains[0], OutChains.size());
+    }
   }
 
-  if (!OutChains.empty())
-    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                            &OutChains[0], OutChains.size()));
+  return Chain;
 }
 
-static SDValue LowerCALL(SDValue Op, SelectionDAG &DAG) {
-  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
-  unsigned CallingConv = TheCall->getCallingConv();
-  SDValue Chain = TheCall->getChain();
-  SDValue Callee = TheCall->getCallee();
-  bool isVarArg = TheCall->isVarArg();
-  DebugLoc dl = TheCall->getDebugLoc();
+SDValue
+SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               bool isTailCall,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) {
 
 #if 0
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallingConv, isVarArg, DAG.getTarget(), ArgLocs);
-  CCInfo.AnalyzeCallOperands(Op.getNode(), CC_Sparc32);
+  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(), ArgLocs);
+  CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);
 
   // Get the size of the outgoing arguments stack space requirement.
   unsigned ArgsSize = CCInfo.getNextStackOffset();
@@ -249,9 +267,9 @@ static SDValue LowerCALL(SDValue Op, SelectionDAG &DAG) {
 
   // Count the size of the outgoing arguments.
   unsigned ArgsSize = 0;
-  for (unsigned i = 0, e = TheCall->getNumArgs(); i != e; ++i) {
-    switch (TheCall->getArg(i).getValueType().getSimpleVT()) {
-      default: assert(0 && "Unknown value type!");
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    switch (Outs[i].Val.getValueType().getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unknown value type!");
       case MVT::i1:
       case MVT::i8:
       case MVT::i16:
@@ -283,13 +301,11 @@ static SDValue LowerCALL(SDValue Op, SelectionDAG &DAG) {
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-
-    // Arguments start after the 5 first operands of ISD::CALL
-    SDValue Arg = TheCall->getArg(i);
+    SDValue Arg = Outs[i].Val;
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
-    default: assert(0 && "Unknown loc info!");
+    default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::SExt:
       Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
@@ -325,13 +341,13 @@ static SDValue LowerCALL(SDValue Op, SelectionDAG &DAG) {
   };
   unsigned ArgOffset = 68;
 
-  for (unsigned i = 0, e = TheCall->getNumArgs(); i != e; ++i) {
-    SDValue Val = TheCall->getArg(i);
-    MVT ObjectVT = Val.getValueType();
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    SDValue Val = Outs[i].Val;
+    EVT ObjectVT = Val.getValueType();
     SDValue ValToStore(0, 0);
     unsigned ObjSize;
-    switch (ObjectVT.getSimpleVT()) {
-    default: assert(0 && "Unhandled argument type!");
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
     case MVT::i32:
       ObjSize = 4;
 
@@ -446,7 +462,7 @@ static SDValue LowerCALL(SDValue Op, SelectionDAG &DAG) {
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
     Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
 
-  std::vector<MVT> NodeTys;
+  std::vector<EVT> NodeTys;
   NodeTys.push_back(MVT::Other);   // Returns a chain
   NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
   SDValue Ops[] = { Chain, Callee, InFlag };
@@ -459,10 +475,10 @@ static SDValue LowerCALL(SDValue Op, SelectionDAG &DAG) {
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState RVInfo(CallingConv, isVarArg, DAG.getTarget(), RVLocs);
+  CCState RVInfo(CallConv, isVarArg, DAG.getTarget(),
+                 RVLocs, *DAG.getContext());
 
-  RVInfo.AnalyzeCallResult(TheCall, RetCC_Sparc32);
-  SmallVector<SDValue, 8> ResultVals;
+  RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -475,15 +491,10 @@ static SDValue LowerCALL(SDValue Op, SelectionDAG &DAG) {
     Chain = DAG.getCopyFromReg(Chain, dl, Reg,
                                RVLocs[i].getValVT(), InFlag).getValue(1);
     InFlag = Chain.getValue(2);
-    ResultVals.push_back(Chain.getValue(0));
+    InVals.push_back(Chain.getValue(0));
   }
 
-  ResultVals.push_back(Chain);
-
-  // Merge everything together with a MERGE_VALUES node.
-  return DAG.getNode(ISD::MERGE_VALUES, dl, 
-                     TheCall->getVTList(), &ResultVals[0],
-                     ResultVals.size());
+  return Chain;
 }
 
 
@@ -496,7 +507,7 @@ static SDValue LowerCALL(SDValue Op, SelectionDAG &DAG) {
 /// condition.
 static SPCC::CondCodes IntCondCCodeToICC(ISD::CondCode CC) {
   switch (CC) {
-  default: assert(0 && "Unknown integer condition code!");
+  default: llvm_unreachable("Unknown integer condition code!");
   case ISD::SETEQ:  return SPCC::ICC_E;
   case ISD::SETNE:  return SPCC::ICC_NE;
   case ISD::SETLT:  return SPCC::ICC_L;
@@ -514,7 +525,7 @@ static SPCC::CondCodes IntCondCCodeToICC(ISD::CondCode CC) {
 /// FCC condition.
 static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
   switch (CC) {
-  default: assert(0 && "Unknown fp condition code!");
+  default: llvm_unreachable("Unknown fp condition code!");
   case ISD::SETEQ:
   case ISD::SETOEQ: return SPCC::FCC_E;
   case ISD::SETNE:
@@ -538,9 +549,8 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
   }
 }
 
-
 SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
-  : TargetLowering(TM) {
+  : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, SP::IntRegsRegisterClass);
@@ -635,9 +645,6 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
   setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 
-  // RET must be custom lowered, to meet ABI requirements
-  setOperationAction(ISD::RET               , MVT::Other, Custom);
-
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
   // VAARG needs to be lowered to not do unaligned accesses for doubles.
@@ -654,7 +661,6 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
   setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
   setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
-  setOperationAction(ISD::DECLARE, MVT::Other, Expand);
 
   setStackPointerRegisterToSaveRestore(SP::O6);
 
@@ -734,17 +740,29 @@ static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
   }
 }
 
-static SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) {
+SDValue SparcTargetLowering::LowerGlobalAddress(SDValue Op, 
+                                                SelectionDAG &DAG) {
   GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   // FIXME there isn't really any debug info here
   DebugLoc dl = Op.getDebugLoc();
   SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
   SDValue Hi = DAG.getNode(SPISD::Hi, dl, MVT::i32, GA);
   SDValue Lo = DAG.getNode(SPISD::Lo, dl, MVT::i32, GA);
-  return DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) 
+    return DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+  
+  SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, dl,
+                                   getPointerTy());
+  SDValue RelAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+  SDValue AbsAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, 
+                                GlobalBase, RelAddr);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 
+                     AbsAddr, NULL, 0);
 }
 
-static SDValue LowerCONSTANTPOOL(SDValue Op, SelectionDAG &DAG) {
+SDValue SparcTargetLowering::LowerConstantPool(SDValue Op,
+                                               SelectionDAG &DAG) {
   ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
   // FIXME there isn't really any debug info here
   DebugLoc dl = Op.getDebugLoc();
@@ -752,7 +770,16 @@ static SDValue LowerCONSTANTPOOL(SDValue Op, SelectionDAG &DAG) {
   SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment());
   SDValue Hi = DAG.getNode(SPISD::Hi, dl, MVT::i32, CP);
   SDValue Lo = DAG.getNode(SPISD::Lo, dl, MVT::i32, CP);
-  return DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) 
+    return DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+
+  SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, dl, 
+                                   getPointerTy());
+  SDValue RelAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+  SDValue AbsAddr = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                GlobalBase, RelAddr);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 
+                     AbsAddr, NULL, 0);
 }
 
 static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
@@ -787,7 +814,7 @@ static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
   // Get the condition flag.
   SDValue CompareFlag;
   if (LHS.getValueType() == MVT::i32) {
-    std::vector<MVT> VTs;
+    std::vector<EVT> VTs;
     VTs.push_back(MVT::i32);
     VTs.push_back(MVT::Flag);
     SDValue Ops[2] = { LHS, RHS };
@@ -818,7 +845,7 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
 
   SDValue CompareFlag;
   if (LHS.getValueType() == MVT::i32) {
-    std::vector<MVT> VTs;
+    std::vector<EVT> VTs;
     VTs.push_back(LHS.getValueType());   // subcc returns a value
     VTs.push_back(MVT::Flag);
     SDValue Ops[2] = { LHS, RHS };
@@ -849,7 +876,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
 
 static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
   SDNode *Node = Op.getNode();
-  MVT VT = Node->getValueType(0);
+  EVT VT = Node->getValueType(0);
   SDValue InChain = Node->getOperand(0);
   SDValue VAListPtr = Node->getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
@@ -900,14 +927,14 @@ static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) {
 SDValue SparcTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) {
   switch (Op.getOpcode()) {
-  default: assert(0 && "Should not custom lower this!");
+  default: llvm_unreachable("Should not custom lower this!");
   // Frame & Return address.  Currently unimplemented
   case ISD::RETURNADDR: return SDValue();
   case ISD::FRAMEADDR:  return SDValue();
   case ISD::GlobalTLSAddress:
-    assert(0 && "TLS not implemented for Sparc.");
-  case ISD::GlobalAddress:      return LowerGLOBALADDRESS(Op, DAG);
-  case ISD::ConstantPool:       return LowerCONSTANTPOOL(Op, DAG);
+    llvm_unreachable("TLS not implemented for Sparc.");
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::BR_CC:              return LowerBR_CC(Op, DAG);
@@ -915,21 +942,20 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) {
   case ISD::VASTART:            return LowerVASTART(Op, DAG, *this);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
-  case ISD::CALL:               return LowerCALL(Op, DAG);
-  case ISD::RET:                return LowerRET(Op, DAG);
   }
 }
 
 MachineBasicBlock *
 SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *BB) const {
+                                                 MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
   const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
   unsigned BROpcode;
   unsigned CC;
   DebugLoc dl = MI->getDebugLoc();
   // Figure out the conditional branch opcode to use for this select_cc.
   switch (MI->getOpcode()) {
-  default: assert(0 && "Unknown SELECT_CC!");
+  default: llvm_unreachable("Unknown SELECT_CC!");
   case SP::SELECT_CC_Int_ICC:
   case SP::SELECT_CC_FP_ICC:
   case SP::SELECT_CC_DFP_ICC:
@@ -964,9 +990,18 @@ SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   BuildMI(BB, dl, TII.get(BROpcode)).addMBB(sinkMBB).addImm(CC);
   F->insert(It, copy0MBB);
   F->insert(It, sinkMBB);
-  // Update machine-CFG edges by transferring all successors of the current
+  // Update machine-CFG edges by first adding all successors of the current
   // block to the new block which will contain the Phi node for the select.
-  sinkMBB->transferSuccessors(BB);
+  // Also inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
+         E = BB->succ_end(); I != E; ++I) {
+    EM->insert(std::make_pair(*I, sinkMBB));
+    sinkMBB->addSuccessor(*I);
+  }
+  // Next, remove all successors of the current block, and add the true
+  // and fallthrough blocks as its successors.
+  while (!BB->succ_empty())
+    BB->removeSuccessor(BB->succ_begin());
   // Next, add the true and fallthrough blocks as its successors.
   BB->addSuccessor(copy0MBB);
   BB->addSuccessor(sinkMBB);
@@ -1011,7 +1046,7 @@ SparcTargetLowering::getConstraintType(const std::string &Constraint) const {
 
 std::pair<unsigned, const TargetRegisterClass*>
 SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                  MVT VT) const {
+                                                  EVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
@@ -1024,7 +1059,7 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
 
 std::vector<unsigned> SparcTargetLowering::
 getRegClassForInlineAsmConstraint(const std::string &Constraint,
-                                  MVT VT) const {
+                                  EVT VT) const {
   if (Constraint.size() != 1)
     return std::vector<unsigned>();
 
@@ -1050,5 +1085,5 @@ SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 
 /// getFunctionAlignment - Return the Log2 alignment of this function.
 unsigned SparcTargetLowering::getFunctionAlignment(const Function *) const {
-  return 4;
+  return 2;
 }
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 27ce1b7..55781be 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -35,7 +35,8 @@ namespace llvm {
       ITOF,        // Int to FP within a FP register.
 
       CALL,        // A call instruction.
-      RET_FLAG     // Return with a flag operand.
+      RET_FLAG,    // Return with a flag operand.
+      GLOBAL_BASE_REG // Global base reg for PIC
     };
   }
 
@@ -57,25 +58,49 @@ namespace llvm {
                                                 const SelectionDAG &DAG,
                                                 unsigned Depth = 0) const;
 
-    virtual void LowerArguments(Function &F, SelectionDAG &DAG,
-                                SmallVectorImpl<SDValue> &ArgValues,
-                                DebugLoc dl);
     virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                   MachineBasicBlock *MBB) const;
+                                                         MachineBasicBlock *MBB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
 
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     ConstraintType getConstraintType(const std::string &Constraint) const;
     std::pair<unsigned, const TargetRegisterClass*>
-    getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+    getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
     std::vector<unsigned>
     getRegClassForInlineAsmConstraint(const std::string &Constraint,
-                                      MVT VT) const;
+                                      EVT VT) const;
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
     /// getFunctionAlignment - Return the Log2 alignment of this function.
     virtual unsigned getFunctionAlignment(const Function *F) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv,
+                           bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
   };
 } // end namespace llvm
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index 12c286a..8667bca 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -17,7 +17,10 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "SparcGenInstrInfo.inc"
+#include "SparcMachineFunctionInfo.h"
 using namespace llvm;
 
 SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
@@ -160,30 +163,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     BuildMI(MBB, I, DL, get(SP::STDFri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg,  getKillRegState(isKill));
   else
-    assert(0 && "Can't store this register to stack slot");
-}
-
-void SparcInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
-                                    bool isKill,
-                                    SmallVectorImpl<MachineOperand> &Addr,
-                                    const TargetRegisterClass *RC,
-                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Opc = 0;
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  if (RC == SP::IntRegsRegisterClass)
-    Opc = SP::STri;
-  else if (RC == SP::FPRegsRegisterClass)
-    Opc = SP::STFri;
-  else if (RC == SP::DFPRegsRegisterClass)
-    Opc = SP::STDFri;
-  else
-    assert(0 && "Can't load this register");
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  MIB.addReg(SrcReg, getKillRegState(isKill));
-  NewMIs.push_back(MIB);
-  return;
+    llvm_unreachable("Can't store this register to stack slot");
 }
 
 void SparcInstrInfo::
@@ -200,28 +180,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   else if (RC == SP::DFPRegsRegisterClass)
     BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0);
   else
-    assert(0 && "Can't load this register from stack slot");
-}
-
-void SparcInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                                     SmallVectorImpl<MachineOperand> &Addr,
-                                     const TargetRegisterClass *RC,
-                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Opc = 0;
-  if (RC == SP::IntRegsRegisterClass)
-    Opc = SP::LDri;
-  else if (RC == SP::FPRegsRegisterClass)
-    Opc = SP::LDFri;
-  else if (RC == SP::DFPRegsRegisterClass)
-    Opc = SP::LDDFri;
-  else
-    assert(0 && "Can't load this register");
-  DebugLoc DL = DebugLoc::getUnknownLoc();
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
-  NewMIs.push_back(MIB);
-  return;
+    llvm_unreachable("Can't load this register from stack slot");
 }
 
 MachineInstr *SparcInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
@@ -278,3 +237,25 @@ MachineInstr *SparcInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 
   return NewMI;
 }
+
+unsigned SparcInstrInfo::getGlobalBaseReg(MachineFunction *MF) const
+{
+  SparcMachineFunctionInfo *SparcFI = MF->getInfo<SparcMachineFunctionInfo>();
+  unsigned GlobalBaseReg = SparcFI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Insert the set of GlobalBaseReg into the first MBB of the function
+  MachineBasicBlock &FirstMBB = MF->front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+
+  GlobalBaseReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+
+
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+
+  BuildMI(FirstMBB, MBBI, dl, get(SP::GETPCX), GlobalBaseReg);
+  SparcFI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index ab661b9..345674b 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -81,20 +81,10 @@ public:
                                    unsigned SrcReg, bool isKill, int FrameIndex,
                                    const TargetRegisterClass *RC) const;
 
-  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                              SmallVectorImpl<MachineOperand> &Addr,
-                              const TargetRegisterClass *RC,
-                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
   virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MBBI,
                                     unsigned DestReg, int FrameIndex,
                                     const TargetRegisterClass *RC) const;
-
-  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                               SmallVectorImpl<MachineOperand> &Addr,
-                               const TargetRegisterClass *RC,
-                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
   
   virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
                                               MachineInstr* MI,
@@ -107,6 +97,8 @@ public:
                                               MachineInstr* LoadMI) const {
     return 0;
   }
+
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
 };
 
 }
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 2d6c920..44821b8 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -117,7 +117,7 @@ def SPitof  : SDNode<"SPISD::ITOF", SDTSPITOF>;
 def SPselecticc : SDNode<"SPISD::SELECT_ICC", SDTSPselectcc, [SDNPInFlag]>;
 def SPselectfcc : SDNode<"SPISD::SELECT_FCC", SDTSPselectcc, [SDNPInFlag]>;
 
-// These are target-independent nodes, but have target-specific formats.
+//  These are target-independent nodes, but have target-specific formats.
 def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
 def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                         SDTCisVT<1, i32> ]>;
@@ -134,6 +134,10 @@ def call          : SDNode<"SPISD::CALL", SDT_SPCall,
 def retflag       : SDNode<"SPISD::RET_FLAG", SDTNone,
                            [SDNPHasChain, SDNPOptInFlag]>;
 
+def getPCX        : Operand<i32> {
+  let PrintMethod = "printGetPCX";
+}  
+
 //===----------------------------------------------------------------------===//
 // SPARC Flag Conditions
 //===----------------------------------------------------------------------===//
@@ -207,6 +211,11 @@ multiclass F3_12np<string OpcStr, bits<6> Op3Val> {
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
    : InstSP<outs, ins, asmstr, pattern>;
 
+// GETPCX for PIC
+let Defs = [O7], Uses = [O7] in {
+  def GETPCX : Pseudo<(outs getPCX:$getpcseq), (ins), "$getpcseq", [] >;
+}
+
 let Defs = [O6], Uses = [O6] in {
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
                                "!ADJCALLSTACKDOWN $amt",
@@ -431,18 +440,23 @@ def LEA_ADDri   : F3_2<2, 0b000000,
                    (outs IntRegs:$dst), (ins MEMri:$addr),
                    "add ${addr:arith}, $dst",
                    [(set IntRegs:$dst, ADDRri:$addr)]>;
-                   
-defm ADDCC  : F3_12<"addcc", 0b010000, addc>;
+
+let Defs = [ICC] in                   
+  defm ADDCC  : F3_12<"addcc", 0b010000, addc>;
+
 defm ADDX  : F3_12<"addx", 0b001000, adde>;
 
 // Section B.15 - Subtract Instructions, p. 110
 defm SUB    : F3_12  <"sub"  , 0b000100, sub>;
 defm SUBX   : F3_12  <"subx" , 0b001100, sube>;
-defm SUBCC  : F3_12  <"subcc", 0b010100, SPcmpicc>;
 
-def SUBXCCrr: F3_1<2, 0b011100, 
-                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
-                   "subxcc $b, $c, $dst", []>;
+let Defs = [ICC] in {
+  defm SUBCC  : F3_12  <"subcc", 0b010100, SPcmpicc>;
+
+  def SUBXCCrr: F3_1<2, 0b011100, 
+                (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                "subxcc $b, $c, $dst", []>;
+}
 
 // Section B.18 - Multiply Instructions, p. 113
 defm UMUL : F3_12np<"umul", 0b001010>;
@@ -471,11 +485,12 @@ let isBarrier = 1 in
   def BA   : BranchSP<0b1000, (ins brtarget:$dst),
                       "ba $dst",
                       [(br bb:$dst)]>;
-                      
+
 // FIXME: the encoding for the JIT should look at the condition field.
-def BCOND : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
-                     "b$cc $dst",
-                     [(SPbricc bb:$dst, imm:$cc)]>;
+let Uses = [ICC] in
+  def BCOND : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
+                         "b$cc $dst",
+                        [(SPbricc bb:$dst, imm:$cc)]>;
 
 
 // Section B.22 - Branch on Floating-point Condition Codes Instructions, p. 121
@@ -489,9 +504,10 @@ class FPBranchSP<bits<4> cc, dag ins, string asmstr, list<dag> pattern>
 }
 
 // FIXME: the encoding for the JIT should look at the condition field.
-def FBCOND  : FPBranchSP<0, (ins brtarget:$dst, CCOp:$cc),
-                      "fb$cc $dst",
-                      [(SPbrfcc bb:$dst, imm:$cc)]>;
+let Uses = [FCC] in
+  def FBCOND  : FPBranchSP<0, (ins brtarget:$dst, CCOp:$cc),
+                              "fb$cc $dst",
+                              [(SPbrfcc bb:$dst, imm:$cc)]>;
 
 
 // Section B.24 - Call and Link Instruction, p. 125
@@ -633,15 +649,16 @@ def FDIVD  : F3_3<2, 0b110100, 0b001001110,
 // Note 2: the result of a FCMP is not available until the 2nd cycle
 // after the instr is retired, but there is no interlock. This behavior
 // is modelled with a forced noop after the instruction.
-def FCMPS  : F3_3<2, 0b110101, 0b001010001,
-                  (outs), (ins FPRegs:$src1, FPRegs:$src2),
-                  "fcmps $src1, $src2\n\tnop",
-                  [(SPcmpfcc FPRegs:$src1, FPRegs:$src2)]>;
-def FCMPD  : F3_3<2, 0b110101, 0b001010010,
-                  (outs), (ins DFPRegs:$src1, DFPRegs:$src2),
-                  "fcmpd $src1, $src2\n\tnop",
-                  [(SPcmpfcc DFPRegs:$src1, DFPRegs:$src2)]>;
-
+let Defs = [FCC] in {
+  def FCMPS  : F3_3<2, 0b110101, 0b001010001,
+                   (outs), (ins FPRegs:$src1, FPRegs:$src2),
+                   "fcmps $src1, $src2\n\tnop",
+                   [(SPcmpfcc FPRegs:$src1, FPRegs:$src2)]>;
+  def FCMPD  : F3_3<2, 0b110101, 0b001010010,
+                   (outs), (ins DFPRegs:$src1, DFPRegs:$src2),
+                   "fcmpd $src1, $src2\n\tnop",
+                   [(SPcmpfcc DFPRegs:$src1, DFPRegs:$src2)]>;
+}
 
 //===----------------------------------------------------------------------===//
 // V9 Instructions
@@ -754,8 +771,6 @@ def : Pat<(call tglobaladdr:$dst),
 def : Pat<(call texternalsym:$dst),
           (CALL texternalsym:$dst)>;
 
-def : Pat<(ret), (RETL)>;
-
 // Map integer extload's to zextloads.
 def : Pat<(i32 (extloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
 def : Pat<(i32 (extloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
diff --git a/lib/Target/Sparc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/SparcMCAsmInfo.cpp
new file mode 100644
index 0000000..b67537c
--- /dev/null
+++ b/lib/Target/Sparc/SparcMCAsmInfo.cpp
@@ -0,0 +1,38 @@
+//===-- SparcMCAsmInfo.cpp - Sparc asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SparcMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCAsmInfo.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+SparcELFMCAsmInfo::SparcELFMCAsmInfo(const Target &T, const StringRef &TT) {
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = 0;  // .xword is only supported by V9.
+  ZeroDirective = "\t.skip\t";
+  CommentString = "!";
+  COMMDirectiveTakesAlignment = true;
+  HasLEB128 = true;
+  AbsoluteDebugSectionOffsets = true;
+  SupportsDebugInformation = true;
+  
+  SunStyleELFSectionSwitchSyntax = true;
+  UsesELFSectionDirectiveForBSS = true;
+
+  WeakRefDirective = "\t.weak\t";
+  SetDirective = "\t.set\t";
+
+  PrivateGlobalPrefix = ".L";
+}
+
+
diff --git a/lib/Target/Sparc/SparcMCAsmInfo.h b/lib/Target/Sparc/SparcMCAsmInfo.h
new file mode 100644
index 0000000..12d6ef4
--- /dev/null
+++ b/lib/Target/Sparc/SparcMCAsmInfo.h
@@ -0,0 +1,28 @@
+//=====-- SparcMCAsmInfo.h - Sparc asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SparcMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCTARGETASMINFO_H
+#define SPARCTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+  struct SparcELFMCAsmInfo : public MCAsmInfo {
+    explicit SparcELFMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Sparc/SparcMachineFunctionInfo.h b/lib/Target/Sparc/SparcMachineFunctionInfo.h
new file mode 100644
index 0000000..e457235
--- /dev/null
+++ b/lib/Target/Sparc/SparcMachineFunctionInfo.h
@@ -0,0 +1,32 @@
+//===- SparcMachineFunctionInfo.h - Sparc Machine Function Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares  Sparc specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SPARCMACHINEFUNCTIONINFO_H
+#define SPARCMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+  class SparcMachineFunctionInfo : public MachineFunctionInfo {
+  private:
+    unsigned GlobalBaseReg;
+  public:
+    SparcMachineFunctionInfo() : GlobalBaseReg(0) {}
+    SparcMachineFunctionInfo(MachineFunction &MF) : GlobalBaseReg(0) {}
+
+    unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+    void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+  };
+}
+
+#endif
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 59efb19..7883260 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Type.h"
 #include "llvm/ADT/BitVector.h"
@@ -75,8 +76,10 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-void SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                            int SPAdj, RegScavenger *RS) const {
+unsigned
+SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, int *Value,
+                                       RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
   unsigned i = 0;
@@ -112,6 +115,7 @@ void SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     MI.getOperand(i).ChangeToRegister(SP::G1, false);
     MI.getOperand(i+1).ChangeToImmediate(Offset & ((1 << 10)-1));
   }
+  return 0;
 }
 
 void SparcRegisterInfo::
@@ -168,28 +172,25 @@ void SparcRegisterInfo::emitEpilogue(MachineFunction &MF,
 }
 
 unsigned SparcRegisterInfo::getRARegister() const {
-  assert(0 && "What is the return address register");
-  return 0;
+  return SP::I7;
 }
 
 unsigned SparcRegisterInfo::getFrameRegister(MachineFunction &MF) const {
-  assert(0 && "What is the frame register");
-  return SP::G1;
+  return SP::I6;
 }
 
 unsigned SparcRegisterInfo::getEHExceptionRegister() const {
-  assert(0 && "What is the exception register");
+  llvm_unreachable("What is the exception register");
   return 0;
 }
 
 unsigned SparcRegisterInfo::getEHHandlerRegister() const {
-  assert(0 && "What is the exception handler register");
+  llvm_unreachable("What is the exception handler register");
   return 0;
 }
 
 int SparcRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  assert(0 && "What is the dwarf register number");
-  return -1;
+  return SparcGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
 }
 
 #include "SparcGenRegisterInfo.inc"
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index fc863f3..753b1c0 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -43,8 +43,9 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-  void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index e3a50ca..2b05c19 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -16,6 +16,10 @@ class SparcReg<string n> : Register<n> {
   let Namespace = "SP";
 }
 
+class SparcCtrlReg<string n>: Register<n> {
+  let Namespace = "SP";
+}
+
 // Registers are identified with 5-bit ID numbers.
 // Ri - 32-bit integer registers
 class Ri<bits<5> num, string n> : SparcReg<n> {
@@ -31,6 +35,10 @@ class Rd<bits<5> num, string n, list<Register> subregs> : SparcReg<n> {
   let SubRegs = subregs;
 }
 
+// Control Registers
+def ICC : SparcCtrlReg<"ICC">;
+def FCC : SparcCtrlReg<"FCC">;
+
 // Integer registers
 def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
 def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>;
@@ -46,7 +54,7 @@ def O2 : Ri<10, "O2">, DwarfRegNum<[10]>;
 def O3 : Ri<11, "O3">, DwarfRegNum<[11]>;
 def O4 : Ri<12, "O4">, DwarfRegNum<[12]>;
 def O5 : Ri<13, "O5">, DwarfRegNum<[13]>; 
-def O6 : Ri<14, "O6">, DwarfRegNum<[14]>;
+def O6 : Ri<14, "SP">, DwarfRegNum<[14]>;
 def O7 : Ri<15, "O7">, DwarfRegNum<[15]>;
 def L0 : Ri<16, "L0">, DwarfRegNum<[16]>;
 def L1 : Ri<17, "L1">, DwarfRegNum<[17]>;
@@ -62,7 +70,7 @@ def I2 : Ri<26, "I2">, DwarfRegNum<[26]>;
 def I3 : Ri<27, "I3">, DwarfRegNum<[27]>;
 def I4 : Ri<28, "I4">, DwarfRegNum<[28]>;
 def I5 : Ri<29, "I5">, DwarfRegNum<[29]>; 
-def I6 : Ri<30, "I6">, DwarfRegNum<[30]>;
+def I6 : Ri<30, "FP">, DwarfRegNum<[30]>;
 def I7 : Ri<31, "I7">, DwarfRegNum<[31]>;
 
 // Floating-point registers
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index aaddbff..8a88cc0 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -22,7 +22,7 @@ namespace {
                           cl::desc("Enable V9 instructions in the V8 target"));
 }
 
-SparcSubtarget::SparcSubtarget(const Module &M, const std::string &FS) {
+SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &FS) {
   // Set the default features.
   IsV9 = false;
   V8DeprecatedInsts = false;
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index e5a5ba4..4377034 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -18,14 +18,13 @@
 #include <string>
 
 namespace llvm {
-  class Module;
-  
+
 class SparcSubtarget : public TargetSubtarget {
   bool IsV9;
   bool V8DeprecatedInsts;
   bool IsVIS;
 public:
-  SparcSubtarget(const Module &M, const std::string &FS);
+  SparcSubtarget(const std::string &TT, const std::string &FS);
 
   bool isV9() const { return IsV9; }
   bool isVIS() const { return IsVIS; }
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 1343bcc..3a38115 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -10,63 +10,30 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SparcTargetAsmInfo.h"
+#include "SparcMCAsmInfo.h"
 #include "SparcTargetMachine.h"
 #include "Sparc.h"
-#include "llvm/Module.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
-// Register the target.
-static RegisterTarget<SparcTargetMachine> X("sparc", "SPARC");
+extern "C" void LLVMInitializeSparcTarget() {
+  // Register the target.
+  RegisterTargetMachine<SparcTargetMachine> X(TheSparcTarget);
+  RegisterAsmInfo<SparcELFMCAsmInfo> Y(TheSparcTarget);
 
-// No assembler printer by default
-SparcTargetMachine::AsmPrinterCtorFn SparcTargetMachine::AsmPrinterCtor = 0;
-
-
-// Force static initialization.
-extern "C" void LLVMInitializeSparcTarget() { }
-
-const TargetAsmInfo *SparcTargetMachine::createTargetAsmInfo() const {
-  // FIXME: Handle Solaris subtarget someday :)
-  return new SparcELFTargetAsmInfo(*this);
 }
 
 /// SparcTargetMachine ctor - Create an ILP32 architecture model
 ///
-SparcTargetMachine::SparcTargetMachine(const Module &M, const std::string &FS)
-  : DataLayout("E-p:32:32-f128:128:128"),
-    Subtarget(M, FS), TLInfo(*this), InstrInfo(Subtarget),
+SparcTargetMachine::SparcTargetMachine(const Target &T, const std::string &TT, 
+                                       const std::string &FS)
+  : LLVMTargetMachine(T, TT),
+    DataLayout("E-p:32:32-f128:128:128"),
+    Subtarget(TT, FS), TLInfo(*this), InstrInfo(Subtarget),
     FrameInfo(TargetFrameInfo::StackGrowsDown, 8, 0) {
 }
 
-unsigned SparcTargetMachine::getModuleMatchQuality(const Module &M) {
-  std::string TT = M.getTargetTriple();
-  if (TT.size() >= 6 && std::string(TT.begin(), TT.begin()+6) == "sparc-")
-    return 20;
-  
-  // If the target triple is something non-sparc, we don't match.
-  if (!TT.empty()) return 0;
-
-  if (M.getEndianness()  == Module::BigEndian &&
-      M.getPointerSize() == Module::Pointer32)
-#ifdef __sparc__
-    return 20;   // BE/32 ==> Prefer sparc on sparc
-#else
-    return 5;    // BE/32 ==> Prefer ppc elsewhere
-#endif
-  else if (M.getEndianness() != Module::AnyEndianness ||
-           M.getPointerSize() != Module::AnyPointerSize)
-    return 0;                                    // Match for some other target
-
-#if defined(__sparc__)
-  return 10;
-#else
-  return 0;
-#endif
-}
-
 bool SparcTargetMachine::addInstSelector(PassManagerBase &PM,
                                          CodeGenOpt::Level OptLevel) {
   PM.add(createSparcISelDag(*this));
@@ -82,14 +49,3 @@ bool SparcTargetMachine::addPreEmitPass(PassManagerBase &PM,
   PM.add(createSparcDelaySlotFillerPass(*this));
   return true;
 }
-
-bool SparcTargetMachine::addAssemblyEmitter(PassManagerBase &PM,
-                                            CodeGenOpt::Level OptLevel,
-                                            bool Verbose,
-                                            raw_ostream &Out) {
-  // Output assembly language.
-  assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-  if (AsmPrinterCtor)
-    PM.add(AsmPrinterCtor(Out, *this, Verbose));
-  return false;
-}
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index ee55d3c..cce5510 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -23,27 +23,15 @@
 
 namespace llvm {
 
-class Module;
-
 class SparcTargetMachine : public LLVMTargetMachine {
   const TargetData DataLayout;       // Calculates type size & alignment
   SparcSubtarget Subtarget;
   SparcTargetLowering TLInfo;
   SparcInstrInfo InstrInfo;
   TargetFrameInfo FrameInfo;
-  
-protected:
-  virtual const TargetAsmInfo *createTargetAsmInfo() const;
-  
-  // To avoid having target depend on the asmprinter stuff libraries, asmprinter
-  // set this functions to ctor pointer at startup time if they are linked in.
-  typedef FunctionPass *(*AsmPrinterCtorFn)(raw_ostream &o,
-                                            TargetMachine &tm,
-                                            bool verbose);
-  static AsmPrinterCtorFn AsmPrinterCtor;
-  
 public:
-  SparcTargetMachine(const Module &M, const std::string &FS);
+  SparcTargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS);
 
   virtual const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
   virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
@@ -55,18 +43,10 @@ public:
     return const_cast<SparcTargetLowering*>(&TLInfo);
   }
   virtual const TargetData       *getTargetData() const { return &DataLayout; }
-  static unsigned getModuleMatchQuality(const Module &M);
 
   // Pass Pipeline Configuration
   virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addAssemblyEmitter(PassManagerBase &PM,
-                                  CodeGenOpt::Level OptLevel,
-                                  bool Verbose, raw_ostream &Out);
-  
-  static void registerAsmPrinter(AsmPrinterCtorFn F) {
-    AsmPrinterCtor = F;
-  }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/TargetInfo/CMakeLists.txt b/lib/Target/Sparc/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..870b56a
--- /dev/null
+++ b/lib/Target/Sparc/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMSparcInfo
+  SparcTargetInfo.cpp
+  )
+
+add_dependencies(LLVMSparcInfo SparcCodeGenTable_gen)
diff --git a/lib/Target/Sparc/TargetInfo/Makefile b/lib/Target/Sparc/TargetInfo/Makefile
new file mode 100644
index 0000000..641ed87
--- /dev/null
+++ b/lib/Target/Sparc/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/Sparc/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSparcInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
new file mode 100644
index 0000000..5d697bd
--- /dev/null
+++ b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -0,0 +1,19 @@
+//===-- SparcTargetInfo.cpp - Sparc Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheSparcTarget;
+
+extern "C" void LLVMInitializeSparcTargetInfo() { 
+  RegisterTarget<Triple::sparc> X(TheSparcTarget, "sparc", "Sparc");
+}
diff --git a/lib/Target/SubtargetFeature.cpp b/lib/Target/SubtargetFeature.cpp
index f937025..664a43c 100644
--- a/lib/Target/SubtargetFeature.cpp
+++ b/lib/Target/SubtargetFeature.cpp
@@ -12,10 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/SubtargetFeature.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Streams.h"
 #include <algorithm>
-#include <ostream>
 #include <cassert>
 #include <cctype>
 using namespace llvm;
@@ -145,22 +144,22 @@ static void Help(const SubtargetFeatureKV *CPUTable, size_t CPUTableSize,
   unsigned MaxFeatLen = getLongestEntryLength(FeatTable, FeatTableSize);
 
   // Print the CPU table.
-  cerr << "Available CPUs for this target:\n\n";
+  errs() << "Available CPUs for this target:\n\n";
   for (size_t i = 0; i != CPUTableSize; i++)
-    cerr << "  " << CPUTable[i].Key
+    errs() << "  " << CPUTable[i].Key
          << std::string(MaxCPULen - std::strlen(CPUTable[i].Key), ' ')
          << " - " << CPUTable[i].Desc << ".\n";
-  cerr << "\n";
+  errs() << "\n";
   
   // Print the Feature table.
-  cerr << "Available features for this target:\n\n";
+  errs() << "Available features for this target:\n\n";
   for (size_t i = 0; i != FeatTableSize; i++)
-    cerr << "  " << FeatTable[i].Key
+    errs() << "  " << FeatTable[i].Key
          << std::string(MaxFeatLen - std::strlen(FeatTable[i].Key), ' ')
          << " - " << FeatTable[i].Desc << ".\n";
-  cerr << "\n";
+  errs() << "\n";
   
-  cerr << "Use +feature to enable a feature, or -feature to disable it.\n"
+  errs() << "Use +feature to enable a feature, or -feature to disable it.\n"
        << "For example, llc -mcpu=mycpu -mattr=+feature1,-feature2\n";
   exit(1);
 }
@@ -283,10 +282,9 @@ uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable,
         SetImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
     }
   } else {
-    cerr << "'" << Features[0]
-         << "' is not a recognized processor for this target"
-         << " (ignoring processor)"
-         << "\n";
+    errs() << "'" << Features[0]
+           << "' is not a recognized processor for this target"
+           << " (ignoring processor)\n";
   }
   // Iterate through each feature
   for (size_t i = 1; i < Features.size(); i++) {
@@ -314,10 +312,9 @@ uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable,
         ClearImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
       }
     } else {
-      cerr << "'" << Feature
-           << "' is not a recognized feature for this target"
-           << " (ignoring feature)"
-           << "\n";
+      errs() << "'" << Feature
+             << "' is not a recognized feature for this target"
+             << " (ignoring feature)\n";
     }
   }
 
@@ -340,25 +337,23 @@ void *SubtargetFeatures::getInfo(const SubtargetInfoKV *Table,
   if (Entry) {
     return Entry->Value;
   } else {
-    cerr << "'" << Features[0]
-         << "' is not a recognized processor for this target"
-         << " (ignoring processor)"
-         << "\n";
+    errs() << "'" << Features[0]
+           << "' is not a recognized processor for this target"
+           << " (ignoring processor)\n";
     return NULL;
   }
 }
 
 /// print - Print feature string.
 ///
-void SubtargetFeatures::print(std::ostream &OS) const {
-  for (size_t i = 0; i < Features.size(); i++) {
+void SubtargetFeatures::print(raw_ostream &OS) const {
+  for (size_t i = 0, e = Features.size(); i != e; ++i)
     OS << Features[i] << "  ";
-  }
   OS << "\n";
 }
 
 /// dump - Dump feature info.
 ///
 void SubtargetFeatures::dump() const {
-  print(*cerr.stream());
+  print(errs());
 }
diff --git a/lib/Target/SystemZ/AsmPrinter/CMakeLists.txt b/lib/Target/SystemZ/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..c6be83a
--- /dev/null
+++ b/lib/Target/SystemZ/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMSystemZAsmPrinter
+  SystemZAsmPrinter.cpp
+  )
+add_dependencies(LLVMSystemZAsmPrinter SystemZCodeGenTable_gen)
diff --git a/lib/Target/SystemZ/AsmPrinter/Makefile b/lib/Target/SystemZ/AsmPrinter/Makefile
new file mode 100644
index 0000000..9a350df
--- /dev/null
+++ b/lib/Target/SystemZ/AsmPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/SystemZ/AsmPrinter/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSystemZAsmPrinter
+
+# Hack: we need to include 'main' SystemZ target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/SystemZ/AsmPrinter/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/AsmPrinter/SystemZAsmPrinter.cpp
new file mode 100644
index 0000000..a128992
--- /dev/null
+++ b/lib/Target/SystemZ/AsmPrinter/SystemZAsmPrinter.cpp
@@ -0,0 +1,391 @@
+//===-- SystemZAsmPrinter.cpp - SystemZ LLVM assembly writer ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the SystemZ assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Mangler.h"
+
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  class VISIBILITY_HIDDEN SystemZAsmPrinter : public AsmPrinter {
+  public:
+    SystemZAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                      const MCAsmInfo *MAI, bool V)
+      : AsmPrinter(O, TM, MAI, V) {}
+
+    virtual const char *getPassName() const {
+      return "SystemZ Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int OpNum,
+                      const char* Modifier = 0);
+    void printPCRelImmOperand(const MachineInstr *MI, int OpNum);
+    void printRIAddrOperand(const MachineInstr *MI, int OpNum,
+                            const char* Modifier = 0);
+    void printRRIAddrOperand(const MachineInstr *MI, int OpNum,
+                             const char* Modifier = 0);
+    void printS16ImmOperand(const MachineInstr *MI, int OpNum) {
+      O << (int16_t)MI->getOperand(OpNum).getImm();
+    }
+    void printS32ImmOperand(const MachineInstr *MI, int OpNum) {
+      O << (int32_t)MI->getOperand(OpNum).getImm();
+    }
+
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printMachineInstruction(const MachineInstr * MI);
+
+    void emitFunctionHeader(const MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &F);
+    void PrintGlobalVariable(const GlobalVariable* GVar);
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AsmPrinter::getAnalysisUsage(AU);
+      AU.setPreservesAll();
+    }
+  };
+} // end of anonymous namespace
+
+#include "SystemZGenAsmWriter.inc"
+
+void SystemZAsmPrinter::emitFunctionHeader(const MachineFunction &MF) {
+  unsigned FnAlign = MF.getAlignment();
+  const Function *F = MF.getFunction();
+
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
+
+  EmitAlignment(FnAlign, F);
+
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+  case Function::PrivateLinkage:
+  case Function::LinkerPrivateLinkage:
+    break;
+  case Function::ExternalLinkage:
+    O << "\t.globl\t" << CurrentFnName << '\n';
+    break;
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+    O << "\t.weak\t" << CurrentFnName << '\n';
+    break;
+  }
+
+  printVisibility(CurrentFnName, F->getVisibility());
+
+  O << "\t.type\t" << CurrentFnName << ",@function\n"
+    << CurrentFnName << ":\n";
+}
+
+bool SystemZAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print the 'header' of function
+  emitFunctionHeader(MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    EmitBasicBlockStart(I);
+
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II)
+      // Print the assembly for the instruction.
+      printMachineInstruction(II);
+  }
+
+  if (MAI->hasDotTypeDotSizeDirective())
+    O << "\t.size\t" << CurrentFnName << ", .-" << CurrentFnName << '\n';
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  // We didn't modify anything
+  return false;
+}
+
+void SystemZAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  processDebugLoc(MI, true);
+
+  // Call the autogenerated instruction printer routines.
+  printInstruction(MI);
+  
+  if (VerboseAsm && !MI->getDebugLoc().isUnknown())
+    EmitComments(*MI);
+  O << '\n';
+
+  processDebugLoc(MI, false);
+}
+
+void SystemZAsmPrinter::printPCRelImmOperand(const MachineInstr *MI, int OpNum){
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+    std::string Name = Mang->getMangledName(GV);
+
+    O << Name;
+
+    // Assemble calls via PLT for externally visible symbols if PIC.
+    if (TM.getRelocationModel() == Reloc::PIC_ &&
+        !GV->hasHiddenVisibility() && !GV->hasProtectedVisibility() &&
+        !GV->hasLocalLinkage())
+      O << "@PLT";
+
+    printOffset(MO.getOffset());
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    std::string Name(MAI->getGlobalPrefix());
+    Name += MO.getSymbolName();
+    O << Name;
+
+    if (TM.getRelocationModel() == Reloc::PIC_)
+      O << "@PLT";
+
+    return;
+  }
+  default:
+    assert(0 && "Not implemented yet!");
+  }
+}
+
+
+void SystemZAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                     const char* Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register: {
+    assert (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+            "Virtual registers should be already mapped!");
+    unsigned Reg = MO.getReg();
+    if (Modifier && strncmp(Modifier, "subreg", 6) == 0) {
+      if (strncmp(Modifier + 7, "even", 4) == 0)
+        Reg = TRI->getSubReg(Reg, SystemZ::SUBREG_EVEN);
+      else if (strncmp(Modifier + 7, "odd", 3) == 0)
+        Reg = TRI->getSubReg(Reg, SystemZ::SUBREG_ODD);
+      else
+        assert(0 && "Invalid subreg modifier");
+    }
+
+    O << '%' << getRegisterName(Reg);
+    return;
+  }
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
+    return;
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+
+    printOffset(MO.getOffset());
+    break;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+    std::string Name = Mang->getMangledName(GV);
+
+    O << Name;
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    std::string Name(MAI->getGlobalPrefix());
+    Name += MO.getSymbolName();
+    O << Name;
+    break;
+  }
+  default:
+    assert(0 && "Not implemented yet!");
+  }
+
+  switch (MO.getTargetFlags()) {
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  case SystemZII::MO_NO_FLAG:
+    break;
+  case SystemZII::MO_GOTENT:    O << "@GOTENT";    break;
+  case SystemZII::MO_PLT:       O << "@PLT";       break;
+  }
+
+  printOffset(MO.getOffset());
+}
+
+void SystemZAsmPrinter::printRIAddrOperand(const MachineInstr *MI, int OpNum,
+                                           const char* Modifier) {
+  const MachineOperand &Base = MI->getOperand(OpNum);
+
+  // Print displacement operand.
+  printOperand(MI, OpNum+1);
+
+  // Print base operand (if any)
+  if (Base.getReg()) {
+    O << '(';
+    printOperand(MI, OpNum);
+    O << ')';
+  }
+}
+
+void SystemZAsmPrinter::printRRIAddrOperand(const MachineInstr *MI, int OpNum,
+                                            const char* Modifier) {
+  const MachineOperand &Base = MI->getOperand(OpNum);
+  const MachineOperand &Index = MI->getOperand(OpNum+2);
+
+  // Print displacement operand.
+  printOperand(MI, OpNum+1);
+
+  // Print base operand (if any)
+  if (Base.getReg()) {
+    O << '(';
+    printOperand(MI, OpNum);
+    if (Index.getReg()) {
+      O << ',';
+      printOperand(MI, OpNum+2);
+    }
+    O << ')';
+  } else
+    assert(!Index.getReg() && "Should allocate base register first!");
+}
+
+void SystemZAsmPrinter::PrintGlobalVariable(const GlobalVariable* GVar) {
+  const TargetData *TD = TM.getTargetData();
+
+  if (!GVar->hasInitializer())
+    return;   // External global require no code
+
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (EmitSpecialLLVMGlobal(GVar))
+    return;
+
+  std::string name = Mang->getMangledName(GVar);
+  Constant *C = GVar->getInitializer();
+  unsigned Size = TD->getTypeAllocSize(C->getType());
+  unsigned Align = std::max(1U, TD->getPreferredAlignmentLog(GVar));
+
+  printVisibility(name, GVar->getVisibility());
+
+  O << "\t.type\t" << name << ",@object\n";
+
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GVar, Mang,
+                                                                  TM));
+
+  if (C->isNullValue() && !GVar->hasSection() &&
+      !GVar->isThreadLocal() &&
+      (GVar->hasLocalLinkage() || GVar->isWeakForLinker())) {
+
+    if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+
+    if (GVar->hasLocalLinkage())
+      O << "\t.local\t" << name << '\n';
+
+    O << MAI->getCOMMDirective()  << name << ',' << Size;
+    if (MAI->getCOMMDirectiveTakesAlignment())
+      O << ',' << (MAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+
+    if (VerboseAsm) {
+      O << "\t\t" << MAI->getCommentString() << ' ';
+      WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
+    }
+    O << '\n';
+    return;
+  }
+
+  switch (GVar->getLinkage()) {
+  case GlobalValue::CommonLinkage:
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+    O << "\t.weak\t" << name << '\n';
+    break;
+  case GlobalValue::DLLExportLinkage:
+  case GlobalValue::AppendingLinkage:
+    // FIXME: appending linkage variables should go into a section of
+    // their name or something.  For now, just emit them as external.
+  case GlobalValue::ExternalLinkage:
+    // If external or appending, declare as a global symbol
+    O << "\t.globl " << name << '\n';
+    // FALL THROUGH
+  case GlobalValue::PrivateLinkage:
+  case GlobalValue::LinkerPrivateLinkage:
+  case GlobalValue::InternalLinkage:
+     break;
+  default:
+    assert(0 && "Unknown linkage type!");
+  }
+
+  // Use 16-bit alignment by default to simplify bunch of stuff
+  EmitAlignment(Align, GVar, 1);
+  O << name << ":";
+  if (VerboseAsm) {
+    O << "\t\t\t\t" << MAI->getCommentString() << ' ';
+    WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
+  }
+  O << '\n';
+  if (MAI->hasDotTypeDotSizeDirective())
+    O << "\t.size\t" << name << ", " << Size << '\n';
+
+  EmitGlobalConstant(C);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSystemZAsmPrinter() {
+  RegisterAsmPrinter<SystemZAsmPrinter> X(TheSystemZTarget);
+}
diff --git a/lib/Target/SystemZ/CMakeLists.txt b/lib/Target/SystemZ/CMakeLists.txt
new file mode 100644
index 0000000..81e51d8
--- /dev/null
+++ b/lib/Target/SystemZ/CMakeLists.txt
@@ -0,0 +1,23 @@
+set(LLVM_TARGET_DEFINITIONS SystemZ.td)
+
+tablegen(SystemZGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(SystemZGenRegisterNames.inc -gen-register-enums)
+tablegen(SystemZGenRegisterInfo.inc -gen-register-desc)
+tablegen(SystemZGenInstrNames.inc -gen-instr-enums)
+tablegen(SystemZGenInstrInfo.inc -gen-instr-desc)
+tablegen(SystemZGenAsmWriter.inc -gen-asm-writer)
+tablegen(SystemZGenDAGISel.inc -gen-dag-isel)
+tablegen(SystemZGenCallingConv.inc -gen-callingconv)
+tablegen(SystemZGenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(SystemZCodeGen
+  SystemZISelDAGToDAG.cpp
+  SystemZISelLowering.cpp
+  SystemZInstrInfo.cpp
+  SystemZMCAsmInfo.cpp
+  SystemZRegisterInfo.cpp
+  SystemZSubtarget.cpp
+  SystemZTargetMachine.cpp
+  )
+
+target_link_libraries (LLVMSystemZCodeGen LLVMSelectionDAG)
diff --git a/lib/Target/SystemZ/Makefile b/lib/Target/SystemZ/Makefile
new file mode 100644
index 0000000..f1097eb
--- /dev/null
+++ b/lib/Target/SystemZ/Makefile
@@ -0,0 +1,22 @@
+##===- lib/Target/SystemZ/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMSystemZCodeGen
+TARGET = SystemZ
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = SystemZGenRegisterInfo.h.inc SystemZGenRegisterNames.inc \
+                SystemZGenRegisterInfo.inc SystemZGenInstrNames.inc \
+                SystemZGenInstrInfo.inc SystemZGenAsmWriter.inc \
+                SystemZGenDAGISel.inc SystemZGenSubtarget.inc SystemZGenCallingConv.inc
+
+DIRS = AsmPrinter TargetInfo
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
new file mode 100644
index 0000000..ea5240a
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZ.h
@@ -0,0 +1,61 @@
+//=-- SystemZ.h - Top-level interface for SystemZ representation -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM SystemZ backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SystemZ_H
+#define LLVM_TARGET_SystemZ_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class SystemZTargetMachine;
+  class FunctionPass;
+  class formatted_raw_ostream;
+
+  namespace SystemZCC {
+    // SystemZ specific condition code. These correspond to SYSTEMZ_*_COND in
+    // SystemZInstrInfo.td. They must be kept in synch.
+    enum CondCodes {
+      O   = 0,
+      H   = 1,
+      NLE = 2,
+      L   = 3,
+      NHE = 4,
+      LH  = 5,
+      NE  = 6,
+      E   = 7,
+      NLH = 8,
+      HE  = 9,
+      NL  = 10,
+      LE  = 11,
+      NH  = 12,
+      NO  = 13,
+      INVALID = -1
+    };
+  }
+
+  FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
+                                    CodeGenOpt::Level OptLevel);
+
+  extern Target TheSystemZTarget;
+
+} // end namespace llvm;
+
+// Defines symbolic names for SystemZ registers.
+// This defines a mapping from register name to register number.
+#include "SystemZGenRegisterNames.inc"
+
+// Defines symbolic names for the SystemZ instructions.
+#include "SystemZGenInstrNames.inc"
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td
new file mode 100644
index 0000000..4c08c08
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZ.td
@@ -0,0 +1,61 @@
+//===- SystemZ.td - Describe the SystemZ Target Machine ------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the SystemZ target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features. 
+//===----------------------------------------------------------------------===//
+def FeatureZ10 : SubtargetFeature<"z10", "HasZ10Insts", "true",
+                                  "Support Z10 instructions">;
+
+//===----------------------------------------------------------------------===//
+// SystemZ supported processors.
+//===----------------------------------------------------------------------===//
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"z9",  []>;
+def : Proc<"z10", [FeatureZ10]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "SystemZRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Description
+//===----------------------------------------------------------------------===//
+
+include "SystemZCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "SystemZInstrInfo.td"
+include "SystemZInstrFP.td"
+
+def SystemZInstrInfo : InstrInfo {} 
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def SystemZ : Target {
+  let InstructionSet = SystemZInstrInfo;
+}
+
diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td
new file mode 100644
index 0000000..c799a9e
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZCallingConv.td
@@ -0,0 +1,46 @@
+//=- SystemZCallingConv.td - Calling Conventions for SystemZ -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for SystemZ architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SystemZ Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_SystemZ : CallingConv<[
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // i64 is returned in register R2
+  CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D]>>,
+
+  // f32 / f64 are returned in F0
+  CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+  CCIfType<[f64], CCAssignToReg<[F0L, F2L, F4L, F6L]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// SystemZ Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_SystemZ : CallingConv<[
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // The first 5 integer arguments of non-varargs functions are passed in
+  // integer registers.
+  CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
+
+  // The first 4 floating point arguments of non-varargs functions are passed
+  // in FP registers.
+  CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+  CCIfType<[f64], CCAssignToReg<[F0L, F2L, F4L, F6L]>>,
+
+  // Integer values get stored in stack slots that are 8 bytes in
+  // size and 8-byte aligned.
+  CCIfType<[i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
new file mode 100644
index 0000000..028ee89
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -0,0 +1,829 @@
+//==-- SystemZISelDAGToDAG.cpp - A dag to dag inst selector for SystemZ ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the SystemZ target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "SystemZISelLowering.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static const unsigned subreg_even32 = 1;
+static const unsigned subreg_odd32  = 2;
+static const unsigned subreg_even   = 3;
+static const unsigned subreg_odd    = 4;
+
+namespace {
+  /// SystemZRRIAddressMode - This corresponds to rriaddr, but uses SDValue's
+  /// instead of register numbers for the leaves of the matched tree.
+  struct SystemZRRIAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    struct {            // This is really a union, discriminated by BaseType!
+      SDValue Reg;
+      int FrameIndex;
+    } Base;
+
+    SDValue IndexReg;
+    int64_t Disp;
+    bool isRI;
+
+    SystemZRRIAddressMode(bool RI = false)
+      : BaseType(RegBase), IndexReg(), Disp(0), isRI(RI) {
+    }
+
+    void dump() {
+      errs() << "SystemZRRIAddressMode " << this << '\n';
+      if (BaseType == RegBase) {
+        errs() << "Base.Reg ";
+        if (Base.Reg.getNode() != 0)
+          Base.Reg.getNode()->dump();
+        else
+          errs() << "nul";
+        errs() << '\n';
+      } else {
+        errs() << " Base.FrameIndex " << Base.FrameIndex << '\n';
+      }
+      if (!isRI) {
+        errs() << "IndexReg ";
+        if (IndexReg.getNode() != 0) IndexReg.getNode()->dump();
+        else errs() << "nul";
+      }
+      errs() << " Disp " << Disp << '\n';
+    }
+  };
+}
+
+/// SystemZDAGToDAGISel - SystemZ specific code to select SystemZ machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+  class SystemZDAGToDAGISel : public SelectionDAGISel {
+    SystemZTargetLowering &Lowering;
+    const SystemZSubtarget &Subtarget;
+
+    void getAddressOperandsRI(const SystemZRRIAddressMode &AM,
+                            SDValue &Base, SDValue &Disp);
+    void getAddressOperands(const SystemZRRIAddressMode &AM,
+                            SDValue &Base, SDValue &Disp,
+                            SDValue &Index);
+
+  public:
+    SystemZDAGToDAGISel(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel),
+        Lowering(*TM.getTargetLowering()),
+        Subtarget(*TM.getSubtargetImpl()) { }
+
+    virtual void InstructionSelect();
+
+    virtual const char *getPassName() const {
+      return "SystemZ DAG->DAG Pattern Instruction Selection";
+    }
+
+    /// getI8Imm - Return a target constant with the specified value, of type
+    /// i8.
+    inline SDValue getI8Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i8);
+    }
+
+    /// getI16Imm - Return a target constant with the specified value, of type
+    /// i16.
+    inline SDValue getI16Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i16);
+    }
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    // Include the pieces autogenerated from the target description.
+    #include "SystemZGenDAGISel.inc"
+
+  private:
+    bool SelectAddrRI12Only(SDValue Op, SDValue& Addr,
+                            SDValue &Base, SDValue &Disp);
+    bool SelectAddrRI12(SDValue Op, SDValue& Addr,
+                        SDValue &Base, SDValue &Disp,
+                        bool is12BitOnly = false);
+    bool SelectAddrRI(SDValue Op, SDValue& Addr,
+                      SDValue &Base, SDValue &Disp);
+    bool SelectAddrRRI12(SDValue Op, SDValue Addr,
+                         SDValue &Base, SDValue &Disp, SDValue &Index);
+    bool SelectAddrRRI20(SDValue Op, SDValue Addr,
+                         SDValue &Base, SDValue &Disp, SDValue &Index);
+    bool SelectLAAddr(SDValue Op, SDValue Addr,
+                      SDValue &Base, SDValue &Disp, SDValue &Index);
+
+    SDNode *Select(SDValue Op);
+
+    bool TryFoldLoad(SDValue P, SDValue N,
+                     SDValue &Base, SDValue &Disp, SDValue &Index);
+
+    bool MatchAddress(SDValue N, SystemZRRIAddressMode &AM,
+                      bool is12Bit, unsigned Depth = 0);
+    bool MatchAddressBase(SDValue N, SystemZRRIAddressMode &AM);
+    bool MatchAddressRI(SDValue N, SystemZRRIAddressMode &AM,
+                        bool is12Bit);
+
+  #ifndef NDEBUG
+    unsigned Indent;
+  #endif
+  };
+}  // end anonymous namespace
+
+/// createSystemZISelDag - This pass converts a legalized DAG into a
+/// SystemZ-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createSystemZISelDag(SystemZTargetMachine &TM,
+                                        CodeGenOpt::Level OptLevel) {
+  return new SystemZDAGToDAGISel(TM, OptLevel);
+}
+
+/// isImmSExt20 - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 20-bit value. If so, this returns true and the
+/// immediate.
+static bool isImmSExt20(int64_t Val, int64_t &Imm) {
+  if (Val >= -524288 && Val <= 524287) {
+    Imm = Val;
+    return true;
+  }
+  return false;
+}
+
+/// isImmZExt12 - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// zero extension from a 12-bit value. If so, this returns true and the
+/// immediate.
+static bool isImmZExt12(int64_t Val, int64_t &Imm) {
+  if (Val >= 0 && Val <= 0xFFF) {
+    Imm = Val;
+    return true;
+  }
+  return false;
+}
+
+/// MatchAddress - Add the specified node to the specified addressing mode,
+/// returning true if it cannot be done.  This just pattern matches for the
+/// addressing mode.
+bool SystemZDAGToDAGISel::MatchAddress(SDValue N, SystemZRRIAddressMode &AM,
+                                       bool is12Bit, unsigned Depth) {
+  DebugLoc dl = N.getDebugLoc();
+  DEBUG(errs() << "MatchAddress: "; AM.dump());
+  // Limit recursion.
+  if (Depth > 5)
+    return MatchAddressBase(N, AM);
+
+  // FIXME: We can perform better here. If we have something like
+  // (shift (add A, imm), N), we can try to reassociate stuff and fold shift of
+  // imm into addressing mode.
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    int64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    int64_t Imm = 0;
+    bool Match = (is12Bit ?
+                  isImmZExt12(AM.Disp + Val, Imm) :
+                  isImmSExt20(AM.Disp + Val, Imm));
+    if (Match) {
+      AM.Disp = Imm;
+      return false;
+    }
+    break;
+  }
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == SystemZRRIAddressMode::RegBase &&
+        AM.Base.Reg.getNode() == 0) {
+      AM.BaseType = SystemZRRIAddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::SUB: {
+    // Given A-B, if A can be completely folded into the address and
+    // the index field with the index field unused, use -B as the index.
+    // This is a win if a has multiple parts that can be folded into
+    // the address. Also, this saves a mov if the base register has
+    // other uses, since it avoids a two-address sub instruction, however
+    // it costs an additional mov if the index register has other uses.
+
+    // Test if the LHS of the sub can be folded.
+    SystemZRRIAddressMode Backup = AM;
+    if (MatchAddress(N.getNode()->getOperand(0), AM, is12Bit, Depth+1)) {
+      AM = Backup;
+      break;
+    }
+    // Test if the index field is free for use.
+    if (AM.IndexReg.getNode() || AM.isRI) {
+      AM = Backup;
+      break;
+    }
+
+    // If the base is a register with multiple uses, this transformation may
+    // save a mov. Otherwise it's probably better not to do it.
+    if (AM.BaseType == SystemZRRIAddressMode::RegBase &&
+        (!AM.Base.Reg.getNode() || AM.Base.Reg.getNode()->hasOneUse())) {
+      AM = Backup;
+      break;
+    }
+
+    // Ok, the transformation is legal and appears profitable. Go for it.
+    SDValue RHS = N.getNode()->getOperand(1);
+    SDValue Zero = CurDAG->getConstant(0, N.getValueType());
+    SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
+    AM.IndexReg = Neg;
+
+    // Insert the new nodes into the topological ordering.
+    if (Zero.getNode()->getNodeId() == -1 ||
+        Zero.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(N.getNode(), Zero.getNode());
+      Zero.getNode()->setNodeId(N.getNode()->getNodeId());
+    }
+    if (Neg.getNode()->getNodeId() == -1 ||
+        Neg.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(N.getNode(), Neg.getNode());
+      Neg.getNode()->setNodeId(N.getNode()->getNodeId());
+    }
+    return false;
+  }
+
+  case ISD::ADD: {
+    SystemZRRIAddressMode Backup = AM;
+    if (!MatchAddress(N.getNode()->getOperand(0), AM, is12Bit, Depth+1) &&
+        !MatchAddress(N.getNode()->getOperand(1), AM, is12Bit, Depth+1))
+      return false;
+    AM = Backup;
+    if (!MatchAddress(N.getNode()->getOperand(1), AM, is12Bit, Depth+1) &&
+        !MatchAddress(N.getNode()->getOperand(0), AM, is12Bit, Depth+1))
+      return false;
+    AM = Backup;
+
+    // If we couldn't fold both operands into the address at the same time,
+    // see if we can just put each operand into a register and fold at least
+    // the add.
+    if (!AM.isRI &&
+        AM.BaseType == SystemZRRIAddressMode::RegBase &&
+        !AM.Base.Reg.getNode() && !AM.IndexReg.getNode()) {
+      AM.Base.Reg = N.getNode()->getOperand(0);
+      AM.IndexReg = N.getNode()->getOperand(1);
+      return false;
+    }
+    break;
+  }
+
+  case ISD::OR:
+    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      SystemZRRIAddressMode Backup = AM;
+      int64_t Offset = CN->getSExtValue();
+      int64_t Imm = 0;
+      bool MatchOffset = (is12Bit ?
+                          isImmZExt12(AM.Disp + Offset, Imm) :
+                          isImmSExt20(AM.Disp + Offset, Imm));
+      // The resultant disp must fit in 12 or 20-bits.
+      if (MatchOffset &&
+          // LHS should be an addr mode.
+          !MatchAddress(N.getOperand(0), AM, is12Bit, Depth+1) &&
+          // Check to see if the LHS & C is zero.
+          CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
+        AM.Disp = Imm;
+        return false;
+      }
+      AM = Backup;
+    }
+    break;
+  }
+
+  return MatchAddressBase(N, AM);
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool SystemZDAGToDAGISel::MatchAddressBase(SDValue N,
+                                           SystemZRRIAddressMode &AM) {
+  // Is the base register already occupied?
+  if (AM.BaseType != SystemZRRIAddressMode::RegBase || AM.Base.Reg.getNode()) {
+    // If so, check to see if the index register is set.
+    if (AM.IndexReg.getNode() == 0 && !AM.isRI) {
+      AM.IndexReg = N;
+      return false;
+    }
+
+    // Otherwise, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = SystemZRRIAddressMode::RegBase;
+  AM.Base.Reg = N;
+  return false;
+}
+
+void SystemZDAGToDAGISel::getAddressOperandsRI(const SystemZRRIAddressMode &AM,
+                                               SDValue &Base, SDValue &Disp) {
+  if (AM.BaseType == SystemZRRIAddressMode::RegBase)
+    Base = AM.Base.Reg;
+  else
+    Base = CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy());
+  Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i64);
+}
+
+void SystemZDAGToDAGISel::getAddressOperands(const SystemZRRIAddressMode &AM,
+                                             SDValue &Base, SDValue &Disp,
+                                             SDValue &Index) {
+  getAddressOperandsRI(AM, Base, Disp);
+  Index = AM.IndexReg;
+}
+
+/// Returns true if the address can be represented by a base register plus
+/// an unsigned 12-bit displacement [r+imm].
+bool SystemZDAGToDAGISel::SelectAddrRI12Only(SDValue Op, SDValue& Addr,
+                                             SDValue &Base, SDValue &Disp) {
+  return SelectAddrRI12(Op, Addr, Base, Disp, /*is12BitOnly*/true);
+}
+
+bool SystemZDAGToDAGISel::SelectAddrRI12(SDValue Op, SDValue& Addr,
+                                         SDValue &Base, SDValue &Disp,
+                                         bool is12BitOnly) {
+  SystemZRRIAddressMode AM20(/*isRI*/true), AM12(/*isRI*/true);
+  bool Done = false;
+
+  if (!Addr.hasOneUse()) {
+    unsigned Opcode = Addr.getOpcode();
+    if (Opcode != ISD::Constant && Opcode != ISD::FrameIndex) {
+      // If we are able to fold N into addressing mode, then we'll allow it even
+      // if N has multiple uses. In general, addressing computation is used as
+      // addresses by all of its uses. But watch out for CopyToReg uses, that
+      // means the address computation is liveout. It will be computed by a LA
+      // so we want to avoid computing the address twice.
+      for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+             UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+        if (UI->getOpcode() == ISD::CopyToReg) {
+          MatchAddressBase(Addr, AM12);
+          Done = true;
+          break;
+        }
+      }
+    }
+  }
+  if (!Done && MatchAddress(Addr, AM12, /* is12Bit */ true))
+    return false;
+
+  // Check, whether we can match stuff using 20-bit displacements
+  if (!Done && !is12BitOnly &&
+      !MatchAddress(Addr, AM20, /* is12Bit */ false))
+    if (AM12.Disp == 0 && AM20.Disp != 0)
+      return false;
+
+  DEBUG(errs() << "MatchAddress (final): "; AM12.dump());
+
+  EVT VT = Addr.getValueType();
+  if (AM12.BaseType == SystemZRRIAddressMode::RegBase) {
+    if (!AM12.Base.Reg.getNode())
+      AM12.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  assert(AM12.IndexReg.getNode() == 0 && "Invalid reg-imm address mode!");
+
+  getAddressOperandsRI(AM12, Base, Disp);
+
+  return true;
+}
+
+/// Returns true if the address can be represented by a base register plus
+/// a signed 20-bit displacement [r+imm].
+bool SystemZDAGToDAGISel::SelectAddrRI(SDValue Op, SDValue& Addr,
+                                       SDValue &Base, SDValue &Disp) {
+  SystemZRRIAddressMode AM(/*isRI*/true);
+  bool Done = false;
+
+  if (!Addr.hasOneUse()) {
+    unsigned Opcode = Addr.getOpcode();
+    if (Opcode != ISD::Constant && Opcode != ISD::FrameIndex) {
+      // If we are able to fold N into addressing mode, then we'll allow it even
+      // if N has multiple uses. In general, addressing computation is used as
+      // addresses by all of its uses. But watch out for CopyToReg uses, that
+      // means the address computation is liveout. It will be computed by a LA
+      // so we want to avoid computing the address twice.
+      for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+             UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+        if (UI->getOpcode() == ISD::CopyToReg) {
+          MatchAddressBase(Addr, AM);
+          Done = true;
+          break;
+        }
+      }
+    }
+  }
+  if (!Done && MatchAddress(Addr, AM, /* is12Bit */ false))
+    return false;
+
+  DEBUG(errs() << "MatchAddress (final): "; AM.dump());
+
+  EVT VT = Addr.getValueType();
+  if (AM.BaseType == SystemZRRIAddressMode::RegBase) {
+    if (!AM.Base.Reg.getNode())
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  assert(AM.IndexReg.getNode() == 0 && "Invalid reg-imm address mode!");
+
+  getAddressOperandsRI(AM, Base, Disp);
+
+  return true;
+}
+
+/// Returns true if the address can be represented by a base register plus
+/// index register plus an unsigned 12-bit displacement [base + idx + imm].
+bool SystemZDAGToDAGISel::SelectAddrRRI12(SDValue Op, SDValue Addr,
+                                SDValue &Base, SDValue &Disp, SDValue &Index) {
+  SystemZRRIAddressMode AM20, AM12;
+  bool Done = false;
+
+  if (!Addr.hasOneUse()) {
+    unsigned Opcode = Addr.getOpcode();
+    if (Opcode != ISD::Constant && Opcode != ISD::FrameIndex) {
+      // If we are able to fold N into addressing mode, then we'll allow it even
+      // if N has multiple uses. In general, addressing computation is used as
+      // addresses by all of its uses. But watch out for CopyToReg uses, that
+      // means the address computation is liveout. It will be computed by a LA
+      // so we want to avoid computing the address twice.
+      for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+             UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+        if (UI->getOpcode() == ISD::CopyToReg) {
+          MatchAddressBase(Addr, AM12);
+          Done = true;
+          break;
+        }
+      }
+    }
+  }
+  if (!Done && MatchAddress(Addr, AM12, /* is12Bit */ true))
+    return false;
+
+  // Check, whether we can match stuff using 20-bit displacements
+  if (!Done && !MatchAddress(Addr, AM20, /* is12Bit */ false))
+    if (AM12.Disp == 0 && AM20.Disp != 0)
+      return false;
+
+  DEBUG(errs() << "MatchAddress (final): "; AM12.dump());
+
+  EVT VT = Addr.getValueType();
+  if (AM12.BaseType == SystemZRRIAddressMode::RegBase) {
+    if (!AM12.Base.Reg.getNode())
+      AM12.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  if (!AM12.IndexReg.getNode())
+    AM12.IndexReg = CurDAG->getRegister(0, VT);
+
+  getAddressOperands(AM12, Base, Disp, Index);
+
+  return true;
+}
+
+/// Returns true if the address can be represented by a base register plus
+/// index register plus a signed 20-bit displacement [base + idx + imm].
+bool SystemZDAGToDAGISel::SelectAddrRRI20(SDValue Op, SDValue Addr,
+                                SDValue &Base, SDValue &Disp, SDValue &Index) {
+  SystemZRRIAddressMode AM;
+  bool Done = false;
+
+  if (!Addr.hasOneUse()) {
+    unsigned Opcode = Addr.getOpcode();
+    if (Opcode != ISD::Constant && Opcode != ISD::FrameIndex) {
+      // If we are able to fold N into addressing mode, then we'll allow it even
+      // if N has multiple uses. In general, addressing computation is used as
+      // addresses by all of its uses. But watch out for CopyToReg uses, that
+      // means the address computation is liveout. It will be computed by a LA
+      // so we want to avoid computing the address twice.
+      for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+             UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+        if (UI->getOpcode() == ISD::CopyToReg) {
+          MatchAddressBase(Addr, AM);
+          Done = true;
+          break;
+        }
+      }
+    }
+  }
+  if (!Done && MatchAddress(Addr, AM, /* is12Bit */ false))
+    return false;
+
+  DEBUG(errs() << "MatchAddress (final): "; AM.dump());
+
+  EVT VT = Addr.getValueType();
+  if (AM.BaseType == SystemZRRIAddressMode::RegBase) {
+    if (!AM.Base.Reg.getNode())
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  if (!AM.IndexReg.getNode())
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  getAddressOperands(AM, Base, Disp, Index);
+
+  return true;
+}
+
+/// SelectLAAddr - it calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LA/LAY instruction.
+bool SystemZDAGToDAGISel::SelectLAAddr(SDValue Op, SDValue Addr,
+                                  SDValue &Base, SDValue &Disp, SDValue &Index) {
+  SystemZRRIAddressMode AM;
+
+  if (MatchAddress(Addr, AM, false))
+    return false;
+
+  EVT VT = Addr.getValueType();
+  unsigned Complexity = 0;
+  if (AM.BaseType == SystemZRRIAddressMode::RegBase)
+    if (AM.Base.Reg.getNode())
+      Complexity = 1;
+    else
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  else if (AM.BaseType == SystemZRRIAddressMode::FrameIndexBase)
+    Complexity = 4;
+
+  if (AM.IndexReg.getNode())
+    Complexity += 1;
+  else
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  if (AM.Disp && (AM.Base.Reg.getNode() || AM.IndexReg.getNode()))
+    Complexity += 1;
+
+  if (Complexity > 2) {
+    getAddressOperands(AM, Base, Disp, Index);
+    return true;
+  }
+
+  return false;
+}
+
+bool SystemZDAGToDAGISel::TryFoldLoad(SDValue P, SDValue N,
+                                 SDValue &Base, SDValue &Disp, SDValue &Index) {
+  if (ISD::isNON_EXTLoad(N.getNode()) &&
+      N.hasOneUse() &&
+      IsLegalAndProfitableToFold(N.getNode(), P.getNode(), P.getNode()))
+    return SelectAddrRRI20(P, N.getOperand(1), Base, Disp, Index);
+  return false;
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void SystemZDAGToDAGISel::InstructionSelect() {
+  DEBUG(BB->dump());
+
+  // Codegen the basic block.
+  DEBUG(errs() << "===== Instruction selection begins:\n");
+  DEBUG(Indent = 0);
+  SelectRoot(*CurDAG);
+  DEBUG(errs() << "===== Instruction selection ends:\n");
+
+  CurDAG->RemoveDeadNodes();
+}
+
+SDNode *SystemZDAGToDAGISel::Select(SDValue Op) {
+  SDNode *Node = Op.getNode();
+  EVT NVT = Node->getValueType(0);
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opcode = Node->getOpcode();
+
+  // Dump information about the Node being selected
+  DEBUG(errs().indent(Indent) << "Selecting: ";
+        Node->dump(CurDAG);
+        errs() << "\n");
+  DEBUG(Indent += 2);
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs().indent(Indent-2) << "== ";
+          Node->dump(CurDAG);
+          errs() << "\n");
+    DEBUG(Indent -= 2);
+    return NULL; // Already selected.
+  }
+
+  switch (Opcode) {
+  default: break;
+  case ISD::SDIVREM: {
+    unsigned Opc, MOpc;
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    EVT ResVT;
+    bool is32Bit = false;
+    switch (NVT.getSimpleVT().SimpleTy) {
+      default: assert(0 && "Unsupported VT!");
+      case MVT::i32:
+        Opc = SystemZ::SDIVREM32r; MOpc = SystemZ::SDIVREM32m;
+        ResVT = MVT::v2i64;
+        is32Bit = true;
+        break;
+      case MVT::i64:
+        Opc = SystemZ::SDIVREM64r; MOpc = SystemZ::SDIVREM64m;
+        ResVT = MVT::v2i64;
+        break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2;
+    bool foldedLoad = TryFoldLoad(Op, N1, Tmp0, Tmp1, Tmp2);
+
+    // Prepare the dividend
+    SDNode *Dividend;
+    if (is32Bit)
+      Dividend = CurDAG->getMachineNode(SystemZ::MOVSX64rr32, dl, MVT::i64, N0);
+    else
+      Dividend = N0.getNode();
+
+    // Insert prepared dividend into suitable 'subreg'
+    SDNode *Tmp = CurDAG->getMachineNode(TargetInstrInfo::IMPLICIT_DEF,
+                                         dl, ResVT);
+    Dividend =
+      CurDAG->getMachineNode(TargetInstrInfo::INSERT_SUBREG, dl, ResVT,
+                             SDValue(Tmp, 0), SDValue(Dividend, 0),
+                             CurDAG->getTargetConstant(subreg_odd, MVT::i32));
+
+    SDNode *Result;
+    SDValue DivVal = SDValue(Dividend, 0);
+    if (foldedLoad) {
+      SDValue Ops[] = { DivVal, Tmp0, Tmp1, Tmp2, N1.getOperand(0) };
+      Result = CurDAG->getMachineNode(MOpc, dl, ResVT,
+                                      Ops, array_lengthof(Ops));
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), SDValue(Result, 0));
+    } else {
+      Result = CurDAG->getMachineNode(Opc, dl, ResVT, SDValue(Dividend, 0), N1);
+    }
+
+    // Copy the division (odd subreg) result, if it is needed.
+    if (!Op.getValue(0).use_empty()) {
+      unsigned SubRegIdx = (is32Bit ? subreg_odd32 : subreg_odd);
+      SDNode *Div = CurDAG->getMachineNode(TargetInstrInfo::EXTRACT_SUBREG,
+                                           dl, NVT,
+                                           SDValue(Result, 0),
+                                           CurDAG->getTargetConstant(SubRegIdx,
+                                                                     MVT::i32));
+
+      ReplaceUses(Op.getValue(0), SDValue(Div, 0));
+      DEBUG(errs().indent(Indent-2) << "=> ";
+            Result->dump(CurDAG);
+            errs() << "\n");
+    }
+
+    // Copy the remainder (even subreg) result, if it is needed.
+    if (!Op.getValue(1).use_empty()) {
+      unsigned SubRegIdx = (is32Bit ? subreg_even32 : subreg_even);
+      SDNode *Rem = CurDAG->getMachineNode(TargetInstrInfo::EXTRACT_SUBREG,
+                                           dl, NVT,
+                                           SDValue(Result, 0),
+                                           CurDAG->getTargetConstant(SubRegIdx,
+                                                                     MVT::i32));
+
+      ReplaceUses(Op.getValue(1), SDValue(Rem, 0));
+      DEBUG(errs().indent(Indent-2) << "=> ";
+            Result->dump(CurDAG);
+            errs() << "\n");
+    }
+
+#ifndef NDEBUG
+    Indent -= 2;
+#endif
+
+    return NULL;
+  }
+  case ISD::UDIVREM: {
+    unsigned Opc, MOpc, ClrOpc;
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+    EVT ResVT;
+
+    bool is32Bit = false;
+    switch (NVT.getSimpleVT().SimpleTy) {
+      default: assert(0 && "Unsupported VT!");
+      case MVT::i32:
+        Opc = SystemZ::UDIVREM32r; MOpc = SystemZ::UDIVREM32m;
+        ClrOpc = SystemZ::MOV64Pr0_even;
+        ResVT = MVT::v2i32;
+        is32Bit = true;
+        break;
+      case MVT::i64:
+        Opc = SystemZ::UDIVREM64r; MOpc = SystemZ::UDIVREM64m;
+        ClrOpc = SystemZ::MOV128r0_even;
+        ResVT = MVT::v2i64;
+        break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2;
+    bool foldedLoad = TryFoldLoad(Op, N1, Tmp0, Tmp1, Tmp2);
+
+    // Prepare the dividend
+    SDNode *Dividend = N0.getNode();
+
+    // Insert prepared dividend into suitable 'subreg'
+    SDNode *Tmp = CurDAG->getMachineNode(TargetInstrInfo::IMPLICIT_DEF,
+                                         dl, ResVT);
+    {
+      unsigned SubRegIdx = (is32Bit ? subreg_odd32 : subreg_odd);
+      Dividend =
+        CurDAG->getMachineNode(TargetInstrInfo::INSERT_SUBREG, dl, ResVT,
+                               SDValue(Tmp, 0), SDValue(Dividend, 0),
+                               CurDAG->getTargetConstant(SubRegIdx, MVT::i32));
+    }
+
+    // Zero out even subreg
+    Dividend = CurDAG->getMachineNode(ClrOpc, dl, ResVT, SDValue(Dividend, 0));
+
+    SDValue DivVal = SDValue(Dividend, 0);
+    SDNode *Result;
+    if (foldedLoad) {
+      SDValue Ops[] = { DivVal, Tmp0, Tmp1, Tmp2, N1.getOperand(0) };
+      Result = CurDAG->getMachineNode(MOpc, dl,ResVT,
+                                      Ops, array_lengthof(Ops));
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), SDValue(Result, 0));
+    } else {
+      Result = CurDAG->getMachineNode(Opc, dl, ResVT, DivVal, N1);
+    }
+
+    // Copy the division (odd subreg) result, if it is needed.
+    if (!Op.getValue(0).use_empty()) {
+      unsigned SubRegIdx = (is32Bit ? subreg_odd32 : subreg_odd);
+      SDNode *Div = CurDAG->getMachineNode(TargetInstrInfo::EXTRACT_SUBREG,
+                                           dl, NVT,
+                                           SDValue(Result, 0),
+                                           CurDAG->getTargetConstant(SubRegIdx,
+                                                                     MVT::i32));
+      ReplaceUses(Op.getValue(0), SDValue(Div, 0));
+      DEBUG(errs().indent(Indent-2) << "=> ";
+            Result->dump(CurDAG);
+            errs() << "\n");
+    }
+
+    // Copy the remainder (even subreg) result, if it is needed.
+    if (!Op.getValue(1).use_empty()) {
+      unsigned SubRegIdx = (is32Bit ? subreg_even32 : subreg_even);
+      SDNode *Rem = CurDAG->getMachineNode(TargetInstrInfo::EXTRACT_SUBREG,
+                                           dl, NVT,
+                                           SDValue(Result, 0),
+                                           CurDAG->getTargetConstant(SubRegIdx,
+                                                                     MVT::i32));
+      ReplaceUses(Op.getValue(1), SDValue(Rem, 0));
+      DEBUG(errs().indent(Indent-2) << "=> ";
+            Result->dump(CurDAG);
+            errs() << "\n");
+    }
+
+#ifndef NDEBUG
+    Indent -= 2;
+#endif
+
+    return NULL;
+  }
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(Op);
+
+  DEBUG(errs().indent(Indent-2) << "=> ";
+        if (ResNode == NULL || ResNode == Op.getNode())
+          Op.getNode()->dump(CurDAG);
+        else
+          ResNode->dump(CurDAG);
+        errs() << "\n";
+        );
+  DEBUG(Indent -= 2);
+
+  return ResNode;
+}
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
new file mode 100644
index 0000000..07e0d83
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -0,0 +1,843 @@
+//===-- SystemZISelLowering.cpp - SystemZ DAG Lowering Implementation  -----==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "systemz-lower"
+
+#include "SystemZISelLowering.h"
+#include "SystemZ.h"
+#include "SystemZTargetMachine.h"
+#include "SystemZSubtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/VectorExtras.h"
+using namespace llvm;
+
+SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm) :
+  TargetLowering(tm, new TargetLoweringObjectFileELF()),
+  Subtarget(*tm.getSubtargetImpl()), TM(tm) {
+
+  RegInfo = TM.getRegisterInfo();
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32,  SystemZ::GR32RegisterClass);
+  addRegisterClass(MVT::i64,  SystemZ::GR64RegisterClass);
+  addRegisterClass(MVT::v2i32,SystemZ::GR64PRegisterClass);
+  addRegisterClass(MVT::v2i64,SystemZ::GR128RegisterClass);
+
+  if (!UseSoftFloat) {
+    addRegisterClass(MVT::f32, SystemZ::FP32RegisterClass);
+    addRegisterClass(MVT::f64, SystemZ::FP64RegisterClass);
+
+    addLegalFPImmediate(APFloat(+0.0));  // lzer
+    addLegalFPImmediate(APFloat(+0.0f)); // lzdr
+    addLegalFPImmediate(APFloat(-0.0));  // lzer + lner
+    addLegalFPImmediate(APFloat(-0.0f)); // lzdr + lndr
+  }
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Set shifts properties
+  setShiftAmountType(MVT::i64);
+
+  // Provide all sorts of operation actions
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::f64, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::f64, Expand);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
+
+  setStackPointerRegisterToSaveRestore(SystemZ::R15D);
+  setSchedulingPreference(SchedulingForLatency);
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  setOperationAction(ISD::BR_JT,            MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND,           MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,            MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC,            MVT::i64, Custom);
+  setOperationAction(ISD::BR_CC,            MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC,            MVT::f64, Custom);
+  setOperationAction(ISD::ConstantPool,     MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool,     MVT::i64, Custom);
+  setOperationAction(ISD::GlobalAddress,    MVT::i64, Custom);
+  setOperationAction(ISD::JumpTable,        MVT::i64, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+  setOperationAction(ISD::SDIV,             MVT::i32, Expand);
+  setOperationAction(ISD::UDIV,             MVT::i32, Expand);
+  setOperationAction(ISD::SDIV,             MVT::i64, Expand);
+  setOperationAction(ISD::UDIV,             MVT::i64, Expand);
+  setOperationAction(ISD::SREM,             MVT::i32, Expand);
+  setOperationAction(ISD::UREM,             MVT::i32, Expand);
+  setOperationAction(ISD::SREM,             MVT::i64, Expand);
+  setOperationAction(ISD::UREM,             MVT::i64, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::CTPOP,            MVT::i32, Expand);
+  setOperationAction(ISD::CTPOP,            MVT::i64, Expand);
+  setOperationAction(ISD::CTTZ,             MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ,             MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ,             MVT::i32, Promote);
+  setOperationAction(ISD::CTLZ,             MVT::i64, Legal);
+
+  // FIXME: Can we lower these 2 efficiently?
+  setOperationAction(ISD::SETCC,            MVT::i32, Expand);
+  setOperationAction(ISD::SETCC,            MVT::i64, Expand);
+  setOperationAction(ISD::SETCC,            MVT::f32, Expand);
+  setOperationAction(ISD::SETCC,            MVT::f64, Expand);
+  setOperationAction(ISD::SELECT,           MVT::i32, Expand);
+  setOperationAction(ISD::SELECT,           MVT::i64, Expand);
+  setOperationAction(ISD::SELECT,           MVT::f32, Expand);
+  setOperationAction(ISD::SELECT,           MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC,        MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC,        MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC,        MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC,        MVT::f64, Custom);
+
+  setOperationAction(ISD::MULHS,            MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i64, Expand);
+
+  // FIXME: Can we support these natively?
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i64, Expand);
+  setOperationAction(ISD::SRL_PARTS,        MVT::i64, Expand);
+  setOperationAction(ISD::SHL_PARTS,        MVT::i64, Expand);
+  setOperationAction(ISD::SRA_PARTS,        MVT::i64, Expand);
+
+  // Lower some FP stuff
+  setOperationAction(ISD::FSIN,             MVT::f32, Expand);
+  setOperationAction(ISD::FSIN,             MVT::f64, Expand);
+  setOperationAction(ISD::FCOS,             MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,             MVT::f64, Expand);
+  setOperationAction(ISD::FREM,             MVT::f32, Expand);
+  setOperationAction(ISD::FREM,             MVT::f64, Expand);
+
+  // We have only 64-bit bitconverts
+  setOperationAction(ISD::BIT_CONVERT,      MVT::f32, Expand);
+  setOperationAction(ISD::BIT_CONVERT,      MVT::i32, Expand);
+
+  setOperationAction(ISD::UINT_TO_FP,       MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_UINT,       MVT::i32, Expand);
+  setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+}
+
+SDValue SystemZTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  case ISD::BR_CC:            return LowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
+  case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
+  case ISD::JumpTable:        return LowerJumpTable(Op, DAG);
+  case ISD::ConstantPool:     return LowerConstantPool(Op, DAG);
+  default:
+    llvm_unreachable("Should not custom lower this!");
+    return SDValue();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                       SystemZ Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+TargetLowering::ConstraintType
+SystemZTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return C_RegisterClass;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+SystemZTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint,
+                             EVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'r':   // GENERAL_REGS
+      if (VT == MVT::i32)
+        return std::make_pair(0U, SystemZ::GR32RegisterClass);
+      else if (VT == MVT::i128)
+        return std::make_pair(0U, SystemZ::GR128RegisterClass);
+
+      return std::make_pair(0U, SystemZ::GR64RegisterClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "SystemZGenCallingConv.inc"
+
+SDValue
+SystemZTargetLowering::LowerFormalArguments(SDValue Chain,
+                                            CallingConv::ID CallConv,
+                                            bool isVarArg,
+                                            const SmallVectorImpl<ISD::InputArg>
+                                              &Ins,
+                                            DebugLoc dl,
+                                            SelectionDAG &DAG,
+                                            SmallVectorImpl<SDValue> &InVals) {
+
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return LowerCCCArguments(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals);
+  }
+}
+
+SDValue
+SystemZTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 bool isTailCall,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<ISD::InputArg> &Ins,
+                                 DebugLoc dl, SelectionDAG &DAG,
+                                 SmallVectorImpl<SDValue> &InVals) {
+
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::Fast:
+  case CallingConv::C:
+    return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+                          Outs, Ins, dl, DAG, InVals);
+  }
+}
+
+/// LowerCCCArguments - transform physical registers into virtual registers and
+/// generate load operations for arguments places on the stack.
+// FIXME: struct return stuff
+// FIXME: varargs
+SDValue
+SystemZTargetLowering::LowerCCCArguments(SDValue Chain,
+                                         CallingConv::ID CallConv,
+                                         bool isVarArg,
+                                         const SmallVectorImpl<ISD::InputArg>
+                                           &Ins,
+                                         DebugLoc dl,
+                                         SelectionDAG &DAG,
+                                         SmallVectorImpl<SDValue> &InVals) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
+
+  if (isVarArg)
+    llvm_report_error("Varargs not supported yet");
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    SDValue ArgValue;
+    CCValAssign &VA = ArgLocs[i];
+    EVT LocVT = VA.getLocVT();
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      TargetRegisterClass *RC;
+      switch (LocVT.getSimpleVT().SimpleTy) {
+      default:
+#ifndef NDEBUG
+        errs() << "LowerFormalArguments Unhandled argument type: "
+             << LocVT.getSimpleVT().SimpleTy
+             << "\n";
+#endif
+        llvm_unreachable(0);
+      case MVT::i64:
+        RC = SystemZ::GR64RegisterClass;
+        break;
+      case MVT::f32:
+        RC = SystemZ::FP32RegisterClass;
+        break;
+      case MVT::f64:
+        RC = SystemZ::FP64RegisterClass;
+        break;
+      }
+
+      unsigned VReg = RegInfo.createVirtualRegister(RC);
+      RegInfo.addLiveIn(VA.getLocReg(), VReg);
+      ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
+    } else {
+      // Sanity check
+      assert(VA.isMemLoc());
+
+      // Create the nodes corresponding to a load from this parameter slot.
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI->CreateFixedObject(LocVT.getSizeInBits()/8,
+                                      VA.getLocMemOffset());
+
+      // Create the SelectionDAG nodes corresponding to a load
+      // from this parameter
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      ArgValue = DAG.getLoad(LocVT, dl, Chain, FIN,
+                             PseudoSourceValue::getFixedStack(FI), 0);
+    }
+
+    // If this is an 8/16/32-bit value, it is really passed promoted to 64
+    // bits. Insert an assert[sz]ext to capture this, then truncate to the
+    // right size.
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue,
+                             DAG.getValueType(VA.getValVT()));
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      ArgValue = DAG.getNode(ISD::AssertZext, dl, LocVT, ArgValue,
+                             DAG.getValueType(VA.getValVT()));
+
+    if (VA.getLocInfo() != CCValAssign::Full)
+      ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+
+    InVals.push_back(ArgValue);
+  }
+
+  return Chain;
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+/// TODO: sret.
+SDValue
+SystemZTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      bool isTailCall,
+                                      const SmallVectorImpl<ISD::OutputArg>
+                                        &Outs,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Offset to first argument stack slot.
+  const unsigned FirstArgOffset = 160;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain ,DAG.getConstant(NumBytes,
+                                                      getPointerTy(), true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    SDValue Arg = Outs[i].Val;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: assert(0 && "Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      if (StackPtr.getNode() == 0)
+        StackPtr =
+          DAG.getCopyFromReg(Chain, dl,
+                             (RegInfo->hasFP(MF) ?
+                              SystemZ::R11D : SystemZ::R15D),
+                             getPointerTy());
+
+      unsigned Offset = FirstArgOffset + VA.getLocMemOffset();
+      SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   StackPtr,
+                                   DAG.getIntPtrConstant(Offset));
+
+      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                         PseudoSourceValue::getStack(), Offset));
+    }
+  }
+
+  // Transform all store nodes into one single node because all store nodes are
+  // independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emited instructions must be stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy());
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(SystemZISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, getPointerTy(), true),
+                             DAG.getConstant(0, getPointerTy(), true),
+                             InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl,
+                         DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+SystemZTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                       CallingConv::ID CallConv, bool isVarArg,
+                                       const SmallVectorImpl<ISD::InputArg>
+                                         &Ins,
+                                       DebugLoc dl, SelectionDAG &DAG,
+                                       SmallVectorImpl<SDValue> &InVals) {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
+                 *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+
+    Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                               VA.getLocVT(), InFlag).getValue(1);
+    SDValue RetValue = Chain.getValue(0);
+    InFlag = Chain.getValue(2);
+
+    // If this is an 8/16/32-bit value, it is really passed promoted to 64
+    // bits. Insert an assert[sz]ext to capture this, then truncate to the
+    // right size.
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      RetValue = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), RetValue,
+                             DAG.getValueType(VA.getValVT()));
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      RetValue = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), RetValue,
+                             DAG.getValueType(VA.getValVT()));
+
+    if (VA.getLocInfo() != CCValAssign::Full)
+      RetValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), RetValue);
+
+    InVals.push_back(RetValue);
+  }
+
+  return Chain;
+}
+
+
+SDValue
+SystemZTargetLowering::LowerReturn(SDValue Chain,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   DebugLoc dl, SelectionDAG &DAG) {
+
+  // CCValAssign - represent the assignment of the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    SDValue ResValue = Outs[i].Val;
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // If this is an 8/16/32-bit value, it is really should be passed promoted
+    // to 64 bits.
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      ResValue = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ResValue);
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      ResValue = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ResValue);
+    else if (VA.getLocInfo() == CCValAssign::AExt)
+      ResValue = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ResValue);
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ResValue, Flag);
+
+    // Guarantee that all emitted copies are stuck together,
+    // avoiding something bad.
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.getNode())
+    return DAG.getNode(SystemZISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+
+  // Return Void
+  return DAG.getNode(SystemZISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+SDValue SystemZTargetLowering::EmitCmp(SDValue LHS, SDValue RHS,
+                                       ISD::CondCode CC, SDValue &SystemZCC,
+                                       SelectionDAG &DAG) {
+  // FIXME: Emit a test if RHS is zero
+
+  bool isUnsigned = false;
+  SystemZCC::CondCodes TCC;
+  switch (CC) {
+  default:
+    llvm_unreachable("Invalid integer condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    TCC = SystemZCC::E;
+    break;
+  case ISD::SETUEQ:
+    TCC = SystemZCC::NLH;
+    break;
+  case ISD::SETNE:
+  case ISD::SETONE:
+    TCC = SystemZCC::NE;
+    break;
+  case ISD::SETUNE:
+    TCC = SystemZCC::LH;
+    break;
+  case ISD::SETO:
+    TCC = SystemZCC::O;
+    break;
+  case ISD::SETUO:
+    TCC = SystemZCC::NO;
+    break;
+  case ISD::SETULE:
+    if (LHS.getValueType().isFloatingPoint()) {
+      TCC = SystemZCC::NH;
+      break;
+    }
+    isUnsigned = true;   // FALLTHROUGH
+  case ISD::SETLE:
+  case ISD::SETOLE:
+    TCC = SystemZCC::LE;
+    break;
+  case ISD::SETUGE:
+    if (LHS.getValueType().isFloatingPoint()) {
+      TCC = SystemZCC::NL;
+      break;
+    }
+    isUnsigned = true;   // FALLTHROUGH
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    TCC = SystemZCC::HE;
+    break;
+  case ISD::SETUGT:
+    if (LHS.getValueType().isFloatingPoint()) {
+      TCC = SystemZCC::NLE;
+      break;
+    }
+    isUnsigned = true;  // FALLTHROUGH
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    TCC = SystemZCC::H;
+    break;
+  case ISD::SETULT:
+    if (LHS.getValueType().isFloatingPoint()) {
+      TCC = SystemZCC::NHE;
+      break;
+    }
+    isUnsigned = true;  // FALLTHROUGH
+  case ISD::SETLT:
+  case ISD::SETOLT:
+    TCC = SystemZCC::L;
+    break;
+  }
+
+  SystemZCC = DAG.getConstant(TCC, MVT::i32);
+
+  DebugLoc dl = LHS.getDebugLoc();
+  return DAG.getNode((isUnsigned ? SystemZISD::UCMP : SystemZISD::CMP),
+                     dl, MVT::Flag, LHS, RHS);
+}
+
+
+SDValue SystemZTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS   = Op.getOperand(2);
+  SDValue RHS   = Op.getOperand(3);
+  SDValue Dest  = Op.getOperand(4);
+  DebugLoc dl   = Op.getDebugLoc();
+
+  SDValue SystemZCC;
+  SDValue Flag = EmitCmp(LHS, RHS, CC, SystemZCC, DAG);
+  return DAG.getNode(SystemZISD::BRCOND, dl, Op.getValueType(),
+                     Chain, Dest, SystemZCC, Flag);
+}
+
+SDValue SystemZTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS    = Op.getOperand(0);
+  SDValue RHS    = Op.getOperand(1);
+  SDValue TrueV  = Op.getOperand(2);
+  SDValue FalseV = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  DebugLoc dl   = Op.getDebugLoc();
+
+  SDValue SystemZCC;
+  SDValue Flag = EmitCmp(LHS, RHS, CC, SystemZCC, DAG);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(TrueV);
+  Ops.push_back(FalseV);
+  Ops.push_back(SystemZCC);
+  Ops.push_back(Flag);
+
+  return DAG.getNode(SystemZISD::SELECT, dl, VTs, &Ops[0], Ops.size());
+}
+
+SDValue SystemZTargetLowering::LowerGlobalAddress(SDValue Op,
+                                                  SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+
+  bool IsPic = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+  bool ExtraLoadRequired =
+    Subtarget.GVRequiresExtraLoad(GV, getTargetMachine(), false);
+
+  SDValue Result;
+  if (!IsPic && !ExtraLoadRequired) {
+    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
+    Offset = 0;
+  } else {
+    unsigned char OpFlags = 0;
+    if (ExtraLoadRequired)
+      OpFlags = SystemZII::MO_GOTENT;
+
+    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
+  }
+
+  Result = DAG.getNode(SystemZISD::PCRelativeWrapper, dl,
+                       getPointerTy(), Result);
+
+  if (ExtraLoadRequired)
+    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
+                         PseudoSourceValue::getGOT(), 0);
+
+  // If there was a non-zero offset that we didn't fold, create an explicit
+  // addition for it.
+  if (Offset != 0)
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
+                         DAG.getConstant(Offset, getPointerTy()));
+
+  return Result;
+}
+
+// FIXME: PIC here
+SDValue SystemZTargetLowering::LowerJumpTable(SDValue Op,
+                                              SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
+
+  return DAG.getNode(SystemZISD::PCRelativeWrapper, dl, getPointerTy(), Result);
+}
+
+
+// FIXME: PIC here
+// FIXME: This is just dirty hack. We need to lower cpool properly
+SDValue SystemZTargetLowering::LowerConstantPool(SDValue Op,
+                                                 SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+
+  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
+                                             CP->getAlignment(),
+                                             CP->getOffset());
+
+  return DAG.getNode(SystemZISD::PCRelativeWrapper, dl, getPointerTy(), Result);
+}
+
+const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  case SystemZISD::RET_FLAG:           return "SystemZISD::RET_FLAG";
+  case SystemZISD::CALL:               return "SystemZISD::CALL";
+  case SystemZISD::BRCOND:             return "SystemZISD::BRCOND";
+  case SystemZISD::CMP:                return "SystemZISD::CMP";
+  case SystemZISD::UCMP:               return "SystemZISD::UCMP";
+  case SystemZISD::SELECT:             return "SystemZISD::SELECT";
+  case SystemZISD::PCRelativeWrapper:  return "SystemZISD::PCRelativeWrapper";
+  default: return NULL;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock*
+SystemZTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  const SystemZInstrInfo &TII = *TM.getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  assert((MI->getOpcode() == SystemZ::Select32  ||
+          MI->getOpcode() == SystemZ::SelectF32 ||
+          MI->getOpcode() == SystemZ::Select64  ||
+          MI->getOpcode() == SystemZ::SelectF64) &&
+         "Unexpected instr type to insert");
+
+  // To "insert" a SELECT instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = BB;
+  ++I;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   jCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *copy1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  SystemZCC::CondCodes CC = (SystemZCC::CondCodes)MI->getOperand(3).getImm();
+  BuildMI(BB, dl, TII.getBrCond(CC)).addMBB(copy1MBB);
+  F->insert(I, copy0MBB);
+  F->insert(I, copy1MBB);
+  // Inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), 
+         SE = BB->succ_end(); SI != SE; ++SI)
+    EM->insert(std::make_pair(*SI, copy1MBB));
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  copy1MBB->transferSuccessors(BB);
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(copy1MBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to copy1MBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(copy1MBB);
+
+  //  copy1MBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = copy1MBB;
+  BuildMI(BB, dl, TII.get(SystemZ::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
new file mode 100644
index 0000000..c2c24bc
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -0,0 +1,141 @@
+//==-- SystemZISelLowering.h - SystemZ DAG Lowering Interface ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that SystemZ uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SystemZ_ISELLOWERING_H
+#define LLVM_TARGET_SystemZ_ISELLOWERING_H
+
+#include "SystemZ.h"
+#include "SystemZRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  namespace SystemZISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// Return with a flag operand. Operand 0 is the chain operand.
+      RET_FLAG,
+
+      /// CALL - These operations represent an abstract call
+      /// instruction, which includes a bunch of information.
+      CALL,
+
+      /// PCRelativeWrapper - PC relative address
+      PCRelativeWrapper,
+
+      /// CMP, UCMP - Compare instruction
+      CMP,
+      UCMP,
+
+      /// BRCOND - Conditional branch. Operand 0 is chain operand, operand 1 is
+      /// the block to branch if condition is true, operand 2 is condition code
+      /// and operand 3 is the flag operand produced by a CMP instruction.
+      BRCOND,
+
+      /// SELECT - Operands 0 and 1 are selection variables, operand 2 is
+      /// condition code and operand 3 is the flag operand.
+      SELECT
+    };
+  }
+
+  class SystemZSubtarget;
+  class SystemZTargetMachine;
+
+  class SystemZTargetLowering : public TargetLowering {
+  public:
+    explicit SystemZTargetLowering(SystemZTargetMachine &TM);
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getFunctionAlignment - Return the Log2 alignment of this function.
+    virtual unsigned getFunctionAlignment(const Function *F) const {
+      return 1;
+    }
+
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+    TargetLowering::ConstraintType
+    getConstraintType(const std::string &Constraint) const;
+
+    SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
+
+    SDValue EmitCmp(SDValue LHS, SDValue RHS,
+                    ISD::CondCode CC, SDValue &SystemZCC,
+                    SelectionDAG &DAG);
+
+
+    MachineBasicBlock* EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *BB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+
+  private:
+    SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           bool isTailCall,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    SDValue LowerCCCArguments(SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              DebugLoc dl,
+                              SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals);
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
+    const SystemZSubtarget &Subtarget;
+    const SystemZTargetMachine &TM;
+    const SystemZRegisterInfo *RegInfo;
+  };
+} // namespace llvm
+
+#endif // LLVM_TARGET_SystemZ_ISELLOWERING_H
diff --git a/lib/Target/SystemZ/SystemZInstrBuilder.h b/lib/Target/SystemZ/SystemZInstrBuilder.h
new file mode 100644
index 0000000..b69d2f6
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrBuilder.h
@@ -0,0 +1,128 @@
+//===- SystemZInstrBuilder.h - Functions to aid building  insts -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle SystemZ'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Displacement, Index.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZINSTRBUILDER_H
+#define SYSTEMZINSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+/// SystemZAddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with R15 or R11 and Disp being offsetted accordingly.
+struct SystemZAddressMode {
+  enum {
+    RegBase,
+    FrameIndexBase
+  } BaseType;
+
+  union {
+    unsigned Reg;
+    int FrameIndex;
+  } Base;
+
+  unsigned IndexReg;
+  int32_t Disp;
+  GlobalValue *GV;
+
+  SystemZAddressMode() : BaseType(RegBase), IndexReg(0), Disp(0) {
+    Base.Reg = 0;
+  }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no index or displacement.
+///
+static inline const MachineInstrBuilder &
+addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
+  // Because memory references are always represented with 3
+  // values, this adds: Reg, [0, NoReg] to the instruction.
+  return MIB.addReg(Reg).addImm(0).addReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
+  return MIB.addImm(Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no or index, but with a
+/// displacement. An example is: 10(%r15).
+///
+static inline const MachineInstrBuilder &
+addRegOffset(const MachineInstrBuilder &MIB,
+             unsigned Reg, bool isKill, int Offset) {
+  return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+static inline const MachineInstrBuilder &
+addRegReg(const MachineInstrBuilder &MIB,
+            unsigned Reg1, bool isKill1, unsigned Reg2, bool isKill2) {
+  return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(0)
+    .addReg(Reg2, getKillRegState(isKill2));
+}
+
+static inline const MachineInstrBuilder &
+addFullAddress(const MachineInstrBuilder &MIB, const SystemZAddressMode &AM) {
+  if (AM.BaseType == SystemZAddressMode::RegBase)
+    MIB.addReg(AM.Base.Reg);
+  else if (AM.BaseType == SystemZAddressMode::FrameIndexBase)
+    MIB.addFrameIndex(AM.Base.FrameIndex);
+  else
+    assert(0);
+
+  return MIB.addImm(AM.Disp).addReg(AM.IndexReg);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+  MachineInstr *MI = MIB;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned Flags = 0;
+  if (TID.mayLoad())
+    Flags |= MachineMemOperand::MOLoad;
+  if (TID.mayStore())
+    Flags |= MachineMemOperand::MOStore;
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                            Flags, Offset,
+                            MFI.getObjectSize(FI),
+                            MFI.getObjectAlignment(FI));
+  return addOffset(MIB.addFrameIndex(FI), Offset)
+            .addMemOperand(MMO);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
new file mode 100644
index 0000000..8a202d4
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -0,0 +1,340 @@
+//===- SystemZInstrFP.td - SystemZ FP Instruction defs --------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the SystemZ (binary) floating point instructions in 
+// TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+// FIXME: multiclassify!
+
+//===----------------------------------------------------------------------===//
+// FP Pattern fragments
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+let usesCustomDAGSchedInserter = 1 in {
+  def SelectF32 : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2, i8imm:$cc),
+                        "# SelectF32 PSEUDO",
+                        [(set FP32:$dst,
+                              (SystemZselect FP32:$src1, FP32:$src2, imm:$cc))]>;
+  def SelectF64 : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2, i8imm:$cc),
+                        "# SelectF64 PSEUDO",
+                        [(set FP64:$dst,
+                              (SystemZselect FP64:$src1, FP64:$src2, imm:$cc))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Move Instructions
+
+// Floating point constant loads.
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def LD_Fp032 : Pseudo<(outs FP32:$dst), (ins),
+                      "lzer\t{$dst}",
+                      [(set FP32:$dst, fpimm0)]>;
+def LD_Fp064 : Pseudo<(outs FP64:$dst), (ins),
+                      "lzdr\t{$dst}",
+                      [(set FP64:$dst, fpimm0)]>;
+}
+
+let neverHasSideEffects = 1 in {
+def FMOV32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src),
+                      "ler\t{$dst, $src}",
+                      []>;
+def FMOV64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src),
+                      "ldr\t{$dst, $src}",
+                      []>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+def FMOV32rm  : Pseudo<(outs FP32:$dst), (ins rriaddr12:$src),
+                      "le\t{$dst, $src}",
+                      [(set FP32:$dst, (load rriaddr12:$src))]>;
+def FMOV32rmy : Pseudo<(outs FP32:$dst), (ins rriaddr:$src),
+                      "ley\t{$dst, $src}",
+                      [(set FP32:$dst, (load rriaddr:$src))]>;
+def FMOV64rm  : Pseudo<(outs FP64:$dst), (ins rriaddr12:$src),
+                      "ld\t{$dst, $src}",
+                      [(set FP64:$dst, (load rriaddr12:$src))]>;
+def FMOV64rmy : Pseudo<(outs FP64:$dst), (ins rriaddr:$src),
+                      "ldy\t{$dst, $src}",
+                      [(set FP64:$dst, (load rriaddr:$src))]>;
+}
+
+def FMOV32mr  : Pseudo<(outs), (ins rriaddr12:$dst, FP32:$src),
+                       "ste\t{$src, $dst}",
+                       [(store FP32:$src, rriaddr12:$dst)]>;
+def FMOV32mry : Pseudo<(outs), (ins rriaddr:$dst, FP32:$src),
+                       "stey\t{$src, $dst}",
+                       [(store FP32:$src, rriaddr:$dst)]>;
+def FMOV64mr  : Pseudo<(outs), (ins rriaddr12:$dst, FP64:$src),
+                       "std\t{$src, $dst}",
+                       [(store FP64:$src, rriaddr12:$dst)]>;
+def FMOV64mry : Pseudo<(outs), (ins rriaddr:$dst, FP64:$src),
+                       "stdy\t{$src, $dst}",
+                       [(store FP64:$src, rriaddr:$dst)]>;
+
+def FCOPYSIGN32 : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2),
+                         "cpsdr\t{$dst, $src2, $src1}",
+                         [(set FP32:$dst, (fcopysign FP32:$src1, FP32:$src2))]>;
+def FCOPYSIGN64 : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2),
+                         "cpsdr\t{$dst, $src2, $src1}",
+                         [(set FP64:$dst, (fcopysign FP64:$src1, FP64:$src2))]>;
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions
+
+
+let Defs = [PSW] in {
+def FNEG32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src),
+                       "lcebr\t{$dst, $src}",
+                       [(set FP32:$dst, (fneg FP32:$src)),
+                        (implicit PSW)]>;
+def FNEG64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src),
+                       "lcdbr\t{$dst, $src}",
+                       [(set FP64:$dst, (fneg FP64:$src)),
+                        (implicit PSW)]>;
+
+def FABS32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src),
+                       "lpebr\t{$dst, $src}",
+                       [(set FP32:$dst, (fabs FP32:$src)),
+                        (implicit PSW)]>;
+def FABS64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src),
+                       "lpdbr\t{$dst, $src}",
+                       [(set FP64:$dst, (fabs FP64:$src)),
+                        (implicit PSW)]>;
+
+def FNABS32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src),
+                       "lnebr\t{$dst, $src}",
+                       [(set FP32:$dst, (fneg(fabs FP32:$src))),
+                        (implicit PSW)]>;
+def FNABS64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src),
+                       "lndbr\t{$dst, $src}",
+                       [(set FP64:$dst, (fneg(fabs FP64:$src))),
+                        (implicit PSW)]>;
+}
+
+let isTwoAddress = 1 in {
+let Defs = [PSW] in {
+let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
+def FADD32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2),
+                       "aebr\t{$dst, $src2}",
+                       [(set FP32:$dst, (fadd FP32:$src1, FP32:$src2)),
+                        (implicit PSW)]>;
+def FADD64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2),
+                       "adbr\t{$dst, $src2}",
+                       [(set FP64:$dst, (fadd FP64:$src1, FP64:$src2)),
+                        (implicit PSW)]>;
+}
+
+def FADD32rm : Pseudo<(outs FP32:$dst), (ins FP32:$src1, rriaddr12:$src2),
+                       "aeb\t{$dst, $src2}",
+                       [(set FP32:$dst, (fadd FP32:$src1, (load rriaddr12:$src2))),
+                        (implicit PSW)]>;
+def FADD64rm : Pseudo<(outs FP64:$dst), (ins FP64:$src1, rriaddr12:$src2),
+                       "adb\t{$dst, $src2}",
+                       [(set FP64:$dst, (fadd FP64:$src1, (load rriaddr12:$src2))),
+                        (implicit PSW)]>;
+
+def FSUB32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2),
+                       "sebr\t{$dst, $src2}",
+                       [(set FP32:$dst, (fsub FP32:$src1, FP32:$src2)),
+                        (implicit PSW)]>;
+def FSUB64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2),
+                       "sdbr\t{$dst, $src2}",
+                       [(set FP64:$dst, (fsub FP64:$src1, FP64:$src2)),
+                        (implicit PSW)]>;
+
+def FSUB32rm : Pseudo<(outs FP32:$dst), (ins FP32:$src1, rriaddr12:$src2),
+                       "seb\t{$dst, $src2}",
+                       [(set FP32:$dst, (fsub FP32:$src1, (load rriaddr12:$src2))),
+                        (implicit PSW)]>;
+def FSUB64rm : Pseudo<(outs FP64:$dst), (ins FP64:$src1, rriaddr12:$src2),
+                       "sdb\t{$dst, $src2}",
+                       [(set FP64:$dst, (fsub FP64:$src1, (load rriaddr12:$src2))),
+                        (implicit PSW)]>;
+} // Defs = [PSW]
+
+let isCommutable = 1 in { // X = MUL Y, Z  == X = MUL Z, Y
+def FMUL32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2),
+                       "meebr\t{$dst, $src2}",
+                       [(set FP32:$dst, (fmul FP32:$src1, FP32:$src2))]>;
+def FMUL64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2),
+                       "mdbr\t{$dst, $src2}",
+                       [(set FP64:$dst, (fmul FP64:$src1, FP64:$src2))]>;
+}
+
+def FMUL32rm : Pseudo<(outs FP32:$dst), (ins FP32:$src1, rriaddr12:$src2),
+                       "meeb\t{$dst, $src2}",
+                       [(set FP32:$dst, (fmul FP32:$src1, (load rriaddr12:$src2)))]>;
+def FMUL64rm : Pseudo<(outs FP64:$dst), (ins FP64:$src1, rriaddr12:$src2),
+                       "mdb\t{$dst, $src2}",
+                       [(set FP64:$dst, (fmul FP64:$src1, (load rriaddr12:$src2)))]>;
+
+def FMADD32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2, FP32:$src3),
+                       "maebr\t{$dst, $src3, $src2}",
+                       [(set FP32:$dst, (fadd (fmul FP32:$src2, FP32:$src3),
+                                              FP32:$src1))]>;
+def FMADD32rm : Pseudo<(outs FP32:$dst), (ins FP32:$src1, rriaddr12:$src2, FP32:$src3),
+                       "maeb\t{$dst, $src3, $src2}",
+                       [(set FP32:$dst, (fadd (fmul (load rriaddr12:$src2),
+                                                     FP32:$src3),
+                                              FP32:$src1))]>;
+
+def FMADD64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2, FP64:$src3),
+                       "madbr\t{$dst, $src3, $src2}",
+                       [(set FP64:$dst, (fadd (fmul FP64:$src2, FP64:$src3),
+                                              FP64:$src1))]>;
+def FMADD64rm : Pseudo<(outs FP64:$dst), (ins FP64:$src1, rriaddr12:$src2, FP64:$src3),
+                       "madb\t{$dst, $src3, $src2}",
+                       [(set FP64:$dst, (fadd (fmul (load rriaddr12:$src2),
+                                                     FP64:$src3),
+                                              FP64:$src1))]>;
+
+def FMSUB32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2, FP32:$src3),
+                       "msebr\t{$dst, $src3, $src2}",
+                       [(set FP32:$dst, (fsub (fmul FP32:$src2, FP32:$src3),
+                                              FP32:$src1))]>;
+def FMSUB32rm : Pseudo<(outs FP32:$dst), (ins FP32:$src1, rriaddr12:$src2, FP32:$src3),
+                       "mseb\t{$dst, $src3, $src2}",
+                       [(set FP32:$dst, (fsub (fmul (load rriaddr12:$src2),
+                                                     FP32:$src3),
+                                              FP32:$src1))]>;
+
+def FMSUB64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2, FP64:$src3),
+                       "msdbr\t{$dst, $src3, $src2}",
+                       [(set FP64:$dst, (fsub (fmul FP64:$src2, FP64:$src3),
+                                              FP64:$src1))]>;
+def FMSUB64rm : Pseudo<(outs FP64:$dst), (ins FP64:$src1, rriaddr12:$src2, FP64:$src3),
+                       "msdb\t{$dst, $src3, $src2}",
+                       [(set FP64:$dst, (fsub (fmul (load rriaddr12:$src2),
+                                                     FP64:$src3),
+                                              FP64:$src1))]>;
+
+def FDIV32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2),
+                       "debr\t{$dst, $src2}",
+                       [(set FP32:$dst, (fdiv FP32:$src1, FP32:$src2))]>;
+def FDIV64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2),
+                       "ddbr\t{$dst, $src2}",
+                       [(set FP64:$dst, (fdiv FP64:$src1, FP64:$src2))]>;
+
+def FDIV32rm : Pseudo<(outs FP32:$dst), (ins FP32:$src1, rriaddr12:$src2),
+                       "deb\t{$dst, $src2}",
+                       [(set FP32:$dst, (fdiv FP32:$src1, (load rriaddr12:$src2)))]>;
+def FDIV64rm : Pseudo<(outs FP64:$dst), (ins FP64:$src1, rriaddr12:$src2),
+                       "ddb\t{$dst, $src2}",
+                       [(set FP64:$dst, (fdiv FP64:$src1, (load rriaddr12:$src2)))]>;
+
+} // isTwoAddress = 1
+
+def FSQRT32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src),
+                       "sqebr\t{$dst, $src}",
+                       [(set FP32:$dst, (fsqrt FP32:$src))]>;
+def FSQRT64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src),
+                       "sqdbr\t{$dst, $src}",
+                       [(set FP64:$dst, (fsqrt FP64:$src))]>;
+
+def FSQRT32rm : Pseudo<(outs FP32:$dst), (ins rriaddr12:$src),
+                       "sqeb\t{$dst, $src}",
+                       [(set FP32:$dst, (fsqrt (load rriaddr12:$src)))]>;
+def FSQRT64rm : Pseudo<(outs FP64:$dst), (ins rriaddr12:$src),
+                       "sqdb\t{$dst, $src}",
+                       [(set FP64:$dst, (fsqrt (load rriaddr12:$src)))]>;
+
+def FROUND64r32 : Pseudo<(outs FP32:$dst), (ins FP64:$src),
+                         "ledbr\t{$dst, $src}",
+                         [(set FP32:$dst, (fround FP64:$src))]>;
+
+def FEXT32r64   : Pseudo<(outs FP64:$dst), (ins FP32:$src),
+                         "ldebr\t{$dst, $src}",
+                         [(set FP64:$dst, (fextend FP32:$src))]>;
+def FEXT32m64   : Pseudo<(outs FP64:$dst), (ins rriaddr12:$src),
+                         "ldeb\t{$dst, $src}",
+                         [(set FP64:$dst, (fextend (load rriaddr12:$src)))]>;
+
+let Defs = [PSW] in {
+def FCONVFP32   : Pseudo<(outs FP32:$dst), (ins GR32:$src),
+                         "cefbr\t{$dst, $src}",
+                         [(set FP32:$dst, (sint_to_fp GR32:$src)),
+                          (implicit PSW)]>;
+def FCONVFP32r64: Pseudo<(outs FP32:$dst), (ins GR64:$src),
+                         "cegbr\t{$dst, $src}",
+                         [(set FP32:$dst, (sint_to_fp GR64:$src)),
+                          (implicit PSW)]>;
+
+def FCONVFP64r32: Pseudo<(outs FP64:$dst), (ins GR32:$src),
+                         "cdfbr\t{$dst, $src}",
+                         [(set FP64:$dst, (sint_to_fp GR32:$src)),
+                          (implicit PSW)]>;
+def FCONVFP64   : Pseudo<(outs FP64:$dst), (ins GR64:$src),
+                         "cdgbr\t{$dst, $src}",
+                         [(set FP64:$dst, (sint_to_fp GR64:$src)),
+                          (implicit PSW)]>;
+
+def FCONVGR32   : Pseudo<(outs GR32:$dst), (ins FP32:$src),
+                         "cfebr\t{$dst, 5, $src}",
+                         [(set GR32:$dst, (fp_to_sint FP32:$src)),
+                          (implicit PSW)]>;
+def FCONVGR32r64: Pseudo<(outs GR32:$dst), (ins FP64:$src),
+                         "cfdbr\t{$dst, 5, $src}",
+                         [(set GR32:$dst, (fp_to_sint FP64:$src)),
+                          (implicit PSW)]>;
+
+def FCONVGR64r32: Pseudo<(outs GR64:$dst), (ins FP32:$src),
+                         "cgebr\t{$dst, 5, $src}",
+                         [(set GR64:$dst, (fp_to_sint FP32:$src)),
+                          (implicit PSW)]>;
+def FCONVGR64   : Pseudo<(outs GR64:$dst), (ins FP64:$src),
+                         "cgdbr\t{$dst, 5, $src}",
+                         [(set GR64:$dst, (fp_to_sint FP64:$src)),
+                          (implicit PSW)]>;
+} // Defs = [PSW]
+
+def FBCONVG64   : Pseudo<(outs GR64:$dst), (ins FP64:$src),
+                         "lgdr\t{$dst, $src}",
+                         [(set GR64:$dst, (bitconvert FP64:$src))]>;
+def FBCONVF64   : Pseudo<(outs FP64:$dst), (ins GR64:$src),
+                         "ldgr\t{$dst, $src}",
+                         [(set FP64:$dst, (bitconvert GR64:$src))]>;
+
+//===----------------------------------------------------------------------===//
+// Test instructions (like AND but do not produce any result)
+
+// Integer comparisons
+let Defs = [PSW] in {
+def FCMP32rr : Pseudo<(outs), (ins FP32:$src1, FP32:$src2),
+                      "cebr\t$src1, $src2",
+                      [(SystemZcmp FP32:$src1, FP32:$src2), (implicit PSW)]>;
+def FCMP64rr : Pseudo<(outs), (ins FP64:$src1, FP64:$src2),
+                      "cdbr\t$src1, $src2",
+                      [(SystemZcmp FP64:$src1, FP64:$src2), (implicit PSW)]>;
+
+def FCMP32rm : Pseudo<(outs), (ins FP32:$src1, rriaddr12:$src2),
+                      "ceb\t$src1, $src2",
+                      [(SystemZcmp FP32:$src1, (load rriaddr12:$src2)),
+                       (implicit PSW)]>;
+def FCMP64rm : Pseudo<(outs), (ins FP64:$src1, rriaddr12:$src2),
+                      "cdb\t$src1, $src2",
+                      [(SystemZcmp FP64:$src1, (load rriaddr12:$src2)),
+                       (implicit PSW)]>;
+} // Defs = [PSW]
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Floating point constant -0.0
+def : Pat<(f32 fpimmneg0), (FNEG32rr (LD_Fp032))>;
+def : Pat<(f64 fpimmneg0), (FNEG64rr (LD_Fp064))>;
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
new file mode 100644
index 0000000..b4a8993
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -0,0 +1,133 @@
+//===- SystemZInstrFormats.td - SystemZ Instruction Formats ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<5> val> {
+  bits<5> Value = val;
+}
+
+def Pseudo   : Format<0>;
+def EForm    : Format<1>;
+def IForm    : Format<2>;
+def RIForm   : Format<3>;
+def RIEForm  : Format<4>;
+def RILForm  : Format<5>;
+def RISForm  : Format<6>;
+def RRForm   : Format<7>;
+def RREForm  : Format<8>;
+def RRFForm  : Format<9>;
+def RRRForm  : Format<10>;
+def RRSForm  : Format<11>;
+def RSForm   : Format<12>;
+def RSIForm  : Format<13>;
+def RSILForm : Format<14>;
+def RSYForm  : Format<15>;
+def RXForm   : Format<16>;
+def RXEForm  : Format<17>;
+def RXFForm  : Format<18>;
+def RXYForm  : Format<19>;
+def SForm    : Format<20>;
+def SIForm   : Format<21>;
+def SILForm  : Format<22>;
+def SIYForm  : Format<23>;
+def SSForm   : Format<24>;
+def SSEForm  : Format<25>;
+def SSFForm  : Format<26>;
+
+class InstSystemZ<bits<16> op, Format f, dag outs, dag ins> : Instruction {
+  let Namespace = "SystemZ";
+
+  bits<16> Opcode = op;
+
+  Format Form = f;
+  bits<5> FormBits = Form.Value;
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+}
+
+class I8<bits<8> op, Format f, dag outs, dag ins, string asmstr, 
+         list<dag> pattern> 
+  : InstSystemZ<0, f, outs, ins> {
+  let Opcode{0-7} = op;
+  let Opcode{8-15} = 0;
+
+  let Pattern = pattern;
+  let AsmString = asmstr;
+}
+
+class I12<bits<12> op, Format f, dag outs, dag ins, string asmstr, 
+         list<dag> pattern> 
+  : InstSystemZ<0, f, outs, ins> {
+  let Opcode{0-11} = op;
+  let Opcode{12-15} = 0;
+
+  let Pattern = pattern;
+  let AsmString = asmstr;
+}
+
+class I16<bits<16> op, Format f, dag outs, dag ins, string asmstr,
+         list<dag> pattern>
+  : InstSystemZ<op, f, outs, ins> {
+  let Pattern = pattern;
+  let AsmString = asmstr;
+}
+
+class RRI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I8<op, RRForm, outs, ins, asmstr, pattern>;
+
+class RII<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I12<op, RIForm, outs, ins, asmstr, pattern>;
+
+class RILI<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I12<op, RILForm, outs, ins, asmstr, pattern>;
+
+class RREI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I16<op, RREForm, outs, ins, asmstr, pattern>;
+
+class RXI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I8<op, RXForm, outs, ins, asmstr, pattern> {
+  let AddedComplexity = 1;
+}
+
+class RXYI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I16<op, RXYForm, outs, ins, asmstr, pattern>;
+
+class RSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I8<op, RSForm, outs, ins, asmstr, pattern> {
+  let AddedComplexity = 1;
+}
+
+class RSYI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I16<op, RSYForm, outs, ins, asmstr, pattern>;
+
+class SII<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I8<op, SIForm, outs, ins, asmstr, pattern> {
+  let AddedComplexity = 1;
+}
+
+class SIYI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I16<op, SIYForm, outs, ins, asmstr, pattern>;
+
+class SILI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I16<op, SILForm, outs, ins, asmstr, pattern>;
+
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstSystemZ<0, Pseudo, outs, ins> {
+
+  let Pattern = pattern;
+  let AsmString = asmstr;
+}
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
new file mode 100644
index 0000000..236711c
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -0,0 +1,648 @@
+//===- SystemZInstrInfo.cpp - SystemZ Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "SystemZInstrBuilder.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZTargetMachine.h"
+#include "SystemZGenInstrInfo.inc"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+using namespace llvm;
+
+SystemZInstrInfo::SystemZInstrInfo(SystemZTargetMachine &tm)
+  : TargetInstrInfoImpl(SystemZInsts, array_lengthof(SystemZInsts)),
+    RI(tm, *this), TM(tm) {
+  // Fill the spill offsets map
+  static const unsigned SpillOffsTab[][2] = {
+    { SystemZ::R2D,  0x10 },
+    { SystemZ::R3D,  0x18 },
+    { SystemZ::R4D,  0x20 },
+    { SystemZ::R5D,  0x28 },
+    { SystemZ::R6D,  0x30 },
+    { SystemZ::R7D,  0x38 },
+    { SystemZ::R8D,  0x40 },
+    { SystemZ::R9D,  0x48 },
+    { SystemZ::R10D, 0x50 },
+    { SystemZ::R11D, 0x58 },
+    { SystemZ::R12D, 0x60 },
+    { SystemZ::R13D, 0x68 },
+    { SystemZ::R14D, 0x70 },
+    { SystemZ::R15D, 0x78 }
+  };
+
+  RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
+
+  for (unsigned i = 0, e = array_lengthof(SpillOffsTab); i != e; ++i)
+    RegSpillOffsets[SpillOffsTab[i][0]] = SpillOffsTab[i][1];
+}
+
+/// isGVStub - Return true if the GV requires an extra load to get the
+/// real address.
+static inline bool isGVStub(GlobalValue *GV, SystemZTargetMachine &TM) {
+  return TM.getSubtarget<SystemZSubtarget>().GVRequiresExtraLoad(GV, TM, false);
+}
+
+void SystemZInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill, int FrameIdx,
+                                    const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  unsigned Opc = 0;
+  if (RC == &SystemZ::GR32RegClass ||
+      RC == &SystemZ::ADDR32RegClass)
+    Opc = SystemZ::MOV32mr;
+  else if (RC == &SystemZ::GR64RegClass ||
+           RC == &SystemZ::ADDR64RegClass) {
+    Opc = SystemZ::MOV64mr;
+  } else if (RC == &SystemZ::FP32RegClass) {
+    Opc = SystemZ::FMOV32mr;
+  } else if (RC == &SystemZ::FP64RegClass) {
+    Opc = SystemZ::FMOV64mr;
+  } else if (RC == &SystemZ::GR64PRegClass) {
+    Opc = SystemZ::MOV64Pmr;
+  } else if (RC == &SystemZ::GR128RegClass) {
+    Opc = SystemZ::MOV128mr;
+  } else
+    llvm_unreachable("Unsupported regclass to store");
+
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
+    .addReg(SrcReg, getKillRegState(isKill));
+}
+
+void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                           unsigned DestReg, int FrameIdx,
+                                           const TargetRegisterClass *RC) const{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  unsigned Opc = 0;
+  if (RC == &SystemZ::GR32RegClass ||
+      RC == &SystemZ::ADDR32RegClass)
+    Opc = SystemZ::MOV32rm;
+  else if (RC == &SystemZ::GR64RegClass ||
+           RC == &SystemZ::ADDR64RegClass) {
+    Opc = SystemZ::MOV64rm;
+  } else if (RC == &SystemZ::FP32RegClass) {
+    Opc = SystemZ::FMOV32rm;
+  } else if (RC == &SystemZ::FP64RegClass) {
+    Opc = SystemZ::FMOV64rm;
+  } else if (RC == &SystemZ::GR64PRegClass) {
+    Opc = SystemZ::MOV64Prm;
+  } else if (RC == &SystemZ::GR128RegClass) {
+    Opc = SystemZ::MOV128rm;
+  } else
+    llvm_unreachable("Unsupported regclass to load");
+
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
+}
+
+bool SystemZInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned DestReg, unsigned SrcReg,
+                                    const TargetRegisterClass *DestRC,
+                                    const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  // Determine if DstRC and SrcRC have a common superclass.
+  const TargetRegisterClass *CommonRC = DestRC;
+  if (DestRC == SrcRC)
+    /* Same regclass for source and dest */;
+  else if (CommonRC->hasSuperClass(SrcRC))
+    CommonRC = SrcRC;
+  else if (!CommonRC->hasSubClass(SrcRC))
+    CommonRC = 0;
+
+  if (CommonRC) {
+    if (CommonRC == &SystemZ::GR64RegClass ||
+        CommonRC == &SystemZ::ADDR64RegClass) {
+      BuildMI(MBB, I, DL, get(SystemZ::MOV64rr), DestReg).addReg(SrcReg);
+    } else if (CommonRC == &SystemZ::GR32RegClass ||
+               CommonRC == &SystemZ::ADDR32RegClass) {
+      BuildMI(MBB, I, DL, get(SystemZ::MOV32rr), DestReg).addReg(SrcReg);
+    } else if (CommonRC == &SystemZ::GR64PRegClass) {
+      BuildMI(MBB, I, DL, get(SystemZ::MOV64rrP), DestReg).addReg(SrcReg);
+    } else if (CommonRC == &SystemZ::GR128RegClass) {
+      BuildMI(MBB, I, DL, get(SystemZ::MOV128rr), DestReg).addReg(SrcReg);
+    } else if (CommonRC == &SystemZ::FP32RegClass) {
+      BuildMI(MBB, I, DL, get(SystemZ::FMOV32rr), DestReg).addReg(SrcReg);
+    } else if (CommonRC == &SystemZ::FP64RegClass) {
+      BuildMI(MBB, I, DL, get(SystemZ::FMOV64rr), DestReg).addReg(SrcReg);
+    } else {
+      return false;
+    }
+
+    return true;
+  }
+
+  if ((SrcRC == &SystemZ::GR64RegClass &&
+       DestRC == &SystemZ::ADDR64RegClass) ||
+      (DestRC == &SystemZ::GR64RegClass &&
+       SrcRC == &SystemZ::ADDR64RegClass)) {
+    BuildMI(MBB, I, DL, get(SystemZ::MOV64rr), DestReg).addReg(SrcReg);
+    return true;
+  } else if ((SrcRC == &SystemZ::GR32RegClass &&
+              DestRC == &SystemZ::ADDR32RegClass) ||
+             (DestRC == &SystemZ::GR32RegClass &&
+              SrcRC == &SystemZ::ADDR32RegClass)) {
+    BuildMI(MBB, I, DL, get(SystemZ::MOV32rr), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  return false;
+}
+
+bool
+SystemZInstrInfo::isMoveInstr(const MachineInstr& MI,
+                              unsigned &SrcReg, unsigned &DstReg,
+                              unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case SystemZ::MOV32rr:
+  case SystemZ::MOV64rr:
+  case SystemZ::MOV64rrP:
+  case SystemZ::MOV128rr:
+  case SystemZ::FMOV32rr:
+  case SystemZ::FMOV64rr:
+    assert(MI.getNumOperands() >= 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "invalid register-register move instruction");
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SrcSubIdx = MI.getOperand(1).getSubReg();
+    DstSubIdx = MI.getOperand(0).getSubReg();
+    return true;
+  }
+}
+
+unsigned SystemZInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                               int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case SystemZ::MOV32rm:
+  case SystemZ::MOV32rmy:
+  case SystemZ::MOV64rm:
+  case SystemZ::MOVSX32rm8:
+  case SystemZ::MOVSX32rm16y:
+  case SystemZ::MOVSX64rm8:
+  case SystemZ::MOVSX64rm16:
+  case SystemZ::MOVSX64rm32:
+  case SystemZ::MOVZX32rm8:
+  case SystemZ::MOVZX32rm16:
+  case SystemZ::MOVZX64rm8:
+  case SystemZ::MOVZX64rm16:
+  case SystemZ::MOVZX64rm32:
+  case SystemZ::FMOV32rm:
+  case SystemZ::FMOV32rmy:
+  case SystemZ::FMOV64rm:
+  case SystemZ::FMOV64rmy:
+  case SystemZ::MOV64Prm:
+  case SystemZ::MOV64Prmy:
+  case SystemZ::MOV128rm:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(3).isReg() &&
+        MI->getOperand(2).getImm() == 0 && MI->getOperand(3).getReg() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned SystemZInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case SystemZ::MOV32mr:
+  case SystemZ::MOV32mry:
+  case SystemZ::MOV64mr:
+  case SystemZ::MOV32m8r:
+  case SystemZ::MOV32m8ry:
+  case SystemZ::MOV32m16r:
+  case SystemZ::MOV32m16ry:
+  case SystemZ::MOV64m8r:
+  case SystemZ::MOV64m8ry:
+  case SystemZ::MOV64m16r:
+  case SystemZ::MOV64m16ry:
+  case SystemZ::MOV64m32r:
+  case SystemZ::MOV64m32ry:
+  case SystemZ::FMOV32mr:
+  case SystemZ::FMOV32mry:
+  case SystemZ::FMOV64mr:
+  case SystemZ::FMOV64mry:
+  case SystemZ::MOV64Pmr:
+  case SystemZ::MOV64Pmry:
+  case SystemZ::MOV128mr:
+    if (MI->getOperand(0).isFI() &&
+        MI->getOperand(1).isImm() && MI->getOperand(2).isReg() &&
+        MI->getOperand(1).getImm() == 0 && MI->getOperand(2).getReg() == 0) {
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(3).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+bool
+SystemZInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  unsigned CalleeFrameSize = 0;
+
+  // Scan the callee-saved and find the bounds of register spill area.
+  unsigned LowReg = 0, HighReg = 0, StartOffset = -1U, EndOffset = 0;
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    const TargetRegisterClass *RegClass = CSI[i].getRegClass();
+    if (RegClass != &SystemZ::FP64RegClass) {
+      unsigned Offset = RegSpillOffsets[Reg];
+      CalleeFrameSize += 8;
+      if (StartOffset > Offset) {
+        LowReg = Reg; StartOffset = Offset;
+      }
+      if (EndOffset < Offset) {
+        HighReg = Reg; EndOffset = RegSpillOffsets[Reg];
+      }
+    }
+  }
+
+  // Save information for epilogue inserter.
+  MFI->setCalleeSavedFrameSize(CalleeFrameSize);
+  MFI->setLowReg(LowReg); MFI->setHighReg(HighReg);
+
+  // Save GPRs
+  if (StartOffset) {
+    // Build a store instruction. Use STORE MULTIPLE instruction if there are many
+    // registers to store, otherwise - just STORE.
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MI, DL, get((LowReg == HighReg ?
+                                SystemZ::MOV64mr : SystemZ::MOV64mrm)));
+
+    // Add store operands.
+    MIB.addReg(SystemZ::R15D).addImm(StartOffset);
+    if (LowReg == HighReg)
+      MIB.addReg(0);
+    MIB.addReg(LowReg, RegState::Kill);
+    if (LowReg != HighReg)
+      MIB.addReg(HighReg, RegState::Kill);
+
+    // Do a second scan adding regs as being killed by instruction
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+      unsigned Reg = CSI[i].getReg();
+      // Add the callee-saved register as live-in. It's killed at the spill.
+      MBB.addLiveIn(Reg);
+      if (Reg != LowReg && Reg != HighReg)
+        MIB.addReg(Reg, RegState::ImplicitKill);
+    }
+  }
+
+  // Save FPRs
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    const TargetRegisterClass *RegClass = CSI[i].getRegClass();
+    if (RegClass == &SystemZ::FP64RegClass) {
+      MBB.addLiveIn(Reg);
+      storeRegToStackSlot(MBB, MI, Reg, true, CSI[i].getFrameIdx(), RegClass);
+    }
+  }
+
+  return true;
+}
+
+bool
+SystemZInstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetRegisterInfo *RegInfo= MF.getTarget().getRegisterInfo();
+  SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
+
+  // Restore FP registers
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    const TargetRegisterClass *RegClass = CSI[i].getRegClass();
+    if (RegClass == &SystemZ::FP64RegClass)
+      loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RegClass);
+  }
+
+  // Restore GP registers
+  unsigned LowReg = MFI->getLowReg(), HighReg = MFI->getHighReg();
+  unsigned StartOffset = RegSpillOffsets[LowReg];
+
+  if (StartOffset) {
+    // Build a load instruction. Use LOAD MULTIPLE instruction if there are many
+    // registers to load, otherwise - just LOAD.
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MI, DL, get((LowReg == HighReg ?
+                                SystemZ::MOV64rm : SystemZ::MOV64rmm)));
+    // Add store operands.
+    MIB.addReg(LowReg, RegState::Define);
+    if (LowReg != HighReg)
+      MIB.addReg(HighReg, RegState::Define);
+
+    MIB.addReg((RegInfo->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D));
+    MIB.addImm(StartOffset);
+    if (LowReg == HighReg)
+      MIB.addReg(0);
+
+    // Do a second scan adding regs as being defined by instruction
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+      unsigned Reg = CSI[i].getReg();
+      if (Reg != LowReg && Reg != HighReg)
+        MIB.addReg(Reg, RegState::ImplicitDefine);
+    }
+  }
+
+  return true;
+}
+
+bool SystemZInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid Xbranch condition!");
+
+  SystemZCC::CondCodes CC = static_cast<SystemZCC::CondCodes>(Cond[0].getImm());
+  Cond[0].setImm(getOppositeCondition(CC));
+  return false;
+}
+
+bool SystemZInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB)const{
+  if (MBB.empty()) return false;
+
+  switch (MBB.back().getOpcode()) {
+  case SystemZ::RET:   // Return.
+  case SystemZ::JMP:   // Uncond branch.
+  case SystemZ::JMPr:  // Indirect branch.
+    return true;
+  default: return false;
+  }
+}
+
+bool SystemZInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isTerminator()) return false;
+
+  // Conditional branch is a special case.
+  if (TID.isBranch() && !TID.isBarrier())
+    return true;
+  if (!TID.isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                     MachineBasicBlock *&FBB,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!I->getDesc().isBranch())
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == SystemZ::JMP) {
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (next(I) != MBB.end())
+        next(I)->eraseFromParent();
+      Cond.clear();
+      FBB = 0;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    SystemZCC::CondCodes BranchCode = getCondFromBranchOpc(I->getOpcode());
+    if (BranchCode == SystemZCC::INVALID)
+      return true;  // Can't handle indirect branch.
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to
+    // the same destination.
+    if (TBB != I->getOperand(0).getMBB())
+      return true;
+
+    SystemZCC::CondCodes OldBranchCode = (SystemZCC::CondCodes)Cond[0].getImm();
+    // If the conditions are the same, we can leave them alone.
+    if (OldBranchCode == BranchCode)
+      continue;
+
+    return true;
+  }
+
+  return false;
+}
+
+unsigned SystemZInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->getOpcode() != SystemZ::JMP &&
+        getCondFromBranchOpc(I->getOpcode()) == SystemZCC::INVALID)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+unsigned
+SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                               MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME: this should probably have a DebugLoc operand
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "SystemZ branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, dl, get(SystemZ::JMP)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  SystemZCC::CondCodes CC = (SystemZCC::CondCodes)Cond[0].getImm();
+  BuildMI(&MBB, dl, getBrCond(CC)).addMBB(TBB);
+  ++Count;
+
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, dl, get(SystemZ::JMP)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+const TargetInstrDesc&
+SystemZInstrInfo::getBrCond(SystemZCC::CondCodes CC) const {
+  switch (CC) {
+  default:
+   llvm_unreachable("Unknown condition code!");
+  case SystemZCC::O:   return get(SystemZ::JO);
+  case SystemZCC::H:   return get(SystemZ::JH);
+  case SystemZCC::NLE: return get(SystemZ::JNLE);
+  case SystemZCC::L:   return get(SystemZ::JL);
+  case SystemZCC::NHE: return get(SystemZ::JNHE);
+  case SystemZCC::LH:  return get(SystemZ::JLH);
+  case SystemZCC::NE:  return get(SystemZ::JNE);
+  case SystemZCC::E:   return get(SystemZ::JE);
+  case SystemZCC::NLH: return get(SystemZ::JNLH);
+  case SystemZCC::HE:  return get(SystemZ::JHE);
+  case SystemZCC::NL:  return get(SystemZ::JNL);
+  case SystemZCC::LE:  return get(SystemZ::JLE);
+  case SystemZCC::NH:  return get(SystemZ::JNH);
+  case SystemZCC::NO:  return get(SystemZ::JNO);
+  }
+}
+
+SystemZCC::CondCodes
+SystemZInstrInfo::getCondFromBranchOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:            return SystemZCC::INVALID;
+  case SystemZ::JO:   return SystemZCC::O;
+  case SystemZ::JH:   return SystemZCC::H;
+  case SystemZ::JNLE: return SystemZCC::NLE;
+  case SystemZ::JL:   return SystemZCC::L;
+  case SystemZ::JNHE: return SystemZCC::NHE;
+  case SystemZ::JLH:  return SystemZCC::LH;
+  case SystemZ::JNE:  return SystemZCC::NE;
+  case SystemZ::JE:   return SystemZCC::E;
+  case SystemZ::JNLH: return SystemZCC::NLH;
+  case SystemZ::JHE:  return SystemZCC::HE;
+  case SystemZ::JNL:  return SystemZCC::NL;
+  case SystemZ::JLE:  return SystemZCC::LE;
+  case SystemZ::JNH:  return SystemZCC::NH;
+  case SystemZ::JNO:  return SystemZCC::NO;
+  }
+}
+
+SystemZCC::CondCodes
+SystemZInstrInfo::getOppositeCondition(SystemZCC::CondCodes CC) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Invalid condition!");
+  case SystemZCC::O:   return SystemZCC::NO;
+  case SystemZCC::H:   return SystemZCC::NH;
+  case SystemZCC::NLE: return SystemZCC::LE;
+  case SystemZCC::L:   return SystemZCC::NL;
+  case SystemZCC::NHE: return SystemZCC::HE;
+  case SystemZCC::LH:  return SystemZCC::NLH;
+  case SystemZCC::NE:  return SystemZCC::E;
+  case SystemZCC::E:   return SystemZCC::NE;
+  case SystemZCC::NLH: return SystemZCC::LH;
+  case SystemZCC::HE:  return SystemZCC::NHE;
+  case SystemZCC::NL:  return SystemZCC::L;
+  case SystemZCC::LE:  return SystemZCC::NLE;
+  case SystemZCC::NH:  return SystemZCC::H;
+  case SystemZCC::NO:  return SystemZCC::O;
+  }
+}
+
+const TargetInstrDesc&
+SystemZInstrInfo::getLongDispOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Don't have long disp version of this instruction");
+  case SystemZ::MOV32mr:   return get(SystemZ::MOV32mry);
+  case SystemZ::MOV32rm:   return get(SystemZ::MOV32rmy);
+  case SystemZ::MOVSX32rm16: return get(SystemZ::MOVSX32rm16y);
+  case SystemZ::MOV32m8r:  return get(SystemZ::MOV32m8ry);
+  case SystemZ::MOV32m16r: return get(SystemZ::MOV32m16ry);
+  case SystemZ::MOV64m8r:  return get(SystemZ::MOV64m8ry);
+  case SystemZ::MOV64m16r: return get(SystemZ::MOV64m16ry);
+  case SystemZ::MOV64m32r: return get(SystemZ::MOV64m32ry);
+  case SystemZ::MOV8mi:    return get(SystemZ::MOV8miy);
+  case SystemZ::MUL32rm:   return get(SystemZ::MUL32rmy);
+  case SystemZ::CMP32rm:   return get(SystemZ::CMP32rmy);
+  case SystemZ::UCMP32rm:  return get(SystemZ::UCMP32rmy);
+  case SystemZ::FMOV32mr:  return get(SystemZ::FMOV32mry);
+  case SystemZ::FMOV64mr:  return get(SystemZ::FMOV64mry);
+  case SystemZ::FMOV32rm:  return get(SystemZ::FMOV32rmy);
+  case SystemZ::FMOV64rm:  return get(SystemZ::FMOV64rmy);
+  case SystemZ::MOV64Pmr:  return get(SystemZ::MOV64Pmry);
+  case SystemZ::MOV64Prm:  return get(SystemZ::MOV64Prmy);
+  }
+}
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
new file mode 100644
index 0000000..e16d704
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -0,0 +1,119 @@
+//===- SystemZInstrInfo.h - SystemZ Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SYSTEMZINSTRINFO_H
+#define LLVM_TARGET_SYSTEMZINSTRINFO_H
+
+#include "SystemZ.h"
+#include "SystemZRegisterInfo.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+class SystemZTargetMachine;
+
+/// SystemZII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace SystemZII {
+  enum {
+    //===------------------------------------------------------------------===//
+    // SystemZ Specific MachineOperand flags.
+
+    MO_NO_FLAG = 0,
+
+    /// MO_GOTENT - On a symbol operand this indicates that the immediate is
+    /// the offset to the location of the symbol name from the base of the GOT.
+    ///
+    ///    SYMBOL_LABEL @GOTENT
+    MO_GOTENT = 1,
+
+    /// MO_PLT - On a symbol operand this indicates that the immediate is
+    /// offset to the PLT entry of symbol name from the current code location.
+    ///
+    ///    SYMBOL_LABEL @PLT
+    MO_PLT = 2
+  };
+}
+
+class SystemZInstrInfo : public TargetInstrInfoImpl {
+  const SystemZRegisterInfo RI;
+  SystemZTargetMachine &TM;
+  IndexedMap<unsigned> RegSpillOffsets;
+public:
+  explicit SystemZInstrInfo(SystemZTargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const SystemZRegisterInfo &getRegisterInfo() const { return RI; }
+
+  bool copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *DestRC,
+                    const TargetRegisterClass *SrcRC) const;
+
+  bool isMoveInstr(const MachineInstr& MI,
+                   unsigned &SrcReg, unsigned &DstReg,
+                   unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned SrcReg, bool isKill,
+                                   int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned DestReg, int FrameIdx,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+  virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+  virtual bool isUnpredicatedTerminator(const MachineInstr *MI) const;
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB,
+                             MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  SystemZCC::CondCodes getOppositeCondition(SystemZCC::CondCodes CC) const;
+  SystemZCC::CondCodes getCondFromBranchOpc(unsigned Opc) const;
+  const TargetInstrDesc& getBrCond(SystemZCC::CondCodes CC) const;
+  const TargetInstrDesc& getLongDispOpc(unsigned Opc) const;
+
+  const TargetInstrDesc& getMemoryInstr(unsigned Opc, int64_t Offset = 0) const {
+    if (Offset < 0 || Offset >= 4096)
+      return getLongDispOpc(Opc);
+    else
+      return get(Opc);
+  }
+};
+
+}
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
new file mode 100644
index 0000000..56d75dd
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -0,0 +1,1155 @@
+//===- SystemZInstrInfo.td - SystemZ Instruction defs ---------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the SystemZ instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SystemZ Instruction Predicate Definitions.
+def IsZ10 : Predicate<"Subtarget.isZ10()">;
+
+include "SystemZInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Type Constraints.
+//===----------------------------------------------------------------------===//
+class SDTCisI8<int OpNum> : SDTCisVT<OpNum, i8>;
+class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
+class SDTCisI32<int OpNum> : SDTCisVT<OpNum, i32>;
+class SDTCisI64<int OpNum> : SDTCisVT<OpNum, i64>;
+
+//===----------------------------------------------------------------------===//
+// Type Profiles.
+//===----------------------------------------------------------------------===//
+def SDT_SystemZCall         : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_SystemZCallSeqStart : SDCallSeqStart<[SDTCisI64<0>]>;
+def SDT_SystemZCallSeqEnd   : SDCallSeqEnd<[SDTCisI64<0>, SDTCisI64<1>]>;
+def SDT_CmpTest             : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_BrCond              : SDTypeProfile<0, 2,
+                                           [SDTCisVT<0, OtherVT>,
+                                            SDTCisI8<1>]>;
+def SDT_SelectCC            : SDTypeProfile<1, 3,
+                                           [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                            SDTCisI8<3>]>;
+def SDT_Address             : SDTypeProfile<1, 1,
+                                            [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+//===----------------------------------------------------------------------===//
+// SystemZ Specific Node Definitions.
+//===----------------------------------------------------------------------===//
+def SystemZretflag : SDNode<"SystemZISD::RET_FLAG", SDTNone,
+                     [SDNPHasChain, SDNPOptInFlag]>;
+def SystemZcall    : SDNode<"SystemZISD::CALL", SDT_SystemZCall,
+                     [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+def SystemZcallseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_SystemZCallSeqStart,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def SystemZcallseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_SystemZCallSeqEnd,
+                        [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def SystemZcmp     : SDNode<"SystemZISD::CMP", SDT_CmpTest, [SDNPOutFlag]>;
+def SystemZucmp    : SDNode<"SystemZISD::UCMP", SDT_CmpTest, [SDNPOutFlag]>;
+def SystemZbrcond  : SDNode<"SystemZISD::BRCOND", SDT_BrCond,
+                            [SDNPHasChain, SDNPInFlag]>;
+def SystemZselect  : SDNode<"SystemZISD::SELECT", SDT_SelectCC, [SDNPInFlag]>;
+def SystemZpcrelwrapper : SDNode<"SystemZISD::PCRelativeWrapper", SDT_Address, []>;
+
+
+include "SystemZOperands.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction list..
+
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
+                              "#ADJCALLSTACKDOWN",
+                              [(SystemZcallseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                              "#ADJCALLSTACKUP",
+                              [(SystemZcallseq_end timm:$amt1, timm:$amt2)]>;
+
+let usesCustomDAGSchedInserter = 1 in {
+  def Select32 : Pseudo<(outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cc),
+                        "# Select32 PSEUDO",
+                        [(set GR32:$dst,
+                              (SystemZselect GR32:$src1, GR32:$src2, imm:$cc))]>;
+  def Select64 : Pseudo<(outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$cc),
+                        "# Select64 PSEUDO",
+                        [(set GR64:$dst,
+                              (SystemZselect GR64:$src1, GR64:$src2, imm:$cc))]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions...
+//
+
+// FIXME: Provide proper encoding!
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in {
+  def RET : Pseudo<(outs), (ins), "br\t%r14", [(SystemZretflag)]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  let isBarrier = 1 in {
+    def JMP  : Pseudo<(outs), (ins brtarget:$dst), "j\t{$dst}", [(br bb:$dst)]>;
+
+    let isIndirectBranch = 1 in
+      def JMPr   : Pseudo<(outs), (ins GR64:$dst), "br\t{$dst}", [(brind GR64:$dst)]>;
+  }
+
+  let Uses = [PSW] in {
+    def JO  : Pseudo<(outs), (ins brtarget:$dst),
+                     "jo\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_O)]>;
+    def JH  : Pseudo<(outs), (ins brtarget:$dst),
+                     "jh\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_H)]>;
+    def JNLE: Pseudo<(outs), (ins brtarget:$dst),
+                     "jnle\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NLE)]>;
+    def JL  : Pseudo<(outs), (ins brtarget:$dst),
+                     "jl\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_L)]>;
+    def JNHE: Pseudo<(outs), (ins brtarget:$dst),
+                     "jnhe\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NHE)]>;
+    def JLH : Pseudo<(outs), (ins brtarget:$dst),
+                     "jlh\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_LH)]>;
+    def JNE : Pseudo<(outs), (ins brtarget:$dst),
+                     "jne\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NE)]>;
+    def JE  : Pseudo<(outs), (ins brtarget:$dst),
+                     "je\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_E)]>;
+    def JNLH: Pseudo<(outs), (ins brtarget:$dst),
+                     "jnlh\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NLH)]>;
+    def JHE : Pseudo<(outs), (ins brtarget:$dst),
+                     "jhe\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_HE)]>;
+    def JNL : Pseudo<(outs), (ins brtarget:$dst),
+                     "jnl\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NL)]>;
+    def JLE : Pseudo<(outs), (ins brtarget:$dst),
+                     "jle\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_LE)]>;
+    def JNH : Pseudo<(outs), (ins brtarget:$dst),
+                     "jnh\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NH)]>;
+    def JNO : Pseudo<(outs), (ins brtarget:$dst),
+                     "jno\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NO)]>;
+  } // Uses = [PSW]
+} // isBranch = 1
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. Uses for argument
+  // registers are added manually.
+  let Defs = [R0D, R1D, R2D, R3D, R4D, R5D, R14D,
+              F0L, F1L, F2L, F3L, F4L, F5L, F6L, F7L] in {
+    def CALLi     : Pseudo<(outs), (ins imm_pcrel:$dst, variable_ops),
+                           "brasl\t%r14, $dst", [(SystemZcall imm:$dst)]>;
+    def CALLr     : Pseudo<(outs), (ins ADDR64:$dst, variable_ops),
+                           "basr\t%r14, $dst", [(SystemZcall ADDR64:$dst)]>;
+  }
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+let isReMaterializable = 1 in
+// FIXME: Provide imm12 variant
+// FIXME: Address should be halfword aligned...
+def LA64r  : RXI<0x47,
+                 (outs GR64:$dst), (ins laaddr:$src),
+                 "lay\t{$dst, $src}",
+                 [(set GR64:$dst, laaddr:$src)]>;
+def LA64rm : RXYI<0x71E3,
+                  (outs GR64:$dst), (ins i64imm:$src),
+                  "larl\t{$dst, $src}",
+                  [(set GR64:$dst,
+                        (SystemZpcrelwrapper tglobaladdr:$src))]>;
+
+let neverHasSideEffects = 1 in
+def NOP : Pseudo<(outs), (ins), "# no-op", []>;
+
+//===----------------------------------------------------------------------===//
+// Move Instructions
+
+let neverHasSideEffects = 1 in {
+def MOV32rr : RRI<0x18,
+                  (outs GR32:$dst), (ins GR32:$src),
+                  "lr\t{$dst, $src}",
+                  []>;
+def MOV64rr : RREI<0xB904,
+                   (outs GR64:$dst), (ins GR64:$src),
+                   "lgr\t{$dst, $src}",
+                   []>;
+def MOV128rr : Pseudo<(outs GR128:$dst), (ins GR128:$src),
+                     "# MOV128 PSEUDO!\n"
+                     "\tlgr\t${dst:subreg_odd}, ${src:subreg_odd}\n"
+                     "\tlgr\t${dst:subreg_even}, ${src:subreg_even}",
+                     []>;
+def MOV64rrP : Pseudo<(outs GR64P:$dst), (ins GR64P:$src),
+                     "# MOV64P PSEUDO!\n"
+                     "\tlr\t${dst:subreg_odd}, ${src:subreg_odd}\n"
+                     "\tlr\t${dst:subreg_even}, ${src:subreg_even}",
+                     []>;
+}
+
+def MOVSX64rr32 : RREI<0xB914,
+                       (outs GR64:$dst), (ins GR32:$src),
+                       "lgfr\t{$dst, $src}",
+                       [(set GR64:$dst, (sext GR32:$src))]>;
+def MOVZX64rr32 : RREI<0xB916,
+                       (outs GR64:$dst), (ins GR32:$src),
+                       "llgfr\t{$dst, $src}",
+                       [(set GR64:$dst, (zext GR32:$src))]>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV32ri16 : RII<0x8A7,
+                    (outs GR32:$dst), (ins s16imm:$src),
+                    "lhi\t{$dst, $src}",
+                    [(set GR32:$dst, immSExt16:$src)]>;
+def MOV64ri16 : RII<0x9A7,
+                    (outs GR64:$dst), (ins s16imm64:$src),
+                    "lghi\t{$dst, $src}",
+                    [(set GR64:$dst, immSExt16:$src)]>;
+
+def MOV64rill16 : RII<0xFA5,
+                      (outs GR64:$dst), (ins i64imm:$src),
+                      "llill\t{$dst, $src}",
+                      [(set GR64:$dst, i64ll16:$src)]>;
+def MOV64rilh16 : RII<0xEA5,
+                      (outs GR64:$dst), (ins i64imm:$src),
+                      "llilh\t{$dst, $src}",
+                      [(set GR64:$dst, i64lh16:$src)]>;
+def MOV64rihl16 : RII<0xDA5,
+                      (outs GR64:$dst), (ins i64imm:$src),
+                      "llihl\t{$dst, $src}",
+                      [(set GR64:$dst, i64hl16:$src)]>;
+def MOV64rihh16 : RII<0xCA5,
+                      (outs GR64:$dst), (ins i64imm:$src),
+                      "llihh\t{$dst, $src}",
+                      [(set GR64:$dst, i64hh16:$src)]>;
+
+def MOV64ri32 : RILI<0x1C0,
+                     (outs GR64:$dst), (ins s32imm64:$src),
+                     "lgfi\t{$dst, $src}",
+                     [(set GR64:$dst, immSExt32:$src)]>;
+def MOV64rilo32 : RILI<0xFC0,
+                       (outs GR64:$dst), (ins i64imm:$src),
+                       "llilf\t{$dst, $src}",
+                       [(set GR64:$dst, i64lo32:$src)]>;
+def MOV64rihi32 : RILI<0xEC0, (outs GR64:$dst), (ins i64imm:$src),
+                       "llihf\t{$dst, $src}",
+                       [(set GR64:$dst, i64hi32:$src)]>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+def MOV32rm  : RXI<0x58,
+                   (outs GR32:$dst), (ins rriaddr12:$src),
+                   "l\t{$dst, $src}",
+                   [(set GR32:$dst, (load rriaddr12:$src))]>;
+def MOV32rmy : RXYI<0x58E3,
+                    (outs GR32:$dst), (ins rriaddr:$src),
+                    "ly\t{$dst, $src}",
+                    [(set GR32:$dst, (load rriaddr:$src))]>;
+def MOV64rm  : RXYI<0x04E3,
+                    (outs GR64:$dst), (ins rriaddr:$src),
+                    "lg\t{$dst, $src}",
+                    [(set GR64:$dst, (load rriaddr:$src))]>;
+def MOV64Prm : Pseudo<(outs GR64P:$dst), (ins rriaddr12:$src),
+                      "# MOV64P PSEUDO!\n"
+                      "\tl\t${dst:subreg_odd},  $src\n"
+                      "\tl\t${dst:subreg_even}, 4+$src",
+                      [(set GR64P:$dst, (load rriaddr12:$src))]>;
+def MOV64Prmy : Pseudo<(outs GR64P:$dst), (ins rriaddr:$src),
+                       "# MOV64P PSEUDO!\n"
+                       "\tly\t${dst:subreg_odd},  $src\n"
+                       "\tly\t${dst:subreg_even}, 4+$src",
+                       [(set GR64P:$dst, (load rriaddr:$src))]>;
+def MOV128rm : Pseudo<(outs GR128:$dst), (ins rriaddr:$src),
+                      "# MOV128 PSEUDO!\n"
+                      "\tlg\t${dst:subreg_odd},  $src\n"
+                      "\tlg\t${dst:subreg_even}, 8+$src",
+                      [(set GR128:$dst, (load rriaddr:$src))]>;
+}
+
+def MOV32mr  : RXI<0x50,
+                   (outs), (ins rriaddr12:$dst, GR32:$src),
+                   "st\t{$src, $dst}",
+                   [(store GR32:$src, rriaddr12:$dst)]>;
+def MOV32mry : RXYI<0x50E3,
+                    (outs), (ins rriaddr:$dst, GR32:$src),
+                    "sty\t{$src, $dst}",
+                    [(store GR32:$src, rriaddr:$dst)]>;
+def MOV64mr  : RXYI<0x24E3,
+                    (outs), (ins rriaddr:$dst, GR64:$src),
+                    "stg\t{$src, $dst}",
+                    [(store GR64:$src, rriaddr:$dst)]>;
+def MOV64Pmr : Pseudo<(outs), (ins rriaddr12:$dst, GR64P:$src),
+                      "# MOV64P PSEUDO!\n"
+                      "\tst\t${src:subreg_odd}, $dst\n"
+                      "\tst\t${src:subreg_even}, 4+$dst",
+                      [(store GR64P:$src, rriaddr12:$dst)]>;
+def MOV64Pmry : Pseudo<(outs), (ins rriaddr:$dst, GR64P:$src),
+                       "# MOV64P PSEUDO!\n"
+                       "\tsty\t${src:subreg_odd}, $dst\n"
+                       "\tsty\t${src:subreg_even}, 4+$dst",
+                       [(store GR64P:$src, rriaddr:$dst)]>;
+def MOV128mr : Pseudo<(outs), (ins rriaddr:$dst, GR128:$src),
+                      "# MOV128 PSEUDO!\n"
+                      "\tstg\t${src:subreg_odd}, $dst\n"
+                      "\tstg\t${src:subreg_even}, 8+$dst",
+                      [(store GR128:$src, rriaddr:$dst)]>;
+
+def MOV8mi    : SII<0x92,
+                    (outs), (ins riaddr12:$dst, i32i8imm:$src),
+                    "mvi\t{$dst, $src}",
+                    [(truncstorei8 (i32 i32immSExt8:$src), riaddr12:$dst)]>;
+def MOV8miy   : SIYI<0x52EB,
+                     (outs), (ins riaddr:$dst, i32i8imm:$src),
+                     "mviy\t{$dst, $src}",
+                     [(truncstorei8 (i32 i32immSExt8:$src), riaddr:$dst)]>;
+
+let AddedComplexity = 2 in {
+def MOV16mi   : SILI<0xE544,
+                     (outs), (ins riaddr12:$dst, s16imm:$src),
+                     "mvhhi\t{$dst, $src}",
+                     [(truncstorei16 (i32 i32immSExt16:$src), riaddr12:$dst)]>,
+                     Requires<[IsZ10]>;
+def MOV32mi16 : SILI<0xE54C,
+                     (outs), (ins riaddr12:$dst, s32imm:$src),
+                     "mvhi\t{$dst, $src}",
+                     [(store (i32 immSExt16:$src), riaddr12:$dst)]>,
+                     Requires<[IsZ10]>;
+def MOV64mi16 : SILI<0xE548,
+                     (outs), (ins riaddr12:$dst, s32imm64:$src),
+                     "mvghi\t{$dst, $src}",
+                     [(store (i64 immSExt16:$src), riaddr12:$dst)]>,
+                     Requires<[IsZ10]>;
+}
+
+// sexts
+def MOVSX32rr8  : RREI<0xB926,
+                       (outs GR32:$dst), (ins GR32:$src),
+                       "lbr\t{$dst, $src}",
+                       [(set GR32:$dst, (sext_inreg GR32:$src, i8))]>;
+def MOVSX64rr8  : RREI<0xB906,
+                       (outs GR64:$dst), (ins GR64:$src),
+                       "lgbr\t{$dst, $src}",
+                       [(set GR64:$dst, (sext_inreg GR64:$src, i8))]>;
+def MOVSX32rr16 : RREI<0xB927,
+                       (outs GR32:$dst), (ins GR32:$src),
+                       "lhr\t{$dst, $src}",
+                       [(set GR32:$dst, (sext_inreg GR32:$src, i16))]>;
+def MOVSX64rr16 : RREI<0xB907,
+                       (outs GR64:$dst), (ins GR64:$src),
+                       "lghr\t{$dst, $src}",
+                       [(set GR64:$dst, (sext_inreg GR64:$src, i16))]>;
+
+// extloads
+def MOVSX32rm8   : RXYI<0x76E3,
+                        (outs GR32:$dst), (ins rriaddr:$src),
+                        "lb\t{$dst, $src}",
+                        [(set GR32:$dst, (sextloadi32i8 rriaddr:$src))]>;
+def MOVSX32rm16  : RXI<0x48,
+                       (outs GR32:$dst), (ins rriaddr12:$src),
+                       "lh\t{$dst, $src}",
+                       [(set GR32:$dst, (sextloadi32i16 rriaddr12:$src))]>;
+def MOVSX32rm16y : RXYI<0x78E3,
+                        (outs GR32:$dst), (ins rriaddr:$src),
+                        "lhy\t{$dst, $src}",
+                        [(set GR32:$dst, (sextloadi32i16 rriaddr:$src))]>;
+def MOVSX64rm8   : RXYI<0x77E3,
+                        (outs GR64:$dst), (ins rriaddr:$src),
+                        "lgb\t{$dst, $src}",
+                        [(set GR64:$dst, (sextloadi64i8 rriaddr:$src))]>;
+def MOVSX64rm16  : RXYI<0x15E3,
+                        (outs GR64:$dst), (ins rriaddr:$src),
+                        "lgh\t{$dst, $src}",
+                        [(set GR64:$dst, (sextloadi64i16 rriaddr:$src))]>;
+def MOVSX64rm32  : RXYI<0x14E3,
+                        (outs GR64:$dst), (ins rriaddr:$src),
+                        "lgf\t{$dst, $src}",
+                        [(set GR64:$dst, (sextloadi64i32 rriaddr:$src))]>;
+
+def MOVZX32rm8  : RXYI<0x94E3,
+                       (outs GR32:$dst), (ins rriaddr:$src),
+                       "llc\t{$dst, $src}",
+                       [(set GR32:$dst, (zextloadi32i8 rriaddr:$src))]>;
+def MOVZX32rm16 : RXYI<0x95E3,
+                       (outs GR32:$dst), (ins rriaddr:$src),
+                       "llh\t{$dst, $src}",
+                       [(set GR32:$dst, (zextloadi32i16 rriaddr:$src))]>;
+def MOVZX64rm8  : RXYI<0x90E3,
+                       (outs GR64:$dst), (ins rriaddr:$src),
+                       "llgc\t{$dst, $src}",
+                       [(set GR64:$dst, (zextloadi64i8 rriaddr:$src))]>;
+def MOVZX64rm16 : RXYI<0x91E3,
+                       (outs GR64:$dst), (ins rriaddr:$src),
+                       "llgh\t{$dst, $src}",
+                       [(set GR64:$dst, (zextloadi64i16 rriaddr:$src))]>;
+def MOVZX64rm32 : RXYI<0x16E3,
+                       (outs GR64:$dst), (ins rriaddr:$src),
+                       "llgf\t{$dst, $src}",
+                       [(set GR64:$dst, (zextloadi64i32 rriaddr:$src))]>;
+
+// truncstores
+def MOV32m8r   : RXI<0x42,
+                     (outs), (ins rriaddr12:$dst, GR32:$src),
+                     "stc\t{$src, $dst}",
+                     [(truncstorei8 GR32:$src, rriaddr12:$dst)]>;
+
+def MOV32m8ry  : RXYI<0x72E3,
+                      (outs), (ins rriaddr:$dst, GR32:$src),
+                      "stcy\t{$src, $dst}",
+                      [(truncstorei8 GR32:$src, rriaddr:$dst)]>;
+
+def MOV32m16r  : RXI<0x40,
+                     (outs), (ins rriaddr12:$dst, GR32:$src),
+                     "sth\t{$src, $dst}",
+                     [(truncstorei16 GR32:$src, rriaddr12:$dst)]>;
+
+def MOV32m16ry : RXYI<0x70E3,
+                      (outs), (ins rriaddr:$dst, GR32:$src),
+                      "sthy\t{$src, $dst}",
+                      [(truncstorei16 GR32:$src, rriaddr:$dst)]>;
+
+def MOV64m8r   : RXI<0x42,
+                     (outs), (ins rriaddr12:$dst, GR64:$src),
+                     "stc\t{$src, $dst}",
+                     [(truncstorei8 GR64:$src, rriaddr12:$dst)]>;
+
+def MOV64m8ry  : RXYI<0x72E3,
+                      (outs), (ins rriaddr:$dst, GR64:$src),
+                      "stcy\t{$src, $dst}",
+                      [(truncstorei8 GR64:$src, rriaddr:$dst)]>;
+
+def MOV64m16r  : RXI<0x40,
+                     (outs), (ins rriaddr12:$dst, GR64:$src),
+                     "sth\t{$src, $dst}",
+                     [(truncstorei16 GR64:$src, rriaddr12:$dst)]>;
+
+def MOV64m16ry : RXYI<0x70E3,
+                      (outs), (ins rriaddr:$dst, GR64:$src),
+                      "sthy\t{$src, $dst}",
+                      [(truncstorei16 GR64:$src, rriaddr:$dst)]>;
+
+def MOV64m32r  : RXI<0x50,
+                     (outs), (ins rriaddr12:$dst, GR64:$src),
+                     "st\t{$src, $dst}",
+                     [(truncstorei32 GR64:$src, rriaddr12:$dst)]>;
+
+def MOV64m32ry : RXYI<0x50E3,
+                      (outs), (ins rriaddr:$dst, GR64:$src),
+                      "sty\t{$src, $dst}",
+                      [(truncstorei32 GR64:$src, rriaddr:$dst)]>;
+
+// multiple regs moves
+// FIXME: should we use multiple arg nodes?
+def MOV32mrm  : RSYI<0x90EB,
+                     (outs), (ins riaddr:$dst, GR32:$from, GR32:$to),
+                     "stmy\t{$from, $to, $dst}",
+                     []>;
+def MOV64mrm  : RSYI<0x24EB,
+                     (outs), (ins riaddr:$dst, GR64:$from, GR64:$to),
+                     "stmg\t{$from, $to, $dst}",
+                     []>;
+def MOV32rmm  : RSYI<0x90EB,
+                     (outs GR32:$from, GR32:$to), (ins riaddr:$dst),
+                     "lmy\t{$from, $to, $dst}",
+                     []>;
+def MOV64rmm  : RSYI<0x04EB,
+                     (outs GR64:$from, GR64:$to), (ins riaddr:$dst),
+                     "lmg\t{$from, $to, $dst}",
+                     []>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isTwoAddress = 1 in {
+def MOV64Pr0_even : Pseudo<(outs GR64P:$dst), (ins GR64P:$src),
+                           "lhi\t${dst:subreg_even}, 0",
+                           []>;
+def MOV128r0_even : Pseudo<(outs GR128:$dst), (ins GR128:$src),
+                           "lghi\t${dst:subreg_even}, 0",
+                           []>;
+}
+
+// Byte swaps
+def BSWAP32rr : RREI<0xB91F,
+                     (outs GR32:$dst), (ins GR32:$src),
+                     "lrvr\t{$dst, $src}",
+                     [(set GR32:$dst, (bswap GR32:$src))]>;
+def BSWAP64rr : RREI<0xB90F,
+                     (outs GR64:$dst), (ins GR64:$src),
+                     "lrvgr\t{$dst, $src}",
+                     [(set GR64:$dst, (bswap GR64:$src))]>;
+
+// FIXME: this is invalid pattern for big-endian
+//def BSWAP16rm : RXYI<0x1FE3, (outs GR32:$dst), (ins rriaddr:$src),
+//                     "lrvh\t{$dst, $src}",
+//                     [(set GR32:$dst, (bswap (extloadi32i16 rriaddr:$src)))]>;
+def BSWAP32rm : RXYI<0x1EE3, (outs GR32:$dst), (ins rriaddr:$src),
+                     "lrv\t{$dst, $src}",
+                     [(set GR32:$dst, (bswap (load rriaddr:$src)))]>;
+def BSWAP64rm : RXYI<0x0FE3, (outs GR64:$dst), (ins rriaddr:$src),
+                     "lrvg\t{$dst, $src}",
+                     [(set GR64:$dst, (bswap (load rriaddr:$src)))]>;
+
+//def BSWAP16mr : RXYI<0xE33F, (outs), (ins rriaddr:$dst, GR32:$src),
+//                     "strvh\t{$src, $dst}",
+//                     [(truncstorei16 (bswap GR32:$src), rriaddr:$dst)]>;
+def BSWAP32mr : RXYI<0xE33E, (outs), (ins rriaddr:$dst, GR32:$src),
+                     "strv\t{$src, $dst}",
+                     [(store (bswap GR32:$src), rriaddr:$dst)]>;
+def BSWAP64mr : RXYI<0xE32F, (outs), (ins rriaddr:$dst, GR64:$src),
+                     "strvg\t{$src, $dst}",
+                     [(store (bswap GR64:$src), rriaddr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions
+
+let Defs = [PSW] in {
+def NEG32rr : RRI<0x13,
+                  (outs GR32:$dst), (ins GR32:$src),
+                  "lcr\t{$dst, $src}",
+                  [(set GR32:$dst, (ineg GR32:$src)),
+                   (implicit PSW)]>;
+def NEG64rr : RREI<0xB903, (outs GR64:$dst), (ins GR64:$src),
+                   "lcgr\t{$dst, $src}",
+                   [(set GR64:$dst, (ineg GR64:$src)),
+                    (implicit PSW)]>;
+def NEG64rr32 : RREI<0xB913, (outs GR64:$dst), (ins GR32:$src),
+                     "lcgfr\t{$dst, $src}",
+                     [(set GR64:$dst, (ineg (sext GR32:$src))),
+                      (implicit PSW)]>;
+}
+
+let isTwoAddress = 1 in {
+
+let Defs = [PSW] in {
+
+let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
+def ADD32rr : RRI<0x1A, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "ar\t{$dst, $src2}",
+                  [(set GR32:$dst, (add GR32:$src1, GR32:$src2)),
+                   (implicit PSW)]>;
+def ADD64rr : RREI<0xB908, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "agr\t{$dst, $src2}",
+                   [(set GR64:$dst, (add GR64:$src1, GR64:$src2)),
+                    (implicit PSW)]>;
+}
+
+def ADD32rm   : RXI<0x5A, (outs GR32:$dst), (ins GR32:$src1, rriaddr12:$src2),
+                    "a\t{$dst, $src2}",
+                    [(set GR32:$dst, (add GR32:$src1, (load rriaddr12:$src2))),
+                     (implicit PSW)]>;
+def ADD32rmy  : RXYI<0xE35A, (outs GR32:$dst), (ins GR32:$src1, rriaddr:$src2),
+                     "ay\t{$dst, $src2}",
+                     [(set GR32:$dst, (add GR32:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+def ADD64rm   : RXYI<0xE308, (outs GR64:$dst), (ins GR64:$src1, rriaddr:$src2),
+                     "ag\t{$dst, $src2}",
+                     [(set GR64:$dst, (add GR64:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+
+
+def ADD32ri16 : RII<0xA7A,
+                    (outs GR32:$dst), (ins GR32:$src1, s16imm:$src2),
+                    "ahi\t{$dst, $src2}",
+                    [(set GR32:$dst, (add GR32:$src1, immSExt16:$src2)),
+                     (implicit PSW)]>;
+def ADD32ri   : RILI<0xC29,
+                     (outs GR32:$dst), (ins GR32:$src1, s32imm:$src2),
+                     "afi\t{$dst, $src2}",
+                     [(set GR32:$dst, (add GR32:$src1, imm:$src2)),
+                      (implicit PSW)]>;
+def ADD64ri16 : RILI<0xA7B,
+                     (outs GR64:$dst), (ins GR64:$src1, s16imm64:$src2),
+                     "aghi\t{$dst, $src2}",
+                     [(set GR64:$dst, (add GR64:$src1, immSExt16:$src2)),
+                      (implicit PSW)]>;
+def ADD64ri32 : RILI<0xC28,
+                     (outs GR64:$dst), (ins GR64:$src1, s32imm64:$src2),
+                     "agfi\t{$dst, $src2}",
+                     [(set GR64:$dst, (add GR64:$src1, immSExt32:$src2)),
+                      (implicit PSW)]>;
+
+let isCommutable = 1 in { // X = ADC Y, Z  == X = ADC Z, Y
+def ADC32rr : RRI<0x1E, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "alr\t{$dst, $src2}",
+                  [(set GR32:$dst, (addc GR32:$src1, GR32:$src2))]>;
+def ADC64rr : RREI<0xB90A, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "algr\t{$dst, $src2}",
+                   [(set GR64:$dst, (addc GR64:$src1, GR64:$src2))]>;
+}
+
+def ADC32ri   : RILI<0xC2B,
+                     (outs GR32:$dst), (ins GR32:$src1, s32imm:$src2),
+                     "alfi\t{$dst, $src2}",
+                     [(set GR32:$dst, (addc GR32:$src1, imm:$src2))]>;
+def ADC64ri32 : RILI<0xC2A,
+                     (outs GR64:$dst), (ins GR64:$src1, s32imm64:$src2),
+                     "algfi\t{$dst, $src2}",
+                     [(set GR64:$dst, (addc GR64:$src1, immSExt32:$src2))]>;
+
+let Uses = [PSW] in {
+def ADDE32rr : RREI<0xB998, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                    "alcr\t{$dst, $src2}",
+                    [(set GR32:$dst, (adde GR32:$src1, GR32:$src2)),
+                     (implicit PSW)]>;
+def ADDE64rr : RREI<0xB988, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                    "alcgr\t{$dst, $src2}",
+                    [(set GR64:$dst, (adde GR64:$src1, GR64:$src2)),
+                     (implicit PSW)]>;
+}
+
+let isCommutable = 1 in { // X = AND Y, Z  == X = AND Z, Y
+def AND32rr : RRI<0x14,
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "nr\t{$dst, $src2}",
+                  [(set GR32:$dst, (and GR32:$src1, GR32:$src2))]>;
+def AND64rr : RREI<0xB980,
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "ngr\t{$dst, $src2}",
+                   [(set GR64:$dst, (and GR64:$src1, GR64:$src2))]>;
+}
+
+def AND32rm   : RXI<0x54, (outs GR32:$dst), (ins GR32:$src1, rriaddr12:$src2),
+                    "n\t{$dst, $src2}",
+                    [(set GR32:$dst, (and GR32:$src1, (load rriaddr12:$src2))),
+                     (implicit PSW)]>;
+def AND32rmy  : RXYI<0xE354, (outs GR32:$dst), (ins GR32:$src1, rriaddr:$src2),
+                     "ny\t{$dst, $src2}",
+                     [(set GR32:$dst, (and GR32:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+def AND64rm   : RXYI<0xE360, (outs GR64:$dst), (ins GR64:$src1, rriaddr:$src2),
+                     "ng\t{$dst, $src2}",
+                     [(set GR64:$dst, (and GR64:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+
+def AND32rill16 : RII<0xA57,
+                      (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                      "nill\t{$dst, $src2}",
+                      [(set GR32:$dst, (and GR32:$src1, i32ll16c:$src2))]>;
+def AND64rill16 : RII<0xA57,
+                      (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                      "nill\t{$dst, $src2}",
+                      [(set GR64:$dst, (and GR64:$src1, i64ll16c:$src2))]>;
+
+def AND32rilh16 : RII<0xA56,
+                      (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                      "nilh\t{$dst, $src2}",
+                      [(set GR32:$dst, (and GR32:$src1, i32lh16c:$src2))]>;
+def AND64rilh16 : RII<0xA56,
+                      (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                      "nilh\t{$dst, $src2}",
+                      [(set GR64:$dst, (and GR64:$src1, i64lh16c:$src2))]>;
+
+def AND64rihl16 : RII<0xA55,
+                      (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                      "nihl\t{$dst, $src2}",
+                      [(set GR64:$dst, (and GR64:$src1, i64hl16c:$src2))]>;
+def AND64rihh16 : RII<0xA54,
+                      (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                      "nihh\t{$dst, $src2}",
+                      [(set GR64:$dst, (and GR64:$src1, i64hh16c:$src2))]>;
+
+def AND32ri     : RILI<0xC0B,
+                       (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                       "nilf\t{$dst, $src2}",
+                       [(set GR32:$dst, (and GR32:$src1, imm:$src2))]>;
+def AND64rilo32 : RILI<0xC0B,
+                       (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                       "nilf\t{$dst, $src2}",
+                       [(set GR64:$dst, (and GR64:$src1, i64lo32c:$src2))]>;
+def AND64rihi32 : RILI<0xC0A,
+                       (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                       "nihf\t{$dst, $src2}",
+                       [(set GR64:$dst, (and GR64:$src1, i64hi32c:$src2))]>;
+
+let isCommutable = 1 in { // X = OR Y, Z  == X = OR Z, Y
+def OR32rr : RRI<0x16,
+                 (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                 "or\t{$dst, $src2}",
+                 [(set GR32:$dst, (or GR32:$src1, GR32:$src2))]>;
+def OR64rr : RREI<0xB981,
+                  (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                  "ogr\t{$dst, $src2}",
+                  [(set GR64:$dst, (or GR64:$src1, GR64:$src2))]>;
+}
+
+def OR32rm   : RXI<0x56, (outs GR32:$dst), (ins GR32:$src1, rriaddr12:$src2),
+                   "o\t{$dst, $src2}",
+                   [(set GR32:$dst, (or GR32:$src1, (load rriaddr12:$src2))),
+                    (implicit PSW)]>;
+def OR32rmy  : RXYI<0xE356, (outs GR32:$dst), (ins GR32:$src1, rriaddr:$src2),
+                    "oy\t{$dst, $src2}",
+                    [(set GR32:$dst, (or GR32:$src1, (load rriaddr:$src2))),
+                     (implicit PSW)]>;
+def OR64rm   : RXYI<0xE381, (outs GR64:$dst), (ins GR64:$src1, rriaddr:$src2),
+                    "og\t{$dst, $src2}",
+                    [(set GR64:$dst, (or GR64:$src1, (load rriaddr:$src2))),
+                     (implicit PSW)]>;
+
+ // FIXME: Provide proper encoding!
+def OR32ri16  : RII<0xA5B,
+                    (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                    "oill\t{$dst, $src2}",
+                    [(set GR32:$dst, (or GR32:$src1, i32ll16:$src2))]>;
+def OR32ri16h : RII<0xA5A,
+                    (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                    "oilh\t{$dst, $src2}",
+                    [(set GR32:$dst, (or GR32:$src1, i32lh16:$src2))]>;
+def OR32ri : RILI<0xC0D,
+                  (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                  "oilf\t{$dst, $src2}",
+                  [(set GR32:$dst, (or GR32:$src1, imm:$src2))]>;
+
+def OR64rill16 : RII<0xA5B,
+                     (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                     "oill\t{$dst, $src2}",
+                     [(set GR64:$dst, (or GR64:$src1, i64ll16:$src2))]>;
+def OR64rilh16 : RII<0xA5A,
+                     (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                     "oilh\t{$dst, $src2}",
+                     [(set GR64:$dst, (or GR64:$src1, i64lh16:$src2))]>;
+def OR64rihl16 : RII<0xA59,
+                     (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                     "oihl\t{$dst, $src2}",
+                     [(set GR64:$dst, (or GR64:$src1, i64hl16:$src2))]>;
+def OR64rihh16 : RII<0xA58,
+                     (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                     "oihh\t{$dst, $src2}",
+                     [(set GR64:$dst, (or GR64:$src1, i64hh16:$src2))]>;
+
+def OR64rilo32 : RILI<0xC0D,
+                      (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                      "oilf\t{$dst, $src2}",
+                      [(set GR64:$dst, (or GR64:$src1, i64lo32:$src2))]>;
+def OR64rihi32 : RILI<0xC0C,
+                      (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                      "oihf\t{$dst, $src2}",
+                      [(set GR64:$dst, (or GR64:$src1, i64hi32:$src2))]>;
+
+def SUB32rr : RRI<0x1B,
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "sr\t{$dst, $src2}",
+                  [(set GR32:$dst, (sub GR32:$src1, GR32:$src2))]>;
+def SUB64rr : RREI<0xB909,
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "sgr\t{$dst, $src2}",
+                   [(set GR64:$dst, (sub GR64:$src1, GR64:$src2))]>;
+
+def SUB32rm   : RXI<0x5B, (outs GR32:$dst), (ins GR32:$src1, rriaddr12:$src2),
+                    "s\t{$dst, $src2}",
+                    [(set GR32:$dst, (sub GR32:$src1, (load rriaddr12:$src2))),
+                     (implicit PSW)]>;
+def SUB32rmy  : RXYI<0xE35B, (outs GR32:$dst), (ins GR32:$src1, rriaddr:$src2),
+                     "sy\t{$dst, $src2}",
+                     [(set GR32:$dst, (sub GR32:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+def SUB64rm   : RXYI<0xE309, (outs GR64:$dst), (ins GR64:$src1, rriaddr:$src2),
+                     "sg\t{$dst, $src2}",
+                     [(set GR64:$dst, (sub GR64:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+ 
+def SBC32rr : RRI<0x1F,
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "slr\t{$dst, $src2}",
+                  [(set GR32:$dst, (subc GR32:$src1, GR32:$src2))]>;
+def SBC64rr : RREI<0xB90B,
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "slgr\t{$dst, $src2}",
+                   [(set GR64:$dst, (subc GR64:$src1, GR64:$src2))]>;
+
+def SBC32ri   : RILI<0xC25,
+                     (outs GR32:$dst), (ins GR32:$src1, s32imm:$src2),
+                     "sllfi\t{$dst, $src2}",
+                     [(set GR32:$dst, (subc GR32:$src1, imm:$src2))]>;
+def SBC64ri32 : RILI<0xC24,
+                     (outs GR64:$dst), (ins GR64:$src1, s32imm64:$src2),
+                     "slgfi\t{$dst, $src2}",
+                     [(set GR64:$dst, (subc GR64:$src1, immSExt32:$src2))]>;
+
+let Uses = [PSW] in {
+def SUBE32rr : RREI<0xB999, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                    "slbr\t{$dst, $src2}",
+                    [(set GR32:$dst, (sube GR32:$src1, GR32:$src2)),
+                     (implicit PSW)]>;
+def SUBE64rr : RREI<0xB989, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                    "slbgr\t{$dst, $src2}",
+                    [(set GR64:$dst, (sube GR64:$src1, GR64:$src2)),
+                     (implicit PSW)]>;
+}
+
+let isCommutable = 1 in { // X = XOR Y, Z  == X = XOR Z, Y
+def XOR32rr : RRI<0x17,
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "xr\t{$dst, $src2}",
+                  [(set GR32:$dst, (xor GR32:$src1, GR32:$src2))]>;
+def XOR64rr : RREI<0xB982,
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "xgr\t{$dst, $src2}",
+                   [(set GR64:$dst, (xor GR64:$src1, GR64:$src2))]>;
+}
+
+def XOR32rm   : RXI<0x57,(outs GR32:$dst), (ins GR32:$src1, rriaddr12:$src2),
+                    "x\t{$dst, $src2}",
+                    [(set GR32:$dst, (xor GR32:$src1, (load rriaddr12:$src2))),
+                     (implicit PSW)]>;
+def XOR32rmy  : RXYI<0xE357, (outs GR32:$dst), (ins GR32:$src1, rriaddr:$src2),
+                     "xy\t{$dst, $src2}",
+                     [(set GR32:$dst, (xor GR32:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+def XOR64rm   : RXYI<0xE382, (outs GR64:$dst), (ins GR64:$src1, rriaddr:$src2),
+                     "xg\t{$dst, $src2}",
+                     [(set GR64:$dst, (xor GR64:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+
+def XOR32ri : RILI<0xC07,
+                   (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                   "xilf\t{$dst, $src2}",
+                   [(set GR32:$dst, (xor GR32:$src1, imm:$src2))]>;
+
+} // Defs = [PSW]
+
+let isCommutable = 1 in { // X = MUL Y, Z == X = MUL Z, Y
+def MUL32rr : RREI<0xB252,
+                   (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                   "msr\t{$dst, $src2}",
+                   [(set GR32:$dst, (mul GR32:$src1, GR32:$src2))]>;
+def MUL64rr : RREI<0xB90C,
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "msgr\t{$dst, $src2}",
+                   [(set GR64:$dst, (mul GR64:$src1, GR64:$src2))]>;
+}
+
+def MUL64rrP   : RRI<0x1C,
+                     (outs GR64P:$dst), (ins GR64P:$src1, GR32:$src2),
+                     "mr\t{$dst, $src2}",
+                     []>;
+def UMUL64rrP  : RREI<0xB996,
+                      (outs GR64P:$dst), (ins GR64P:$src1, GR32:$src2),
+                      "mlr\t{$dst, $src2}",
+                      []>;
+def UMUL128rrP : RREI<0xB986,
+                      (outs GR128:$dst), (ins GR128:$src1, GR64:$src2),
+                      "mlgr\t{$dst, $src2}",
+                      []>;
+
+def MUL32ri16   : RII<0xA7C,
+                      (outs GR32:$dst), (ins GR32:$src1, s16imm:$src2),
+                      "mhi\t{$dst, $src2}",
+                      [(set GR32:$dst, (mul GR32:$src1, i32immSExt16:$src2))]>;
+def MUL64ri16   : RII<0xA7D,
+                      (outs GR64:$dst), (ins GR64:$src1, s16imm64:$src2),
+                      "mghi\t{$dst, $src2}",
+                      [(set GR64:$dst, (mul GR64:$src1, immSExt16:$src2))]>;
+
+let AddedComplexity = 2 in {
+def MUL32ri     : RILI<0xC21,
+                       (outs GR32:$dst), (ins GR32:$src1, s32imm:$src2),
+                       "msfi\t{$dst, $src2}",
+                       [(set GR32:$dst, (mul GR32:$src1, imm:$src2))]>,
+                       Requires<[IsZ10]>;
+def MUL64ri32   : RILI<0xC20,
+                       (outs GR64:$dst), (ins GR64:$src1, s32imm64:$src2),
+                       "msgfi\t{$dst, $src2}",
+                       [(set GR64:$dst, (mul GR64:$src1, i64immSExt32:$src2))]>,
+                       Requires<[IsZ10]>;
+}
+
+def MUL32rm : RXI<0x71,
+                  (outs GR32:$dst), (ins GR32:$src1, rriaddr12:$src2),
+                  "ms\t{$dst, $src2}",
+                  [(set GR32:$dst, (mul GR32:$src1, (load rriaddr12:$src2)))]>;
+def MUL32rmy : RXYI<0xE351,
+                    (outs GR32:$dst), (ins GR32:$src1, rriaddr:$src2),
+                    "msy\t{$dst, $src2}",
+                    [(set GR32:$dst, (mul GR32:$src1, (load rriaddr:$src2)))]>;
+def MUL64rm  : RXYI<0xE30C,
+                    (outs GR64:$dst), (ins GR64:$src1, rriaddr:$src2),
+                    "msg\t{$dst, $src2}",
+                    [(set GR64:$dst, (mul GR64:$src1, (load rriaddr:$src2)))]>;
+
+def MULSX64rr32 : RREI<0xB91C,
+                       (outs GR64:$dst), (ins GR64:$src1, GR32:$src2),
+                       "msgfr\t{$dst, $src2}",
+                       [(set GR64:$dst, (mul GR64:$src1, (sext GR32:$src2)))]>;
+
+def SDIVREM32r : RREI<0xB91D,
+                      (outs GR128:$dst), (ins GR128:$src1, GR32:$src2),
+                      "dsgfr\t{$dst, $src2}",
+                      []>;
+def SDIVREM64r : RREI<0xB90D,
+                      (outs GR128:$dst), (ins GR128:$src1, GR64:$src2),
+                      "dsgr\t{$dst, $src2}",
+                      []>;
+
+def UDIVREM32r : RREI<0xB997,
+                      (outs GR64P:$dst), (ins GR64P:$src1, GR32:$src2),
+                      "dlr\t{$dst, $src2}",
+                      []>;
+def UDIVREM64r : RREI<0xB987,
+                      (outs GR128:$dst), (ins GR128:$src1, GR64:$src2),
+                      "dlgr\t{$dst, $src2}",
+                      []>;
+let mayLoad = 1 in {
+def SDIVREM32m : RXYI<0xE31D,
+                      (outs GR128:$dst), (ins GR128:$src1, rriaddr:$src2),
+                      "dsgf\t{$dst, $src2}",
+                      []>;
+def SDIVREM64m : RXYI<0xE30D,
+                      (outs GR128:$dst), (ins GR128:$src1, rriaddr:$src2),
+                      "dsg\t{$dst, $src2}",
+                      []>;
+
+def UDIVREM32m : RXYI<0xE397, (outs GR64P:$dst), (ins GR64P:$src1, rriaddr:$src2),
+                      "dl\t{$dst, $src2}",
+                      []>;
+def UDIVREM64m : RXYI<0xE387, (outs GR128:$dst), (ins GR128:$src1, rriaddr:$src2),
+                      "dlg\t{$dst, $src2}",
+                      []>;
+} // mayLoad
+} // isTwoAddress = 1
+
+//===----------------------------------------------------------------------===//
+// Shifts
+
+let isTwoAddress = 1 in
+def SRL32rri : RSI<0x88,
+                   (outs GR32:$dst), (ins GR32:$src, riaddr32:$amt),
+                   "srl\t{$src, $amt}",
+                   [(set GR32:$dst, (srl GR32:$src, riaddr32:$amt))]>;
+def SRL64rri : RSYI<0xEB0C,
+                    (outs GR64:$dst), (ins GR64:$src, riaddr:$amt),
+                    "srlg\t{$dst, $src, $amt}",
+                    [(set GR64:$dst, (srl GR64:$src, riaddr:$amt))]>;
+
+let isTwoAddress = 1 in
+def SHL32rri : RSI<0x89,
+                   (outs GR32:$dst), (ins GR32:$src, riaddr32:$amt),
+                   "sll\t{$src, $amt}",
+                   [(set GR32:$dst, (shl GR32:$src, riaddr32:$amt))]>;
+def SHL64rri : RSYI<0xEB0D,
+                    (outs GR64:$dst), (ins GR64:$src, riaddr:$amt),
+                    "sllg\t{$dst, $src, $amt}",
+                    [(set GR64:$dst, (shl GR64:$src, riaddr:$amt))]>;
+
+let Defs = [PSW] in {
+let isTwoAddress = 1 in
+def SRA32rri : RSI<0x8A,
+                   (outs GR32:$dst), (ins GR32:$src, riaddr32:$amt),
+                   "sra\t{$src, $amt}",
+                   [(set GR32:$dst, (sra GR32:$src, riaddr32:$amt)),
+                    (implicit PSW)]>;
+
+def SRA64rri : RSYI<0xEB0A,
+                    (outs GR64:$dst), (ins GR64:$src, riaddr:$amt),
+                    "srag\t{$dst, $src, $amt}",
+                    [(set GR64:$dst, (sra GR64:$src, riaddr:$amt)),
+                     (implicit PSW)]>;
+} // Defs = [PSW]
+
+def ROTL32rri : RSYI<0xEB1D,
+                     (outs GR32:$dst), (ins GR32:$src, riaddr32:$amt),
+                     "rll\t{$dst, $src, $amt}",
+                     [(set GR32:$dst, (rotl GR32:$src, riaddr32:$amt))]>;
+def ROTL64rri : RSYI<0xEB1C,
+                     (outs GR64:$dst), (ins GR64:$src, riaddr:$amt),
+                     "rllg\t{$dst, $src, $amt}",
+                     [(set GR64:$dst, (rotl GR64:$src, riaddr:$amt))]>;
+
+//===----------------------------------------------------------------------===//
+// Test instructions (like AND but do not produce any result)
+
+// Integer comparisons
+let Defs = [PSW] in {
+def CMP32rr : RRI<0x19,
+                  (outs), (ins GR32:$src1, GR32:$src2),
+                  "cr\t$src1, $src2",
+                  [(SystemZcmp GR32:$src1, GR32:$src2), 
+                   (implicit PSW)]>;
+def CMP64rr : RREI<0xB920,
+                   (outs), (ins GR64:$src1, GR64:$src2),
+                   "cgr\t$src1, $src2",
+                   [(SystemZcmp GR64:$src1, GR64:$src2), 
+                    (implicit PSW)]>;
+
+def CMP32ri   : RILI<0xC2D,
+                     (outs), (ins GR32:$src1, s32imm:$src2),
+                     "cfi\t$src1, $src2",
+                     [(SystemZcmp GR32:$src1, imm:$src2), 
+                      (implicit PSW)]>;
+def CMP64ri32 : RILI<0xC2C,
+                     (outs), (ins GR64:$src1, s32imm64:$src2),
+                     "cgfi\t$src1, $src2",
+                     [(SystemZcmp GR64:$src1, i64immSExt32:$src2),
+                      (implicit PSW)]>;
+
+def CMP32rm : RXI<0x59,
+                  (outs), (ins GR32:$src1, rriaddr12:$src2),
+                  "c\t$src1, $src2",
+                  [(SystemZcmp GR32:$src1, (load rriaddr12:$src2)),
+                   (implicit PSW)]>;
+def CMP32rmy : RXYI<0xE359,
+                    (outs), (ins GR32:$src1, rriaddr:$src2),
+                    "cy\t$src1, $src2",
+                    [(SystemZcmp GR32:$src1, (load rriaddr:$src2)),
+                     (implicit PSW)]>;
+def CMP64rm  : RXYI<0xE320,
+                    (outs), (ins GR64:$src1, rriaddr:$src2),
+                    "cg\t$src1, $src2",
+                    [(SystemZcmp GR64:$src1, (load rriaddr:$src2)),
+                     (implicit PSW)]>;
+
+def UCMP32rr : RRI<0x15,
+                   (outs), (ins GR32:$src1, GR32:$src2),
+                   "clr\t$src1, $src2",
+                   [(SystemZucmp GR32:$src1, GR32:$src2),
+                    (implicit PSW)]>;
+def UCMP64rr : RREI<0xB921,
+                    (outs), (ins GR64:$src1, GR64:$src2),
+                    "clgr\t$src1, $src2",
+                    [(SystemZucmp GR64:$src1, GR64:$src2), 
+                     (implicit PSW)]>;
+
+def UCMP32ri   : RILI<0xC2F,
+                      (outs), (ins GR32:$src1, i32imm:$src2),
+                      "clfi\t$src1, $src2",
+                      [(SystemZucmp GR32:$src1, imm:$src2),
+                       (implicit PSW)]>;
+def UCMP64ri32 : RILI<0xC2E,
+                      (outs), (ins GR64:$src1, i64i32imm:$src2),
+                      "clgfi\t$src1, $src2",
+                      [(SystemZucmp GR64:$src1, i64immZExt32:$src2),
+                       (implicit PSW)]>;
+
+def UCMP32rm  : RXI<0x55,
+                    (outs), (ins GR32:$src1, rriaddr12:$src2),
+                    "cl\t$src1, $src2",
+                    [(SystemZucmp GR32:$src1, (load rriaddr12:$src2)),
+                     (implicit PSW)]>;
+def UCMP32rmy : RXYI<0xE355,
+                     (outs), (ins GR32:$src1, rriaddr:$src2),
+                     "cly\t$src1, $src2",
+                     [(SystemZucmp GR32:$src1, (load rriaddr:$src2)),
+                      (implicit PSW)]>;
+def UCMP64rm  : RXYI<0xE351,
+                     (outs), (ins GR64:$src1, rriaddr:$src2),
+                     "clg\t$src1, $src2",
+                     [(SystemZucmp GR64:$src1, (load rriaddr:$src2)),
+                      (implicit PSW)]>;
+
+def CMPSX64rr32  : RREI<0xB930,
+                        (outs), (ins GR64:$src1, GR32:$src2),
+                        "cgfr\t$src1, $src2",
+                        [(SystemZucmp GR64:$src1, (sext GR32:$src2)),
+                         (implicit PSW)]>;
+def UCMPZX64rr32 : RREI<0xB931,
+                        (outs), (ins GR64:$src1, GR32:$src2),
+                        "clgfr\t$src1, $src2",
+                        [(SystemZucmp GR64:$src1, (zext GR32:$src2)),
+                         (implicit PSW)]>;
+
+def CMPSX64rm32   : RXYI<0xE330,
+                         (outs), (ins GR64:$src1, rriaddr:$src2),
+                         "cgf\t$src1, $src2",
+                         [(SystemZucmp GR64:$src1, (sextloadi64i32 rriaddr:$src2)),
+                          (implicit PSW)]>;
+def UCMPZX64rm32  : RXYI<0xE331,
+                         (outs), (ins GR64:$src1, rriaddr:$src2),
+                         "clgf\t$src1, $src2",
+                         [(SystemZucmp GR64:$src1, (zextloadi64i32 rriaddr:$src2)),
+                          (implicit PSW)]>;
+
+// FIXME: Add other crazy ucmp forms
+
+} // Defs = [PSW]
+
+//===----------------------------------------------------------------------===//
+// Other crazy stuff
+let Defs = [PSW] in {
+def FLOGR64 : RREI<0xB983,
+                   (outs GR128:$dst), (ins GR64:$src),
+                   "flogr\t{$dst, $src}",
+                   []>;
+} // Defs = [PSW]
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns.
+//===----------------------------------------------------------------------===//
+
+// ConstPools, JumpTables
+def : Pat<(SystemZpcrelwrapper tjumptable:$src), (LA64rm tjumptable:$src)>;
+def : Pat<(SystemZpcrelwrapper tconstpool:$src), (LA64rm tconstpool:$src)>;
+
+// anyext
+def : Pat<(i64 (anyext GR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_32bit)>;
+
+// calls
+def : Pat<(SystemZcall (i64 tglobaladdr:$dst)), (CALLi tglobaladdr:$dst)>;
+def : Pat<(SystemZcall (i64 texternalsym:$dst)), (CALLi texternalsym:$dst)>;
+
+//===----------------------------------------------------------------------===//
+// Peepholes.
+//===----------------------------------------------------------------------===//
+
+// FIXME: use add/sub tricks with 32678/-32768
+
+// Arbitrary immediate support.
+def : Pat<(i32 imm:$src),
+          (EXTRACT_SUBREG (MOV64ri32 (i64 imm:$src)), subreg_32bit)>;
+
+// Implement in terms of LLIHF/OILF.
+def : Pat<(i64 imm:$imm),
+          (OR64rilo32 (MOV64rihi32 (HI32 imm:$imm)), (LO32 imm:$imm))>;
+
+// trunc patterns
+def : Pat<(i32 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, subreg_32bit)>;
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR64:$src, i32),
+          (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
+
+// extload patterns
+def : Pat<(extloadi32i8  rriaddr:$src), (MOVZX32rm8  rriaddr:$src)>;
+def : Pat<(extloadi32i16 rriaddr:$src), (MOVZX32rm16 rriaddr:$src)>;
+def : Pat<(extloadi64i8  rriaddr:$src), (MOVZX64rm8  rriaddr:$src)>;
+def : Pat<(extloadi64i16 rriaddr:$src), (MOVZX64rm16 rriaddr:$src)>;
+def : Pat<(extloadi64i32 rriaddr:$src), (MOVZX64rm32 rriaddr:$src)>;
+
+// muls
+def : Pat<(mulhs GR32:$src1, GR32:$src2),
+          (EXTRACT_SUBREG (MUL64rrP (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                                                   GR32:$src1, subreg_odd32),
+                                    GR32:$src2),
+                          subreg_even32)>;
+
+def : Pat<(mulhu GR32:$src1, GR32:$src2),
+          (EXTRACT_SUBREG (UMUL64rrP (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                                                    GR32:$src1, subreg_odd32),
+                                     GR32:$src2),
+                          subreg_even32)>;
+def : Pat<(mulhu GR64:$src1, GR64:$src2),
+          (EXTRACT_SUBREG (UMUL128rrP (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                                                     GR64:$src1, subreg_odd),
+                                      GR64:$src2),
+                          subreg_even)>;
+
+def : Pat<(ctlz GR64:$src),
+          (EXTRACT_SUBREG (FLOGR64 GR64:$src), subreg_even)>;
diff --git a/lib/Target/SystemZ/SystemZMCAsmInfo.cpp b/lib/Target/SystemZ/SystemZMCAsmInfo.cpp
new file mode 100644
index 0000000..8ea11c9
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZMCAsmInfo.cpp
@@ -0,0 +1,26 @@
+//===-- SystemZMCAsmInfo.cpp - SystemZ asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SystemZMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCAsmInfo.h"
+using namespace llvm;
+
+SystemZMCAsmInfo::SystemZMCAsmInfo(const Target &T, const StringRef &TT) {
+  AlignmentIsInBytes = true;
+
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  SetDirective = "\t.set\t";
+  PCSymbol = ".";
+
+  NonexecutableStackDirective = "\t.section\t.note.GNU-stack,\"\",@progbits";
+}
diff --git a/lib/Target/SystemZ/SystemZMCAsmInfo.h b/lib/Target/SystemZ/SystemZMCAsmInfo.h
new file mode 100644
index 0000000..3bebcb7
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZMCAsmInfo.h
@@ -0,0 +1,29 @@
+//====-- SystemZMCAsmInfo.h - SystemZ asm properties -----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SystemZMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SystemZTARGETASMINFO_H
+#define SystemZTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+
+  struct SystemZMCAsmInfo : public MCAsmInfo {
+    explicit SystemZMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
new file mode 100644
index 0000000..e47d419
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -0,0 +1,50 @@
+//==- SystemZMachineFuctionInfo.h - SystemZ machine function info -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares SystemZ-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZMACHINEFUNCTIONINFO_H
+#define SYSTEMZMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// SystemZMachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private SystemZ target-specific information for each MachineFunction.
+class SystemZMachineFunctionInfo : public MachineFunctionInfo {
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// LowReg - Low register of range of callee-saved registers to store.
+  unsigned LowReg;
+
+  /// HighReg - High register of range of callee-saved registers to store.
+  unsigned HighReg;
+public:
+  SystemZMachineFunctionInfo() : CalleeSavedFrameSize(0) {}
+
+  SystemZMachineFunctionInfo(MachineFunction &MF) : CalleeSavedFrameSize(0) {}
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  unsigned getLowReg() const { return LowReg; }
+  void setLowReg(unsigned Reg) { LowReg = Reg; }
+
+  unsigned getHighReg() const { return HighReg; }
+  void setHighReg(unsigned Reg) { HighReg = Reg; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
new file mode 100644
index 0000000..156cace
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -0,0 +1,306 @@
+//=====- SystemZOperands.td - SystemZ Operands defs ---------*- tblgen-*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the various SystemZ instruction operands.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff.
+//===----------------------------------------------------------------------===//
+
+// SystemZ specific condition code. These correspond to CondCode in
+// SystemZ.h. They must be kept in synch.
+def SYSTEMZ_COND_O   : PatLeaf<(i8 0)>;
+def SYSTEMZ_COND_H   : PatLeaf<(i8 1)>;
+def SYSTEMZ_COND_NLE : PatLeaf<(i8 2)>;
+def SYSTEMZ_COND_L   : PatLeaf<(i8 3)>;
+def SYSTEMZ_COND_NHE : PatLeaf<(i8 4)>;
+def SYSTEMZ_COND_LH  : PatLeaf<(i8 5)>;
+def SYSTEMZ_COND_NE  : PatLeaf<(i8 6)>;
+def SYSTEMZ_COND_E   : PatLeaf<(i8 7)>;
+def SYSTEMZ_COND_NLH : PatLeaf<(i8 8)>;
+def SYSTEMZ_COND_HE  : PatLeaf<(i8 9)>;
+def SYSTEMZ_COND_NL  : PatLeaf<(i8 10)>;
+def SYSTEMZ_COND_LE  : PatLeaf<(i8 11)>;
+def SYSTEMZ_COND_NH  : PatLeaf<(i8 12)>;
+def SYSTEMZ_COND_NO  : PatLeaf<(i8 13)>;
+
+def LO8 : SDNodeXForm<imm, [{
+  // Transformation function: return low 8 bits.
+  return getI8Imm(N->getZExtValue() & 0x00000000000000FFULL);
+}]>;
+
+def LL16 : SDNodeXForm<imm, [{
+  // Transformation function: return low 16 bits.
+  return getI16Imm(N->getZExtValue() & 0x000000000000FFFFULL);
+}]>;
+
+def LH16 : SDNodeXForm<imm, [{
+  // Transformation function: return bits 16-31.
+  return getI16Imm((N->getZExtValue() & 0x00000000FFFF0000ULL) >> 16);
+}]>;
+
+def HL16 : SDNodeXForm<imm, [{
+  // Transformation function: return bits 32-47.
+  return getI16Imm((N->getZExtValue() & 0x0000FFFF00000000ULL) >> 32);
+}]>;
+
+def HH16 : SDNodeXForm<imm, [{
+  // Transformation function: return bits 48-63.
+  return getI16Imm((N->getZExtValue() & 0xFFFF000000000000ULL) >> 48);
+}]>;
+
+def LO32 : SDNodeXForm<imm, [{
+  // Transformation function: return low 32 bits.
+  return getI32Imm(N->getZExtValue() & 0x00000000FFFFFFFFULL);
+}]>;
+
+def HI32 : SDNodeXForm<imm, [{
+  // Transformation function: return bits 32-63.
+  return getI32Imm(N->getZExtValue() >> 32);
+}]>;
+
+def i32ll16 : PatLeaf<(i32 imm), [{
+  // i32ll16 predicate - true if the 32-bit immediate has only rightmost 16
+  // bits set.
+  return ((N->getZExtValue() & 0x000000000000FFFFULL) == N->getZExtValue());
+}], LL16>;
+
+def i32lh16 : PatLeaf<(i32 imm), [{
+  // i32lh16 predicate - true if the 32-bit immediate has only bits 16-31 set.
+  return ((N->getZExtValue() & 0x00000000FFFF0000ULL) == N->getZExtValue());
+}], LH16>;
+
+def i32ll16c : PatLeaf<(i32 imm), [{
+  // i32ll16c predicate - true if the 32-bit immediate has all bits 16-31 set.
+  return ((N->getZExtValue() | 0x00000000FFFF0000ULL) == N->getZExtValue());
+}], LL16>;
+
+def i32lh16c : PatLeaf<(i32 imm), [{
+  // i32lh16c predicate - true if the 32-bit immediate has all rightmost 16
+  //  bits set.
+  return ((N->getZExtValue() | 0x000000000000FFFFULL) == N->getZExtValue());
+}], LH16>;
+
+def i64ll16 : PatLeaf<(i64 imm), [{  
+  // i64ll16 predicate - true if the 64-bit immediate has only rightmost 16
+  // bits set.
+  return ((N->getZExtValue() & 0x000000000000FFFFULL) == N->getZExtValue());
+}], LL16>;
+
+def i64lh16 : PatLeaf<(i64 imm), [{  
+  // i64lh16 predicate - true if the 64-bit immediate has only bits 16-31 set.
+  return ((N->getZExtValue() & 0x00000000FFFF0000ULL) == N->getZExtValue());
+}], LH16>;
+
+def i64hl16 : PatLeaf<(i64 imm), [{  
+  // i64hl16 predicate - true if the 64-bit immediate has only bits 32-47 set.
+  return ((N->getZExtValue() & 0x0000FFFF00000000ULL) == N->getZExtValue());
+}], HL16>;
+
+def i64hh16 : PatLeaf<(i64 imm), [{  
+  // i64hh16 predicate - true if the 64-bit immediate has only bits 48-63 set.
+  return ((N->getZExtValue() & 0xFFFF000000000000ULL) == N->getZExtValue());
+}], HH16>;
+
+def i64ll16c : PatLeaf<(i64 imm), [{  
+  // i64ll16c predicate - true if the 64-bit immediate has only rightmost 16
+  // bits set.
+  return ((N->getZExtValue() | 0xFFFFFFFFFFFF0000ULL) == N->getZExtValue());
+}], LL16>;
+
+def i64lh16c : PatLeaf<(i64 imm), [{  
+  // i64lh16c predicate - true if the 64-bit immediate has only bits 16-31 set.
+  return ((N->getZExtValue() | 0xFFFFFFFF0000FFFFULL) == N->getZExtValue());
+}], LH16>;
+
+def i64hl16c : PatLeaf<(i64 imm), [{  
+  // i64hl16c predicate - true if the 64-bit immediate has only bits 32-47 set.
+  return ((N->getZExtValue() | 0xFFFF0000FFFFFFFFULL) == N->getZExtValue());
+}], HL16>;
+
+def i64hh16c : PatLeaf<(i64 imm), [{  
+  // i64hh16c predicate - true if the 64-bit immediate has only bits 48-63 set.
+  return ((N->getZExtValue() | 0x0000FFFFFFFFFFFFULL) == N->getZExtValue());
+}], HH16>;
+
+def immSExt16 : PatLeaf<(imm), [{
+  // immSExt16 predicate - true if the immediate fits in a 16-bit sign extended
+  // field.
+  if (N->getValueType(0) == MVT::i64) {
+    uint64_t val = N->getZExtValue();
+    return ((int64_t)val == (int16_t)val);
+  } else if (N->getValueType(0) == MVT::i32) {
+    uint32_t val = N->getZExtValue();
+    return ((int32_t)val == (int16_t)val);
+  }
+
+  return false;
+}], LL16>;
+
+def immSExt32 : PatLeaf<(i64 imm), [{
+  // immSExt32 predicate - true if the immediate fits in a 32-bit sign extended
+  // field.
+  uint64_t val = N->getZExtValue();
+  return ((int64_t)val == (int32_t)val);
+}], LO32>;
+
+def i64lo32 : PatLeaf<(i64 imm), [{
+  // i64lo32 predicate - true if the 64-bit immediate has only rightmost 32
+  // bits set.
+  return ((N->getZExtValue() & 0x00000000FFFFFFFFULL) == N->getZExtValue());
+}], LO32>;
+
+def i64hi32 : PatLeaf<(i64 imm), [{
+  // i64hi32 predicate - true if the 64-bit immediate has only bits 32-63 set.
+  return ((N->getZExtValue() & 0xFFFFFFFF00000000ULL) == N->getZExtValue());
+}], HI32>;
+
+def i64lo32c : PatLeaf<(i64 imm), [{
+  // i64lo32 predicate - true if the 64-bit immediate has only rightmost 32
+  // bits set.
+  return ((N->getZExtValue() | 0xFFFFFFFF00000000ULL) == N->getZExtValue());
+}], LO32>;
+
+def i64hi32c : PatLeaf<(i64 imm), [{
+  // i64hi32 predicate - true if the 64-bit immediate has only bits 32-63 set.
+  return ((N->getZExtValue() | 0x00000000FFFFFFFFULL) == N->getZExtValue());
+}], HI32>;
+
+def i32immSExt8  : PatLeaf<(i32 imm), [{
+  // i32immSExt8 predicate - True if the 32-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int32_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+}], LO8>;
+
+def i32immSExt16 : PatLeaf<(i32 imm), [{
+  // i32immSExt16 predicate - True if the 32-bit immediate fits in a 16-bit
+  // sign extended field.
+  return (int32_t)N->getZExtValue() == (int16_t)N->getZExtValue();
+}], LL16>;
+
+def i64immSExt32 : PatLeaf<(i64 imm), [{
+  // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+  // sign extended field.
+  return (int64_t)N->getZExtValue() == (int32_t)N->getZExtValue();
+}], LO32>;
+
+def i64immZExt32 : PatLeaf<(i64 imm), [{
+  // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+  // zero extended field.
+  return (uint64_t)N->getZExtValue() == (uint32_t)N->getZExtValue();
+}], LO32>;
+
+// extloads
+def extloadi32i8   : PatFrag<(ops node:$ptr), (i32 (extloadi8  node:$ptr))>;
+def extloadi32i16  : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+def extloadi64i8   : PatFrag<(ops node:$ptr), (i64 (extloadi8  node:$ptr))>;
+def extloadi64i16  : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32  : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+def sextloadi32i8   : PatFrag<(ops node:$ptr), (i32 (sextloadi8  node:$ptr))>;
+def sextloadi32i16  : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+def sextloadi64i8   : PatFrag<(ops node:$ptr), (i64 (sextloadi8  node:$ptr))>;
+def sextloadi64i16  : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32  : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi32i8   : PatFrag<(ops node:$ptr), (i32 (zextloadi8  node:$ptr))>;
+def zextloadi32i16  : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+def zextloadi64i8   : PatFrag<(ops node:$ptr), (i64 (zextloadi8  node:$ptr))>;
+def zextloadi64i16  : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32  : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+// A couple of more descriptive operand definitions.
+// 32-bits but only 8 bits are significant.
+def i32i8imm  : Operand<i32>;
+// 32-bits but only 16 bits are significant.
+def i32i16imm : Operand<i32>;
+// 64-bits but only 32 bits are significant.
+def i64i32imm : Operand<i64>;
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT>;
+
+// Unsigned i12
+def u12imm : Operand<i32> {
+  let PrintMethod = "printU12ImmOperand";
+}
+def u12imm64 : Operand<i64> {
+  let PrintMethod = "printU12ImmOperand";
+}
+
+// Signed i16
+def s16imm : Operand<i32> {
+  let PrintMethod = "printS16ImmOperand";
+}
+def s16imm64 : Operand<i64> {
+  let PrintMethod = "printS16ImmOperand";
+}
+// Signed i20
+def s20imm : Operand<i32> {
+  let PrintMethod = "printS20ImmOperand";
+}
+def s20imm64 : Operand<i64> {
+  let PrintMethod = "printS20ImmOperand";
+}
+// Signed i32
+def s32imm : Operand<i32> {
+  let PrintMethod = "printS32ImmOperand";
+}
+def s32imm64 : Operand<i64> {
+  let PrintMethod = "printS32ImmOperand";
+}
+
+def imm_pcrel : Operand<i64> {
+  let PrintMethod = "printPCRelImmOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// SystemZ Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+// Address operands
+
+// riaddr := reg + imm
+def riaddr32 : Operand<i64>,
+               ComplexPattern<i64, 2, "SelectAddrRI12Only", []> {
+  let PrintMethod = "printRIAddrOperand";
+  let MIOperandInfo = (ops ADDR64:$base, u12imm:$disp);
+}
+
+def riaddr12 : Operand<i64>,
+               ComplexPattern<i64, 2, "SelectAddrRI12", []> {
+  let PrintMethod = "printRIAddrOperand";
+  let MIOperandInfo = (ops ADDR64:$base, u12imm64:$disp);
+}
+
+def riaddr : Operand<i64>,
+             ComplexPattern<i64, 2, "SelectAddrRI", []> {
+  let PrintMethod = "printRIAddrOperand";
+  let MIOperandInfo = (ops ADDR64:$base, s20imm64:$disp);
+}
+
+//===----------------------------------------------------------------------===//
+
+// rriaddr := reg + reg + imm
+def rriaddr12 : Operand<i64>,
+                ComplexPattern<i64, 3, "SelectAddrRRI12", [], []> {
+  let PrintMethod = "printRRIAddrOperand";
+  let MIOperandInfo = (ops ADDR64:$base, u12imm64:$disp, ADDR64:$index);
+}
+def rriaddr : Operand<i64>,
+              ComplexPattern<i64, 3, "SelectAddrRRI20", [], []> {
+  let PrintMethod = "printRRIAddrOperand";
+  let MIOperandInfo = (ops ADDR64:$base, s20imm64:$disp, ADDR64:$index);
+}
+def laaddr : Operand<i64>,
+             ComplexPattern<i64, 3, "SelectLAAddr", [add, sub, or, frameindex], []> {
+  let PrintMethod = "printRRIAddrOperand";
+  let MIOperandInfo = (ops ADDR64:$base, s20imm64:$disp, ADDR64:$index);
+}
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
new file mode 100644
index 0000000..38460a6
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -0,0 +1,343 @@
+//===- SystemZRegisterInfo.cpp - SystemZ Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+using namespace llvm;
+
+SystemZRegisterInfo::SystemZRegisterInfo(SystemZTargetMachine &tm,
+                                         const SystemZInstrInfo &tii)
+  : SystemZGenRegisterInfo(SystemZ::ADJCALLSTACKUP, SystemZ::ADJCALLSTACKDOWN),
+    TM(tm), TII(tii) {
+}
+
+const unsigned*
+SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const unsigned CalleeSavedRegs[] = {
+    SystemZ::R6D,  SystemZ::R7D,  SystemZ::R8D,  SystemZ::R9D,
+    SystemZ::R10D, SystemZ::R11D, SystemZ::R12D, SystemZ::R13D,
+    SystemZ::R14D, SystemZ::R15D,
+    SystemZ::F8L,  SystemZ::F9L,  SystemZ::F10L, SystemZ::F11L,
+    SystemZ::F12L, SystemZ::F13L, SystemZ::F14L, SystemZ::F15L,
+    0
+  };
+
+  return CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const*
+SystemZRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
+    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
+    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
+    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
+    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
+    &SystemZ::FP64RegClass, &SystemZ::FP64RegClass,
+    &SystemZ::FP64RegClass, &SystemZ::FP64RegClass,
+    &SystemZ::FP64RegClass, &SystemZ::FP64RegClass,
+    &SystemZ::FP64RegClass, &SystemZ::FP64RegClass, 0
+  };
+  return CalleeSavedRegClasses;
+}
+
+BitVector SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  if (hasFP(MF))
+    Reserved.set(SystemZ::R11D);
+  Reserved.set(SystemZ::R14D);
+  Reserved.set(SystemZ::R15D);
+  return Reserved;
+}
+
+/// needsFP - Return true if the specified function should have a dedicated
+/// frame pointer register.  This is true if the function has variable sized
+/// allocas or if frame pointer elimination is disabled.
+bool SystemZRegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasVarSizedObjects();
+}
+
+void SystemZRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  MBB.erase(I);
+}
+
+int SystemZRegisterInfo::getFrameIndexOffset(MachineFunction &MF, int FI) const {
+  const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  SystemZMachineFunctionInfo *SystemZMFI =
+    MF.getInfo<SystemZMachineFunctionInfo>();
+  int Offset = MFI->getObjectOffset(FI) + MFI->getOffsetAdjustment();
+  uint64_t StackSize = MFI->getStackSize();
+
+  // Fixed objects are really located in the "previous" frame.
+  if (FI < 0)
+    StackSize -= SystemZMFI->getCalleeSavedFrameSize();
+
+  Offset += StackSize - TFI.getOffsetOfLocalArea();
+
+  // Skip the register save area if we generated the stack frame.
+  if (StackSize || MFI->hasCalls())
+    Offset -= TFI.getOffsetOfLocalArea();
+
+  return Offset;
+}
+
+unsigned
+SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                         int SPAdj, int *Value,
+                                         RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unxpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  unsigned BasePtr = (hasFP(MF) ? SystemZ::R11D : SystemZ::R15D);
+
+  // This must be part of a rri or ri operand memory reference.  Replace the
+  // FrameIndex with base register with BasePtr.  Add an offset to the
+  // displacement field.
+  MI.getOperand(i).ChangeToRegister(BasePtr, false);
+
+  // Offset is a either 12-bit unsigned or 20-bit signed integer.
+  // FIXME: handle "too long" displacements.
+  int Offset = getFrameIndexOffset(MF, FrameIndex) + MI.getOperand(i+1).getImm();
+
+  // Check whether displacement is too long to fit into 12 bit zext field.
+  MI.setDesc(TII.getMemoryInstr(MI.getOpcode(), Offset));
+
+  MI.getOperand(i+1).ChangeToImmediate(Offset);
+  return 0;
+}
+
+void
+SystemZRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                       RegScavenger *RS) const {
+  // Determine whether R15/R14 will ever be clobbered inside the function. And
+  // if yes - mark it as 'callee' saved.
+  MachineFrameInfo *FFI = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Check whether high FPRs are ever used, if yes - we need to save R15 as
+  // well.
+  static const unsigned HighFPRs[] = {
+    SystemZ::F8L,  SystemZ::F9L,  SystemZ::F10L, SystemZ::F11L,
+    SystemZ::F12L, SystemZ::F13L, SystemZ::F14L, SystemZ::F15L,
+    SystemZ::F8S,  SystemZ::F9S,  SystemZ::F10S, SystemZ::F11S,
+    SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S,
+  };
+
+  bool HighFPRsUsed = false;
+  for (unsigned i = 0, e = array_lengthof(HighFPRs); i != e; ++i)
+    HighFPRsUsed |= MRI.isPhysRegUsed(HighFPRs[i]);
+
+  if (FFI->hasCalls())
+    /* FIXME: function is varargs */
+    /* FIXME: function grabs RA */
+    /* FIXME: function calls eh_return */
+    MRI.setPhysRegUsed(SystemZ::R14D);
+
+  if (HighFPRsUsed ||
+      FFI->hasCalls() ||
+      FFI->getObjectIndexEnd() != 0 || // Contains automatic variables
+      FFI->hasVarSizedObjects() // Function calls dynamic alloca's
+      /* FIXME: function is varargs */)
+    MRI.setPhysRegUsed(SystemZ::R15D);
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                  int64_t NumBytes, const TargetInstrInfo &TII) {
+  unsigned Opc; uint64_t Chunk;
+  bool isSub = NumBytes < 0;
+  uint64_t Offset = isSub ? -NumBytes : NumBytes;
+
+  if (Offset >= (1LL << 15) - 1) {
+    Opc = SystemZ::ADD64ri32;
+    Chunk = (1LL << 31) - 1;
+  } else {
+    Opc = SystemZ::ADD64ri16;
+    Chunk = (1LL << 15) - 1;
+  }
+
+  DebugLoc DL = (MBBI != MBB.end() ? MBBI->getDebugLoc() :
+                 DebugLoc::getUnknownLoc());
+
+  while (Offset) {
+    uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
+    MachineInstr *MI =
+      BuildMI(MBB, MBBI, DL, TII.get(Opc), SystemZ::R15D)
+      .addReg(SystemZ::R15D).addImm((isSub ? -(int64_t)ThisVal : ThisVal));
+    // The PSW implicit def is dead.
+    MI->getOperand(3).setIsDead();
+    Offset -= ThisVal;
+  }
+}
+
+void SystemZRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  SystemZMachineFunctionInfo *SystemZMFI =
+    MF.getInfo<SystemZMachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc DL = (MBBI != MBB.end() ? MBBI->getDebugLoc() :
+                 DebugLoc::getUnknownLoc());
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  // Note that area for callee-saved stuff is already allocated, thus we need to
+  // 'undo' the stack movement.
+  uint64_t StackSize = MFI->getStackSize();
+  StackSize -= SystemZMFI->getCalleeSavedFrameSize();
+
+  uint64_t NumBytes = StackSize - TFI.getOffsetOfLocalArea();
+
+  // Skip the callee-saved push instructions.
+  while (MBBI != MBB.end() &&
+         (MBBI->getOpcode() == SystemZ::MOV64mr ||
+          MBBI->getOpcode() == SystemZ::MOV64mrm))
+    ++MBBI;
+
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+
+  // adjust stack pointer: R15 -= numbytes
+  if (StackSize || MFI->hasCalls()) {
+    assert(MF.getRegInfo().isPhysRegUsed(SystemZ::R15D) &&
+           "Invalid stack frame calculation!");
+    emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, TII);
+  }
+
+  if (hasFP(MF)) {
+    // Update R11 with the new base value...
+    BuildMI(MBB, MBBI, DL, TII.get(SystemZ::MOV64rr), SystemZ::R11D)
+      .addReg(SystemZ::R15D);
+
+    // Mark the FramePtr as live-in in every block except the entry.
+    for (MachineFunction::iterator I = next(MF.begin()), E = MF.end();
+         I != E; ++I)
+      I->addLiveIn(SystemZ::R11D);
+
+  }
+}
+
+void SystemZRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  SystemZMachineFunctionInfo *SystemZMFI =
+    MF.getInfo<SystemZMachineFunctionInfo>();
+  unsigned RetOpcode = MBBI->getOpcode();
+
+  switch (RetOpcode) {
+  case SystemZ::RET: break;  // These are ok
+  default:
+    assert(0 && "Can only insert epilog into returning blocks");
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo
+  // Note that area for callee-saved stuff is already allocated, thus we need to
+  // 'undo' the stack movement.
+  uint64_t StackSize =
+    MFI->getStackSize() - SystemZMFI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = StackSize - TFI.getOffsetOfLocalArea();
+
+  // Skip the final terminator instruction.
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = prior(MBBI);
+    --MBBI;
+    if (!PI->getDesc().isTerminator())
+      break;
+  }
+
+  // During callee-saved restores emission stack frame was not yet finialized
+  // (and thus - the stack size was unknown). Tune the offset having full stack
+  // size in hands.
+  if (StackSize || MFI->hasCalls()) {
+    assert((MBBI->getOpcode() == SystemZ::MOV64rmm ||
+            MBBI->getOpcode() == SystemZ::MOV64rm) &&
+           "Expected to see callee-save register restore code");
+    assert(MF.getRegInfo().isPhysRegUsed(SystemZ::R15D) &&
+           "Invalid stack frame calculation!");
+
+    unsigned i = 0;
+    MachineInstr &MI = *MBBI;
+    while (!MI.getOperand(i).isImm()) {
+      ++i;
+      assert(i < MI.getNumOperands() && "Unexpected restore code!");
+    }
+
+    uint64_t Offset = NumBytes + MI.getOperand(i).getImm();
+    // If Offset does not fit into 20-bit signed displacement field we need to
+    // emit some additional code...
+    if (Offset > 524287) {
+      // Fold the displacement into load instruction as much as possible.
+      NumBytes = Offset - 524287;
+      Offset = 524287;
+      emitSPUpdate(MBB, MBBI, NumBytes, TII);
+    }
+
+    MI.getOperand(i).ChangeToImmediate(Offset);
+  }
+}
+
+unsigned SystemZRegisterInfo::getRARegister() const {
+  assert(0 && "What is the return address register");
+  return 0;
+}
+
+unsigned SystemZRegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  assert(0 && "What is the frame register");
+  return 0;
+}
+
+unsigned SystemZRegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned SystemZRegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+int SystemZRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  assert(0 && "What is the dwarf register number");
+  return -1;
+}
+
+#include "SystemZGenRegisterInfo.inc"
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
new file mode 100644
index 0000000..b22b05d
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -0,0 +1,82 @@
+//===- SystemZRegisterInfo.h - SystemZ Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SystemZREGISTERINFO_H
+#define SystemZREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "SystemZGenRegisterInfo.h.inc"
+
+namespace llvm {
+
+namespace SystemZ {
+  /// SubregIndex - The index of various sized subregister classes. Note that
+  /// these indices must be kept in sync with the class indices in the
+  /// SystemZRegisterInfo.td file.
+  enum SubregIndex {
+    SUBREG_32BIT = 1, SUBREG_EVEN = 1, SUBREG_ODD = 2
+  };
+}
+
+class SystemZSubtarget;
+class SystemZInstrInfo;
+class Type;
+
+struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
+  SystemZTargetMachine &TM;
+  const SystemZInstrInfo &TII;
+
+  SystemZRegisterInfo(SystemZTargetMachine &tm, const SystemZInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasReservedCallFrame(MachineFunction &MF) const { return true; }
+  bool hasFP(const MachineFunction &MF) const;
+
+  int getFrameIndexOffset(MachineFunction &MF, int FI) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
+
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
new file mode 100644
index 0000000..8795847
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -0,0 +1,490 @@
+//===- SystemZRegisterInfo.td - The PowerPC Register File ------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+class SystemZReg<string n> : Register<n> {
+  let Namespace = "SystemZ";
+}
+
+class SystemZRegWithSubregs<string n, list<Register> subregs>
+  : RegisterWithSubRegs<n, subregs> {
+  let Namespace = "SystemZ";
+}
+
+// We identify all our registers with a 4-bit ID, for consistency's sake.
+
+// GPR32 - Lower 32 bits of one of the 16 64-bit general-purpose registers
+class GPR32<bits<4> num, string n> : SystemZReg<n> {
+  field bits<4> Num = num;
+}
+
+// GPR64 - One of the 16 64-bit general-purpose registers
+class GPR64<bits<4> num, string n, list<Register> subregs,
+            list<Register> aliases = []>
+ : SystemZRegWithSubregs<n, subregs> {
+  field bits<4> Num = num;
+  let Aliases = aliases;
+}
+
+// GPR128 - 8 even-odd register pairs
+class GPR128<bits<4> num, string n, list<Register> subregs,
+             list<Register> aliases = []>
+ : SystemZRegWithSubregs<n, subregs> {
+  field bits<4> Num = num;
+  let Aliases = aliases;
+}
+
+// FPRS - Lower 32 bits of one of the 16 64-bit floating-point registers
+class FPRS<bits<4> num, string n> : SystemZReg<n> {
+  field bits<4> Num = num;
+}
+
+// FPRL - One of the 16 64-bit floating-point registers
+class FPRL<bits<4> num, string n, list<Register> subregs>
+ : SystemZRegWithSubregs<n, subregs> {
+  field bits<4> Num = num;
+}
+
+// General-purpose registers
+def R0W  : GPR32< 0,  "r0">, DwarfRegNum<[0]>;
+def R1W  : GPR32< 1,  "r1">, DwarfRegNum<[1]>;
+def R2W  : GPR32< 2,  "r2">, DwarfRegNum<[2]>;
+def R3W  : GPR32< 3,  "r3">, DwarfRegNum<[3]>;
+def R4W  : GPR32< 4,  "r4">, DwarfRegNum<[4]>;
+def R5W  : GPR32< 5,  "r5">, DwarfRegNum<[5]>;
+def R6W  : GPR32< 6,  "r6">, DwarfRegNum<[6]>;
+def R7W  : GPR32< 7,  "r7">, DwarfRegNum<[7]>;
+def R8W  : GPR32< 8,  "r8">, DwarfRegNum<[8]>;
+def R9W  : GPR32< 9,  "r9">, DwarfRegNum<[9]>;
+def R10W : GPR32<10, "r10">, DwarfRegNum<[10]>;
+def R11W : GPR32<11, "r11">, DwarfRegNum<[11]>;
+def R12W : GPR32<12, "r12">, DwarfRegNum<[12]>;
+def R13W : GPR32<13, "r13">, DwarfRegNum<[13]>;
+def R14W : GPR32<14, "r14">, DwarfRegNum<[14]>;
+def R15W : GPR32<15, "r15">, DwarfRegNum<[15]>;
+
+def R0D  : GPR64< 0,  "r0", [R0W]>,  DwarfRegNum<[0]>;
+def R1D  : GPR64< 1,  "r1", [R1W]>,  DwarfRegNum<[1]>;
+def R2D  : GPR64< 2,  "r2", [R2W]>,  DwarfRegNum<[2]>;
+def R3D  : GPR64< 3,  "r3", [R3W]>,  DwarfRegNum<[3]>;
+def R4D  : GPR64< 4,  "r4", [R4W]>,  DwarfRegNum<[4]>;
+def R5D  : GPR64< 5,  "r5", [R5W]>,  DwarfRegNum<[5]>;
+def R6D  : GPR64< 6,  "r6", [R6W]>,  DwarfRegNum<[6]>;
+def R7D  : GPR64< 7,  "r7", [R7W]>,  DwarfRegNum<[7]>;
+def R8D  : GPR64< 8,  "r8", [R8W]>,  DwarfRegNum<[8]>;
+def R9D  : GPR64< 9,  "r9", [R9W]>,  DwarfRegNum<[9]>;
+def R10D : GPR64<10, "r10", [R10W]>, DwarfRegNum<[10]>;
+def R11D : GPR64<11, "r11", [R11W]>, DwarfRegNum<[11]>;
+def R12D : GPR64<12, "r12", [R12W]>, DwarfRegNum<[12]>;
+def R13D : GPR64<13, "r13", [R13W]>, DwarfRegNum<[13]>;
+def R14D : GPR64<14, "r14", [R14W]>, DwarfRegNum<[14]>;
+def R15D : GPR64<15, "r15", [R15W]>, DwarfRegNum<[15]>;
+
+// Register pairs
+def R0P  : GPR64< 0,  "r0", [R0W,  R1W],  [R0D,  R1D]>,  DwarfRegNum<[0]>;
+def R2P  : GPR64< 2,  "r2", [R2W,  R3W],  [R2D,  R3D]>,  DwarfRegNum<[2]>;
+def R4P  : GPR64< 4,  "r4", [R4W,  R5W],  [R4D,  R5D]>,  DwarfRegNum<[4]>;
+def R6P  : GPR64< 6,  "r6", [R6W,  R7W],  [R6D,  R7D]>,  DwarfRegNum<[6]>;
+def R8P  : GPR64< 8,  "r8", [R8W,  R9W],  [R8D,  R9D]>,  DwarfRegNum<[8]>;
+def R10P : GPR64<10, "r10", [R10W, R11W], [R10D, R11D]>, DwarfRegNum<[10]>;
+def R12P : GPR64<12, "r12", [R12W, R13W], [R12D, R13D]>, DwarfRegNum<[12]>;
+def R14P : GPR64<14, "r14", [R14W, R15W], [R14D, R15D]>, DwarfRegNum<[14]>;
+
+def R0Q  : GPR128< 0,  "r0", [R0D,  R1D],  [R0P]>,  DwarfRegNum<[0]>;
+def R2Q  : GPR128< 2,  "r2", [R2D,  R3D],  [R2P]>,  DwarfRegNum<[2]>;
+def R4Q  : GPR128< 4,  "r4", [R4D,  R5D],  [R4P]>,  DwarfRegNum<[4]>;
+def R6Q  : GPR128< 6,  "r6", [R6D,  R7D],  [R6P]>,  DwarfRegNum<[6]>;
+def R8Q  : GPR128< 8,  "r8", [R8D,  R9D],  [R8P]>,  DwarfRegNum<[8]>;
+def R10Q : GPR128<10, "r10", [R10D, R11D], [R10P]>, DwarfRegNum<[10]>;
+def R12Q : GPR128<12, "r12", [R12D, R13D], [R12P]>, DwarfRegNum<[12]>;
+def R14Q : GPR128<14, "r14", [R14D, R15D], [R14P]>, DwarfRegNum<[14]>;
+
+// Floating-point registers
+def F0S  : FPRS< 0,  "f0">, DwarfRegNum<[16]>;
+def F1S  : FPRS< 1,  "f1">, DwarfRegNum<[17]>;
+def F2S  : FPRS< 2,  "f2">, DwarfRegNum<[18]>;
+def F3S  : FPRS< 3,  "f3">, DwarfRegNum<[19]>;
+def F4S  : FPRS< 4,  "f4">, DwarfRegNum<[20]>;
+def F5S  : FPRS< 5,  "f5">, DwarfRegNum<[21]>;
+def F6S  : FPRS< 6,  "f6">, DwarfRegNum<[22]>;
+def F7S  : FPRS< 7,  "f7">, DwarfRegNum<[23]>;
+def F8S  : FPRS< 8,  "f8">, DwarfRegNum<[24]>;
+def F9S  : FPRS< 9,  "f9">, DwarfRegNum<[25]>;
+def F10S : FPRS<10, "f10">, DwarfRegNum<[26]>;
+def F11S : FPRS<11, "f11">, DwarfRegNum<[27]>;
+def F12S : FPRS<12, "f12">, DwarfRegNum<[28]>;
+def F13S : FPRS<13, "f13">, DwarfRegNum<[29]>;
+def F14S : FPRS<14, "f14">, DwarfRegNum<[30]>;
+def F15S : FPRS<15, "f15">, DwarfRegNum<[31]>;
+
+def F0L  : FPRL< 0,  "f0", [F0S]>,  DwarfRegNum<[16]>;
+def F1L  : FPRL< 1,  "f1", [F1S]>,  DwarfRegNum<[17]>;
+def F2L  : FPRL< 2,  "f2", [F2S]>,  DwarfRegNum<[18]>;
+def F3L  : FPRL< 3,  "f3", [F3S]>,  DwarfRegNum<[19]>;
+def F4L  : FPRL< 4,  "f4", [F4S]>,  DwarfRegNum<[20]>;
+def F5L  : FPRL< 5,  "f5", [F5S]>,  DwarfRegNum<[21]>;
+def F6L  : FPRL< 6,  "f6", [F6S]>,  DwarfRegNum<[22]>;
+def F7L  : FPRL< 7,  "f7", [F7S]>,  DwarfRegNum<[23]>;
+def F8L  : FPRL< 8,  "f8", [F8S]>,  DwarfRegNum<[24]>;
+def F9L  : FPRL< 9,  "f9", [F9S]>,  DwarfRegNum<[25]>;
+def F10L : FPRL<10, "f10", [F10S]>, DwarfRegNum<[26]>;
+def F11L : FPRL<11, "f11", [F11S]>, DwarfRegNum<[27]>;
+def F12L : FPRL<12, "f12", [F12S]>, DwarfRegNum<[28]>;
+def F13L : FPRL<13, "f13", [F13S]>, DwarfRegNum<[29]>;
+def F14L : FPRL<14, "f14", [F14S]>, DwarfRegNum<[30]>;
+def F15L : FPRL<15, "f15", [F15S]>, DwarfRegNum<[31]>;
+
+// Status register
+def PSW : SystemZReg<"psw">;
+
+def subreg_32bit  : PatLeaf<(i32 1)>;
+def subreg_even32 : PatLeaf<(i32 1)>;
+def subreg_odd32  : PatLeaf<(i32 2)>;
+def subreg_even   : PatLeaf<(i32 3)>;
+def subreg_odd    : PatLeaf<(i32 4)>;
+
+def : SubRegSet<1, [R0D, R1D,  R2D,  R3D,  R4D,  R5D,  R6D,  R7D,
+                    R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D],
+                   [R0W, R1W,  R2W,  R3W,  R4W,  R5W,  R6W,  R7W,
+                    R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]>;
+
+def : SubRegSet<3, [R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q],
+                   [R0D, R2D, R4D, R6D, R8D, R10D, R12D, R14D]>;
+
+def : SubRegSet<4, [R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q],
+                   [R1D, R3D, R5D, R7D, R9D, R11D, R13D, R15D]>;
+
+def : SubRegSet<1, [R0P, R2P, R4P, R6P, R8P, R10P, R12P, R14P],
+                   [R0W, R2W, R4W, R6W, R8W, R10W, R12W, R14W]>;
+
+def : SubRegSet<2, [R0P, R2P, R4P, R6P, R8P, R10P, R12P, R14P],
+                   [R1W, R3W, R5W, R7W, R9W, R11W, R13W, R15W]>;
+
+def : SubRegSet<1, [R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q],
+                   [R0W, R2W, R4W, R6W, R8W, R10W, R12W, R14W]>;
+
+def : SubRegSet<2, [R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q],
+                   [R1W, R3W, R5W, R7W, R9W, R11W, R13W, R15W]>;
+
+/// Register classes
+def GR32 : RegisterClass<"SystemZ", [i32], 32,
+   // Volatile registers
+  [R0W, R1W, R2W, R3W, R4W, R5W, R6W, R7W, R8W, R9W, R10W, R12W, R13W,
+   // Frame pointer, sometimes allocable
+   R11W,
+   // Volatile, but not allocable
+   R14W, R15W]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_REG32[] = {
+      SystemZ::R1W,  SystemZ::R2W,  SystemZ::R3W,  SystemZ::R4W,
+      SystemZ::R5W,  SystemZ::R0W,  SystemZ::R12W, SystemZ::R11W,
+      SystemZ::R10W, SystemZ::R9W,  SystemZ::R8W,  SystemZ::R7W,
+      SystemZ::R6W,  SystemZ::R14W, SystemZ::R13W
+    };
+    static const unsigned SystemZ_REG32_nofp[] = {
+      SystemZ::R1W,  SystemZ::R2W,  SystemZ::R3W,  SystemZ::R4W,
+      SystemZ::R5W,  SystemZ::R0W,  SystemZ::R12W, /* No R11W */
+      SystemZ::R10W, SystemZ::R9W,  SystemZ::R8W,  SystemZ::R7W,
+      SystemZ::R6W,  SystemZ::R14W, SystemZ::R13W
+    };
+    GR32Class::iterator
+    GR32Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG32_nofp;
+      else
+        return SystemZ_REG32;
+    }
+    GR32Class::iterator
+    GR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG32_nofp + (sizeof(SystemZ_REG32_nofp) / sizeof(unsigned));
+      else
+        return SystemZ_REG32 + (sizeof(SystemZ_REG32) / sizeof(unsigned));
+    }
+  }];
+}
+
+/// Registers used to generate address. Everything except R0.
+def ADDR32 : RegisterClass<"SystemZ", [i32], 32,
+   // Volatile registers
+  [R1W, R2W, R3W, R4W, R5W, R6W, R7W, R8W, R9W, R10W, R12W, R13W,
+   // Frame pointer, sometimes allocable
+   R11W,
+   // Volatile, but not allocable
+   R14W, R15W]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_ADDR32[] = {
+      SystemZ::R1W,  SystemZ::R2W,  SystemZ::R3W,  SystemZ::R4W,
+      SystemZ::R5W,  /* No R0W */   SystemZ::R12W, SystemZ::R11W,
+      SystemZ::R10W, SystemZ::R9W,  SystemZ::R8W,  SystemZ::R7W,
+      SystemZ::R6W,  SystemZ::R14W, SystemZ::R13W
+    };
+    static const unsigned SystemZ_ADDR32_nofp[] = {
+      SystemZ::R1W,  SystemZ::R2W,  SystemZ::R3W,  SystemZ::R4W,
+      SystemZ::R5W,  /* No R0W */   SystemZ::R12W, /* No R11W */
+      SystemZ::R10W, SystemZ::R9W,  SystemZ::R8W,  SystemZ::R7W,
+      SystemZ::R6W,  SystemZ::R14W, SystemZ::R13W
+    };
+    ADDR32Class::iterator
+    ADDR32Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_ADDR32_nofp;
+      else
+        return SystemZ_ADDR32;
+    }
+    ADDR32Class::iterator
+    ADDR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_ADDR32_nofp + (sizeof(SystemZ_ADDR32_nofp) / sizeof(unsigned));
+      else
+        return SystemZ_ADDR32 + (sizeof(SystemZ_ADDR32) / sizeof(unsigned));
+    }
+  }];
+}
+
+def GR64 : RegisterClass<"SystemZ", [i64], 64,
+   // Volatile registers
+  [R0D, R1D, R2D, R3D, R4D, R5D, R6D, R7D, R8D, R9D, R10D, R12D, R13D,
+   // Frame pointer, sometimes allocable
+   R11D,
+   // Volatile, but not allocable
+   R14D, R15D]>
+{
+  let SubRegClassList = [GR32];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_REG64[] = {
+      SystemZ::R1D,  SystemZ::R2D,  SystemZ::R3D,  SystemZ::R4D,
+      SystemZ::R5D,  SystemZ::R0D,  SystemZ::R12D, SystemZ::R11D,
+      SystemZ::R10D, SystemZ::R9D,  SystemZ::R8D,  SystemZ::R7D,
+      SystemZ::R6D,  SystemZ::R14D, SystemZ::R13D
+    };
+    static const unsigned SystemZ_REG64_nofp[] = {
+      SystemZ::R1D,  SystemZ::R2D,  SystemZ::R3D,  SystemZ::R4D,
+      SystemZ::R5D,  SystemZ::R0D,  SystemZ::R12D, /* No R11D */
+      SystemZ::R10D, SystemZ::R9D,  SystemZ::R8D,  SystemZ::R7D,
+      SystemZ::R6D,  SystemZ::R14D, SystemZ::R13D
+    };
+    GR64Class::iterator
+    GR64Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG64_nofp;
+      else
+        return SystemZ_REG64;
+    }
+    GR64Class::iterator
+    GR64Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG64_nofp + (sizeof(SystemZ_REG64_nofp) / sizeof(unsigned));
+      else
+        return SystemZ_REG64 + (sizeof(SystemZ_REG64) / sizeof(unsigned));
+    }
+  }];
+}
+
+def ADDR64 : RegisterClass<"SystemZ", [i64], 64,
+   // Volatile registers
+  [R1D, R2D, R3D, R4D, R5D, R6D, R7D, R8D, R9D, R10D, R12D, R13D,
+   // Frame pointer, sometimes allocable
+   R11D,
+   // Volatile, but not allocable
+   R14D, R15D]>
+{
+  let SubRegClassList = [ADDR32];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_ADDR64[] = {
+      SystemZ::R1D,  SystemZ::R2D,  SystemZ::R3D,  SystemZ::R4D,
+      SystemZ::R5D,  /* No R0D */   SystemZ::R12D, SystemZ::R11D,
+      SystemZ::R10D, SystemZ::R9D,  SystemZ::R8D,  SystemZ::R7D,
+      SystemZ::R6D,  SystemZ::R14D, SystemZ::R13D
+    };
+    static const unsigned SystemZ_ADDR64_nofp[] = {
+      SystemZ::R1D,  SystemZ::R2D,  SystemZ::R3D,  SystemZ::R4D,
+      SystemZ::R5D,  /* No R0D */   SystemZ::R12D, /* No R11D */
+      SystemZ::R10D, SystemZ::R9D,  SystemZ::R8D,  SystemZ::R7D,
+      SystemZ::R6D,  SystemZ::R14D, SystemZ::R13D
+    };
+    ADDR64Class::iterator
+    ADDR64Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_ADDR64_nofp;
+      else
+        return SystemZ_ADDR64;
+    }
+    ADDR64Class::iterator
+    ADDR64Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_ADDR64_nofp + (sizeof(SystemZ_ADDR64_nofp) / sizeof(unsigned));
+      else
+        return SystemZ_ADDR64 + (sizeof(SystemZ_ADDR64) / sizeof(unsigned));
+    }
+  }];
+}
+
+// Even-odd register pairs
+def GR64P : RegisterClass<"SystemZ", [v2i32], 64,
+  [R0P, R2P, R4P, R6P, R8P, R10P, R12P, R14P]>
+{
+  let SubRegClassList = [GR32, GR32];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_REG64P[] = {
+      SystemZ::R0P,  SystemZ::R2P,  SystemZ::R4P, SystemZ::R10P,
+      SystemZ::R8P,  SystemZ::R6P };
+    static const unsigned SystemZ_REG64P_nofp[] = {
+      SystemZ::R0P,  SystemZ::R2P,  SystemZ::R4P, /* NO R10P */
+      SystemZ::R8P,  SystemZ::R6P };
+    GR64PClass::iterator
+    GR64PClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG64P_nofp;
+      else
+        return SystemZ_REG64P;
+    }
+    GR64PClass::iterator
+    GR64PClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG64P_nofp + (sizeof(SystemZ_REG64P_nofp) / sizeof(unsigned));
+      else
+        return SystemZ_REG64P + (sizeof(SystemZ_REG64P) / sizeof(unsigned));
+    }
+  }];
+}
+
+def GR128 : RegisterClass<"SystemZ", [v2i64], 128,
+  [R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q]>
+{
+  let SubRegClassList = [GR32, GR32, GR64, GR64];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_REG128[] = {
+      SystemZ::R0Q,  SystemZ::R2Q,  SystemZ::R4Q,  SystemZ::R10Q,
+      SystemZ::R8Q,  SystemZ::R6Q };
+    static const unsigned SystemZ_REG128_nofp[] = {
+      SystemZ::R0Q,  SystemZ::R2Q,  SystemZ::R4Q, /* NO R10Q */
+      SystemZ::R8Q,  SystemZ::R6Q };
+    GR128Class::iterator
+    GR128Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG128_nofp;
+      else
+        return SystemZ_REG128;
+    }
+    GR128Class::iterator
+    GR128Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG128_nofp + (sizeof(SystemZ_REG128_nofp) / sizeof(unsigned));
+      else
+        return SystemZ_REG128 + (sizeof(SystemZ_REG128) / sizeof(unsigned));
+    }
+  }];
+}
+
+def FP32 : RegisterClass<"SystemZ", [f32], 32,
+ [F0S, F1S,  F2S,  F3S,  F4S,  F5S,  F6S,  F7S,
+  F8S, F9S, F10S, F11S, F12S, F13S, F14S, F15S]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_REGFP32[] = {
+      SystemZ::F0S,  SystemZ::F2S,  SystemZ::F4S,  SystemZ::F6S,
+      SystemZ::F1S,  SystemZ::F3S,  SystemZ::F5S,  SystemZ::F7S,
+      SystemZ::F8S,  SystemZ::F9S,  SystemZ::F10S, SystemZ::F11S,
+      SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S };
+    FP32Class::iterator
+    FP32Class::allocation_order_begin(const MachineFunction &MF) const {
+      return SystemZ_REGFP32;
+    }
+    FP32Class::iterator
+    FP32Class::allocation_order_end(const MachineFunction &MF) const {
+      return SystemZ_REGFP32 + (sizeof(SystemZ_REGFP32) / sizeof(unsigned));
+    }
+  }];
+}
+
+def FP64 : RegisterClass<"SystemZ", [f64], 64,
+ [F0L, F1L,  F2L,  F3L,  F4L,  F5L,  F6L,  F7L, 
+  F8L, F9L, F10L, F11L, F12L, F13L, F14L, F15L]> {
+  let SubRegClassList = [FP32];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_REGFP64[] = {
+      SystemZ::F0L,  SystemZ::F2L,  SystemZ::F4L,  SystemZ::F6L,
+      SystemZ::F1L,  SystemZ::F3L,  SystemZ::F5L,  SystemZ::F7L,
+      SystemZ::F8L,  SystemZ::F9L,  SystemZ::F10L, SystemZ::F11L,
+      SystemZ::F12L, SystemZ::F13L, SystemZ::F14L, SystemZ::F15L };
+    FP64Class::iterator
+    FP64Class::allocation_order_begin(const MachineFunction &MF) const {
+      return SystemZ_REGFP64;
+    }
+    FP64Class::iterator
+    FP64Class::allocation_order_end(const MachineFunction &MF) const {
+      return SystemZ_REGFP64 + (sizeof(SystemZ_REGFP64) / sizeof(unsigned));
+    }
+  }];
+}
+
+// Status flags registers.
+def CCR : RegisterClass<"SystemZ", [i64], 64, [PSW]> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+}
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
new file mode 100644
index 0000000..a8b5e1f
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -0,0 +1,47 @@
+//===- SystemZSubtarget.cpp - SystemZ Subtarget Information -------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZ specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZSubtarget.h"
+#include "SystemZ.h"
+#include "SystemZGenSubtarget.inc"
+#include "llvm/GlobalValue.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+SystemZSubtarget::SystemZSubtarget(const std::string &TT, 
+                                   const std::string &FS):
+  HasZ10Insts(false) {
+  std::string CPU = "z9";
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}
+
+/// True if accessing the GV requires an extra load.
+bool SystemZSubtarget::GVRequiresExtraLoad(const GlobalValue* GV,
+                                           const TargetMachine& TM,
+                                           bool isDirectCall) const {
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // Extra load is needed for all externally visible.
+    if (isDirectCall)
+      return false;
+
+    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+      return false;
+
+    return true;
+  }
+
+  return false;
+}
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
new file mode 100644
index 0000000..405d6e9
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -0,0 +1,45 @@
+//==-- SystemZSubtarget.h - Define Subtarget for the SystemZ ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SystemZ specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SystemZ_SUBTARGET_H
+#define LLVM_TARGET_SystemZ_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+class GlobalValue;
+class TargetMachine;
+
+class SystemZSubtarget : public TargetSubtarget {
+  bool HasZ10Insts;
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  SystemZSubtarget(const std::string &TT, const std::string &FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+  bool isZ10() const { return HasZ10Insts; }
+
+  bool GVRequiresExtraLoad(const GlobalValue* GV, const TargetMachine& TM,
+                           bool isDirectCall) const;
+};
+} // End llvm namespace
+
+#endif  // LLVM_TARGET_SystemZ_SUBTARGET_H
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
new file mode 100644
index 0000000..990e003
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -0,0 +1,44 @@
+//===-- SystemZTargetMachine.cpp - Define TargetMachine for SystemZ -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCAsmInfo.h"
+#include "SystemZTargetMachine.h"
+#include "SystemZ.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeSystemZTarget() {
+  // Register the target.
+  RegisterTargetMachine<SystemZTargetMachine> X(TheSystemZTarget);
+  RegisterAsmInfo<SystemZMCAsmInfo> Y(TheSystemZTarget);
+}
+
+/// SystemZTargetMachine ctor - Create an ILP64 architecture model
+///
+SystemZTargetMachine::SystemZTargetMachine(const Target &T,
+                                           const std::string &TT,
+                                           const std::string &FS)
+  : LLVMTargetMachine(T, TT),
+    Subtarget(TT, FS),
+    DataLayout("E-p:64:64:64-i8:8:16-i16:16:16-i32:32:32-i64:64:64-f32:32:32"
+               "-f64:64:64-f128:128:128-a0:16:16"),
+    InstrInfo(*this), TLInfo(*this),
+    FrameInfo(TargetFrameInfo::StackGrowsDown, 8, -160) {
+
+  if (getRelocationModel() == Reloc::Default)
+    setRelocationModel(Reloc::Static);
+}
+
+bool SystemZTargetMachine::addInstSelector(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel) {
+  // Install an instruction selector.
+  PM.add(createSystemZISelDag(*this, OptLevel));
+  return false;
+}
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
new file mode 100644
index 0000000..551aeb5
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -0,0 +1,61 @@
+//==- SystemZTargetMachine.h - Define TargetMachine for SystemZ ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SystemZ specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_TARGET_SYSTEMZ_TARGETMACHINE_H
+#define LLVM_TARGET_SYSTEMZ_TARGETMACHINE_H
+
+#include "SystemZInstrInfo.h"
+#include "SystemZISelLowering.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// SystemZTargetMachine
+///
+class SystemZTargetMachine : public LLVMTargetMachine {
+  SystemZSubtarget        Subtarget;
+  const TargetData        DataLayout;       // Calculates type size & alignment
+  SystemZInstrInfo        InstrInfo;
+  SystemZTargetLowering   TLInfo;
+
+  // SystemZ does not have any call stack frame, therefore not having
+  // any SystemZ specific FrameInfo class.
+  TargetFrameInfo       FrameInfo;
+public:
+  SystemZTargetMachine(const Target &T, const std::string &TT,
+                       const std::string &FS);
+
+  virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; }
+  virtual const SystemZInstrInfo *getInstrInfo() const  { return &InstrInfo; }
+  virtual const TargetData *getTargetData() const     { return &DataLayout;}
+  virtual const SystemZSubtarget *getSubtargetImpl() const { return &Subtarget; }
+
+  virtual const SystemZRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  virtual SystemZTargetLowering *getTargetLowering() const {
+    return const_cast<SystemZTargetLowering*>(&TLInfo);
+  }
+
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+}; // SystemZTargetMachine.
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_SystemZ_TARGETMACHINE_H
diff --git a/lib/Target/SystemZ/TargetInfo/CMakeLists.txt b/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..743d8d3
--- /dev/null
+++ b/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMSystemZInfo
+  SystemZTargetInfo.cpp
+  )
+
+add_dependencies(LLVMSystemZInfo SystemZCodeGenTable_gen)
diff --git a/lib/Target/SystemZ/TargetInfo/Makefile b/lib/Target/SystemZ/TargetInfo/Makefile
new file mode 100644
index 0000000..0be80eb
--- /dev/null
+++ b/lib/Target/SystemZ/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/SystemZ/TargetInfo/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSystemZInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
new file mode 100644
index 0000000..8272b11
--- /dev/null
+++ b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
@@ -0,0 +1,19 @@
+//===-- SystemZTargetInfo.cpp - SystemZ Target Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheSystemZTarget;
+
+extern "C" void LLVMInitializeSystemZTargetInfo() {
+  RegisterTarget<Triple::systemz> X(TheSystemZTarget, "systemz", "SystemZ");
+}
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index ed544b7..cc6be9f 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -41,7 +41,7 @@ unsigned LLVMPointerSize(LLVMTargetDataRef TD) {
 }
 
 LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef TD) {
-  return wrap(unwrap(TD)->getIntPtrType());
+  return wrap(unwrap(TD)->getIntPtrType(getGlobalContext()));
 }
 
 unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
index 7b843df..5bcd658 100644
--- a/lib/Target/TargetData.cpp
+++ b/lib/Target/TargetData.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/System/Mutex.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringExtras.h"
@@ -155,13 +156,13 @@ const TargetAlignElem TargetData::InvalidAlignmentElem =
  <br><br>
  <i>@verbatim<type><size>:<abi_align>:<pref_align>@endverbatim</i>: Numeric type
  alignment. Type is
- one of <i>i|f|v|a</i>, corresponding to integer, floating point, vector (aka
- packed) or aggregate.  Size indicates the size, e.g., 32 or 64 bits.
+ one of <i>i|f|v|a</i>, corresponding to integer, floating point, vector, or
+ aggregate.  Size indicates the size, e.g., 32 or 64 bits.
  \p
- The default string, fully specified is:
+ The default string, fully specified, is:
  <br><br>
- "E-p:64:64:64-a0:0:0-f32:32:32-f64:0:64"
- "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:0:64"
+ "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64"
+ "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64"
  "-v64:64:64-v128:128:128"
  <br><br>
  Note that in the case of aggregates, 0 is the default ABI and preferred
@@ -171,6 +172,7 @@ const TargetAlignElem TargetData::InvalidAlignmentElem =
 void TargetData::init(const std::string &TargetDescription) {
   std::string temp = TargetDescription;
   
+  LayoutMap = 0;
   LittleEndian = false;
   PointerMemSize = 8;
   PointerABIAlign   = 8;
@@ -184,9 +186,9 @@ void TargetData::init(const std::string &TargetDescription) {
   setAlignment(INTEGER_ALIGN,   4,  8, 64);  // i64
   setAlignment(FLOAT_ALIGN,     4,  4, 32);  // float
   setAlignment(FLOAT_ALIGN,     8,  8, 64);  // double
-  setAlignment(VECTOR_ALIGN,    8,  8, 64);  // v2i32
+  setAlignment(VECTOR_ALIGN,    8,  8, 64);  // v2i32, v1i64, ...
   setAlignment(VECTOR_ALIGN,   16, 16, 128); // v16i8, v8i16, v4i32, ...
-  setAlignment(AGGREGATE_ALIGN, 0,  8,  0);  // struct, union, class, ...
+  setAlignment(AGGREGATE_ALIGN, 0,  8,  0);  // struct
 
   while (!temp.empty()) {
     std::string token = getToken(temp, "-");
@@ -316,61 +318,30 @@ unsigned TargetData::getAlignmentInfo(AlignTypeEnum AlignType,
                  : Alignments[BestMatchIdx].PrefAlign;
 }
 
-namespace {
-
-/// LayoutInfo - The lazy cache of structure layout information maintained by
-/// TargetData.  Note that the struct types must have been free'd before
-/// llvm_shutdown is called (and thus this is deallocated) because all the
-/// targets with cached elements should have been destroyed.
-///
-typedef std::pair<const TargetData*,const StructType*> LayoutKey;
-
-struct DenseMapLayoutKeyInfo {
-  static inline LayoutKey getEmptyKey() { return LayoutKey(0, 0); }
-  static inline LayoutKey getTombstoneKey() {
-    return LayoutKey((TargetData*)(intptr_t)-1, 0);
-  }
-  static unsigned getHashValue(const LayoutKey &Val) {
-    return DenseMapInfo<void*>::getHashValue(Val.first) ^
-           DenseMapInfo<void*>::getHashValue(Val.second);
-  }
-  static bool isEqual(const LayoutKey &LHS, const LayoutKey &RHS) {
-    return LHS == RHS;
-  }
-
-  static bool isPod() { return true; }
-};
-
-typedef DenseMap<LayoutKey, StructLayout*, DenseMapLayoutKeyInfo> LayoutInfoTy;
-
-}
-
-static ManagedStatic<LayoutInfoTy> LayoutInfo;
-static ManagedStatic<sys::SmartMutex<true> > LayoutLock;
+typedef DenseMap<const StructType*, StructLayout*>LayoutInfoTy;
 
 TargetData::~TargetData() {
-  if (!LayoutInfo.isConstructed())
+  if (!LayoutMap)
     return;
   
-  sys::SmartScopedLock<true> Lock(&*LayoutLock);
   // Remove any layouts for this TD.
-  LayoutInfoTy &TheMap = *LayoutInfo;
+  LayoutInfoTy &TheMap = *static_cast<LayoutInfoTy*>(LayoutMap);
   for (LayoutInfoTy::iterator I = TheMap.begin(), E = TheMap.end(); I != E; ) {
-    if (I->first.first == this) {
-      I->second->~StructLayout();
-      free(I->second);
-      TheMap.erase(I++);
-    } else {
-      ++I;
-    }
+    I->second->~StructLayout();
+    free(I->second);
+    TheMap.erase(I++);
   }
+  
+  delete static_cast<LayoutInfoTy*>(LayoutMap);
 }
 
 const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
-  LayoutInfoTy &TheMap = *LayoutInfo;
+  if (!LayoutMap)
+    LayoutMap = static_cast<void*>(new LayoutInfoTy());
+  
+  LayoutInfoTy &TheMap = *static_cast<LayoutInfoTy*>(LayoutMap);
   
-  sys::SmartScopedLock<true> Lock(&*LayoutLock);
-  StructLayout *&SL = TheMap[LayoutKey(this, Ty)];
+  StructLayout *&SL = TheMap[Ty];
   if (SL) return SL;
 
   // Otherwise, create the struct layout.  Because it is variable length, we 
@@ -392,10 +363,10 @@ const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
 /// removed, this method must be called whenever a StructType is removed to
 /// avoid a dangling pointer in this cache.
 void TargetData::InvalidateStructLayoutInfo(const StructType *Ty) const {
-  if (!LayoutInfo.isConstructed()) return;  // No cache.
+  if (!LayoutMap) return;  // No cache.
   
-  sys::SmartScopedLock<true> Lock(&*LayoutLock);
-  LayoutInfoTy::iterator I = LayoutInfo->find(LayoutKey(this, Ty));
+  LayoutInfoTy* LayoutInfo = static_cast<LayoutInfoTy*>(LayoutMap);
+  LayoutInfoTy::iterator I = LayoutInfo->find(Ty);
   if (I == LayoutInfo->end()) return;
   
   I->second->~StructLayout();
@@ -453,7 +424,7 @@ uint64_t TargetData::getTypeSizeInBits(const Type *Ty) const {
   case Type::VectorTyID:
     return cast<VectorType>(Ty)->getBitWidth();
   default:
-    assert(0 && "TargetData::getTypeSizeInBits(): Unsupported type");
+    llvm_unreachable("TargetData::getTypeSizeInBits(): Unsupported type");
     break;
   }
   return 0;
@@ -508,7 +479,7 @@ unsigned char TargetData::getAlignment(const Type *Ty, bool abi_or_pref) const {
     AlignType = VECTOR_ALIGN;
     break;
   default:
-    assert(0 && "Bad type for getAlignment!!!");
+    llvm_unreachable("Bad type for getAlignment!!!");
     break;
   }
 
@@ -540,8 +511,8 @@ unsigned char TargetData::getPreferredTypeAlignmentShift(const Type *Ty) const {
 
 /// getIntPtrType - Return an unsigned integer type that is the same size or
 /// greater to the host pointer size.
-const IntegerType *TargetData::getIntPtrType() const {
-  return IntegerType::get(getPointerSizeInBits());
+const IntegerType *TargetData::getIntPtrType(LLVMContext &C) const {
+  return IntegerType::get(C, getPointerSizeInBits());
 }
 
 
@@ -555,7 +526,8 @@ uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices,
     TI = gep_type_begin(ptrTy, Indices, Indices+NumIndices);
   for (unsigned CurIDX = 0; CurIDX != NumIndices; ++CurIDX, ++TI) {
     if (const StructType *STy = dyn_cast<StructType>(*TI)) {
-      assert(Indices[CurIDX]->getType() == Type::Int32Ty &&
+      assert(Indices[CurIDX]->getType() ==
+             Type::getInt32Ty(ptrTy->getContext()) &&
              "Illegal struct idx");
       unsigned FieldNo = cast<ConstantInt>(Indices[CurIDX])->getZExtValue();
 
diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp
index ceaea0c..094a57e 100644
--- a/lib/Target/TargetInstrInfo.cpp
+++ b/lib/Target/TargetInstrInfo.cpp
@@ -12,11 +12,29 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Constant.h"
-#include "llvm/DerivedTypes.h"
+#include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
+//===----------------------------------------------------------------------===//
+//  TargetOperandInfo
+//===----------------------------------------------------------------------===//
+
+/// getRegClass - Get the register class for the operand, handling resolution
+/// of "symbolic" pointer register classes etc.  If this is not a register
+/// operand, this returns null.
+const TargetRegisterClass *
+TargetOperandInfo::getRegClass(const TargetRegisterInfo *TRI) const {
+  if (isLookupPtrRegClass())
+    return TRI->getPointerRegClass(RegClass);
+  return TRI->getRegClass(RegClass);
+}
+
+//===----------------------------------------------------------------------===//
+//  TargetInstrInfo
+//===----------------------------------------------------------------------===//
+
 TargetInstrInfo::TargetInstrInfo(const TargetInstrDesc* Desc,
                                  unsigned numOpcodes)
   : Descriptors(Desc), NumOpcodes(numOpcodes) {
@@ -25,6 +43,14 @@ TargetInstrInfo::TargetInstrInfo(const TargetInstrDesc* Desc,
 TargetInstrInfo::~TargetInstrInfo() {
 }
 
+/// insertNoop - Insert a noop into the instruction stream at the specified
+/// point.
+void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB, 
+                                 MachineBasicBlock::iterator MI) const {
+  llvm_unreachable("Target didn't implement insertNoop!");
+}
+
+
 bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
   const TargetInstrDesc &TID = MI->getDesc();
   if (!TID.isTerminator()) return false;
@@ -37,14 +63,33 @@ bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
   return !isPredicated(MI);
 }
 
-/// getInstrOperandRegClass - Return register class of the operand of an
-/// instruction of the specified TargetInstrDesc.
-const TargetRegisterClass*
-llvm::getInstrOperandRegClass(const TargetRegisterInfo *TRI,
-                        const TargetInstrDesc &II, unsigned Op) {
-  if (Op >= II.getNumOperands())
-    return NULL;
-  if (II.OpInfo[Op].isLookupPtrRegClass())
-    return TRI->getPointerRegClass();
-  return TRI->getRegClass(II.OpInfo[Op].RegClass);
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorChar or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorChar or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+unsigned TargetInstrInfo::getInlineAsmLength(const char *Str,
+                                             const MCAsmInfo &MAI) const {
+  
+  
+  // Count the number of instructions in the asm.
+  bool atInsnStart = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || *Str == MAI.getSeparatorChar())
+      atInsnStart = true;
+    if (atInsnStart && !isspace(*Str)) {
+      Length += MAI.getMaxInstLength();
+      atInsnStart = false;
+    }
+    if (atInsnStart && strncmp(Str, MAI.getCommentString(),
+                               strlen(MAI.getCommentString())) == 0)
+      atInsnStart = false;
+  }
+  
+  return Length;
 }
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
new file mode 100644
index 0000000..c1aab99
--- /dev/null
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -0,0 +1,1089 @@
+//===-- llvm/Target/TargetLoweringObjectFile.cpp - Object File Info -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements classes used to handle lowerings specific to common
+// object file formats.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                              Generic Code
+//===----------------------------------------------------------------------===//
+
+TargetLoweringObjectFile::TargetLoweringObjectFile() : Ctx(0) {
+  TextSection = 0;
+  DataSection = 0;
+  BSSSection = 0;
+  ReadOnlySection = 0;
+  StaticCtorSection = 0;
+  StaticDtorSection = 0;
+  LSDASection = 0;
+  EHFrameSection = 0;
+
+  DwarfAbbrevSection = 0;
+  DwarfInfoSection = 0;
+  DwarfLineSection = 0;
+  DwarfFrameSection = 0;
+  DwarfPubNamesSection = 0;
+  DwarfPubTypesSection = 0;
+  DwarfDebugInlineSection = 0;
+  DwarfStrSection = 0;
+  DwarfLocSection = 0;
+  DwarfARangesSection = 0;
+  DwarfRangesSection = 0;
+  DwarfMacroInfoSection = 0;
+}
+
+TargetLoweringObjectFile::~TargetLoweringObjectFile() {
+}
+
+static bool isSuitableForBSS(const GlobalVariable *GV) {
+  Constant *C = GV->getInitializer();
+
+  // Must have zero initializer.
+  if (!C->isNullValue())
+    return false;
+
+  // Leave constant zeros in readonly constant sections, so they can be shared.
+  if (GV->isConstant())
+    return false;
+
+  // If the global has an explicit section specified, don't put it in BSS.
+  if (!GV->getSection().empty())
+    return false;
+
+  // If -nozero-initialized-in-bss is specified, don't ever use BSS.
+  if (NoZerosInBSS)
+    return false;
+
+  // Otherwise, put it in BSS!
+  return true;
+}
+
+/// IsNullTerminatedString - Return true if the specified constant (which is
+/// known to have a type that is an array of 1/2/4 byte elements) ends with a
+/// nul value and contains no other nuls in it.
+static bool IsNullTerminatedString(const Constant *C) {
+  const ArrayType *ATy = cast<ArrayType>(C->getType());
+
+  // First check: is we have constant array of i8 terminated with zero
+  if (const ConstantArray *CVA = dyn_cast<ConstantArray>(C)) {
+    if (ATy->getNumElements() == 0) return false;
+
+    ConstantInt *Null =
+      dyn_cast<ConstantInt>(CVA->getOperand(ATy->getNumElements()-1));
+    if (Null == 0 || Null->getZExtValue() != 0)
+      return false; // Not null terminated.
+
+    // Verify that the null doesn't occur anywhere else in the string.
+    for (unsigned i = 0, e = ATy->getNumElements()-1; i != e; ++i)
+      // Reject constantexpr elements etc.
+      if (!isa<ConstantInt>(CVA->getOperand(i)) ||
+          CVA->getOperand(i) == Null)
+        return false;
+    return true;
+  }
+
+  // Another possibility: [1 x i8] zeroinitializer
+  if (isa<ConstantAggregateZero>(C))
+    return ATy->getNumElements() == 1;
+
+  return false;
+}
+
+/// getKindForGlobal - This is a top-level target-independent classifier for
+/// a global variable.  Given an global variable and information from TM, it
+/// classifies the global in a variety of ways that make various target
+/// implementations simpler.  The target implementation is free to ignore this
+/// extra info of course.
+SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
+                                                       const TargetMachine &TM){
+  assert(!GV->isDeclaration() && !GV->hasAvailableExternallyLinkage() &&
+         "Can only be used for global definitions");
+
+  Reloc::Model ReloModel = TM.getRelocationModel();
+
+  // Early exit - functions should be always in text sections.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (GVar == 0)
+    return SectionKind::getText();
+
+  // Handle thread-local data first.
+  if (GVar->isThreadLocal()) {
+    if (isSuitableForBSS(GVar))
+      return SectionKind::getThreadBSS();
+    return SectionKind::getThreadData();
+  }
+
+  // Variable can be easily put to BSS section.
+  if (isSuitableForBSS(GVar))
+    return SectionKind::getBSS();
+
+  Constant *C = GVar->getInitializer();
+
+  // If the global is marked constant, we can put it into a mergable section,
+  // a mergable string section, or general .data if it contains relocations.
+  if (GVar->isConstant()) {
+    // If the initializer for the global contains something that requires a
+    // relocation, then we may have to drop this into a wriable data section
+    // even though it is marked const.
+    switch (C->getRelocationInfo()) {
+    default: llvm_unreachable("unknown relocation info kind");
+    case Constant::NoRelocation:
+      // If initializer is a null-terminated string, put it in a "cstring"
+      // section of the right width.
+      if (const ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) {
+        if (const IntegerType *ITy =
+              dyn_cast<IntegerType>(ATy->getElementType())) {
+          if ((ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16 ||
+               ITy->getBitWidth() == 32) &&
+              IsNullTerminatedString(C)) {
+            if (ITy->getBitWidth() == 8)
+              return SectionKind::getMergeable1ByteCString();
+            if (ITy->getBitWidth() == 16)
+              return SectionKind::getMergeable2ByteCString();
+
+            assert(ITy->getBitWidth() == 32 && "Unknown width");
+            return SectionKind::getMergeable4ByteCString();
+          }
+        }
+      }
+
+      // Otherwise, just drop it into a mergable constant section.  If we have
+      // a section for this size, use it, otherwise use the arbitrary sized
+      // mergable section.
+      switch (TM.getTargetData()->getTypeAllocSize(C->getType())) {
+      case 4:  return SectionKind::getMergeableConst4();
+      case 8:  return SectionKind::getMergeableConst8();
+      case 16: return SectionKind::getMergeableConst16();
+      default: return SectionKind::getMergeableConst();
+      }
+
+    case Constant::LocalRelocation:
+      // In static relocation model, the linker will resolve all addresses, so
+      // the relocation entries will actually be constants by the time the app
+      // starts up.  However, we can't put this into a mergable section, because
+      // the linker doesn't take relocations into consideration when it tries to
+      // merge entries in the section.
+      if (ReloModel == Reloc::Static)
+        return SectionKind::getReadOnly();
+
+      // Otherwise, the dynamic linker needs to fix it up, put it in the
+      // writable data.rel.local section.
+      return SectionKind::getReadOnlyWithRelLocal();
+
+    case Constant::GlobalRelocations:
+      // In static relocation model, the linker will resolve all addresses, so
+      // the relocation entries will actually be constants by the time the app
+      // starts up.  However, we can't put this into a mergable section, because
+      // the linker doesn't take relocations into consideration when it tries to
+      // merge entries in the section.
+      if (ReloModel == Reloc::Static)
+        return SectionKind::getReadOnly();
+
+      // Otherwise, the dynamic linker needs to fix it up, put it in the
+      // writable data.rel section.
+      return SectionKind::getReadOnlyWithRel();
+    }
+  }
+
+  // Okay, this isn't a constant.  If the initializer for the global is going
+  // to require a runtime relocation by the dynamic linker, put it into a more
+  // specific section to improve startup time of the app.  This coalesces these
+  // globals together onto fewer pages, improving the locality of the dynamic
+  // linker.
+  if (ReloModel == Reloc::Static)
+    return SectionKind::getDataNoRel();
+
+  switch (C->getRelocationInfo()) {
+  default: llvm_unreachable("unknown relocation info kind");
+  case Constant::NoRelocation:
+    return SectionKind::getDataNoRel();
+  case Constant::LocalRelocation:
+    return SectionKind::getDataRelLocal();
+  case Constant::GlobalRelocations:
+    return SectionKind::getDataRel();
+  }
+}
+
+/// SectionForGlobal - This method computes the appropriate section to emit
+/// the specified global variable or function definition.  This should not
+/// be passed external (or available externally) globals.
+const MCSection *TargetLoweringObjectFile::
+SectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler *Mang,
+                 const TargetMachine &TM) const {
+  // Select section name.
+  if (GV->hasSection())
+    return getExplicitSectionGlobal(GV, Kind, Mang, TM);
+
+
+  // Use default section depending on the 'type' of global
+  return SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
+
+
+// Lame default implementation. Calculate the section name for global.
+const MCSection *
+TargetLoweringObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                                 SectionKind Kind,
+                                                 Mangler *Mang,
+                                                 const TargetMachine &TM) const{
+  assert(!Kind.isThreadLocal() && "Doesn't support TLS");
+
+  if (Kind.isText())
+    return getTextSection();
+
+  if (Kind.isBSS() && BSSSection != 0)
+    return BSSSection;
+
+  if (Kind.isReadOnly() && ReadOnlySection != 0)
+    return ReadOnlySection;
+
+  return getDataSection();
+}
+
+/// getSectionForConstant - Given a mergable constant with the
+/// specified size and relocation information, return a section that it
+/// should be placed in.
+const MCSection *
+TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind) const {
+  if (Kind.isReadOnly() && ReadOnlySection != 0)
+    return ReadOnlySection;
+
+  return DataSection;
+}
+
+/// getSymbolForDwarfGlobalReference - Return an MCExpr to use for a
+/// pc-relative reference to the specified global variable from exception
+/// handling information.  In addition to the symbol, this returns
+/// by-reference:
+///
+/// IsIndirect - True if the returned symbol is actually a stub that contains
+///    the address of the symbol, false if the symbol is the global itself.
+///
+/// IsPCRel - True if the symbol reference is already pc-relative, false if
+///    the caller needs to subtract off the address of the reference from the
+///    symbol.
+///
+const MCExpr *TargetLoweringObjectFile::
+getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                                 MachineModuleInfo *MMI,
+                                 bool &IsIndirect, bool &IsPCRel) const {
+  // The generic implementation of this just returns a direct reference to the
+  // symbol.
+  IsIndirect = false;
+  IsPCRel    = false;
+  
+  SmallString<128> Name;
+  Mang->getNameWithPrefix(Name, GV, false);
+  return MCSymbolRefExpr::Create(Name.str(), getContext());
+}
+
+
+//===----------------------------------------------------------------------===//
+//                                  ELF
+//===----------------------------------------------------------------------===//
+typedef StringMap<const MCSectionELF*> ELFUniqueMapTy;
+
+TargetLoweringObjectFileELF::~TargetLoweringObjectFileELF() {
+  // If we have the section uniquing map, free it.
+  delete (ELFUniqueMapTy*)UniquingMap;
+}
+
+const MCSection *TargetLoweringObjectFileELF::
+getELFSection(StringRef Section, unsigned Type, unsigned Flags,
+              SectionKind Kind, bool IsExplicit) const {
+  if (UniquingMap == 0)
+    UniquingMap = new ELFUniqueMapTy();
+  ELFUniqueMapTy &Map = *(ELFUniqueMapTy*)UniquingMap;
+
+  // Do the lookup, if we have a hit, return it.
+  const MCSectionELF *&Entry = Map[Section];
+  if (Entry) return Entry;
+
+  return Entry = MCSectionELF::Create(Section, Type, Flags, Kind, IsExplicit,
+                                      getContext());
+}
+
+void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TM) {
+  if (UniquingMap != 0)
+    ((ELFUniqueMapTy*)UniquingMap)->clear();
+  TargetLoweringObjectFile::Initialize(Ctx, TM);
+
+  BSSSection =
+    getELFSection(".bss", MCSectionELF::SHT_NOBITS,
+                  MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                  SectionKind::getBSS());
+
+  TextSection =
+    getELFSection(".text", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_EXECINSTR | MCSectionELF::SHF_ALLOC,
+                  SectionKind::getText());
+
+  DataSection =
+    getELFSection(".data", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                  SectionKind::getDataRel());
+
+  ReadOnlySection =
+    getELFSection(".rodata", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC,
+                  SectionKind::getReadOnly());
+
+  TLSDataSection =
+    getELFSection(".tdata", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_TLS |
+                  MCSectionELF::SHF_WRITE, SectionKind::getThreadData());
+
+  TLSBSSSection =
+    getELFSection(".tbss", MCSectionELF::SHT_NOBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_TLS |
+                  MCSectionELF::SHF_WRITE, SectionKind::getThreadBSS());
+
+  DataRelSection =
+    getELFSection(".data.rel", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getDataRel());
+
+  DataRelLocalSection =
+    getELFSection(".data.rel.local", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getDataRelLocal());
+
+  DataRelROSection =
+    getELFSection(".data.rel.ro", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getReadOnlyWithRel());
+
+  DataRelROLocalSection =
+    getELFSection(".data.rel.ro.local", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getReadOnlyWithRelLocal());
+
+  MergeableConst4Section =
+    getELFSection(".rodata.cst4", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_MERGE,
+                  SectionKind::getMergeableConst4());
+
+  MergeableConst8Section =
+    getELFSection(".rodata.cst8", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_MERGE,
+                  SectionKind::getMergeableConst8());
+
+  MergeableConst16Section =
+    getELFSection(".rodata.cst16", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_MERGE,
+                  SectionKind::getMergeableConst16());
+
+  StaticCtorSection =
+    getELFSection(".ctors", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getDataRel());
+
+  StaticDtorSection =
+    getELFSection(".dtors", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getDataRel());
+
+  // Exception Handling Sections.
+
+  // FIXME: We're emitting LSDA info into a readonly section on ELF, even though
+  // it contains relocatable pointers.  In PIC mode, this is probably a big
+  // runtime hit for C++ apps.  Either the contents of the LSDA need to be
+  // adjusted or this should be a data section.
+  LSDASection =
+    getELFSection(".gcc_except_table", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC, SectionKind::getReadOnly());
+  EHFrameSection =
+    getELFSection(".eh_frame", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getDataRel());
+
+  // Debug Info Sections.
+  DwarfAbbrevSection =
+    getELFSection(".debug_abbrev", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfInfoSection =
+    getELFSection(".debug_info", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfLineSection =
+    getELFSection(".debug_line", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfFrameSection =
+    getELFSection(".debug_frame", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    getELFSection(".debug_pubnames", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfPubTypesSection =
+    getELFSection(".debug_pubtypes", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfStrSection =
+    getELFSection(".debug_str", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfLocSection =
+    getELFSection(".debug_loc", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfARangesSection =
+    getELFSection(".debug_aranges", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfRangesSection =
+    getELFSection(".debug_ranges", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfMacroInfoSection =
+    getELFSection(".debug_macinfo", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+}
+
+
+static SectionKind
+getELFKindForNamedSection(const char *Name, SectionKind K) {
+  if (Name[0] != '.') return K;
+
+  // Some lame default implementation based on some magic section names.
+  if (strcmp(Name, ".bss") == 0 ||
+      strncmp(Name, ".bss.", 5) == 0 ||
+      strncmp(Name, ".gnu.linkonce.b.", 16) == 0 ||
+      strncmp(Name, ".llvm.linkonce.b.", 17) == 0 ||
+      strcmp(Name, ".sbss") == 0 ||
+      strncmp(Name, ".sbss.", 6) == 0 ||
+      strncmp(Name, ".gnu.linkonce.sb.", 17) == 0 ||
+      strncmp(Name, ".llvm.linkonce.sb.", 18) == 0)
+    return SectionKind::getBSS();
+
+  if (strcmp(Name, ".tdata") == 0 ||
+      strncmp(Name, ".tdata.", 7) == 0 ||
+      strncmp(Name, ".gnu.linkonce.td.", 17) == 0 ||
+      strncmp(Name, ".llvm.linkonce.td.", 18) == 0)
+    return SectionKind::getThreadData();
+
+  if (strcmp(Name, ".tbss") == 0 ||
+      strncmp(Name, ".tbss.", 6) == 0 ||
+      strncmp(Name, ".gnu.linkonce.tb.", 17) == 0 ||
+      strncmp(Name, ".llvm.linkonce.tb.", 18) == 0)
+    return SectionKind::getThreadBSS();
+
+  return K;
+}
+
+
+static unsigned
+getELFSectionType(const char *Name, SectionKind K) {
+
+  if (strcmp(Name, ".init_array") == 0)
+    return MCSectionELF::SHT_INIT_ARRAY;
+
+  if (strcmp(Name, ".fini_array") == 0)
+    return MCSectionELF::SHT_FINI_ARRAY;
+
+  if (strcmp(Name, ".preinit_array") == 0)
+    return MCSectionELF::SHT_PREINIT_ARRAY;
+
+  if (K.isBSS() || K.isThreadBSS())
+    return MCSectionELF::SHT_NOBITS;
+
+  return MCSectionELF::SHT_PROGBITS;
+}
+
+
+static unsigned
+getELFSectionFlags(SectionKind K) {
+  unsigned Flags = 0;
+
+  if (!K.isMetadata())
+    Flags |= MCSectionELF::SHF_ALLOC;
+
+  if (K.isText())
+    Flags |= MCSectionELF::SHF_EXECINSTR;
+
+  if (K.isWriteable())
+    Flags |= MCSectionELF::SHF_WRITE;
+
+  if (K.isThreadLocal())
+    Flags |= MCSectionELF::SHF_TLS;
+
+  // K.isMergeableConst() is left out to honour PR4650
+  if (K.isMergeableCString() || K.isMergeableConst4() ||
+      K.isMergeableConst8() || K.isMergeableConst16())
+    Flags |= MCSectionELF::SHF_MERGE;
+
+  if (K.isMergeableCString())
+    Flags |= MCSectionELF::SHF_STRINGS;
+
+  return Flags;
+}
+
+
+const MCSection *TargetLoweringObjectFileELF::
+getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+                         Mangler *Mang, const TargetMachine &TM) const {
+  const char *SectionName = GV->getSection().c_str();
+
+  // Infer section flags from the section name if we can.
+  Kind = getELFKindForNamedSection(SectionName, Kind);
+
+  return getELFSection(SectionName,
+                       getELFSectionType(SectionName, Kind),
+                       getELFSectionFlags(Kind), Kind, true);
+}
+
+static const char *getSectionPrefixForUniqueGlobal(SectionKind Kind) {
+  if (Kind.isText())                 return ".gnu.linkonce.t.";
+  if (Kind.isReadOnly())             return ".gnu.linkonce.r.";
+
+  if (Kind.isThreadData())           return ".gnu.linkonce.td.";
+  if (Kind.isThreadBSS())            return ".gnu.linkonce.tb.";
+
+  if (Kind.isBSS())                  return ".gnu.linkonce.b.";
+  if (Kind.isDataNoRel())            return ".gnu.linkonce.d.";
+  if (Kind.isDataRelLocal())         return ".gnu.linkonce.d.rel.local.";
+  if (Kind.isDataRel())              return ".gnu.linkonce.d.rel.";
+  if (Kind.isReadOnlyWithRelLocal()) return ".gnu.linkonce.d.rel.ro.local.";
+
+  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
+  return ".gnu.linkonce.d.rel.ro.";
+}
+
+const MCSection *TargetLoweringObjectFileELF::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+
+  // If this global is linkonce/weak and the target handles this by emitting it
+  // into a 'uniqued' section name, create and return the section now.
+  if (GV->isWeakForLinker()) {
+    const char *Prefix = getSectionPrefixForUniqueGlobal(Kind);
+    std::string Name = Mang->makeNameProper(GV->getNameStr());
+
+    return getELFSection((Prefix+Name).c_str(),
+                         getELFSectionType((Prefix+Name).c_str(), Kind),
+                         getELFSectionFlags(Kind),
+                         Kind);
+  }
+
+  if (Kind.isText()) return TextSection;
+
+  if (Kind.isMergeable1ByteCString() ||
+      Kind.isMergeable2ByteCString() ||
+      Kind.isMergeable4ByteCString()) {
+
+    // We also need alignment here.
+    // FIXME: this is getting the alignment of the character, not the
+    // alignment of the global!
+    unsigned Align =
+      TM.getTargetData()->getPreferredAlignment(cast<GlobalVariable>(GV));
+
+    const char *SizeSpec = ".rodata.str1.";
+    if (Kind.isMergeable2ByteCString())
+      SizeSpec = ".rodata.str2.";
+    else if (Kind.isMergeable4ByteCString())
+      SizeSpec = ".rodata.str4.";
+    else
+      assert(Kind.isMergeable1ByteCString() && "unknown string width");
+
+
+    std::string Name = SizeSpec + utostr(Align);
+    return getELFSection(Name.c_str(), MCSectionELF::SHT_PROGBITS,
+                         MCSectionELF::SHF_ALLOC |
+                         MCSectionELF::SHF_MERGE |
+                         MCSectionELF::SHF_STRINGS,
+                         Kind);
+  }
+
+  if (Kind.isMergeableConst()) {
+    if (Kind.isMergeableConst4() && MergeableConst4Section)
+      return MergeableConst4Section;
+    if (Kind.isMergeableConst8() && MergeableConst8Section)
+      return MergeableConst8Section;
+    if (Kind.isMergeableConst16() && MergeableConst16Section)
+      return MergeableConst16Section;
+    return ReadOnlySection;  // .const
+  }
+
+  if (Kind.isReadOnly())             return ReadOnlySection;
+
+  if (Kind.isThreadData())           return TLSDataSection;
+  if (Kind.isThreadBSS())            return TLSBSSSection;
+
+  if (Kind.isBSS())                  return BSSSection;
+
+  if (Kind.isDataNoRel())            return DataSection;
+  if (Kind.isDataRelLocal())         return DataRelLocalSection;
+  if (Kind.isDataRel())              return DataRelSection;
+  if (Kind.isReadOnlyWithRelLocal()) return DataRelROLocalSection;
+
+  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
+  return DataRelROSection;
+}
+
+/// getSectionForConstant - Given a mergeable constant with the
+/// specified size and relocation information, return a section that it
+/// should be placed in.
+const MCSection *TargetLoweringObjectFileELF::
+getSectionForConstant(SectionKind Kind) const {
+  if (Kind.isMergeableConst4() && MergeableConst4Section)
+    return MergeableConst4Section;
+  if (Kind.isMergeableConst8() && MergeableConst8Section)
+    return MergeableConst8Section;
+  if (Kind.isMergeableConst16() && MergeableConst16Section)
+    return MergeableConst16Section;
+  if (Kind.isReadOnly())
+    return ReadOnlySection;
+
+  if (Kind.isReadOnlyWithRelLocal()) return DataRelROLocalSection;
+  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
+  return DataRelROSection;
+}
+
+//===----------------------------------------------------------------------===//
+//                                 MachO
+//===----------------------------------------------------------------------===//
+
+typedef StringMap<const MCSectionMachO*> MachOUniqueMapTy;
+
+TargetLoweringObjectFileMachO::~TargetLoweringObjectFileMachO() {
+  // If we have the MachO uniquing map, free it.
+  delete (MachOUniqueMapTy*)UniquingMap;
+}
+
+
+const MCSectionMachO *TargetLoweringObjectFileMachO::
+getMachOSection(const StringRef &Segment, const StringRef &Section,
+                unsigned TypeAndAttributes,
+                unsigned Reserved2, SectionKind Kind) const {
+  // We unique sections by their segment/section pair.  The returned section
+  // may not have the same flags as the requested section, if so this should be
+  // diagnosed by the client as an error.
+
+  // Create the map if it doesn't already exist.
+  if (UniquingMap == 0)
+    UniquingMap = new MachOUniqueMapTy();
+  MachOUniqueMapTy &Map = *(MachOUniqueMapTy*)UniquingMap;
+
+  // Form the name to look up.
+  SmallString<64> Name;
+  Name += Segment;
+  Name.push_back(',');
+  Name += Section;
+
+  // Do the lookup, if we have a hit, return it.
+  const MCSectionMachO *&Entry = Map[Name.str()];
+  if (Entry) return Entry;
+
+  // Otherwise, return a new section.
+  return Entry = MCSectionMachO::Create(Segment, Section, TypeAndAttributes,
+                                        Reserved2, Kind, getContext());
+}
+
+
+void TargetLoweringObjectFileMachO::Initialize(MCContext &Ctx,
+                                               const TargetMachine &TM) {
+  if (UniquingMap != 0)
+    ((MachOUniqueMapTy*)UniquingMap)->clear();
+  TargetLoweringObjectFile::Initialize(Ctx, TM);
+
+  TextSection // .text
+    = getMachOSection("__TEXT", "__text",
+                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                      SectionKind::getText());
+  DataSection // .data
+    = getMachOSection("__DATA", "__data", 0, SectionKind::getDataRel());
+
+  CStringSection // .cstring
+    = getMachOSection("__TEXT", "__cstring", MCSectionMachO::S_CSTRING_LITERALS,
+                      SectionKind::getMergeable1ByteCString());
+  UStringSection
+    = getMachOSection("__TEXT","__ustring", 0,
+                      SectionKind::getMergeable2ByteCString());
+  FourByteConstantSection // .literal4
+    = getMachOSection("__TEXT", "__literal4", MCSectionMachO::S_4BYTE_LITERALS,
+                      SectionKind::getMergeableConst4());
+  EightByteConstantSection // .literal8
+    = getMachOSection("__TEXT", "__literal8", MCSectionMachO::S_8BYTE_LITERALS,
+                      SectionKind::getMergeableConst8());
+
+  // ld_classic doesn't support .literal16 in 32-bit mode, and ld64 falls back
+  // to using it in -static mode.
+  SixteenByteConstantSection = 0;
+  if (TM.getRelocationModel() != Reloc::Static &&
+      TM.getTargetData()->getPointerSize() == 32)
+    SixteenByteConstantSection =   // .literal16
+      getMachOSection("__TEXT", "__literal16",MCSectionMachO::S_16BYTE_LITERALS,
+                      SectionKind::getMergeableConst16());
+
+  ReadOnlySection  // .const
+    = getMachOSection("__TEXT", "__const", 0, SectionKind::getReadOnly());
+
+  TextCoalSection
+    = getMachOSection("__TEXT", "__textcoal_nt",
+                      MCSectionMachO::S_COALESCED |
+                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                      SectionKind::getText());
+  ConstTextCoalSection
+    = getMachOSection("__TEXT", "__const_coal", MCSectionMachO::S_COALESCED,
+                      SectionKind::getText());
+  ConstDataCoalSection
+    = getMachOSection("__DATA","__const_coal", MCSectionMachO::S_COALESCED,
+                      SectionKind::getText());
+  ConstDataSection  // .const_data
+    = getMachOSection("__DATA", "__const", 0,
+                      SectionKind::getReadOnlyWithRel());
+  DataCoalSection
+    = getMachOSection("__DATA","__datacoal_nt", MCSectionMachO::S_COALESCED,
+                      SectionKind::getDataRel());
+
+
+  LazySymbolPointerSection
+    = getMachOSection("__DATA", "__la_symbol_ptr",
+                      MCSectionMachO::S_LAZY_SYMBOL_POINTERS,
+                      SectionKind::getMetadata());
+  NonLazySymbolPointerSection
+    = getMachOSection("__DATA", "__nl_symbol_ptr",
+                      MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS,
+                      SectionKind::getMetadata());
+
+  if (TM.getRelocationModel() == Reloc::Static) {
+    StaticCtorSection
+      = getMachOSection("__TEXT", "__constructor", 0,SectionKind::getDataRel());
+    StaticDtorSection
+      = getMachOSection("__TEXT", "__destructor", 0, SectionKind::getDataRel());
+  } else {
+    StaticCtorSection
+      = getMachOSection("__DATA", "__mod_init_func",
+                        MCSectionMachO::S_MOD_INIT_FUNC_POINTERS,
+                        SectionKind::getDataRel());
+    StaticDtorSection
+      = getMachOSection("__DATA", "__mod_term_func",
+                        MCSectionMachO::S_MOD_TERM_FUNC_POINTERS,
+                        SectionKind::getDataRel());
+  }
+
+  // Exception Handling.
+  LSDASection = getMachOSection("__DATA", "__gcc_except_tab", 0,
+                                SectionKind::getDataRel());
+  EHFrameSection =
+    getMachOSection("__TEXT", "__eh_frame",
+                    MCSectionMachO::S_COALESCED |
+                    MCSectionMachO::S_ATTR_NO_TOC |
+                    MCSectionMachO::S_ATTR_STRIP_STATIC_SYMS |
+                    MCSectionMachO::S_ATTR_LIVE_SUPPORT,
+                    SectionKind::getReadOnly());
+
+  // Debug Information.
+  DwarfAbbrevSection =
+    getMachOSection("__DWARF", "__debug_abbrev", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfInfoSection =
+    getMachOSection("__DWARF", "__debug_info", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfLineSection =
+    getMachOSection("__DWARF", "__debug_line", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfFrameSection =
+    getMachOSection("__DWARF", "__debug_frame", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    getMachOSection("__DWARF", "__debug_pubnames", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfPubTypesSection =
+    getMachOSection("__DWARF", "__debug_pubtypes", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfStrSection =
+    getMachOSection("__DWARF", "__debug_str", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfLocSection =
+    getMachOSection("__DWARF", "__debug_loc", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfARangesSection =
+    getMachOSection("__DWARF", "__debug_aranges", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfRangesSection =
+    getMachOSection("__DWARF", "__debug_ranges", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfMacroInfoSection =
+    getMachOSection("__DWARF", "__debug_macinfo", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfDebugInlineSection =
+    getMachOSection("__DWARF", "__debug_inlined", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+}
+
+const MCSection *TargetLoweringObjectFileMachO::
+getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+                         Mangler *Mang, const TargetMachine &TM) const {
+  // Parse the section specifier and create it if valid.
+  StringRef Segment, Section;
+  unsigned TAA, StubSize;
+  std::string ErrorCode =
+    MCSectionMachO::ParseSectionSpecifier(GV->getSection(), Segment, Section,
+                                          TAA, StubSize);
+  if (!ErrorCode.empty()) {
+    // If invalid, report the error with llvm_report_error.
+    llvm_report_error("Global variable '" + GV->getNameStr() +
+                      "' has an invalid section specifier '" + GV->getSection()+
+                      "': " + ErrorCode + ".");
+    // Fall back to dropping it into the data section.
+    return DataSection;
+  }
+
+  // Get the section.
+  const MCSectionMachO *S =
+    getMachOSection(Segment, Section, TAA, StubSize, Kind);
+
+  // Okay, now that we got the section, verify that the TAA & StubSize agree.
+  // If the user declared multiple globals with different section flags, we need
+  // to reject it here.
+  if (S->getTypeAndAttributes() != TAA || S->getStubSize() != StubSize) {
+    // If invalid, report the error with llvm_report_error.
+    llvm_report_error("Global variable '" + GV->getNameStr() +
+                      "' section type or attributes does not match previous"
+                      " section specifier");
+  }
+
+  return S;
+}
+
+const MCSection *TargetLoweringObjectFileMachO::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+  assert(!Kind.isThreadLocal() && "Darwin doesn't support TLS");
+
+  if (Kind.isText())
+    return GV->isWeakForLinker() ? TextCoalSection : TextSection;
+
+  // If this is weak/linkonce, put this in a coalescable section, either in text
+  // or data depending on if it is writable.
+  if (GV->isWeakForLinker()) {
+    if (Kind.isReadOnly())
+      return ConstTextCoalSection;
+    return DataCoalSection;
+  }
+
+  // FIXME: Alignment check should be handled by section classifier.
+  if (Kind.isMergeable1ByteCString() ||
+      Kind.isMergeable2ByteCString()) {
+    if (TM.getTargetData()->getPreferredAlignment(
+                                              cast<GlobalVariable>(GV)) < 32) {
+      if (Kind.isMergeable1ByteCString())
+        return CStringSection;
+      assert(Kind.isMergeable2ByteCString());
+      return UStringSection;
+    }
+  }
+
+  if (Kind.isMergeableConst()) {
+    if (Kind.isMergeableConst4())
+      return FourByteConstantSection;
+    if (Kind.isMergeableConst8())
+      return EightByteConstantSection;
+    if (Kind.isMergeableConst16() && SixteenByteConstantSection)
+      return SixteenByteConstantSection;
+  }
+
+  // Otherwise, if it is readonly, but not something we can specially optimize,
+  // just drop it in .const.
+  if (Kind.isReadOnly())
+    return ReadOnlySection;
+
+  // If this is marked const, put it into a const section.  But if the dynamic
+  // linker needs to write to it, put it in the data segment.
+  if (Kind.isReadOnlyWithRel())
+    return ConstDataSection;
+
+  // Otherwise, just drop the variable in the normal data section.
+  return DataSection;
+}
+
+const MCSection *
+TargetLoweringObjectFileMachO::getSectionForConstant(SectionKind Kind) const {
+  // If this constant requires a relocation, we have to put it in the data
+  // segment, not in the text segment.
+  if (Kind.isDataRel())
+    return ConstDataSection;
+
+  if (Kind.isMergeableConst4())
+    return FourByteConstantSection;
+  if (Kind.isMergeableConst8())
+    return EightByteConstantSection;
+  if (Kind.isMergeableConst16() && SixteenByteConstantSection)
+    return SixteenByteConstantSection;
+  return ReadOnlySection;  // .const
+}
+
+/// shouldEmitUsedDirectiveFor - This hook allows targets to selectively decide
+/// not to emit the UsedDirective for some symbols in llvm.used.
+// FIXME: REMOVE this (rdar://7071300)
+bool TargetLoweringObjectFileMachO::
+shouldEmitUsedDirectiveFor(const GlobalValue *GV, Mangler *Mang) const {
+  /// On Darwin, internally linked data beginning with "L" or "l" does not have
+  /// the directive emitted (this occurs in ObjC metadata).
+  if (!GV) return false;
+
+  // Check whether the mangled name has the "Private" or "LinkerPrivate" prefix.
+  if (GV->hasLocalLinkage() && !isa<Function>(GV)) {
+    // FIXME: ObjC metadata is currently emitted as internal symbols that have
+    // \1L and \0l prefixes on them.  Fix them to be Private/LinkerPrivate and
+    // this horrible hack can go away.
+    const std::string &Name = Mang->getMangledName(GV);
+    if (Name[0] == 'L' || Name[0] == 'l')
+      return false;
+  }
+
+  return true;
+}
+
+const MCExpr *TargetLoweringObjectFileMachO::
+getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                                 MachineModuleInfo *MMI,
+                                 bool &IsIndirect, bool &IsPCRel) const {
+  // The mach-o version of this method defaults to returning a stub reference.
+  IsIndirect = true;
+  IsPCRel    = false;
+  
+  SmallString<128> Name;
+  Mang->getNameWithPrefix(Name, GV, true);
+  Name += "$non_lazy_ptr";
+  return MCSymbolRefExpr::Create(Name.str(), getContext());
+}
+
+
+//===----------------------------------------------------------------------===//
+//                                  COFF
+//===----------------------------------------------------------------------===//
+
+typedef StringMap<const MCSectionCOFF*> COFFUniqueMapTy;
+
+TargetLoweringObjectFileCOFF::~TargetLoweringObjectFileCOFF() {
+  delete (COFFUniqueMapTy*)UniquingMap;
+}
+
+
+const MCSection *TargetLoweringObjectFileCOFF::
+getCOFFSection(const char *Name, bool isDirective, SectionKind Kind) const {
+  // Create the map if it doesn't already exist.
+  if (UniquingMap == 0)
+    UniquingMap = new MachOUniqueMapTy();
+  COFFUniqueMapTy &Map = *(COFFUniqueMapTy*)UniquingMap;
+
+  // Do the lookup, if we have a hit, return it.
+  const MCSectionCOFF *&Entry = Map[Name];
+  if (Entry) return Entry;
+
+  return Entry = MCSectionCOFF::Create(Name, isDirective, Kind, getContext());
+}
+
+void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx,
+                                              const TargetMachine &TM) {
+  if (UniquingMap != 0)
+    ((COFFUniqueMapTy*)UniquingMap)->clear();
+  TargetLoweringObjectFile::Initialize(Ctx, TM);
+  TextSection = getCOFFSection("\t.text", true, SectionKind::getText());
+  DataSection = getCOFFSection("\t.data", true, SectionKind::getDataRel());
+  StaticCtorSection =
+    getCOFFSection(".ctors", false, SectionKind::getDataRel());
+  StaticDtorSection =
+    getCOFFSection(".dtors", false, SectionKind::getDataRel());
+
+  // FIXME: We're emitting LSDA info into a readonly section on COFF, even
+  // though it contains relocatable pointers.  In PIC mode, this is probably a
+  // big runtime hit for C++ apps.  Either the contents of the LSDA need to be
+  // adjusted or this should be a data section.
+  LSDASection =
+    getCOFFSection(".gcc_except_table", false, SectionKind::getReadOnly());
+  EHFrameSection =
+    getCOFFSection(".eh_frame", false, SectionKind::getDataRel());
+
+  // Debug info.
+  // FIXME: Don't use 'directive' mode here.
+  DwarfAbbrevSection =
+    getCOFFSection("\t.section\t.debug_abbrev,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfInfoSection =
+    getCOFFSection("\t.section\t.debug_info,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfLineSection =
+    getCOFFSection("\t.section\t.debug_line,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfFrameSection =
+    getCOFFSection("\t.section\t.debug_frame,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    getCOFFSection("\t.section\t.debug_pubnames,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfPubTypesSection =
+    getCOFFSection("\t.section\t.debug_pubtypes,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfStrSection =
+    getCOFFSection("\t.section\t.debug_str,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfLocSection =
+    getCOFFSection("\t.section\t.debug_loc,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfARangesSection =
+    getCOFFSection("\t.section\t.debug_aranges,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfRangesSection =
+    getCOFFSection("\t.section\t.debug_ranges,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfMacroInfoSection =
+    getCOFFSection("\t.section\t.debug_macinfo,\"dr\"",
+                   true, SectionKind::getMetadata());
+}
+
+const MCSection *TargetLoweringObjectFileCOFF::
+getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+                         Mangler *Mang, const TargetMachine &TM) const {
+  return getCOFFSection(GV->getSection().c_str(), false, Kind);
+}
+
+static const char *getCOFFSectionPrefixForUniqueGlobal(SectionKind Kind) {
+  if (Kind.isText())
+    return ".text$linkonce";
+  if (Kind.isWriteable())
+    return ".data$linkonce";
+  return ".rdata$linkonce";
+}
+
+
+const MCSection *TargetLoweringObjectFileCOFF::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+  assert(!Kind.isThreadLocal() && "Doesn't support TLS");
+
+  // If this global is linkonce/weak and the target handles this by emitting it
+  // into a 'uniqued' section name, create and return the section now.
+  if (GV->isWeakForLinker()) {
+    const char *Prefix = getCOFFSectionPrefixForUniqueGlobal(Kind);
+    std::string Name = Mang->makeNameProper(GV->getNameStr());
+    return getCOFFSection((Prefix+Name).c_str(), false, Kind);
+  }
+
+  if (Kind.isText())
+    return getTextSection();
+
+  return getDataSection();
+}
+
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index c487cb8..fec59b5 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
@@ -33,7 +33,10 @@ namespace llvm {
   FloatABI::ABIType FloatABIType;
   bool NoImplicitFloat;
   bool NoZerosInBSS;
-  bool ExceptionHandling;
+  bool DwarfExceptionHandling;
+  bool SjLjExceptionHandling;
+  bool JITEmitDebugInfo;
+  bool JITEmitDebugInfoToDisk;
   bool UnwindTablesMandatory;
   Reloc::Model RelocationModel;
   CodeModel::Model CMModel;
@@ -104,9 +107,32 @@ DontPlaceZerosInBSS("nozero-initialized-in-bss",
   cl::location(NoZerosInBSS),
   cl::init(false));
 static cl::opt<bool, true>
-EnableExceptionHandling("enable-eh",
+EnableDwarfExceptionHandling("enable-eh",
   cl::desc("Emit DWARF exception handling (default if target supports)"),
-  cl::location(ExceptionHandling),
+  cl::location(DwarfExceptionHandling),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableSjLjExceptionHandling("enable-sjlj-eh",
+  cl::desc("Emit SJLJ exception handling (default if target supports)"),
+  cl::location(SjLjExceptionHandling),
+  cl::init(false));
+// In debug builds, make this default to true.
+#ifdef NDEBUG
+#define EMIT_DEBUG false
+#else
+#define EMIT_DEBUG true
+#endif
+static cl::opt<bool, true>
+EmitJitDebugInfo("jit-emit-debug",
+  cl::desc("Emit debug information to debugger"),
+  cl::location(JITEmitDebugInfo),
+  cl::init(EMIT_DEBUG));
+#undef EMIT_DEBUG
+static cl::opt<bool, true>
+EmitJitDebugInfoToDisk("jit-emit-debug-to-disk",
+  cl::Hidden,
+  cl::desc("Emit debug info objfiles to disk"),
+  cl::location(JITEmitDebugInfoToDisk),
   cl::init(false));
 static cl::opt<bool, true>
 EnableUnwindTables("unwind-tables",
@@ -176,8 +202,8 @@ EnableStrongPHIElim(cl::Hidden, "strong-phi-elim",
 // TargetMachine Class
 //
 
-TargetMachine::TargetMachine() 
-  : AsmInfo(0) {
+TargetMachine::TargetMachine(const Target &T) 
+  : TheTarget(T), AsmInfo(0) {
   // Typically it will be subtargets that will adjust FloatABIType from Default
   // to Soft or Hard.
   if (UseSoftFloat)
@@ -237,4 +263,3 @@ namespace llvm {
     return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
   }
 }
-
diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp
index a84fdaa..fac67e2 100644
--- a/lib/Target/TargetRegisterInfo.cpp
+++ b/lib/Target/TargetRegisterInfo.cpp
@@ -40,10 +40,10 @@ TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterDesc *D, unsigned NR,
 TargetRegisterInfo::~TargetRegisterInfo() {}
 
 /// getPhysicalRegisterRegClass - Returns the Register Class of a physical
-/// register of the given type. If type is MVT::Other, then just return any
+/// register of the given type. If type is EVT::Other, then just return any
 /// register class the register belongs to.
 const TargetRegisterClass *
-TargetRegisterInfo::getPhysicalRegisterRegClass(unsigned reg, MVT VT) const {
+TargetRegisterInfo::getPhysicalRegisterRegClass(unsigned reg, EVT VT) const {
   assert(isPhysicalRegister(reg) && "reg must be a physical register");
 
   // Pick the most super register class of the right type that contains
@@ -62,14 +62,14 @@ TargetRegisterInfo::getPhysicalRegisterRegClass(unsigned reg, MVT VT) const {
 
 /// getAllocatableSetForRC - Toggle the bits that represent allocatable
 /// registers for the specific register class.
-static void getAllocatableSetForRC(MachineFunction &MF,
+static void getAllocatableSetForRC(const MachineFunction &MF,
                                    const TargetRegisterClass *RC, BitVector &R){  
   for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF),
          E = RC->allocation_order_end(MF); I != E; ++I)
     R.set(*I);
 }
 
-BitVector TargetRegisterInfo::getAllocatableSet(MachineFunction &MF,
+BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
                                           const TargetRegisterClass *RC) const {
   BitVector Allocatable(NumRegs);
   if (RC) {
@@ -85,7 +85,7 @@ BitVector TargetRegisterInfo::getAllocatableSet(MachineFunction &MF,
 
 /// getFrameIndexOffset - Returns the displacement from the frame register to
 /// the stack frame of the specified index. This is the default implementation
-/// which is likely incorrect for the target.
+/// which is overridden for some targets.
 int TargetRegisterInfo::getFrameIndexOffset(MachineFunction &MF, int FI) const {
   const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
   MachineFrameInfo *MFI = MF.getFrameInfo();
diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..034d5ab
--- /dev/null
+++ b/lib/Target/X86/AsmParser/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMX86AsmParser
+  X86AsmParser.cpp
+  )
+add_dependencies(LLVMX86AsmParser X86CodeGenTable_gen)
diff --git a/lib/Target/X86/AsmParser/Makefile b/lib/Target/X86/AsmParser/Makefile
new file mode 100644
index 0000000..25fb0a2
--- /dev/null
+++ b/lib/Target/X86/AsmParser/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/X86/AsmParser/Makefile -------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMX86AsmParser
+
+# Hack: we need to include 'main' x86 target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
new file mode 100644
index 0000000..c357b4d
--- /dev/null
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -0,0 +1,479 @@
+//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLexer.h"
+#include "llvm/MC/MCAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Target/TargetAsmParser.h"
+using namespace llvm;
+
+namespace {
+struct X86Operand;
+
+class X86ATTAsmParser : public TargetAsmParser {
+  MCAsmParser &Parser;
+
+private:
+  MCAsmParser &getParser() const { return Parser; }
+
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+
+  bool ParseRegister(X86Operand &Op);
+
+  bool ParseOperand(X86Operand &Op);
+
+  bool ParseMemOperand(X86Operand &Op);
+
+  bool ParseDirectiveWord(unsigned Size, SMLoc L);
+
+  /// @name Auto-generated Match Functions
+  /// {  
+
+  bool MatchInstruction(SmallVectorImpl<X86Operand> &Operands,
+                        MCInst &Inst);
+
+  /// MatchRegisterName - Match the given string to a register name, or 0 if
+  /// there is no match.
+  unsigned MatchRegisterName(const StringRef &Name);
+
+  /// }
+
+public:
+  X86ATTAsmParser(const Target &T, MCAsmParser &_Parser)
+    : TargetAsmParser(T), Parser(_Parser) {}
+
+  virtual bool ParseInstruction(const StringRef &Name, MCInst &Inst);
+
+  virtual bool ParseDirective(AsmToken DirectiveID);
+};
+  
+} // end anonymous namespace
+
+
+namespace {
+
+/// X86Operand - Instances of this class represent a parsed X86 machine
+/// instruction.
+struct X86Operand {
+  enum {
+    Token,
+    Register,
+    Immediate,
+    Memory
+  } Kind;
+
+  union {
+    struct {
+      const char *Data;
+      unsigned Length;
+    } Tok;
+
+    struct {
+      unsigned RegNo;
+    } Reg;
+
+    struct {
+      const MCExpr *Val;
+    } Imm;
+
+    struct {
+      unsigned SegReg;
+      const MCExpr *Disp;
+      unsigned BaseReg;
+      unsigned IndexReg;
+      unsigned Scale;
+    } Mem;
+  };
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const {
+    assert(Kind == Register && "Invalid access!");
+    return Reg.RegNo;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getMemDisp() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Disp;
+  }
+  unsigned getMemSegReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.SegReg;
+  }
+  unsigned getMemBaseReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.BaseReg;
+  }
+  unsigned getMemIndexReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.IndexReg;
+  }
+  unsigned getMemScale() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Scale;
+  }
+
+  bool isToken() const {return Kind == Token; }
+
+  bool isImm() const { return Kind == Immediate; }
+  
+  bool isImmSExt8() const { 
+    // Accept immediates which fit in 8 bits when sign extended, and
+    // non-absolute immediates.
+    if (!isImm())
+      return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) {
+      int64_t Value = CE->getValue();
+      return Value == (int64_t) (int8_t) Value;
+    }
+
+    return true;
+  }
+  
+  bool isMem() const { return Kind == Memory; }
+
+  bool isReg() const { return Kind == Register; }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateExpr(getImm()));
+  }
+
+  void addImmSExt8Operands(MCInst &Inst, unsigned N) const {
+    // FIXME: Support user customization of the render method.
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateExpr(getImm()));
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 4 || N == 5) && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
+    Inst.addOperand(MCOperand::CreateImm(getMemScale()));
+    Inst.addOperand(MCOperand::CreateReg(getMemIndexReg()));
+    Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+
+    // FIXME: What a hack.
+    if (N == 5)
+      Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
+  }
+
+  static X86Operand CreateToken(StringRef Str) {
+    X86Operand Res;
+    Res.Kind = Token;
+    Res.Tok.Data = Str.data();
+    Res.Tok.Length = Str.size();
+    return Res;
+  }
+
+  static X86Operand CreateReg(unsigned RegNo) {
+    X86Operand Res;
+    Res.Kind = Register;
+    Res.Reg.RegNo = RegNo;
+    return Res;
+  }
+
+  static X86Operand CreateImm(const MCExpr *Val) {
+    X86Operand Res;
+    Res.Kind = Immediate;
+    Res.Imm.Val = Val;
+    return Res;
+  }
+
+  static X86Operand CreateMem(unsigned SegReg, const MCExpr *Disp,
+                              unsigned BaseReg, unsigned IndexReg,
+                              unsigned Scale) {
+    // We should never just have a displacement, that would be an immediate.
+    assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
+
+    // The scale should always be one of {1,2,4,8}.
+    assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
+           "Invalid scale!");
+    X86Operand Res;
+    Res.Kind = Memory;
+    Res.Mem.SegReg   = SegReg;
+    Res.Mem.Disp     = Disp;
+    Res.Mem.BaseReg  = BaseReg;
+    Res.Mem.IndexReg = IndexReg;
+    Res.Mem.Scale    = Scale;
+    return Res;
+  }
+};
+
+} // end anonymous namespace.
+
+
+bool X86ATTAsmParser::ParseRegister(X86Operand &Op) {
+  const AsmToken &TokPercent = getLexer().getTok();
+  (void)TokPercent; // Avoid warning when assertions are disabled.
+  assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!");
+  getLexer().Lex(); // Eat percent token.
+
+  const AsmToken &Tok = getLexer().getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return Error(Tok.getLoc(), "invalid register name");
+
+  // FIXME: Validate register for the current architecture; we have to do
+  // validation later, so maybe there is no need for this here.
+  unsigned RegNo;
+
+  RegNo = MatchRegisterName(Tok.getString());
+  if (RegNo == 0)
+    return Error(Tok.getLoc(), "invalid register name");
+
+  Op = X86Operand::CreateReg(RegNo);
+  getLexer().Lex(); // Eat identifier token.
+
+  return false;
+}
+
+bool X86ATTAsmParser::ParseOperand(X86Operand &Op) {
+  switch (getLexer().getKind()) {
+  default:
+    return ParseMemOperand(Op);
+  case AsmToken::Percent:
+    // FIXME: if a segment register, this could either be just the seg reg, or
+    // the start of a memory operand.
+    return ParseRegister(Op);
+  case AsmToken::Dollar: {
+    // $42 -> immediate.
+    getLexer().Lex();
+    const MCExpr *Val;
+    if (getParser().ParseExpression(Val))
+      return true;
+    Op = X86Operand::CreateImm(Val);
+    return false;
+  }
+  }
+}
+
+/// ParseMemOperand: segment: disp(basereg, indexreg, scale)
+bool X86ATTAsmParser::ParseMemOperand(X86Operand &Op) {
+  // FIXME: If SegReg ':'  (e.g. %gs:), eat and remember.
+  unsigned SegReg = 0;
+  
+  // We have to disambiguate a parenthesized expression "(4+5)" from the start
+  // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)".  The
+  // only way to do this without lookahead is to eat the ( and see what is after
+  // it.
+  const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
+  if (getLexer().isNot(AsmToken::LParen)) {
+    if (getParser().ParseExpression(Disp)) return true;
+    
+    // After parsing the base expression we could either have a parenthesized
+    // memory address or not.  If not, return now.  If so, eat the (.
+    if (getLexer().isNot(AsmToken::LParen)) {
+      // Unless we have a segment register, treat this as an immediate.
+      if (SegReg)
+        Op = X86Operand::CreateMem(SegReg, Disp, 0, 0, 1);
+      else
+        Op = X86Operand::CreateImm(Disp);
+      return false;
+    }
+    
+    // Eat the '('.
+    getLexer().Lex();
+  } else {
+    // Okay, we have a '('.  We don't know if this is an expression or not, but
+    // so we have to eat the ( to see beyond it.
+    getLexer().Lex(); // Eat the '('.
+    
+    if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) {
+      // Nothing to do here, fall into the code below with the '(' part of the
+      // memory operand consumed.
+    } else {
+      // It must be an parenthesized expression, parse it now.
+      if (getParser().ParseParenExpression(Disp))
+        return true;
+      
+      // After parsing the base expression we could either have a parenthesized
+      // memory address or not.  If not, return now.  If so, eat the (.
+      if (getLexer().isNot(AsmToken::LParen)) {
+        // Unless we have a segment register, treat this as an immediate.
+        if (SegReg)
+          Op = X86Operand::CreateMem(SegReg, Disp, 0, 0, 1);
+        else
+          Op = X86Operand::CreateImm(Disp);
+        return false;
+      }
+      
+      // Eat the '('.
+      getLexer().Lex();
+    }
+  }
+  
+  // If we reached here, then we just ate the ( of the memory operand.  Process
+  // the rest of the memory operand.
+  unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+  
+  if (getLexer().is(AsmToken::Percent)) {
+    if (ParseRegister(Op))
+      return true;
+    BaseReg = Op.getReg();
+  }
+  
+  if (getLexer().is(AsmToken::Comma)) {
+    getLexer().Lex(); // Eat the comma.
+
+    // Following the comma we should have either an index register, or a scale
+    // value. We don't support the later form, but we want to parse it
+    // correctly.
+    //
+    // Not that even though it would be completely consistent to support syntax
+    // like "1(%eax,,1)", the assembler doesn't.
+    if (getLexer().is(AsmToken::Percent)) {
+      if (ParseRegister(Op))
+        return true;
+      IndexReg = Op.getReg();
+    
+      if (getLexer().isNot(AsmToken::RParen)) {
+        // Parse the scale amount:
+        //  ::= ',' [scale-expression]
+        if (getLexer().isNot(AsmToken::Comma))
+          return true;
+        getLexer().Lex(); // Eat the comma.
+
+        if (getLexer().isNot(AsmToken::RParen)) {
+          SMLoc Loc = getLexer().getTok().getLoc();
+
+          int64_t ScaleVal;
+          if (getParser().ParseAbsoluteExpression(ScaleVal))
+            return true;
+          
+          // Validate the scale amount.
+          if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8)
+            return Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
+          Scale = (unsigned)ScaleVal;
+        }
+      }
+    } else if (getLexer().isNot(AsmToken::RParen)) {
+      // Otherwise we have the unsupported form of a scale amount without an
+      // index.
+      SMLoc Loc = getLexer().getTok().getLoc();
+
+      int64_t Value;
+      if (getParser().ParseAbsoluteExpression(Value))
+        return true;
+      
+      return Error(Loc, "cannot have scale factor without index register");
+    }
+  }
+  
+  // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
+  if (getLexer().isNot(AsmToken::RParen))
+    return Error(getLexer().getTok().getLoc(),
+                    "unexpected token in memory operand");
+  getLexer().Lex(); // Eat the ')'.
+  
+  Op = X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale);
+  return false;
+}
+
+bool X86ATTAsmParser::ParseInstruction(const StringRef &Name, MCInst &Inst) {
+  SmallVector<X86Operand, 8> Operands;
+
+  Operands.push_back(X86Operand::CreateToken(Name));
+
+  SMLoc Loc = getLexer().getTok().getLoc();
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+
+    // Parse '*' modifier.
+    if (getLexer().is(AsmToken::Star)) {
+      getLexer().Lex(); // Eat the star.
+      Operands.push_back(X86Operand::CreateToken("*"));
+    }
+
+    // Read the first operand.
+    Operands.push_back(X86Operand());
+    if (ParseOperand(Operands.back()))
+      return true;
+
+    while (getLexer().is(AsmToken::Comma)) {
+      getLexer().Lex();  // Eat the comma.
+
+      // Parse and remember the operand.
+      Operands.push_back(X86Operand());
+      if (ParseOperand(Operands.back()))
+        return true;
+    }
+  }
+
+  if (!MatchInstruction(Operands, Inst))
+    return false;
+
+  // FIXME: We should give nicer diagnostics about the exact failure.
+
+  Error(Loc, "unrecognized instruction");
+  return true;
+}
+
+bool X86ATTAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".word")
+    return ParseDirectiveWord(2, DirectiveID.getLoc());
+  return true;
+}
+
+/// ParseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool X86ATTAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().ParseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+      
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      getLexer().Lex();
+    }
+  }
+
+  getLexer().Lex();
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmParser() {
+  RegisterAsmParser<X86ATTAsmParser> X(TheX86_32Target);
+  RegisterAsmParser<X86ATTAsmParser> Y(TheX86_64Target);
+}
+
+#include "X86GenAsmMatcher.inc"
diff --git a/lib/Target/X86/AsmPrinter/CMakeLists.txt b/lib/Target/X86/AsmPrinter/CMakeLists.txt
index a28c826..b70a587 100644
--- a/lib/Target/X86/AsmPrinter/CMakeLists.txt
+++ b/lib/Target/X86/AsmPrinter/CMakeLists.txt
@@ -1,9 +1,9 @@
 include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
 
 add_llvm_library(LLVMX86AsmPrinter
-  X86ATTAsmPrinter.cpp
   X86ATTInstPrinter.cpp
   X86AsmPrinter.cpp
-  X86IntelAsmPrinter.cpp
+  X86IntelInstPrinter.cpp
+  X86MCInstLower.cpp
   )
-add_dependencies(LLVMX86AsmPrinter X86CodeGenTable_gen)
-\ No newline at end of file
+add_dependencies(LLVMX86AsmPrinter X86CodeGenTable_gen)
diff --git a/lib/Target/X86/AsmPrinter/Makefile b/lib/Target/X86/AsmPrinter/Makefile
index ba89ac6..2368761 100644
--- a/lib/Target/X86/AsmPrinter/Makefile
+++ b/lib/Target/X86/AsmPrinter/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===##
+##===- lib/Target/X86/AsmPrinter/Makefile ------------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
diff --git a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
index fa0ee75..bc70ffe 100644
--- a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
@@ -13,10 +13,13 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "asm-printer"
+#include "X86ATTInstPrinter.h"
 #include "llvm/MC/MCInst.h"
-#include "X86ATTAsmPrinter.h"
-#include "llvm/Target/TargetAsmInfo.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "X86GenInstrNames.inc"
 using namespace llvm;
 
 // Include the auto-generated portion of the assembly writer.
@@ -25,9 +28,11 @@ using namespace llvm;
 #include "X86GenAsmWriter.inc"
 #undef MachineInstr
 
-void X86ATTAsmPrinter::printSSECC(const MCInst *MI, unsigned Op) {
+void X86ATTInstPrinter::printInst(const MCInst *MI) { printInstruction(MI); }
+
+void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op) {
   switch (MI->getOperand(Op).getImm()) {
-  default: assert(0 && "Invalid ssecc argument!");
+  default: llvm_unreachable("Invalid ssecc argument!");
   case 0: O << "eq"; break;
   case 1: O << "lt"; break;
   case 2: O << "le"; break;
@@ -39,61 +44,36 @@ void X86ATTAsmPrinter::printSSECC(const MCInst *MI, unsigned Op) {
   }
 }
 
-
-void X86ATTAsmPrinter::printPICLabel(const MCInst *MI, unsigned Op) {
-  assert(0 &&
-         "This is only used for MOVPC32r, should lower before asm printing!");
-}
-
-
 /// print_pcrel_imm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value.  These print slightly differently, for
 /// example, a $ is not emitted.
-void X86ATTAsmPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo) {
+void X86ATTInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo) {
   const MCOperand &Op = MI->getOperand(OpNo);
-  
   if (Op.isImm())
     O << Op.getImm();
-  else if (Op.isMBBLabel())
-    // FIXME: Keep in sync with printBasicBlockLabel.  printBasicBlockLabel
-    // should eventually call into this code, not the other way around.
-    O << TAI->getPrivateGlobalPrefix() << "BB" << Op.getMBBLabelFunction()
-      << '_' << Op.getMBBLabelBlock();
-  else
-    assert(0 && "Unknown pcrel immediate operand");
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    Op.getExpr()->print(O, &MAI);
+  }
 }
 
-
-void X86ATTAsmPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                    const char *Modifier) {
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     const char *Modifier) {
   assert(Modifier == 0 && "Modifiers should not be used");
   
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
-    O << '%';
-    unsigned Reg = Op.getReg();
-#if 0
-    if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
-      MVT VT = (strcmp(Modifier+6,"64") == 0) ?
-      MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
-                  ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
-      Reg = getX86SubSuperRegister(Reg, VT);
-    }
-#endif
-    O << TRI->getAsmName(Reg);
-    return;
+    O << '%' << getRegisterName(Op.getReg());
   } else if (Op.isImm()) {
-    //if (!Modifier || (strcmp(Modifier, "debug") && strcmp(Modifier, "mem")))
+    O << '$' << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
     O << '$';
-    O << Op.getImm();
-    return;
+    Op.getExpr()->print(O, &MAI);
   }
-  
-  O << "<<UNKNOWN OPERAND KIND>>";
 }
 
-void X86ATTAsmPrinter::printLeaMemReference(const MCInst *MI, unsigned Op) {
-
+void X86ATTInstPrinter::printLeaMemReference(const MCInst *MI, unsigned Op) {
   const MCOperand &BaseReg  = MI->getOperand(Op);
   const MCOperand &IndexReg = MI->getOperand(Op+2);
   const MCOperand &DispSpec = MI->getOperand(Op+3);
@@ -103,19 +83,11 @@ void X86ATTAsmPrinter::printLeaMemReference(const MCInst *MI, unsigned Op) {
     if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
       O << DispVal;
   } else {
-    abort();
-    //assert(DispSpec.isGlobal() || DispSpec.isCPI() ||
-    //       DispSpec.isJTI() || DispSpec.isSymbol());
-    //printOperand(MI, Op+3, "mem");
+    assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+    DispSpec.getExpr()->print(O, &MAI);
   }
   
   if (IndexReg.getReg() || BaseReg.getReg()) {
-    // There are cases where we can end up with ESP/RSP in the indexreg slot.
-    // If this happens, swap the base/index register to support assemblers that
-    // don't work when the index is *SP.
-    // FIXME: REMOVE THIS.
-    assert(IndexReg.getReg() != X86::ESP && IndexReg.getReg() != X86::RSP);
-    
     O << '(';
     if (BaseReg.getReg())
       printOperand(MI, Op);
@@ -131,9 +103,9 @@ void X86ATTAsmPrinter::printLeaMemReference(const MCInst *MI, unsigned Op) {
   }
 }
 
-void X86ATTAsmPrinter::printMemReference(const MCInst *MI, unsigned Op) {
-  const MCOperand &Segment = MI->getOperand(Op+4);
-  if (Segment.getReg()) {
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op) {
+  // If this has a segment register, print it.
+  if (MI->getOperand(Op+4).getReg()) {
     printOperand(MI, Op+4);
     O << ':';
   }
diff --git a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
new file mode 100644
index 0000000..5f28fa4
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
@@ -0,0 +1,86 @@
+//===-- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to AT&T style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ATT_INST_PRINTER_H
+#define X86_ATT_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+  class MCOperand;
+  
+class X86ATTInstPrinter : public MCInstPrinter {
+public:
+  X86ATTInstPrinter(raw_ostream &O, const MCAsmInfo &MAI)
+    : MCInstPrinter(O, MAI) {}
+
+  
+  virtual void printInst(const MCInst *MI);
+  
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI);
+  static const char *getRegisterName(unsigned RegNo);
+
+
+  void printOperand(const MCInst *MI, unsigned OpNo,
+                    const char *Modifier = 0);
+  void printMemReference(const MCInst *MI, unsigned Op);
+  void printLeaMemReference(const MCInst *MI, unsigned Op);
+  void printSSECC(const MCInst *MI, unsigned Op);
+  void print_pcrel_imm(const MCInst *MI, unsigned OpNo);
+  
+  void printopaquemem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  
+  void printi8mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi16mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi32mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi64mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi128mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf32mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf64mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf80mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf128mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printlea32mem(const MCInst *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64mem(const MCInst *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64_32mem(const MCInst *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo);
+  }
+};
+  
+}
+
+#endif
diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
index e5d80a4..2a0290d 100644
--- a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
+++ b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- X86AsmPrinter.cpp - Convert X86 LLVM IR to X86 assembly -----------===//
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,42 +7,937 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file the shared super class printer that converts from our internal
-// representation of machine-dependent LLVM code to Intel and AT&T format
-// assembly language.
-// This printer is the output mechanism used by `llc'.
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to AT&T format assembly
+// language. This printer is the output mechanism used by `llc'.
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86ATTAsmPrinter.h"
-#include "X86IntelAsmPrinter.h"
-#include "X86Subtarget.h"
+#define DEBUG_TYPE "asm-printer"
+#include "X86AsmPrinter.h"
+#include "X86ATTInstPrinter.h"
+#include "X86IntelInstPrinter.h"
+#include "X86MCInstLower.h"
+#include "X86.h"
+#include "X86COFF.h"
+#include "X86COFFMachineModuleInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
-/// createX86CodePrinterPass - Returns a pass that prints the X86 assembly code
-/// for a MachineFunction to the given output stream, using the given target
-/// machine description.
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+//===----------------------------------------------------------------------===//
+// Primitive Helper Functions.
+//===----------------------------------------------------------------------===//
+
+void X86AsmPrinter::printMCInst(const MCInst *MI) {
+  if (MAI->getAssemblerDialect() == 0)
+    X86ATTInstPrinter(O, *MAI).printInstruction(MI);
+  else
+    X86IntelInstPrinter(O, *MAI).printInstruction(MI);
+}
+
+void X86AsmPrinter::PrintPICBaseSymbol() const {
+  // FIXME: Gross const cast hack.
+  X86AsmPrinter *AP = const_cast<X86AsmPrinter*>(this);
+  X86MCInstLower(OutContext, 0, *AP).GetPICBaseSymbol()->print(O, MAI);
+}
+
+void X86AsmPrinter::emitFunctionHeader(const MachineFunction &MF) {
+  unsigned FnAlign = MF.getAlignment();
+  const Function *F = MF.getFunction();
+
+  if (Subtarget->isTargetCygMing()) {
+    X86COFFMachineModuleInfo &COFFMMI = 
+      MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
+    COFFMMI.DecorateCygMingName(CurrentFnName, F, *TM.getTargetData());
+  }
+
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
+  EmitAlignment(FnAlign, F);
+
+  switch (F->getLinkage()) {
+  default: llvm_unreachable("Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+  case Function::PrivateLinkage:
+    break;
+  case Function::DLLExportLinkage:
+  case Function::ExternalLinkage:
+    O << "\t.globl\t" << CurrentFnName << '\n';
+    break;
+  case Function::LinkerPrivateLinkage:
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+    if (Subtarget->isTargetDarwin()) {
+      O << "\t.globl\t" << CurrentFnName << '\n';
+      O << MAI->getWeakDefDirective() << CurrentFnName << '\n';
+    } else if (Subtarget->isTargetCygMing()) {
+      O << "\t.globl\t" << CurrentFnName << "\n"
+           "\t.linkonce discard\n";
+    } else {
+      O << "\t.weak\t" << CurrentFnName << '\n';
+    }
+    break;
+  }
+
+  printVisibility(CurrentFnName, F->getVisibility());
+
+  if (Subtarget->isTargetELF())
+    O << "\t.type\t" << CurrentFnName << ",@function\n";
+  else if (Subtarget->isTargetCygMing()) {
+    O << "\t.def\t " << CurrentFnName
+      << ";\t.scl\t" <<
+      (F->hasInternalLinkage() ? COFF::C_STAT : COFF::C_EXT)
+      << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+      << ";\t.endef\n";
+  }
+
+  O << CurrentFnName << ':';
+  if (VerboseAsm) {
+    O.PadToColumn(MAI->getCommentColumn());
+    O << MAI->getCommentString() << ' ';
+    WriteAsOperand(O, F, /*PrintType=*/false, F->getParent());
+  }
+  O << '\n';
+
+  // Add some workaround for linkonce linkage on Cygwin\MinGW
+  if (Subtarget->isTargetCygMing() &&
+      (F->hasLinkOnceLinkage() || F->hasWeakLinkage()))
+    O << "Lllvm$workaround$fake$stub$" << CurrentFnName << ":\n";
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  const Function *F = MF.getFunction();
+  this->MF = &MF;
+  CallingConv::ID CC = F->getCallingConv();
+
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  if (Subtarget->isTargetCOFF()) {
+    X86COFFMachineModuleInfo &COFFMMI = 
+    MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
+
+    // Populate function information map.  Don't want to populate
+    // non-stdcall or non-fastcall functions' information right now.
+    if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall)
+      COFFMMI.AddFunctionInfo(F, *MF.getInfo<X86MachineFunctionInfo>());
+  }
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print the 'header' of function
+  emitFunctionHeader(MF);
+
+  // Emit pre-function debug and/or EH information.
+  if (MAI->doesSupportDebugInformation() || MAI->doesSupportExceptionHandling())
+    DW->BeginFunction(&MF);
+
+  // Print out code for the function.
+  bool hasAnyRealCode = false;
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    EmitBasicBlockStart(I);
+    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
+         II != IE; ++II) {
+      // Print the assembly for the instruction.
+      if (!II->isLabel())
+        hasAnyRealCode = true;
+      printMachineInstruction(II);
+    }
+  }
+
+  if (Subtarget->isTargetDarwin() && !hasAnyRealCode) {
+    // If the function is empty, then we need to emit *something*. Otherwise,
+    // the function's label might be associated with something that it wasn't
+    // meant to be associated with. We emit a noop in this situation.
+    // We are assuming inline asms are code.
+    O << "\tnop\n";
+  }
+
+  if (MAI->hasDotTypeDotSizeDirective())
+    O << "\t.size\t" << CurrentFnName << ", .-" << CurrentFnName << '\n';
+
+  // Emit post-function debug information.
+  if (MAI->doesSupportDebugInformation() || MAI->doesSupportExceptionHandling())
+    DW->EndFunction(&MF);
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  // We didn't modify anything.
+  return false;
+}
+
+/// printSymbolOperand - Print a raw symbol reference operand.  This handles
+/// jump tables, constant pools, global address and external symbols, all of
+/// which print to a label with various suffixes for relocation types etc.
+void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown symbol type!");
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+    printOffset(MO.getOffset());
+    break;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+    
+    const char *Suffix = "";
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB)
+      Suffix = "$stub";
+    else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+             MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
+             MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+      Suffix = "$non_lazy_ptr";
+    
+    std::string Name = Mang->getMangledName(GV, Suffix, Suffix[0] != '\0');
+    if (Subtarget->isTargetCygMing()) {
+      X86COFFMachineModuleInfo &COFFMMI = 
+        MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
+      COFFMMI.DecorateCygMingName(Name, GV, *TM.getTargetData());
+    }
+    
+    // Handle dllimport linkage.
+    if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
+      Name = "__imp_" + Name;
+    
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
+      SmallString<128> NameStr;
+      Mang->getNameWithPrefix(NameStr, GV, true);
+      NameStr += "$non_lazy_ptr";
+      MCSymbol *Sym = OutContext.GetOrCreateSymbol(NameStr.str());
+      
+      const MCSymbol *&StubSym = 
+        MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+      if (StubSym == 0) {
+        NameStr.clear();
+        Mang->getNameWithPrefix(NameStr, GV, false);
+        StubSym = OutContext.GetOrCreateSymbol(NameStr.str());
+      }
+    } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
+      SmallString<128> NameStr;
+      Mang->getNameWithPrefix(NameStr, GV, true);
+      NameStr += "$non_lazy_ptr";
+      MCSymbol *Sym = OutContext.GetOrCreateSymbol(NameStr.str());
+      const MCSymbol *&StubSym =
+        MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(Sym);
+      if (StubSym == 0) {
+        NameStr.clear();
+        Mang->getNameWithPrefix(NameStr, GV, false);
+        StubSym = OutContext.GetOrCreateSymbol(NameStr.str());
+      }
+    } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
+      SmallString<128> NameStr;
+      Mang->getNameWithPrefix(NameStr, GV, true);
+      NameStr += "$stub";
+      MCSymbol *Sym = OutContext.GetOrCreateSymbol(NameStr.str());
+      const MCSymbol *&StubSym =
+        MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+      if (StubSym == 0) {
+        NameStr.clear();
+        Mang->getNameWithPrefix(NameStr, GV, false);
+        StubSym = OutContext.GetOrCreateSymbol(NameStr.str());
+      }
+    }
+    
+    // If the name begins with a dollar-sign, enclose it in parens.  We do this
+    // to avoid having it look like an integer immediate to the assembler.
+    if (Name[0] == '$') 
+      O << '(' << Name << ')';
+    else
+      O << Name;
+    
+    printOffset(MO.getOffset());
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    std::string Name = Mang->makeNameProper(MO.getSymbolName());
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
+      Name += "$stub";
+      MCSymbol *Sym = OutContext.GetOrCreateSymbol(Name);
+      const MCSymbol *&StubSym =
+        MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+      if (StubSym == 0) {
+        Name.erase(Name.end()-5, Name.end());
+        StubSym = OutContext.GetOrCreateSymbol(Name);
+      }
+    }
+    
+    // If the name begins with a dollar-sign, enclose it in parens.  We do this
+    // to avoid having it look like an integer immediate to the assembler.
+    if (Name[0] == '$') 
+      O << '(' << Name << ')';
+    else
+      O << Name;
+    break;
+  }
+  }
+  
+  switch (MO.getTargetFlags()) {
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+    break;
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DLLIMPORT:
+  case X86II::MO_DARWIN_STUB:
+    // These affect the name of the symbol, not any suffix.
+    break;
+  case X86II::MO_GOT_ABSOLUTE_ADDRESS:
+    O << " + [.-";
+    PrintPICBaseSymbol();
+    O << ']';
+    break;      
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    O << '-';
+    PrintPICBaseSymbol();
+    break;
+  case X86II::MO_TLSGD:     O << "@TLSGD";     break;
+  case X86II::MO_GOTTPOFF:  O << "@GOTTPOFF";  break;
+  case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
+  case X86II::MO_TPOFF:     O << "@TPOFF";     break;
+  case X86II::MO_NTPOFF:    O << "@NTPOFF";    break;
+  case X86II::MO_GOTPCREL:  O << "@GOTPCREL";  break;
+  case X86II::MO_GOT:       O << "@GOT";       break;
+  case X86II::MO_GOTOFF:    O << "@GOTOFF";    break;
+  case X86II::MO_PLT:       O << "@PLT";       break;
+  }
+}
+
+/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.  These print slightly differently, for
+/// example, a $ is not emitted.
+void X86AsmPrinter::print_pcrel_imm(const MachineInstr *MI, unsigned OpNo) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  default: llvm_unreachable("Unknown pcrel immediate operand");
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
+    return;
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+    printSymbolOperand(MO);
+    return;
+  }
+}
+
+
+void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                    const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type!");
+  case MachineOperand::MO_Register: {
+    O << '%';
+    unsigned Reg = MO.getReg();
+    if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+      EVT VT = (strcmp(Modifier+6,"64") == 0) ?
+        MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
+                    ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
+      Reg = getX86SubSuperRegister(Reg, VT);
+    }
+    O << X86ATTInstPrinter::getRegisterName(Reg);
+    return;
+  }
+
+  case MachineOperand::MO_Immediate:
+    O << '$' << MO.getImm();
+    return;
+
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_GlobalAddress: 
+  case MachineOperand::MO_ExternalSymbol: {
+    O << '$';
+    printSymbolOperand(MO);
+    break;
+  }
+  }
+}
+
+void X86AsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) {
+  unsigned char value = MI->getOperand(Op).getImm();
+  assert(value <= 7 && "Invalid ssecc argument!");
+  switch (value) {
+  case 0: O << "eq"; break;
+  case 1: O << "lt"; break;
+  case 2: O << "le"; break;
+  case 3: O << "unord"; break;
+  case 4: O << "neq"; break;
+  case 5: O << "nlt"; break;
+  case 6: O << "nle"; break;
+  case 7: O << "ord"; break;
+  }
+}
+
+void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op,
+                                         const char *Modifier) {
+  const MachineOperand &BaseReg  = MI->getOperand(Op);
+  const MachineOperand &IndexReg = MI->getOperand(Op+2);
+  const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+  // If we really don't want to print out (rip), don't.
+  bool HasBaseReg = BaseReg.getReg() != 0;
+  if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
+      BaseReg.getReg() == X86::RIP)
+    HasBaseReg = false;
+  
+  // HasParenPart - True if we will print out the () part of the mem ref.
+  bool HasParenPart = IndexReg.getReg() || HasBaseReg;
+  
+  if (DispSpec.isImm()) {
+    int DispVal = DispSpec.getImm();
+    if (DispVal || !HasParenPart)
+      O << DispVal;
+  } else {
+    assert(DispSpec.isGlobal() || DispSpec.isCPI() ||
+           DispSpec.isJTI() || DispSpec.isSymbol());
+    printSymbolOperand(MI->getOperand(Op+3));
+  }
+
+  if (HasParenPart) {
+    assert(IndexReg.getReg() != X86::ESP &&
+           "X86 doesn't allow scaling by ESP");
+
+    O << '(';
+    if (HasBaseReg)
+      printOperand(MI, Op, Modifier);
+
+    if (IndexReg.getReg()) {
+      O << ',';
+      printOperand(MI, Op+2, Modifier);
+      unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+      if (ScaleVal != 1)
+        O << ',' << ScaleVal;
+    }
+    O << ')';
+  }
+}
+
+void X86AsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+                                      const char *Modifier) {
+  assert(isMem(MI, Op) && "Invalid memory reference!");
+  const MachineOperand &Segment = MI->getOperand(Op+4);
+  if (Segment.getReg()) {
+    printOperand(MI, Op+4, Modifier);
+    O << ':';
+  }
+  printLeaMemReference(MI, Op, Modifier);
+}
+
+void X86AsmPrinter::printPICJumpTableSetLabel(unsigned uid,
+                                           const MachineBasicBlock *MBB) const {
+  if (!MAI->getSetDirective())
+    return;
+
+  // We don't need .set machinery if we have GOT-style relocations
+  if (Subtarget->isPICStyleGOT())
+    return;
+
+  O << MAI->getSetDirective() << ' ' << MAI->getPrivateGlobalPrefix()
+    << getFunctionNumber() << '_' << uid << "_set_" << MBB->getNumber() << ',';
+  
+  GetMBBSymbol(MBB->getNumber())->print(O, MAI);
+  
+  if (Subtarget->isPICStyleRIPRel())
+    O << '-' << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << uid << '\n';
+  else {
+    O << '-';
+    PrintPICBaseSymbol();
+    O << '\n';
+  }
+}
+
+
+void X86AsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) {
+  PrintPICBaseSymbol();
+  O << '\n';
+  PrintPICBaseSymbol();
+  O << ':';
+}
+
+void X86AsmPrinter::printPICJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                           const MachineBasicBlock *MBB,
+                                           unsigned uid) const {
+  const char *JTEntryDirective = MJTI->getEntrySize() == 4 ?
+    MAI->getData32bitsDirective() : MAI->getData64bitsDirective();
+
+  O << JTEntryDirective << ' ';
+
+  if (Subtarget->isPICStyleRIPRel() || Subtarget->isPICStyleStubPIC()) {
+    O << MAI->getPrivateGlobalPrefix() << getFunctionNumber()
+      << '_' << uid << "_set_" << MBB->getNumber();
+  } else if (Subtarget->isPICStyleGOT()) {
+    GetMBBSymbol(MBB->getNumber())->print(O, MAI);
+    O << "@GOTOFF";
+  } else
+    GetMBBSymbol(MBB->getNumber())->print(O, MAI);
+}
+
+bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default: return true;  // Unknown mode.
+  case 'b': // Print QImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8);
+    break;
+  case 'h': // Print QImode high register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+    break;
+  case 'w': // Print HImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i16);
+    break;
+  case 'k': // Print SImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i32);
+    break;
+  case 'q': // Print DImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i64);
+    break;
+  }
+
+  O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
 ///
-FunctionPass *llvm::createX86CodePrinterPass(raw_ostream &o,
-                                             X86TargetMachine &tm,
-                                             bool verbose) {
-  const X86Subtarget *Subtarget = &tm.getSubtarget<X86Subtarget>();
+bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'a': // This is an address.  Currently only 'i' and 'r' are expected.
+      if (MO.isImm()) {
+        O << MO.getImm();
+        return false;
+      } 
+      if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol()) {
+        printSymbolOperand(MO);
+        return false;
+      }
+      if (MO.isReg()) {
+        O << '(';
+        printOperand(MI, OpNo);
+        O << ')';
+        return false;
+      }
+      return true;
+
+    case 'c': // Don't print "$" before a global var name or constant.
+      if (MO.isImm())
+        O << MO.getImm();
+      else if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol())
+        printSymbolOperand(MO);
+      else
+        printOperand(MI, OpNo);
+      return false;
+
+    case 'A': // Print '*' before a register (it must be a register)
+      if (MO.isReg()) {
+        O << '*';
+        printOperand(MI, OpNo);
+        return false;
+      }
+      return true;
+
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print DImode register
+      if (MO.isReg())
+        return printAsmMRegister(MO, ExtraCode[0]);
+      printOperand(MI, OpNo);
+      return false;
+
+    case 'P': // This is the operand of a call, treat specially.
+      print_pcrel_imm(MI, OpNo);
+      return false;
+
+    case 'n':  // Negate the immediate or print a '-' before the operand.
+      // Note: this is a temporary solution. It should be handled target
+      // independently as part of the 'MC' work.
+      if (MO.isImm()) {
+        O << -MO.getImm();
+        return false;
+      }
+      O << '-';
+    }
+  }
 
-  if (Subtarget->isFlavorIntel())
-    return new X86IntelAsmPrinter(o, tm, tm.getTargetAsmInfo(), verbose);
-  return new X86ATTAsmPrinter(o, tm, tm.getTargetAsmInfo(), verbose);
+  printOperand(MI, OpNo);
+  return false;
 }
 
-namespace {
-  static struct Register {
-    Register() {
-      X86TargetMachine::registerAsmPrinter(createX86CodePrinterPass);
+bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNo, unsigned AsmVariant,
+                                          const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print SImode register
+      // These only apply to registers, ignore on mem.
+      break;
+    case 'P': // Don't print @PLT, but do print as memory.
+      printMemReference(MI, OpNo, "no-rip");
+      return false;
     }
-  } Registrator;
+  }
+  printMemReference(MI, OpNo);
+  return false;
+}
+
+
+
+/// printMachineInstruction -- Print out a single X86 LLVM instruction MI in
+/// AT&T syntax to the current output stream.
+///
+void X86AsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  processDebugLoc(MI, true);
+  
+  printInstructionThroughMCStreamer(MI);
+  
+  if (VerboseAsm && !MI->getDebugLoc().isUnknown())
+    EmitComments(*MI);
+  O << '\n';
+
+  processDebugLoc(MI, false);
 }
 
-extern "C" int X86AsmPrinterForceLink;
-int X86AsmPrinterForceLink = 0;
+void X86AsmPrinter::PrintGlobalVariable(const GlobalVariable* GVar) {
+  if (!GVar->hasInitializer())
+    return;   // External global require no code
+  
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (EmitSpecialLLVMGlobal(GVar)) {
+    if (Subtarget->isTargetDarwin() &&
+        TM.getRelocationModel() == Reloc::Static) {
+      if (GVar->getName() == "llvm.global_ctors")
+        O << ".reference .constructors_used\n";
+      else if (GVar->getName() == "llvm.global_dtors")
+        O << ".reference .destructors_used\n";
+    }
+    return;
+  }
+  
+  const TargetData *TD = TM.getTargetData();
+
+  std::string name = Mang->getMangledName(GVar);
+  Constant *C = GVar->getInitializer();
+  const Type *Type = C->getType();
+  unsigned Size = TD->getTypeAllocSize(Type);
+  unsigned Align = TD->getPreferredAlignmentLog(GVar);
+
+  printVisibility(name, GVar->getVisibility());
+
+  if (Subtarget->isTargetELF())
+    O << "\t.type\t" << name << ",@object\n";
+
+  
+  SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GVar, TM);
+  const MCSection *TheSection =
+    getObjFileLowering().SectionForGlobal(GVar, GVKind, Mang, TM);
+  OutStreamer.SwitchSection(TheSection);
+
+  // FIXME: get this stuff from section kind flags.
+  if (C->isNullValue() && !GVar->hasSection() &&
+      // Don't put things that should go in the cstring section into "comm".
+      !TheSection->getKind().isMergeableCString()) {
+    if (GVar->hasExternalLinkage()) {
+      if (const char *Directive = MAI->getZeroFillDirective()) {
+        O << "\t.globl " << name << '\n';
+        O << Directive << "__DATA, __common, " << name << ", "
+          << Size << ", " << Align << '\n';
+        return;
+      }
+    }
+
+    if (!GVar->isThreadLocal() &&
+        (GVar->hasLocalLinkage() || GVar->isWeakForLinker())) {
+      if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+
+      if (MAI->getLCOMMDirective() != NULL) {
+        if (GVar->hasLocalLinkage()) {
+          O << MAI->getLCOMMDirective() << name << ',' << Size;
+          if (Subtarget->isTargetDarwin())
+            O << ',' << Align;
+        } else if (Subtarget->isTargetDarwin() && !GVar->hasCommonLinkage()) {
+          O << "\t.globl " << name << '\n'
+            << MAI->getWeakDefDirective() << name << '\n';
+          EmitAlignment(Align, GVar);
+          O << name << ":";
+          if (VerboseAsm) {
+            O.PadToColumn(MAI->getCommentColumn());
+            O << MAI->getCommentString() << ' ';
+            WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
+          }
+          O << '\n';
+          EmitGlobalConstant(C);
+          return;
+        } else {
+          O << MAI->getCOMMDirective()  << name << ',' << Size;
+          if (MAI->getCOMMDirectiveTakesAlignment())
+            O << ',' << (MAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+        }
+      } else {
+        if (!Subtarget->isTargetCygMing()) {
+          if (GVar->hasLocalLinkage())
+            O << "\t.local\t" << name << '\n';
+        }
+        O << MAI->getCOMMDirective()  << name << ',' << Size;
+        if (MAI->getCOMMDirectiveTakesAlignment())
+          O << ',' << (MAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+      }
+      if (VerboseAsm) {
+        O.PadToColumn(MAI->getCommentColumn());
+        O << MAI->getCommentString() << ' ';
+        WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
+      }
+      O << '\n';
+      return;
+    }
+  }
+
+  switch (GVar->getLinkage()) {
+  case GlobalValue::CommonLinkage:
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+  case GlobalValue::LinkerPrivateLinkage:
+    if (Subtarget->isTargetDarwin()) {
+      O << "\t.globl " << name << '\n'
+        << MAI->getWeakDefDirective() << name << '\n';
+    } else if (Subtarget->isTargetCygMing()) {
+      O << "\t.globl\t" << name << "\n"
+           "\t.linkonce same_size\n";
+    } else {
+      O << "\t.weak\t" << name << '\n';
+    }
+    break;
+  case GlobalValue::DLLExportLinkage:
+  case GlobalValue::AppendingLinkage:
+    // FIXME: appending linkage variables should go into a section of
+    // their name or something.  For now, just emit them as external.
+  case GlobalValue::ExternalLinkage:
+    // If external or appending, declare as a global symbol
+    O << "\t.globl " << name << '\n';
+    // FALL THROUGH
+  case GlobalValue::PrivateLinkage:
+  case GlobalValue::InternalLinkage:
+     break;
+  default:
+    llvm_unreachable("Unknown linkage type!");
+  }
+
+  EmitAlignment(Align, GVar);
+  O << name << ":";
+  if (VerboseAsm){
+    O.PadToColumn(MAI->getCommentColumn());
+    O << MAI->getCommentString() << ' ';
+    WriteAsOperand(O, GVar, /*PrintType=*/false, GVar->getParent());
+  }
+  O << '\n';
+
+  EmitGlobalConstant(C);
+
+  if (MAI->hasDotTypeDotSizeDirective())
+    O << "\t.size\t" << name << ", " << Size << '\n';
+}
+
+void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetDarwin()) {
+    // All darwin targets use mach-o.
+    TargetLoweringObjectFileMachO &TLOFMacho = 
+      static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
+    
+    MachineModuleInfoMachO &MMIMacho =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+    
+    // Output stubs for dynamically-linked functions.
+    MachineModuleInfoMachO::SymbolListTy Stubs;
+
+    Stubs = MMIMacho.GetFnStubList();
+    if (!Stubs.empty()) {
+      const MCSection *TheSection = 
+        TLOFMacho.getMachOSection("__IMPORT", "__jump_table",
+                                  MCSectionMachO::S_SYMBOL_STUBS |
+                                  MCSectionMachO::S_ATTR_SELF_MODIFYING_CODE |
+                                  MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                  5, SectionKind::getMetadata());
+      OutStreamer.SwitchSection(TheSection);
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        Stubs[i].first->print(O, MAI);
+        O << ":\n" << "\t.indirect_symbol ";
+        // Get the MCSymbol without the $stub suffix.
+        Stubs[i].second->print(O, MAI);
+        O << "\n\thlt ; hlt ; hlt ; hlt ; hlt\n";
+      }
+      O << '\n';
+      
+      Stubs.clear();
+    }
+
+    // Output stubs for external and common global variables.
+    Stubs = MMIMacho.GetGVStubList();
+    if (!Stubs.empty()) {
+      const MCSection *TheSection = 
+        TLOFMacho.getMachOSection("__IMPORT", "__pointers",
+                                  MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS,
+                                  SectionKind::getMetadata());
+      OutStreamer.SwitchSection(TheSection);
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        Stubs[i].first->print(O, MAI);
+        O << ":\n\t.indirect_symbol ";
+        Stubs[i].second->print(O, MAI);
+        O << "\n\t.long\t0\n";
+      }
+      Stubs.clear();
+    }
+
+    Stubs = MMIMacho.GetHiddenGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+      EmitAlignment(2);
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        Stubs[i].first->print(O, MAI);
+        O << ":\n" << MAI->getData32bitsDirective();
+        Stubs[i].second->print(O, MAI);
+        O << '\n';
+      }
+      Stubs.clear();
+    }
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    O << "\t.subsections_via_symbols\n";
+  }  
+  
+  if (Subtarget->isTargetCOFF()) {
+    // Necessary for dllexport support
+    std::vector<std::string> DLLExportedFns, DLLExportedGlobals;
+
+    X86COFFMachineModuleInfo &COFFMMI = 
+      MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
+    TargetLoweringObjectFileCOFF &TLOFCOFF = 
+      static_cast<TargetLoweringObjectFileCOFF&>(getObjFileLowering());
+
+    for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
+      if (I->hasDLLExportLinkage())
+        DLLExportedFns.push_back(Mang->getMangledName(I));
+    
+    for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I)
+      if (I->hasDLLExportLinkage())
+        DLLExportedGlobals.push_back(Mang->getMangledName(I));
+    
+    if (Subtarget->isTargetCygMing()) {
+      // Emit type information for external functions
+      for (X86COFFMachineModuleInfo::stub_iterator I = COFFMMI.stub_begin(),
+           E = COFFMMI.stub_end(); I != E; ++I) {
+        O << "\t.def\t " << I->getKeyData()
+        << ";\t.scl\t" << COFF::C_EXT
+        << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+        << ";\t.endef\n";
+      }
+    }
+  
+    // Output linker support code for dllexported globals on windows.
+    if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) {
+      OutStreamer.SwitchSection(TLOFCOFF.getCOFFSection(".section .drectve",
+                                                        true,
+                                                   SectionKind::getMetadata()));
+    
+      for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i)
+        O << "\t.ascii \" -export:" << DLLExportedGlobals[i] << ",data\"\n";
+    
+      for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i)
+        O << "\t.ascii \" -export:" << DLLExportedFns[i] << "\"\n";
+    }
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+static MCInstPrinter *createX86MCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             raw_ostream &O) {
+  if (SyntaxVariant == 0)
+    return new X86ATTInstPrinter(O, MAI);
+  if (SyntaxVariant == 1)
+    return new X86IntelInstPrinter(O, MAI);
+  return 0;
+}
 
 // Force static initialization.
-extern "C" void LLVMInitializeX86AsmPrinter() { }
+extern "C" void LLVMInitializeX86AsmPrinter() { 
+  RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target);
+  RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target);
+  
+  TargetRegistry::RegisterMCInstPrinter(TheX86_32Target,createX86MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,createX86MCInstPrinter);
+}
diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.h b/lib/Target/X86/AsmPrinter/X86AsmPrinter.h
new file mode 100644
index 0000000..0351829
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86AsmPrinter.h
@@ -0,0 +1,150 @@
+//===-- X86AsmPrinter.h - Convert X86 LLVM code to assembly -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AT&T assembly code printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ASMPRINTER_H
+#define X86ASMPRINTER_H
+
+#include "../X86.h"
+#include "../X86MachineFunctionInfo.h"
+#include "../X86TargetMachine.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MachineJumpTableInfo;
+class MCContext;
+class MCInst;
+class MCStreamer;
+class MCSymbol;
+
+class VISIBILITY_HIDDEN X86AsmPrinter : public AsmPrinter {
+  const X86Subtarget *Subtarget;
+ public:
+  explicit X86AsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                            const MCAsmInfo *T, bool V)
+    : AsmPrinter(O, TM, T, V) {
+    Subtarget = &TM.getSubtarget<X86Subtarget>();
+  }
+
+  virtual const char *getPassName() const {
+    return "X86 AT&T-Style Assembly Printer";
+  }
+  
+  const X86Subtarget &getSubtarget() const { return *Subtarget; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    AU.addRequired<MachineModuleInfo>();
+    AU.addRequired<DwarfWriter>();
+    AsmPrinter::getAnalysisUsage(AU);
+  }
+
+  
+  virtual void EmitEndOfAsmFile(Module &M);
+  
+  void printInstructionThroughMCStreamer(const MachineInstr *MI);
+
+
+  void printMCInst(const MCInst *MI);
+
+  void printSymbolOperand(const MachineOperand &MO);
+  
+  
+
+  // These methods are used by the tablegen'erated instruction printer.
+  void printOperand(const MachineInstr *MI, unsigned OpNo,
+                    const char *Modifier = 0);
+  void print_pcrel_imm(const MachineInstr *MI, unsigned OpNo);
+
+  void printopaquemem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+
+  void printi8mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi16mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi32mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi64mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi128mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf32mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf64mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf80mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf128mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printlea32mem(const MachineInstr *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64mem(const MachineInstr *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo, "subreg64");
+  }
+
+  bool printAsmMRegister(const MachineOperand &MO, char Mode);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode);
+
+  void printMachineInstruction(const MachineInstr *MI);
+  void printSSECC(const MachineInstr *MI, unsigned Op);
+  void printMemReference(const MachineInstr *MI, unsigned Op,
+                         const char *Modifier=NULL);
+  void printLeaMemReference(const MachineInstr *MI, unsigned Op,
+                            const char *Modifier=NULL);
+  void printPICJumpTableSetLabel(unsigned uid,
+                                 const MachineBasicBlock *MBB) const;
+  void printPICJumpTableSetLabel(unsigned uid, unsigned uid2,
+                                 const MachineBasicBlock *MBB) const {
+    AsmPrinter::printPICJumpTableSetLabel(uid, uid2, MBB);
+  }
+  void printPICJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                              const MachineBasicBlock *MBB,
+                              unsigned uid) const;
+
+  void printPICLabel(const MachineInstr *MI, unsigned Op);
+  void PrintGlobalVariable(const GlobalVariable* GVar);
+
+  void PrintPICBaseSymbol() const;
+  
+  bool runOnMachineFunction(MachineFunction &F);
+
+  void emitFunctionHeader(const MachineFunction &MF);
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp
new file mode 100644
index 0000000..fde5902
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp
@@ -0,0 +1,131 @@
+//===-- X86IntelInstPrinter.cpp - AT&T assembly instruction printing ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86IntelInstPrinter.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "X86GenInstrNames.inc"
+using namespace llvm;
+
+// Include the auto-generated portion of the assembly writer.
+#define MachineInstr MCInst
+#define NO_ASM_WRITER_BOILERPLATE
+#include "X86GenAsmWriter1.inc"
+#undef MachineInstr
+
+void X86IntelInstPrinter::printInst(const MCInst *MI) { printInstruction(MI); }
+
+void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op) {
+  switch (MI->getOperand(Op).getImm()) {
+  default: llvm_unreachable("Invalid ssecc argument!");
+  case 0: O << "eq"; break;
+  case 1: O << "lt"; break;
+  case 2: O << "le"; break;
+  case 3: O << "unord"; break;
+  case 4: O << "neq"; break;
+  case 5: O << "nlt"; break;
+  case 6: O << "nle"; break;
+  case 7: O << "ord"; break;
+  }
+}
+
+/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.
+void X86IntelInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << Op.getImm();
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+static void PrintRegName(raw_ostream &O, StringRef RegName) {
+  for (unsigned i = 0, e = RegName.size(); i != e; ++i)
+    O << (char)toupper(RegName[i]);
+}
+
+void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     const char *Modifier) {
+  assert(Modifier == 0 && "Modifiers should not be used");
+  
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    PrintRegName(O, getRegisterName(Op.getReg()));
+  } else if (Op.isImm()) {
+    O << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+void X86IntelInstPrinter::printLeaMemReference(const MCInst *MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI->getOperand(Op);
+  unsigned ScaleVal         = MI->getOperand(Op+1).getImm();
+  const MCOperand &IndexReg = MI->getOperand(Op+2);
+  const MCOperand &DispSpec = MI->getOperand(Op+3);
+  
+  O << '[';
+  
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOperand(MI, Op);
+    NeedPlus = true;
+  }
+  
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << '*';
+    printOperand(MI, Op+2);
+    NeedPlus = true;
+  }
+  
+ 
+  if (!DispSpec.isImm()) {
+    if (NeedPlus) O << " + ";
+    assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+    DispSpec.getExpr()->print(O, &MAI);
+  } else {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+      if (NeedPlus) {
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
+      }
+      O << DispVal;
+    }
+  }
+  
+  O << ']';
+}
+
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op) {
+  // If this has a segment register, print it.
+  if (MI->getOperand(Op+4).getReg()) {
+    printOperand(MI, Op+4);
+    O << ':';
+  }
+  printLeaMemReference(MI, Op);
+}
diff --git a/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h
new file mode 100644
index 0000000..1976177
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h
@@ -0,0 +1,99 @@
+//===-- X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to intel style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_INTEL_INST_PRINTER_H
+#define X86_INTEL_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+  class MCOperand;
+  
+class X86IntelInstPrinter : public MCInstPrinter {
+public:
+  X86IntelInstPrinter(raw_ostream &O, const MCAsmInfo &MAI)
+    : MCInstPrinter(O, MAI) {}
+  
+  virtual void printInst(const MCInst *MI);
+  
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI);
+  static const char *getRegisterName(unsigned RegNo);
+
+
+  void printOperand(const MCInst *MI, unsigned OpNo,
+                    const char *Modifier = 0);
+  void printMemReference(const MCInst *MI, unsigned Op);
+  void printLeaMemReference(const MCInst *MI, unsigned Op);
+  void printSSECC(const MCInst *MI, unsigned Op);
+  void print_pcrel_imm(const MCInst *MI, unsigned OpNo);
+  
+  void printopaquemem(const MCInst *MI, unsigned OpNo) {
+    O << "OPAQUE PTR ";
+    printMemReference(MI, OpNo);
+  }
+  
+  void printi8mem(const MCInst *MI, unsigned OpNo) {
+    O << "BYTE PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi16mem(const MCInst *MI, unsigned OpNo) {
+    O << "WORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi32mem(const MCInst *MI, unsigned OpNo) {
+    O << "DWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi64mem(const MCInst *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi128mem(const MCInst *MI, unsigned OpNo) {
+    O << "XMMWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf32mem(const MCInst *MI, unsigned OpNo) {
+    O << "DWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf64mem(const MCInst *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf80mem(const MCInst *MI, unsigned OpNo) {
+    O << "XWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf128mem(const MCInst *MI, unsigned OpNo) {
+    O << "XMMWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printlea32mem(const MCInst *MI, unsigned OpNo) {
+    O << "DWORD PTR ";
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64mem(const MCInst *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64_32mem(const MCInst *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printLeaMemReference(MI, OpNo);
+  }
+};
+  
+}
+
+#endif
diff --git a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
new file mode 100644
index 0000000..5ccddf5
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
@@ -0,0 +1,485 @@
+//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower X86 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCInstLower.h"
+#include "X86AsmPrinter.h"
+#include "X86MCAsmInfo.h"
+#include "X86COFFMachineModuleInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/ADT/SmallString.h"
+using namespace llvm;
+
+
+const X86Subtarget &X86MCInstLower::getSubtarget() const {
+  return AsmPrinter.getSubtarget();
+}
+
+MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
+  assert(getSubtarget().isTargetDarwin() &&"Can only get MachO info on darwin");
+  return AsmPrinter.MMI->getObjFileInfo<MachineModuleInfoMachO>(); 
+}
+
+
+MCSymbol *X86MCInstLower::GetPICBaseSymbol() const {
+  SmallString<60> Name;
+  raw_svector_ostream(Name) << AsmPrinter.MAI->getPrivateGlobalPrefix()
+    << AsmPrinter.getFunctionNumber() << "$pb";
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+
+/// LowerGlobalAddressOperand - Lower an MO_GlobalAddress operand to an
+/// MCOperand.
+MCSymbol *X86MCInstLower::
+GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  const GlobalValue *GV = MO.getGlobal();
+  
+  bool isImplicitlyPrivate = false;
+  if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB ||
+      MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+      MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
+      MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+    isImplicitlyPrivate = true;
+  
+  SmallString<128> Name;
+  Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+  
+  if (getSubtarget().isTargetCygMing()) {
+    X86COFFMachineModuleInfo &COFFMMI = 
+      AsmPrinter.MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
+    COFFMMI.DecorateCygMingName(Name, GV, *AsmPrinter.TM.getTargetData());
+  }
+  
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:                // No flag.
+  case X86II::MO_PIC_BASE_OFFSET:        // Doesn't modify symbol name.
+    break;
+  case X86II::MO_DLLIMPORT: {
+    // Handle dllimport linkage.
+    const char *Prefix = "__imp_";
+    Name.insert(Name.begin(), Prefix, Prefix+strlen(Prefix));
+    break;
+  }
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
+    Name += "$non_lazy_ptr";
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+
+    const MCSymbol *&StubSym = getMachOMMI().getGVStubEntry(Sym);
+    if (StubSym == 0) {
+      Name.clear();
+      Mang->getNameWithPrefix(Name, GV, false);
+      StubSym = Ctx.GetOrCreateSymbol(Name.str());
+    }
+    return Sym;
+  }
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
+    Name += "$non_lazy_ptr";
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+    const MCSymbol *&StubSym = getMachOMMI().getHiddenGVStubEntry(Sym);
+    if (StubSym == 0) {
+      Name.clear();
+      Mang->getNameWithPrefix(Name, GV, false);
+      StubSym = Ctx.GetOrCreateSymbol(Name.str());
+    }
+    return Sym;
+  }
+  case X86II::MO_DARWIN_STUB: {
+    Name += "$stub";
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+    const MCSymbol *&StubSym = getMachOMMI().getFnStubEntry(Sym);
+    if (StubSym == 0) {
+      Name.clear();
+      Mang->getNameWithPrefix(Name, GV, false);
+      StubSym = Ctx.GetOrCreateSymbol(Name.str());
+    }
+    return Sym;
+  }
+  // FIXME: These probably should be a modifier on the symbol or something??
+  case X86II::MO_TLSGD:     Name += "@TLSGD";     break;
+  case X86II::MO_GOTTPOFF:  Name += "@GOTTPOFF";  break;
+  case X86II::MO_INDNTPOFF: Name += "@INDNTPOFF"; break;
+  case X86II::MO_TPOFF:     Name += "@TPOFF";     break;
+  case X86II::MO_NTPOFF:    Name += "@NTPOFF";    break;
+  case X86II::MO_GOTPCREL:  Name += "@GOTPCREL";  break;
+  case X86II::MO_GOT:       Name += "@GOT";       break;
+  case X86II::MO_GOTOFF:    Name += "@GOTOFF";    break;
+  case X86II::MO_PLT:       Name += "@PLT";       break;
+  }
+  
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *X86MCInstLower::
+GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  SmallString<128> Name;
+  Name += AsmPrinter.MAI->getGlobalPrefix();
+  Name += MO.getSymbolName();
+  
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:                // No flag.
+  case X86II::MO_GOT_ABSOLUTE_ADDRESS:   // Doesn't modify symbol name.
+  case X86II::MO_PIC_BASE_OFFSET:        // Doesn't modify symbol name.
+    break;
+  case X86II::MO_DLLIMPORT: {
+    // Handle dllimport linkage.
+    const char *Prefix = "__imp_";
+    Name.insert(Name.begin(), Prefix, Prefix+strlen(Prefix));
+    break;
+  }
+  case X86II::MO_DARWIN_STUB: {
+    Name += "$stub";
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+    const MCSymbol *&StubSym = getMachOMMI().getFnStubEntry(Sym);
+
+    if (StubSym == 0) {
+      Name.erase(Name.end()-5, Name.end());
+      StubSym = Ctx.GetOrCreateSymbol(Name.str());
+    }
+    return Sym;
+  }
+  // FIXME: These probably should be a modifier on the symbol or something??
+  case X86II::MO_TLSGD:     Name += "@TLSGD";     break;
+  case X86II::MO_GOTTPOFF:  Name += "@GOTTPOFF";  break;
+  case X86II::MO_INDNTPOFF: Name += "@INDNTPOFF"; break;
+  case X86II::MO_TPOFF:     Name += "@TPOFF";     break;
+  case X86II::MO_NTPOFF:    Name += "@NTPOFF";    break;
+  case X86II::MO_GOTPCREL:  Name += "@GOTPCREL";  break;
+  case X86II::MO_GOT:       Name += "@GOT";       break;
+  case X86II::MO_GOTOFF:    Name += "@GOTOFF";    break;
+  case X86II::MO_PLT:       Name += "@PLT";       break;
+  }
+  
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *X86MCInstLower::GetJumpTableSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << AsmPrinter.MAI->getPrivateGlobalPrefix() << "JTI"
+    << AsmPrinter.getFunctionNumber() << '_' << MO.getIndex();
+  
+  switch (MO.getTargetFlags()) {
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    break;
+    // FIXME: These probably should be a modifier on the symbol or something??
+  case X86II::MO_TLSGD:     Name += "@TLSGD";     break;
+  case X86II::MO_GOTTPOFF:  Name += "@GOTTPOFF";  break;
+  case X86II::MO_INDNTPOFF: Name += "@INDNTPOFF"; break;
+  case X86II::MO_TPOFF:     Name += "@TPOFF";     break;
+  case X86II::MO_NTPOFF:    Name += "@NTPOFF";    break;
+  case X86II::MO_GOTPCREL:  Name += "@GOTPCREL";  break;
+  case X86II::MO_GOT:       Name += "@GOT";       break;
+  case X86II::MO_GOTOFF:    Name += "@GOTOFF";    break;
+  case X86II::MO_PLT:       Name += "@PLT";       break;
+  }
+  
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+
+MCSymbol *X86MCInstLower::
+GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << AsmPrinter.MAI->getPrivateGlobalPrefix() << "CPI"
+    << AsmPrinter.getFunctionNumber() << '_' << MO.getIndex();
+  
+  switch (MO.getTargetFlags()) {
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    break;
+    // FIXME: These probably should be a modifier on the symbol or something??
+  case X86II::MO_TLSGD:     Name += "@TLSGD";     break;
+  case X86II::MO_GOTTPOFF:  Name += "@GOTTPOFF";  break;
+  case X86II::MO_INDNTPOFF: Name += "@INDNTPOFF"; break;
+  case X86II::MO_TPOFF:     Name += "@TPOFF";     break;
+  case X86II::MO_NTPOFF:    Name += "@NTPOFF";    break;
+  case X86II::MO_GOTPCREL:  Name += "@GOTPCREL";  break;
+  case X86II::MO_GOT:       Name += "@GOT";       break;
+  case X86II::MO_GOTOFF:    Name += "@GOTOFF";    break;
+  case X86II::MO_PLT:       Name += "@PLT";       break;
+  }
+  
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                             MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+  
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+      
+  // These affect the name of the symbol, not any suffix.
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DLLIMPORT:
+  case X86II::MO_DARWIN_STUB:
+  case X86II::MO_TLSGD:
+  case X86II::MO_GOTTPOFF:
+  case X86II::MO_INDNTPOFF:
+  case X86II::MO_TPOFF:
+  case X86II::MO_NTPOFF:
+  case X86II::MO_GOTPCREL:
+  case X86II::MO_GOT:
+  case X86II::MO_GOTOFF:
+  case X86II::MO_PLT:
+    break;
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    // Subtract the pic base.
+    Expr = MCBinaryExpr::CreateSub(Expr, 
+                               MCSymbolRefExpr::Create(GetPICBaseSymbol(), Ctx),
+                                   Ctx);
+    break;
+  }
+  
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+
+
+static void lower_subreg32(MCInst *MI, unsigned OpNo) {
+  // Convert registers in the addr mode according to subreg32.
+  unsigned Reg = MI->getOperand(OpNo).getReg();
+  if (Reg != 0)
+    MI->getOperand(OpNo).setReg(getX86SubSuperRegister(Reg, MVT::i32));
+}
+
+static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) {
+  // Convert registers in the addr mode according to subreg64.
+  for (unsigned i = 0; i != 4; ++i) {
+    if (!MI->getOperand(OpNo+i).isReg()) continue;
+    
+    unsigned Reg = MI->getOperand(OpNo+i).getReg();
+    if (Reg == 0) continue;
+    
+    MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64));
+  }
+}
+
+
+
+void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                       AsmPrinter.GetMBBSymbol(MO.getMBB()->getNumber()), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+      break;
+    }
+    
+    OutMI.addOperand(MCOp);
+  }
+  
+  // Handle a few special cases to eliminate operand modifiers.
+  switch (OutMI.getOpcode()) {
+  case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand.
+    lower_lea64_32mem(&OutMI, 1);
+    break;
+  case X86::MOV16r0:
+    OutMI.setOpcode(X86::MOV32r0);
+    lower_subreg32(&OutMI, 0);
+    break;
+  case X86::MOVZX16rr8:
+    OutMI.setOpcode(X86::MOVZX32rr8);
+    lower_subreg32(&OutMI, 0);
+    break;
+  case X86::MOVZX16rm8:
+    OutMI.setOpcode(X86::MOVZX32rm8);
+    lower_subreg32(&OutMI, 0);
+    break;
+  case X86::MOVSX16rr8:
+    OutMI.setOpcode(X86::MOVSX32rr8);
+    lower_subreg32(&OutMI, 0);
+    break;
+  case X86::MOVSX16rm8:
+    OutMI.setOpcode(X86::MOVSX32rm8);
+    lower_subreg32(&OutMI, 0);
+    break;
+  case X86::MOVZX64rr32:
+    OutMI.setOpcode(X86::MOV32rr);
+    lower_subreg32(&OutMI, 0);
+    break;
+  case X86::MOVZX64rm32:
+    OutMI.setOpcode(X86::MOV32rm);
+    lower_subreg32(&OutMI, 0);
+    break;
+  case X86::MOV64ri64i32:
+    OutMI.setOpcode(X86::MOV32ri);
+    lower_subreg32(&OutMI, 0);
+    break;
+  case X86::MOVZX64rr8:
+    OutMI.setOpcode(X86::MOVZX32rr8);
+    lower_subreg32(&OutMI, 0);
+    break;
+  case X86::MOVZX64rm8:
+    OutMI.setOpcode(X86::MOVZX32rm8);
+    lower_subreg32(&OutMI, 0);
+    break;
+  case X86::MOVZX64rr16:
+    OutMI.setOpcode(X86::MOVZX32rr16);
+    lower_subreg32(&OutMI, 0);
+    break;
+  case X86::MOVZX64rm16:
+    OutMI.setOpcode(X86::MOVZX32rm16);
+    lower_subreg32(&OutMI, 0);
+    break;
+  }
+}
+
+
+
+void X86AsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
+  X86MCInstLower MCInstLowering(OutContext, Mang, *this);
+  switch (MI->getOpcode()) {
+  case TargetInstrInfo::DBG_LABEL:
+  case TargetInstrInfo::EH_LABEL:
+  case TargetInstrInfo::GC_LABEL:
+    printLabel(MI);
+    return;
+  case TargetInstrInfo::INLINEASM:
+    O << '\t';
+    printInlineAsm(MI);
+    return;
+  case TargetInstrInfo::IMPLICIT_DEF:
+    printImplicitDef(MI);
+    return;
+  case TargetInstrInfo::KILL:
+    return;
+  case X86::MOVPC32r: {
+    MCInst TmpInst;
+    // This is a pseudo op for a two instruction sequence with a label, which
+    // looks like:
+    //     call "L1$pb"
+    // "L1$pb":
+    //     popl %esi
+    
+    // Emit the call.
+    MCSymbol *PICBase = MCInstLowering.GetPICBaseSymbol();
+    TmpInst.setOpcode(X86::CALLpcrel32);
+    // FIXME: We would like an efficient form for this, so we don't have to do a
+    // lot of extra uniquing.
+    TmpInst.addOperand(MCOperand::CreateExpr(MCSymbolRefExpr::Create(PICBase,
+                                                                 OutContext)));
+    printMCInst(&TmpInst);
+    O << '\n';
+    
+    // Emit the label.
+    OutStreamer.EmitLabel(PICBase);
+    
+    // popl $reg
+    TmpInst.setOpcode(X86::POP32r);
+    TmpInst.getOperand(0) = MCOperand::CreateReg(MI->getOperand(0).getReg());
+    printMCInst(&TmpInst);
+    return;
+  }
+      
+  case X86::ADD32ri: {
+    // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+    if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+      break;
+    
+    // Okay, we have something like:
+    //  EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+    
+    // For this, we want to print something like:
+    //   MYGLOBAL + (. - PICBASE)
+    // However, we can't generate a ".", so just emit a new label here and refer
+    // to it.  We know that this operand flag occurs at most once per function.
+    SmallString<64> Name;
+    raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix()
+      << "picbaseref" << getFunctionNumber();
+    MCSymbol *DotSym = OutContext.GetOrCreateSymbol(Name.str());
+    OutStreamer.EmitLabel(DotSym);
+    
+    // Now that we have emitted the label, lower the complex operand expression.
+    MCSymbol *OpSym = MCInstLowering.GetExternalSymbolSymbol(MI->getOperand(2));
+    
+    const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext);
+    const MCExpr *PICBase =
+      MCSymbolRefExpr::Create(MCInstLowering.GetPICBaseSymbol(), OutContext);
+    DotExpr = MCBinaryExpr::CreateSub(DotExpr, PICBase, OutContext);
+    
+    DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext), 
+                                      DotExpr, OutContext);
+    
+    MCInst TmpInst;
+    TmpInst.setOpcode(X86::ADD32ri);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+    TmpInst.addOperand(MCOperand::CreateExpr(DotExpr));
+    printMCInst(&TmpInst);
+    return;
+  }
+  }
+  
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  
+  
+  printMCInst(&TmpInst);
+}
+
diff --git a/lib/Target/X86/AsmPrinter/X86MCInstLower.h b/lib/Target/X86/AsmPrinter/X86MCInstLower.h
new file mode 100644
index 0000000..fa25b90
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86MCInstLower.h
@@ -0,0 +1,54 @@
+//===-- X86MCInstLower.h - Lower MachineInstr to MCInst -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_MCINSTLOWER_H
+#define X86_MCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MCSymbol;
+  class MachineInstr;
+  class MachineModuleInfoMachO;
+  class MachineOperand;
+  class Mangler;
+  class X86AsmPrinter;
+  class X86Subtarget;
+  
+/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class VISIBILITY_HIDDEN X86MCInstLower {
+  MCContext &Ctx;
+  Mangler *Mang;
+  X86AsmPrinter &AsmPrinter;
+
+  const X86Subtarget &getSubtarget() const;
+public:
+  X86MCInstLower(MCContext &ctx, Mangler *mang, X86AsmPrinter &asmprinter)
+    : Ctx(ctx), Mang(mang), AsmPrinter(asmprinter) {}
+  
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCSymbol *GetPICBaseSymbol() const;
+  
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+  
+private:
+  MachineModuleInfoMachO &getMachOMMI() const;
+};
+
+}
+
+#endif
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 7ea0e51..3ad65fb 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -7,13 +7,15 @@ tablegen(X86GenInstrNames.inc -gen-instr-enums)
 tablegen(X86GenInstrInfo.inc -gen-instr-desc)
 tablegen(X86GenAsmWriter.inc -gen-asm-writer)
 tablegen(X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+tablegen(X86GenAsmMatcher.inc -gen-asm-matcher)
 tablegen(X86GenDAGISel.inc -gen-dag-isel)
 tablegen(X86GenFastISel.inc -gen-fast-isel)
 tablegen(X86GenCallingConv.inc -gen-callingconv)
 tablegen(X86GenSubtarget.inc -gen-subtarget)
 
-add_llvm_target(X86CodeGen
+set(sources
   X86CodeEmitter.cpp
+  X86COFFMachineModuleInfo.cpp
   X86ELFWriterInfo.cpp
   X86FloatingPoint.cpp
   X86FloatingPointRegKill.cpp
@@ -21,11 +23,19 @@ add_llvm_target(X86CodeGen
   X86ISelLowering.cpp
   X86InstrInfo.cpp
   X86JITInfo.cpp
+  X86MCAsmInfo.cpp
   X86RegisterInfo.cpp
   X86Subtarget.cpp
-  X86TargetAsmInfo.cpp
   X86TargetMachine.cpp
+  X86TargetObjectFile.cpp
   X86FastISel.cpp
   )
 
+if( CMAKE_CL_64 )
+  enable_language(ASM_MASM)
+  set(sources ${sources} X86CompilationCallback_Win64.asm)
+endif()
+
+add_llvm_target(X86CodeGen ${sources})
+
 target_link_libraries (LLVMX86CodeGen LLVMSelectionDAG)
diff --git a/lib/Target/X86/Makefile b/lib/Target/X86/Makefile
index 44f1c5d..220831d 100644
--- a/lib/Target/X86/Makefile
+++ b/lib/Target/X86/Makefile
@@ -13,11 +13,11 @@ TARGET = X86
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = X86GenRegisterInfo.h.inc X86GenRegisterNames.inc \
                 X86GenRegisterInfo.inc X86GenInstrNames.inc \
-                X86GenInstrInfo.inc X86GenAsmWriter.inc \
+                X86GenInstrInfo.inc X86GenAsmWriter.inc X86GenAsmMatcher.inc \
                 X86GenAsmWriter1.inc X86GenDAGISel.inc  \
                 X86GenFastISel.inc \
                 X86GenCallingConv.inc X86GenSubtarget.inc
 
-DIRS = AsmPrinter
+DIRS = AsmPrinter AsmParser TargetInfo
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt
index ad12137..e8f7c5d 100644
--- a/lib/Target/X86/README-X86-64.txt
+++ b/lib/Target/X86/README-X86-64.txt
@@ -249,3 +249,52 @@ lowered return value, and it would free non-C frontends from a
 complication only required by a C-based ABI.
 
 //===---------------------------------------------------------------------===//
+
+We get a redundant zero extension for code like this:
+
+int mask[1000];
+int foo(unsigned x) {
+ if (x < 10)
+   x = x * 45;
+ else
+   x = x * 78;
+ return mask[x];
+}
+
+_foo:
+LBB1_0:	## entry
+	cmpl	$9, %edi
+	jbe	LBB1_3	## bb
+LBB1_1:	## bb1
+	imull	$78, %edi, %eax
+LBB1_2:	## bb2
+	movl	%eax, %eax                    <----
+	movq	_mask@GOTPCREL(%rip), %rcx
+	movl	(%rcx,%rax,4), %eax
+	ret
+LBB1_3:	## bb
+	imull	$45, %edi, %eax
+	jmp	LBB1_2	## bb2
+  
+Before regalloc, we have:
+
+        %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def>
+        JMP mbb<bb2,0x203afb0>
+    Successors according to CFG: 0x203afb0 (#3)
+
+bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
+    Predecessors according to CFG: 0x203aec0 (#0)
+        %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def>
+    Successors according to CFG: 0x203afb0 (#3)
+
+bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
+    Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
+        %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>,
+                            %reg1026, mbb<bb1,0x203af60>
+        %reg1029<def> = MOVZX64rr32 %reg1027
+
+so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
+be able to recognize the zero extend.  This could also presumably be implemented
+if we have whole-function selectiondags.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 4464878..046d35c 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -1932,3 +1932,23 @@ Replacing an icmp+select with a shift should always be considered profitable in
 instcombine.
 
 //===---------------------------------------------------------------------===//
+
+Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
+properly.
+
+When the return value is not used (i.e. only care about the value in the
+memory), x86 does not have to use add to implement these. Instead, it can use
+add, sub, inc, dec instructions with the "lock" prefix.
+
+This is currently implemented using a bit of instruction selection trick. The
+issue is the target independent pattern produces one output and a chain and we
+want to map it into one that just output a chain. The current trick is to select
+it into a MERGE_VALUES with the first definition being an implicit_def. The
+proper solution is to add new ISD opcodes for the no-output variant. DAG
+combiner can then transform the node before it gets to target node selection.
+
+Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
+fact these instructions are identical to the non-lock versions. We need a way to
+add target specific information to target nodes and have this information
+carried over to machine instructions. Asm printer (or JIT) can use this
+information to add the "lock" prefix.
diff --git a/lib/Target/X86/TargetInfo/CMakeLists.txt b/lib/Target/X86/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..90be9f5
--- /dev/null
+++ b/lib/Target/X86/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMX86Info
+  X86TargetInfo.cpp
+  )
+
+add_dependencies(LLVMX86Info X86CodeGenTable_gen)
diff --git a/lib/Target/X86/TargetInfo/Makefile b/lib/Target/X86/TargetInfo/Makefile
new file mode 100644
index 0000000..6677d4b
--- /dev/null
+++ b/lib/Target/X86/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/X86/TargetInfo/Makefile ------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMX86Info
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
new file mode 100644
index 0000000..08d4d84
--- /dev/null
+++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheX86_32Target, llvm::TheX86_64Target;
+
+extern "C" void LLVMInitializeX86TargetInfo() { 
+  RegisterTarget<Triple::x86, /*HasJIT=*/true>
+    X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above");
+
+  RegisterTarget<Triple::x86_64, /*HasJIT=*/true>
+    Y(TheX86_64Target, "x86-64", "64-bit X86: EM64T and AMD64");
+}
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 22de3f6..a167118 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -22,8 +22,10 @@ namespace llvm {
 class X86TargetMachine;
 class FunctionPass;
 class MachineCodeEmitter;
+class MCCodeEmitter;
 class JITCodeEmitter;
-class raw_ostream;
+class Target;
+class formatted_raw_ostream;
 
 /// createX86ISelDag - This pass converts a legalized DAG into a 
 /// X86-specific DAG, ready for instruction scheduling.
@@ -42,13 +44,6 @@ FunctionPass *createX86FloatingPointStackifierPass();
 ///
 FunctionPass *createX87FPRegKillInserterPass();
 
-/// createX86CodePrinterPass - Returns a pass that prints the X86
-/// assembly code for a MachineFunction to the given output stream,
-/// using the given target machine description.
-///
-FunctionPass *createX86CodePrinterPass(raw_ostream &o, X86TargetMachine &tm,
-                                       bool Verbose);
-
 /// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
 /// to the specified MCE object.
 
@@ -56,6 +51,10 @@ FunctionPass *createX86CodeEmitterPass(X86TargetMachine &TM,
                                        MachineCodeEmitter &MCE);
 FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
                                           JITCodeEmitter &JCE);
+FunctionPass *createX86ObjectCodeEmitterPass(X86TargetMachine &TM,
+                                             ObjectCodeEmitter &OCE);
+
+MCCodeEmitter *createX86MCCodeEmitter(const Target &, TargetMachine &TM);
 
 /// createX86EmitCodeToMemory - Returns a pass that converts a register
 /// allocated function into raw machine code in a dynamically
@@ -68,6 +67,8 @@ FunctionPass *createEmitX86CodeToMemory();
 ///
 FunctionPass *createX86MaxStackAlignmentCalculatorPass();
 
+extern Target TheX86_32Target, TheX86_64Target;
+
 } // End llvm namespace
 
 // Defines symbolic names for X86 registers.  This defines a mapping from
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 47861d5..da467fe 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -19,12 +19,17 @@ include "llvm/Target/Target.td"
 //===----------------------------------------------------------------------===//
 // X86 Subtarget features.
 //===----------------------------------------------------------------------===//
- 
+
+def FeatureCMOV    : SubtargetFeature<"cmov","HasCMov", "true",
+                                      "Enable conditional move instructions">;
+
 def FeatureMMX     : SubtargetFeature<"mmx","X86SSELevel", "MMX",
                                       "Enable MMX instructions">;
 def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
                                       "Enable SSE instructions",
-                                      [FeatureMMX]>;
+                                      // SSE codegen depends on cmovs, and all
+                                      // SSE1+ processors support them. 
+                                      [FeatureMMX, FeatureCMOV]>;
 def FeatureSSE2    : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
                                       "Enable SSE2 instructions",
                                       [FeatureSSE1]>;
@@ -76,8 +81,8 @@ def : Proc<"i586",            []>;
 def : Proc<"pentium",         []>;
 def : Proc<"pentium-mmx",     [FeatureMMX]>;
 def : Proc<"i686",            []>;
-def : Proc<"pentiumpro",      []>;
-def : Proc<"pentium2",        [FeatureMMX]>;
+def : Proc<"pentiumpro",      [FeatureCMOV]>;
+def : Proc<"pentium2",        [FeatureMMX, FeatureCMOV]>;
 def : Proc<"pentium3",        [FeatureSSE1]>;
 def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;
 def : Proc<"pentium4",        [FeatureSSE2]>;
@@ -178,21 +183,34 @@ include "X86CallingConv.td"
 // Assembly Printers
 //===----------------------------------------------------------------------===//
 
+// Currently the X86 assembly parser only supports ATT syntax.
+def ATTAsmParser : AsmParser {
+  string AsmParserClassName  = "ATTAsmParser";
+  int Variant = 0;
+
+  // Discard comments in assembly strings.
+  string CommentDelimiter = "#";
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "%";
+}
+
 // The X86 target supports two different syntaxes for emitting machine code.
 // This is controlled by the -x86-asm-syntax={att|intel}
 def ATTAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "ATTAsmPrinter";
+  string AsmWriterClassName  = "ATTInstPrinter";
   int Variant = 0;
 }
 def IntelAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "IntelAsmPrinter";
+  string AsmWriterClassName  = "IntelInstPrinter";
   int Variant = 1;
 }
 
-
 def X86 : Target {
   // Information about the instructions...
   let InstructionSet = X86InstrInfo;
 
+  let AssemblyParsers = [ATTAsmParser];
+
   let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
 }
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/lib/Target/X86/X86COFFMachineModuleInfo.cpp
new file mode 100644
index 0000000..01c4fcf
--- /dev/null
+++ b/lib/Target/X86/X86COFFMachineModuleInfo.cpp
@@ -0,0 +1,123 @@
+//===-- llvm/CodeGen/X86COFFMachineModuleInfo.cpp -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an MMI implementation for X86 COFF (windows) targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86COFFMachineModuleInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+X86COFFMachineModuleInfo::X86COFFMachineModuleInfo(const MachineModuleInfo &) {
+}
+X86COFFMachineModuleInfo::~X86COFFMachineModuleInfo() {
+  
+}
+
+void X86COFFMachineModuleInfo::AddFunctionInfo(const Function *F,
+                                            const X86MachineFunctionInfo &Val) {
+  FunctionInfoMap[F] = Val;
+}
+
+
+
+static X86MachineFunctionInfo calculateFunctionInfo(const Function *F,
+                                                    const TargetData &TD) {
+  X86MachineFunctionInfo Info;
+  uint64_t Size = 0;
+  
+  switch (F->getCallingConv()) {
+  case CallingConv::X86_StdCall:
+    Info.setDecorationStyle(StdCall);
+    break;
+  case CallingConv::X86_FastCall:
+    Info.setDecorationStyle(FastCall);
+    break;
+  default:
+    return Info;
+  }
+  
+  unsigned argNum = 1;
+  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+       AI != AE; ++AI, ++argNum) {
+    const Type* Ty = AI->getType();
+    
+    // 'Dereference' type in case of byval parameter attribute
+    if (F->paramHasAttr(argNum, Attribute::ByVal))
+      Ty = cast<PointerType>(Ty)->getElementType();
+    
+    // Size should be aligned to DWORD boundary
+    Size += ((TD.getTypeAllocSize(Ty) + 3)/4)*4;
+  }
+  
+  // We're not supporting tooooo huge arguments :)
+  Info.setBytesToPopOnReturn((unsigned int)Size);
+  return Info;
+}
+
+
+/// DecorateCygMingName - Query FunctionInfoMap and use this information for
+/// various name decorations for Cygwin and MingW.
+void X86COFFMachineModuleInfo::DecorateCygMingName(SmallVectorImpl<char> &Name,
+                                                   const GlobalValue *GV,
+                                                   const TargetData &TD) {
+  const Function *F = dyn_cast<Function>(GV);
+  if (!F) return;
+  
+  // Save function name for later type emission.
+  if (F->isDeclaration())
+    CygMingStubs.insert(StringRef(Name.data(), Name.size()));
+  
+  // We don't want to decorate non-stdcall or non-fastcall functions right now
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC != CallingConv::X86_StdCall && CC != CallingConv::X86_FastCall)
+    return;
+  
+  const X86MachineFunctionInfo *Info;
+  
+  FMFInfoMap::const_iterator info_item = FunctionInfoMap.find(F);
+  if (info_item == FunctionInfoMap.end()) {
+    // Calculate apropriate function info and populate map
+    FunctionInfoMap[F] = calculateFunctionInfo(F, TD);
+    Info = &FunctionInfoMap[F];
+  } else {
+    Info = &info_item->second;
+  }
+  
+  if (Info->getDecorationStyle() == None) return;
+  const FunctionType *FT = F->getFunctionType();
+  
+  // "Pure" variadic functions do not receive @0 suffix.
+  if (!FT->isVarArg() || FT->getNumParams() == 0 ||
+      (FT->getNumParams() == 1 && F->hasStructRetAttr()))
+    raw_svector_ostream(Name) << '@' << Info->getBytesToPopOnReturn();
+  
+  if (Info->getDecorationStyle() == FastCall) {
+    if (Name[0] == '_')
+      Name[0] = '@';
+    else
+      Name.insert(Name.begin(), '@');
+  }    
+}
+
+/// DecorateCygMingName - Query FunctionInfoMap and use this information for
+/// various name decorations for Cygwin and MingW.
+void X86COFFMachineModuleInfo::DecorateCygMingName(std::string &Name,
+                                                   const GlobalValue *GV,
+                                                   const TargetData &TD) {
+  SmallString<128> NameStr(Name.begin(), Name.end());
+  DecorateCygMingName(NameStr, GV, TD);
+  Name.assign(NameStr.begin(), NameStr.end());
+}
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.h b/lib/Target/X86/X86COFFMachineModuleInfo.h
new file mode 100644
index 0000000..afd5525
--- /dev/null
+++ b/lib/Target/X86/X86COFFMachineModuleInfo.h
@@ -0,0 +1,67 @@
+//===-- llvm/CodeGen/X86COFFMachineModuleInfo.h -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an MMI implementation for X86 COFF (windows) targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86COFF_MACHINEMODULEINFO_H
+#define X86COFF_MACHINEMODULEINFO_H
+
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/StringSet.h"
+
+namespace llvm {
+  class X86MachineFunctionInfo;
+  class TargetData;
+  
+/// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation
+/// for X86 COFF targets.
+class X86COFFMachineModuleInfo : public MachineModuleInfoImpl {
+  StringSet<> CygMingStubs;
+  
+  // We have to propagate some information about MachineFunction to
+  // AsmPrinter. It's ok, when we're printing the function, since we have
+  // access to MachineFunction and can get the appropriate MachineFunctionInfo.
+  // Unfortunately, this is not possible when we're printing reference to
+  // Function (e.g. calling it and so on). Even more, there is no way to get the
+  // corresponding MachineFunctions: it can even be not created at all. That's
+  // why we should use additional structure, when we're collecting all necessary
+  // information.
+  //
+  // This structure is using e.g. for name decoration for stdcall & fastcall'ed
+  // function, since we have to use arguments' size for decoration.
+  typedef std::map<const Function*, X86MachineFunctionInfo> FMFInfoMap;
+  FMFInfoMap FunctionInfoMap;
+  
+public:
+  X86COFFMachineModuleInfo(const MachineModuleInfo &);
+  ~X86COFFMachineModuleInfo();
+  
+  
+  void DecorateCygMingName(std::string &Name, const GlobalValue *GV,
+                           const TargetData &TD);
+  void DecorateCygMingName(SmallVectorImpl<char> &Name, const GlobalValue *GV,
+                           const TargetData &TD);
+  
+  void AddFunctionInfo(const Function *F, const X86MachineFunctionInfo &Val);
+  
+
+  typedef StringSet<>::const_iterator stub_iterator;
+  stub_iterator stub_begin() const { return CygMingStubs.begin(); }
+  stub_iterator stub_end() const { return CygMingStubs.end(); }
+
+  
+};
+
+
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index e9fcbd5..d77f039 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -89,7 +89,7 @@ def RetCC_X86_64_C : CallingConv<[
 // X86-Win64 C return-value convention.
 def RetCC_X86_Win64_C : CallingConv<[
   // The X86-Win64 calling convention always returns __m64 values in RAX.
-  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToReg<[RAX]>>,
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCBitConvertToType<i64>>,
 
   // And FP in XMM0 only.
   CCIfType<[f32], CCAssignToReg<[XMM0]>>,
@@ -137,26 +137,26 @@ def CC_X86_64_C : CallingConv<[
   // The 'nest' parameter, if any, is passed in R10.
   CCIfNest<CCAssignToReg<[R10]>>,
 
+  // The first 6 v1i64 vector arguments are passed in GPRs on Darwin.
+  CCIfType<[v1i64],
+            CCIfSubtarget<"isTargetDarwin()",
+            CCBitConvertToType<i64>>>,
+
   // The first 6 integer arguments are passed in integer registers.
   CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
   CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
-  
-  // The first 8 FP/Vector arguments are passed in XMM registers.
-  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-            CCIfSubtarget<"hasSSE1()",
-            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
 
   // The first 8 MMX (except for v1i64) vector arguments are passed in XMM
   // registers on Darwin.
   CCIfType<[v8i8, v4i16, v2i32, v2f32],
             CCIfSubtarget<"isTargetDarwin()",
             CCIfSubtarget<"hasSSE2()",
-            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>>,
+            CCPromoteToType<v2i64>>>>,
 
-  // The first 8 v1i64 vector arguments are passed in GPRs on Darwin.
-  CCIfType<[v1i64],
-            CCIfSubtarget<"isTargetDarwin()",
-            CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>>,
+  // The first 8 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCIfSubtarget<"hasSSE1()",
+            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
  
   // Integer/FP values get stored in stack slots that are 8 bytes in size and
   // 8-byte aligned if there are no more registers to hold them.
@@ -184,6 +184,13 @@ def CC_X86_Win64_C : CallingConv<[
   // The 'nest' parameter, if any, is passed in R10.
   CCIfNest<CCAssignToReg<[R10]>>,
 
+  // 128 bit vectors are passed by pointer
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+
+  // The first 4 MMX vector arguments are passed in GPRs.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32],
+           CCBitConvertToType<i64>>,
+
   // The first 4 integer arguments are passed in integer registers.
   CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
                                           [XMM0, XMM1, XMM2, XMM3]>>,
@@ -195,24 +202,16 @@ def CC_X86_Win64_C : CallingConv<[
            CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
                                    [RCX , RDX , R8  , R9  ]>>,
 
-  // The first 4 MMX vector arguments are passed in GPRs.
-  CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32],
-           CCAssignToRegWithShadow<[RCX , RDX , R8  , R9  ],
-                                   [XMM0, XMM1, XMM2, XMM3]>>,
-
   // Integer/FP values get stored in stack slots that are 8 bytes in size and
-  // 16-byte aligned if there are no more registers to hold them.
-  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 16>>,
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
 
   // Long doubles get stack slots whose size and alignment depends on the
   // subtarget.
   CCIfType<[f80], CCAssignToStack<0, 0>>,
 
-  // Vectors get 16-byte stack slots that are 16-byte aligned.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
-
-  // __m64 vectors get 8-byte stack slots that are 16-byte aligned.
-  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 16>>
+  // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
 ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index d5846a0..f942f3f 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -22,21 +22,27 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/MachineCodeEmitter.h"
 #include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/ObjectCodeEmitter.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Function.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
 STATISTIC(NumEmitted, "Number of machine instructions emitted");
 
 namespace {
-template<class CodeEmitter>
+  template<class CodeEmitter>
   class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass {
     const X86InstrInfo  *II;
     const TargetData    *TD;
@@ -67,6 +73,7 @@ template<class CodeEmitter>
                          const TargetInstrDesc *Desc);
     
     void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
       AU.addRequired<MachineModuleInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -83,7 +90,7 @@ template<class CodeEmitter>
                               intptr_t PCAdj = 0);
 
     void emitDisplacementField(const MachineOperand *RelocOp, int DispVal,
-                               intptr_t PCAdj = 0);
+                               intptr_t Adj = 0, bool IsPCRel = true);
 
     void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField);
     void emitRegModRMByte(unsigned RegOpcodeField);
@@ -95,29 +102,27 @@ template<class CodeEmitter>
                           intptr_t PCAdj = 0);
 
     unsigned getX86RegNum(unsigned RegNo) const;
-
-    bool gvNeedsNonLazyPtr(const GlobalValue *GV);
   };
 
 template<class CodeEmitter>
   char Emitter<CodeEmitter>::ID = 0;
-}
+} // end anonymous namespace.
 
 /// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
 /// to the specified templated MachineCodeEmitter object.
 
-namespace llvm {
-	
-FunctionPass *createX86CodeEmitterPass(X86TargetMachine &TM,
-                                       MachineCodeEmitter &MCE) {
+FunctionPass *llvm::createX86CodeEmitterPass(X86TargetMachine &TM,
+                                             MachineCodeEmitter &MCE) {
   return new Emitter<MachineCodeEmitter>(TM, MCE);
 }
-FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
-                                          JITCodeEmitter &JCE) {
+FunctionPass *llvm::createX86JITCodeEmitterPass(X86TargetMachine &TM,
+                                                JITCodeEmitter &JCE) {
   return new Emitter<JITCodeEmitter>(TM, JCE);
 }
-
-} // end namespace llvm
+FunctionPass *llvm::createX86ObjectCodeEmitterPass(X86TargetMachine &TM,
+                                                   ObjectCodeEmitter &OCE) {
+  return new Emitter<ObjectCodeEmitter>(TM, OCE);
+}
 
 template<class CodeEmitter>
 bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
@@ -130,7 +135,8 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
   IsPIC = TM.getRelocationModel() == Reloc::PIC_;
   
   do {
-    DOUT << "JITTing function '" << MF.getFunction()->getName() << "'\n";
+    DEBUG(errs() << "JITTing function '" 
+          << MF.getFunction()->getName() << "'\n");
     MCE.startFunction(MF);
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); 
          MBB != E; ++MBB) {
@@ -172,7 +178,7 @@ void Emitter<CodeEmitter>::emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
                                 intptr_t PCAdj /* = 0 */,
                                 bool NeedStub /* = false */,
                                 bool Indirect /* = false */) {
-  intptr_t RelocCST = 0;
+  intptr_t RelocCST = Disp;
   if (Reloc == X86::reloc_picrel_word)
     RelocCST = PICBaseOffset;
   else if (Reloc == X86::reloc_pcrel_word)
@@ -291,53 +297,61 @@ static bool isDisp8(int Value) {
   return Value == (signed char)Value;
 }
 
-template<class CodeEmitter>
-bool Emitter<CodeEmitter>::gvNeedsNonLazyPtr(const GlobalValue *GV) {
-  // For Darwin, simulate the linktime GOT by using the same non-lazy-pointer
+static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp,
+                              const TargetMachine &TM) {
+  // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer
   // mechanism as 32-bit mode.
-  return (!Is64BitMode || TM.getSubtarget<X86Subtarget>().isTargetDarwin()) &&
-    TM.getSubtarget<X86Subtarget>().GVRequiresExtraLoad(GV, TM, false);
+  if (TM.getSubtarget<X86Subtarget>().is64Bit() && 
+      !TM.getSubtarget<X86Subtarget>().isTargetDarwin())
+    return false;
+  
+  // Return true if this is a reference to a stub containing the address of the
+  // global, not the global itself.
+  return isGlobalStubReference(GVOp.getTargetFlags());
 }
 
 template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp,
-                                                 int DispVal, intptr_t PCAdj) {
+                                                 int DispVal,
+                                                 intptr_t Adj /* = 0 */,
+                                                 bool IsPCRel /* = true */) {
   // If this is a simple integer displacement that doesn't require a relocation,
   // emit it now.
   if (!RelocOp) {
     emitConstant(DispVal, 4);
     return;
   }
-  
+
   // Otherwise, this is something that requires a relocation.  Emit it as such
   // now.
+  unsigned RelocType = Is64BitMode ?
+    (IsPCRel ? X86::reloc_pcrel_word : X86::reloc_absolute_word_sext)
+    : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
   if (RelocOp->isGlobal()) {
     // In 64-bit static small code model, we could potentially emit absolute.
-    // But it's probably not beneficial.
+    // But it's probably not beneficial. If the MCE supports using RIP directly
+    // do it, otherwise fallback to absolute (this is determined by IsPCRel). 
     //  89 05 00 00 00 00     mov    %eax,0(%rip)  # PC-relative
     //  89 04 25 00 00 00 00  mov    %eax,0x0      # Absolute
-    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
-      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
     bool NeedStub = isa<Function>(RelocOp->getGlobal());
-    bool Indirect = gvNeedsNonLazyPtr(RelocOp->getGlobal());
-    emitGlobalAddress(RelocOp->getGlobal(), rt, RelocOp->getOffset(),
-                      PCAdj, NeedStub, Indirect);
+    bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM);
+    emitGlobalAddress(RelocOp->getGlobal(), RelocType, RelocOp->getOffset(),
+                      Adj, NeedStub, Indirect);
+  } else if (RelocOp->isSymbol()) {
+    emitExternalSymbolAddress(RelocOp->getSymbolName(), RelocType);
   } else if (RelocOp->isCPI()) {
-    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_picrel_word;
-    emitConstPoolAddress(RelocOp->getIndex(), rt,
-                         RelocOp->getOffset(), PCAdj);
-  } else if (RelocOp->isJTI()) {
-    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_picrel_word;
-    emitJumpTableAddress(RelocOp->getIndex(), rt, PCAdj);
+    emitConstPoolAddress(RelocOp->getIndex(), RelocType,
+                         RelocOp->getOffset(), Adj);
   } else {
-    assert(0 && "Unknown value to relocate!");
+    assert(RelocOp->isJTI() && "Unexpected machine operand!");
+    emitJumpTableAddress(RelocOp->getIndex(), RelocType, Adj);
   }
 }
 
 template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
-                               unsigned Op, unsigned RegOpcodeField,
-                               intptr_t PCAdj) {
+                                            unsigned Op,unsigned RegOpcodeField,
+                                            intptr_t PCAdj) {
   const MachineOperand &Op3 = MI.getOperand(Op+3);
   int DispVal = 0;
   const MachineOperand *DispForReloc = 0;
@@ -345,15 +359,17 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   // Figure out what sort of displacement we have to handle here.
   if (Op3.isGlobal()) {
     DispForReloc = &Op3;
+  } else if (Op3.isSymbol()) {
+    DispForReloc = &Op3;
   } else if (Op3.isCPI()) {
-    if (Is64BitMode || IsPIC) {
+    if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
       DispForReloc = &Op3;
     } else {
       DispVal += MCE.getConstantPoolEntryAddress(Op3.getIndex());
       DispVal += Op3.getOffset();
     }
   } else if (Op3.isJTI()) {
-    if (Is64BitMode || IsPIC) {
+    if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
       DispForReloc = &Op3;
     } else {
       DispVal += MCE.getJumpTableEntryAddress(Op3.getIndex());
@@ -368,17 +384,23 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
 
   unsigned BaseReg = Base.getReg();
 
+  // Indicate that the displacement will use an pcrel or absolute reference
+  // by default. MCEs able to resolve addresses on-the-fly use pcrel by default
+  // while others, unless explicit asked to use RIP, use absolute references.
+  bool IsPCRel = MCE.earlyResolveAddresses() ? true : false;
+
   // Is a SIB byte needed?
+  // If no BaseReg, issue a RIP relative instruction only if the MCE can 
+  // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+  // 2-7) and absolute references.
   if ((!Is64BitMode || DispForReloc || BaseReg != 0) &&
-      IndexReg.getReg() == 0 &&
-      (BaseReg == 0 || BaseReg == X86::RIP ||
-       getX86RegNum(BaseReg) != N86::ESP)) {
-    if (BaseReg == 0 ||
-        BaseReg == X86::RIP) {  // Just a displacement?
+      IndexReg.getReg() == 0 && 
+      ((BaseReg == 0 && MCE.earlyResolveAddresses()) || BaseReg == X86::RIP || 
+       (BaseReg != 0 && getX86RegNum(BaseReg) != N86::ESP))) {
+    if (BaseReg == 0 || BaseReg == X86::RIP) {  // Just a displacement?
       // Emit special case [disp32] encoding
       MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
-      
-      emitDisplacementField(DispForReloc, DispVal, PCAdj);
+      emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
     } else {
       unsigned BaseRegNo = getX86RegNum(BaseReg);
       if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
@@ -391,7 +413,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
       } else {
         // Emit the most general non-SIB encoding: [REG+disp32]
         MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
-        emitDisplacementField(DispForReloc, DispVal, PCAdj);
+        emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
       }
     }
 
@@ -427,13 +449,13 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
     unsigned SS = SSTable[Scale.getImm()];
 
     if (BaseReg == 0) {
-      // Handle the SIB byte for the case where there is no base.  The
-      // displacement has already been output.
+      // Handle the SIB byte for the case where there is no base, see Intel 
+      // Manual 2A, table 2-7. The displacement has already been output.
       unsigned IndexRegNo;
       if (IndexReg.getReg())
         IndexRegNo = getX86RegNum(IndexReg.getReg());
-      else
-        IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+      else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+        IndexRegNo = 4;
       emitSIBByte(SS, IndexRegNo, 5);
     } else {
       unsigned BaseRegNo = getX86RegNum(BaseReg);
@@ -449,21 +471,23 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
     if (ForceDisp8) {
       emitConstant(DispVal, 1);
     } else if (DispVal != 0 || ForceDisp32) {
-      emitDisplacementField(DispForReloc, DispVal, PCAdj);
+      emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
     }
   }
 }
 
 template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitInstruction(
-                              const MachineInstr &MI,
-                              const TargetInstrDesc *Desc) {
-  DOUT << MI;
+void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI,
+                                           const TargetInstrDesc *Desc) {
+  DEBUG(errs() << MI);
+
+  MCE.processDebugLoc(MI.getDebugLoc(), true);
 
   unsigned Opcode = Desc->Opcode;
 
   // Emit the lock opcode prefix as needed.
-  if (Desc->TSFlags & X86II::LOCK) MCE.emitByte(0xF0);
+  if (Desc->TSFlags & X86II::LOCK)
+    MCE.emitByte(0xF0);
 
   // Emit segment override opcode prefix as needed.
   switch (Desc->TSFlags & X86II::SegOvrMask) {
@@ -473,18 +497,21 @@ void Emitter<CodeEmitter>::emitInstruction(
   case X86II::GS:
     MCE.emitByte(0x65);
     break;
-  default: assert(0 && "Invalid segment!");
+  default: llvm_unreachable("Invalid segment!");
   case 0: break;  // No segment override!
   }
 
   // Emit the repeat opcode prefix as needed.
-  if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) MCE.emitByte(0xF3);
+  if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP)
+    MCE.emitByte(0xF3);
 
   // Emit the operand size opcode prefix as needed.
-  if (Desc->TSFlags & X86II::OpSize) MCE.emitByte(0x66);
+  if (Desc->TSFlags & X86II::OpSize)
+    MCE.emitByte(0x66);
 
   // Emit the address size opcode prefix as needed.
-  if (Desc->TSFlags & X86II::AdSize) MCE.emitByte(0x67);
+  if (Desc->TSFlags & X86II::AdSize)
+    MCE.emitByte(0x67);
 
   bool Need0FPrefix = false;
   switch (Desc->TSFlags & X86II::Op0Mask) {
@@ -493,6 +520,10 @@ void Emitter<CodeEmitter>::emitInstruction(
   case X86II::TA:  // 0F 3A
     Need0FPrefix = true;
     break;
+  case X86II::TF: // F2 0F 38
+    MCE.emitByte(0xF2);
+    Need0FPrefix = true;
+    break;
   case X86II::REP: break; // already handled.
   case X86II::XS:   // F3 0F
     MCE.emitByte(0xF3);
@@ -508,14 +539,13 @@ void Emitter<CodeEmitter>::emitInstruction(
                  (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
                                    >> X86II::Op0Shift));
     break; // Two-byte opcode prefix
-  default: assert(0 && "Invalid prefix!");
+  default: llvm_unreachable("Invalid prefix!");
   case 0: break;  // No prefix!
   }
 
+  // Handle REX prefix.
   if (Is64BitMode) {
-    // REX prefix
-    unsigned REX = X86InstrInfo::determineREX(MI);
-    if (REX)
+    if (unsigned REX = X86InstrInfo::determineREX(MI))
       MCE.emitByte(0x40 | REX);
   }
 
@@ -524,7 +554,8 @@ void Emitter<CodeEmitter>::emitInstruction(
     MCE.emitByte(0x0F);
 
   switch (Desc->TSFlags & X86II::Op0Mask) {
-  case X86II::T8:  // 0F 38
+  case X86II::TF:    // F2 0F 38
+  case X86II::T8:    // 0F 38
     MCE.emitByte(0x38);
     break;
   case X86II::TA:    // 0F 3A
@@ -543,29 +574,29 @@ void Emitter<CodeEmitter>::emitInstruction(
 
   unsigned char BaseOpcode = II->getBaseOpcodeFor(Desc);
   switch (Desc->TSFlags & X86II::FormMask) {
-  default: assert(0 && "Unknown FormMask value in X86 MachineCodeEmitter!");
+  default:
+    llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
   case X86II::Pseudo:
     // Remember the current PC offset, this is the PIC relocation
     // base address.
     switch (Opcode) {
     default: 
-      assert(0 && "psuedo instructions should be removed before code emission");
+      llvm_unreachable("psuedo instructions should be removed before code"
+                       " emission");
       break;
-    case TargetInstrInfo::INLINEASM: {
+    case TargetInstrInfo::INLINEASM:
       // We allow inline assembler nodes with empty bodies - they can
       // implicitly define registers, which is ok for JIT.
-      if (MI.getOperand(0).getSymbolName()[0]) {
-        assert(0 && "JIT does not support inline asm!\n");
-        abort();
-      }
+      if (MI.getOperand(0).getSymbolName()[0])
+        llvm_report_error("JIT does not support inline asm!");
       break;
-    }
     case TargetInstrInfo::DBG_LABEL:
     case TargetInstrInfo::EH_LABEL:
+    case TargetInstrInfo::GC_LABEL:
       MCE.emitLabel(MI.getOperand(0).getImm());
       break;
     case TargetInstrInfo::IMPLICIT_DEF:
-    case TargetInstrInfo::DECLARE:
+    case TargetInstrInfo::KILL:
     case X86::DWARF_LOC:
     case X86::FP_REG_KILL:
       break;
@@ -582,73 +613,86 @@ void Emitter<CodeEmitter>::emitInstruction(
     }
     CurOp = NumOps;
     break;
-  case X86II::RawFrm:
+  case X86II::RawFrm: {
     MCE.emitByte(BaseOpcode);
 
-    if (CurOp != NumOps) {
-      const MachineOperand &MO = MI.getOperand(CurOp++);
-
-      DOUT << "RawFrm CurOp " << CurOp << "\n";
-      DOUT << "isMBB " << MO.isMBB() << "\n";
-      DOUT << "isGlobal " << MO.isGlobal() << "\n";
-      DOUT << "isSymbol " << MO.isSymbol() << "\n";
-      DOUT << "isImm " << MO.isImm() << "\n";
-
-      if (MO.isMBB()) {
-        emitPCRelativeBlockAddress(MO.getMBB());
-      } else if (MO.isGlobal()) {
-        // Assume undefined functions may be outside the Small codespace.
-        bool NeedStub = 
-          (Is64BitMode && 
-              (TM.getCodeModel() == CodeModel::Large ||
-               TM.getSubtarget<X86Subtarget>().isTargetDarwin())) ||
-          Opcode == X86::TAILJMPd;
-        emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word,
-                          MO.getOffset(), 0, NeedStub);
-      } else if (MO.isSymbol()) {
-        emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
-      } else if (MO.isImm()) {
-        if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
-          // Fix up immediate operand for pc relative calls.
-          intptr_t Imm = (intptr_t)MO.getImm();
-          Imm = Imm - MCE.getCurrentPCValue() - 4;
-          emitConstant(Imm, X86InstrInfo::sizeOfImm(Desc));
-        } else
-          emitConstant(MO.getImm(), X86InstrInfo::sizeOfImm(Desc));
-      } else {
-        assert(0 && "Unknown RawFrm operand!");
-      }
+    if (CurOp == NumOps)
+      break;
+      
+    const MachineOperand &MO = MI.getOperand(CurOp++);
+
+    DEBUG(errs() << "RawFrm CurOp " << CurOp << "\n");
+    DEBUG(errs() << "isMBB " << MO.isMBB() << "\n");
+    DEBUG(errs() << "isGlobal " << MO.isGlobal() << "\n");
+    DEBUG(errs() << "isSymbol " << MO.isSymbol() << "\n");
+    DEBUG(errs() << "isImm " << MO.isImm() << "\n");
+
+    if (MO.isMBB()) {
+      emitPCRelativeBlockAddress(MO.getMBB());
+      break;
     }
+    
+    if (MO.isGlobal()) {
+      // Assume undefined functions may be outside the Small codespace.
+      bool NeedStub = 
+        (Is64BitMode && 
+            (TM.getCodeModel() == CodeModel::Large ||
+             TM.getSubtarget<X86Subtarget>().isTargetDarwin())) ||
+        Opcode == X86::TAILJMPd;
+      emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word,
+                        MO.getOffset(), 0, NeedStub);
+      break;
+    }
+    
+    if (MO.isSymbol()) {
+      emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
+      break;
+    }
+    
+    assert(MO.isImm() && "Unknown RawFrm operand!");
+    if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
+      // Fix up immediate operand for pc relative calls.
+      intptr_t Imm = (intptr_t)MO.getImm();
+      Imm = Imm - MCE.getCurrentPCValue() - 4;
+      emitConstant(Imm, X86InstrInfo::sizeOfImm(Desc));
+    } else
+      emitConstant(MO.getImm(), X86InstrInfo::sizeOfImm(Desc));
     break;
-
-  case X86II::AddRegFrm:
+  }
+      
+  case X86II::AddRegFrm: {
     MCE.emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++).getReg()));
     
-    if (CurOp != NumOps) {
-      const MachineOperand &MO1 = MI.getOperand(CurOp++);
-      unsigned Size = X86InstrInfo::sizeOfImm(Desc);
-      if (MO1.isImm())
-        emitConstant(MO1.getImm(), Size);
-      else {
-        unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
-          : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
-        // This should not occur on Darwin for relocatable objects.
-        if (Opcode == X86::MOV64ri)
-          rt = X86::reloc_absolute_dword;  // FIXME: add X86II flag?
-        if (MO1.isGlobal()) {
-          bool NeedStub = isa<Function>(MO1.getGlobal());
-          bool Indirect = gvNeedsNonLazyPtr(MO1.getGlobal());
-          emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
-                            NeedStub, Indirect);
-        } else if (MO1.isSymbol())
-          emitExternalSymbolAddress(MO1.getSymbolName(), rt);
-        else if (MO1.isCPI())
-          emitConstPoolAddress(MO1.getIndex(), rt);
-        else if (MO1.isJTI())
-          emitJumpTableAddress(MO1.getIndex(), rt);
-      }
+    if (CurOp == NumOps)
+      break;
+      
+    const MachineOperand &MO1 = MI.getOperand(CurOp++);
+    unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+    if (MO1.isImm()) {
+      emitConstant(MO1.getImm(), Size);
+      break;
     }
+    
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV64ri64i32)
+      rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
+    // This should not occur on Darwin for relocatable objects.
+    if (Opcode == X86::MOV64ri)
+      rt = X86::reloc_absolute_dword;  // FIXME: add X86II flag?
+    if (MO1.isGlobal()) {
+      bool NeedStub = isa<Function>(MO1.getGlobal());
+      bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
+      emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+                        NeedStub, Indirect);
+    } else if (MO1.isSymbol())
+      emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+    else if (MO1.isCPI())
+      emitConstPoolAddress(MO1.getIndex(), rt);
+    else if (MO1.isJTI())
+      emitJumpTableAddress(MO1.getIndex(), rt);
     break;
+  }
 
   case X86II::MRMDestReg: {
     MCE.emitByte(BaseOpcode);
@@ -656,7 +700,8 @@ void Emitter<CodeEmitter>::emitInstruction(
                      getX86RegNum(MI.getOperand(CurOp+1).getReg()));
     CurOp += 2;
     if (CurOp != NumOps)
-      emitConstant(MI.getOperand(CurOp++).getImm(), X86InstrInfo::sizeOfImm(Desc));
+      emitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86InstrInfo::sizeOfImm(Desc));
     break;
   }
   case X86II::MRMDestMem: {
@@ -666,7 +711,8 @@ void Emitter<CodeEmitter>::emitInstruction(
                                   .getReg()));
     CurOp +=  X86AddrNumOperands + 1;
     if (CurOp != NumOps)
-      emitConstant(MI.getOperand(CurOp++).getImm(), X86InstrInfo::sizeOfImm(Desc));
+      emitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86InstrInfo::sizeOfImm(Desc));
     break;
   }
 
@@ -729,29 +775,31 @@ void Emitter<CodeEmitter>::emitInstruction(
                        (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
     }
 
-    if (CurOp != NumOps) {
-      const MachineOperand &MO1 = MI.getOperand(CurOp++);
-      unsigned Size = X86InstrInfo::sizeOfImm(Desc);
-      if (MO1.isImm())
-        emitConstant(MO1.getImm(), Size);
-      else {
-        unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
-          : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
-        if (Opcode == X86::MOV64ri32)
-          rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
-        if (MO1.isGlobal()) {
-          bool NeedStub = isa<Function>(MO1.getGlobal());
-          bool Indirect = gvNeedsNonLazyPtr(MO1.getGlobal());
-          emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
-                            NeedStub, Indirect);
-        } else if (MO1.isSymbol())
-          emitExternalSymbolAddress(MO1.getSymbolName(), rt);
-        else if (MO1.isCPI())
-          emitConstPoolAddress(MO1.getIndex(), rt);
-        else if (MO1.isJTI())
-          emitJumpTableAddress(MO1.getIndex(), rt);
-      }
+    if (CurOp == NumOps)
+      break;
+    
+    const MachineOperand &MO1 = MI.getOperand(CurOp++);
+    unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+    if (MO1.isImm()) {
+      emitConstant(MO1.getImm(), Size);
+      break;
     }
+    
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV64ri32)
+      rt = X86::reloc_absolute_word_sext;  // FIXME: add X86II flag?
+    if (MO1.isGlobal()) {
+      bool NeedStub = isa<Function>(MO1.getGlobal());
+      bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
+      emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+                        NeedStub, Indirect);
+    } else if (MO1.isSymbol())
+      emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+    else if (MO1.isCPI())
+      emitConstPoolAddress(MO1.getIndex(), rt);
+    else if (MO1.isJTI())
+      emitJumpTableAddress(MO1.getIndex(), rt);
     break;
   }
 
@@ -768,29 +816,31 @@ void Emitter<CodeEmitter>::emitInstruction(
                      PCAdj);
     CurOp += X86AddrNumOperands;
 
-    if (CurOp != NumOps) {
-      const MachineOperand &MO = MI.getOperand(CurOp++);
-      unsigned Size = X86InstrInfo::sizeOfImm(Desc);
-      if (MO.isImm())
-        emitConstant(MO.getImm(), Size);
-      else {
-        unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
-          : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
-        if (Opcode == X86::MOV64mi32)
-          rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
-        if (MO.isGlobal()) {
-          bool NeedStub = isa<Function>(MO.getGlobal());
-          bool Indirect = gvNeedsNonLazyPtr(MO.getGlobal());
-          emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0,
-                            NeedStub, Indirect);
-        } else if (MO.isSymbol())
-          emitExternalSymbolAddress(MO.getSymbolName(), rt);
-        else if (MO.isCPI())
-          emitConstPoolAddress(MO.getIndex(), rt);
-        else if (MO.isJTI())
-          emitJumpTableAddress(MO.getIndex(), rt);
-      }
+    if (CurOp == NumOps)
+      break;
+    
+    const MachineOperand &MO = MI.getOperand(CurOp++);
+    unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+    if (MO.isImm()) {
+      emitConstant(MO.getImm(), Size);
+      break;
     }
+    
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV64mi32)
+      rt = X86::reloc_absolute_word_sext;  // FIXME: add X86II flag?
+    if (MO.isGlobal()) {
+      bool NeedStub = isa<Function>(MO.getGlobal());
+      bool Indirect = gvNeedsNonLazyPtr(MO, TM);
+      emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0,
+                        NeedStub, Indirect);
+    } else if (MO.isSymbol())
+      emitExternalSymbolAddress(MO.getSymbolName(), rt);
+    else if (MO.isCPI())
+      emitConstPoolAddress(MO.getIndex(), rt);
+    else if (MO.isJTI())
+      emitJumpTableAddress(MO.getIndex(), rt);
     break;
   }
 
@@ -804,10 +854,264 @@ void Emitter<CodeEmitter>::emitInstruction(
   }
 
   if (!Desc->isVariadic() && CurOp != NumOps) {
-    cerr << "Cannot encode: ";
-    MI.dump();
-    cerr << '\n';
-    abort();
+#ifndef NDEBUG
+    errs() << "Cannot encode all operands of: " << MI << "\n";
+#endif
+    llvm_unreachable(0);
+  }
+
+  MCE.processDebugLoc(MI.getDebugLoc(), false);
+}
+
+// Adapt the Emitter / CodeEmitter interfaces to MCCodeEmitter.
+//
+// FIXME: This is a total hack designed to allow work on llvm-mc to proceed
+// without being blocked on various cleanups needed to support a clean interface
+// to instruction encoding.
+//
+// Look away!
+
+#include "llvm/DerivedTypes.h"
+
+namespace {
+class MCSingleInstructionCodeEmitter : public MachineCodeEmitter {
+  uint8_t Data[256];
+
+public:
+  MCSingleInstructionCodeEmitter() { reset(); }
+
+  void reset() { 
+    BufferBegin = Data;
+    BufferEnd = array_endof(Data);
+    CurBufferPtr = Data;
+  }
+
+  StringRef str() {
+    return StringRef(reinterpret_cast<char*>(BufferBegin),
+                     CurBufferPtr - BufferBegin);
+  }
+
+  virtual void startFunction(MachineFunction &F) {}
+  virtual bool finishFunction(MachineFunction &F) { return false; }
+  virtual void emitLabel(uint64_t LabelID) {}
+  virtual void StartMachineBasicBlock(MachineBasicBlock *MBB) {}
+  virtual bool earlyResolveAddresses() const { return false; }
+  virtual void addRelocation(const MachineRelocation &MR) { }
+  virtual uintptr_t getConstantPoolEntryAddress(unsigned Index) const {
+    return 0;
+  }
+  virtual uintptr_t getJumpTableEntryAddress(unsigned Index) const {
+    return 0;
+  }
+  virtual uintptr_t getMachineBasicBlockAddress(MachineBasicBlock *MBB) const {
+    return 0;
+  }
+  virtual uintptr_t getLabelAddress(uint64_t LabelID) const {
+    return 0;
+  }
+  virtual void setModuleInfo(MachineModuleInfo* Info) {}
+};
+
+class X86MCCodeEmitter : public MCCodeEmitter {
+  X86MCCodeEmitter(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
+
+private:
+  X86TargetMachine &TM;
+  llvm::Function *DummyF;
+  TargetData *DummyTD;
+  mutable llvm::MachineFunction *DummyMF;
+  llvm::MachineBasicBlock *DummyMBB;
+  
+  MCSingleInstructionCodeEmitter *InstrEmitter;
+  Emitter<MachineCodeEmitter> *Emit;
+
+public:
+  X86MCCodeEmitter(X86TargetMachine &_TM) : TM(_TM) {
+    // Verily, thou shouldst avert thine eyes.
+    const llvm::FunctionType *FTy =
+      FunctionType::get(llvm::Type::getVoidTy(getGlobalContext()), false);
+    DummyF = Function::Create(FTy, GlobalValue::InternalLinkage);
+    DummyTD = new TargetData("");
+    DummyMF = new MachineFunction(DummyF, TM);
+    DummyMBB = DummyMF->CreateMachineBasicBlock();
+
+    InstrEmitter = new MCSingleInstructionCodeEmitter();
+    Emit = new Emitter<MachineCodeEmitter>(TM, *InstrEmitter, 
+                                           *TM.getInstrInfo(),
+                                           *DummyTD, false);
+  }
+  ~X86MCCodeEmitter() {
+    delete Emit;
+    delete InstrEmitter;
+    delete DummyMF;
+    delete DummyF;
+  }
+
+  bool AddRegToInstr(const MCInst &MI, MachineInstr *Instr,
+                     unsigned Start) const {
+    if (Start + 1 > MI.getNumOperands())
+      return false;
+
+    const MCOperand &Op = MI.getOperand(Start);
+    if (!Op.isReg()) return false;
+
+    Instr->addOperand(MachineOperand::CreateReg(Op.getReg(), false));
+    return true;
+  }
+
+  bool AddImmToInstr(const MCInst &MI, MachineInstr *Instr,
+                     unsigned Start) const {
+    if (Start + 1 > MI.getNumOperands())
+      return false;
+
+    const MCOperand &Op = MI.getOperand(Start);
+    if (Op.isImm()) {
+      Instr->addOperand(MachineOperand::CreateImm(Op.getImm()));
+      return true;
+    }
+    if (!Op.isExpr())
+      return false;
+
+    const MCExpr *Expr = Op.getExpr();
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) {
+      Instr->addOperand(MachineOperand::CreateImm(CE->getValue()));
+      return true;
+    }
+
+    // FIXME: Relocation / fixup.
+    Instr->addOperand(MachineOperand::CreateImm(0));
+    return true;
+  }
+
+  bool AddLMemToInstr(const MCInst &MI, MachineInstr *Instr,
+                     unsigned Start) const {
+    return (AddRegToInstr(MI, Instr, Start + 0) &&
+            AddImmToInstr(MI, Instr, Start + 1) &&
+            AddRegToInstr(MI, Instr, Start + 2) &&
+            AddImmToInstr(MI, Instr, Start + 3));
+  }
+
+  bool AddMemToInstr(const MCInst &MI, MachineInstr *Instr,
+                     unsigned Start) const {
+    return (AddRegToInstr(MI, Instr, Start + 0) &&
+            AddImmToInstr(MI, Instr, Start + 1) &&
+            AddRegToInstr(MI, Instr, Start + 2) &&
+            AddImmToInstr(MI, Instr, Start + 3) &&
+            AddRegToInstr(MI, Instr, Start + 4));
+  }
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS) const {
+    // Don't look yet!
+
+    // Convert the MCInst to a MachineInstr so we can (ab)use the regular
+    // emitter.
+    const X86InstrInfo &II = *TM.getInstrInfo();
+    const TargetInstrDesc &Desc = II.get(MI.getOpcode());    
+    MachineInstr *Instr = DummyMF->CreateMachineInstr(Desc, DebugLoc());
+    DummyMBB->push_back(Instr);
+
+    unsigned Opcode = MI.getOpcode();
+    unsigned NumOps = MI.getNumOperands();
+    unsigned CurOp = 0;
+    if (NumOps > 1 && Desc.getOperandConstraint(1, TOI::TIED_TO) != -1) {
+      Instr->addOperand(MachineOperand::CreateReg(0, false));
+      ++CurOp;
+    } else if (NumOps > 2 && 
+             Desc.getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
+      // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
+      --NumOps;
+
+    bool OK = true;
+    switch (Desc.TSFlags & X86II::FormMask) {
+    case X86II::MRMDestReg:
+    case X86II::MRMSrcReg:
+      // Matching doesn't fill this in completely, we have to choose operand 0
+      // for a tied register.
+      OK &= AddRegToInstr(MI, Instr, 0); CurOp++;
+      OK &= AddRegToInstr(MI, Instr, CurOp++);
+      if (CurOp < NumOps)
+        OK &= AddImmToInstr(MI, Instr, CurOp);
+      break;
+
+    case X86II::RawFrm:
+      if (CurOp < NumOps) {
+        // Hack to make branches work.
+        if (!(Desc.TSFlags & X86II::ImmMask) &&
+            MI.getOperand(0).isExpr() &&
+            isa<MCSymbolRefExpr>(MI.getOperand(0).getExpr()))
+          Instr->addOperand(MachineOperand::CreateMBB(DummyMBB));
+        else
+          OK &= AddImmToInstr(MI, Instr, CurOp);
+      }
+      break;
+
+    case X86II::AddRegFrm:
+      OK &= AddRegToInstr(MI, Instr, CurOp++);
+      if (CurOp < NumOps)
+        OK &= AddImmToInstr(MI, Instr, CurOp);
+      break;
+
+    case X86II::MRM0r: case X86II::MRM1r:
+    case X86II::MRM2r: case X86II::MRM3r:
+    case X86II::MRM4r: case X86II::MRM5r:
+    case X86II::MRM6r: case X86II::MRM7r:
+      // Matching doesn't fill this in completely, we have to choose operand 0
+      // for a tied register.
+      OK &= AddRegToInstr(MI, Instr, 0); CurOp++;
+      if (CurOp < NumOps)
+        OK &= AddImmToInstr(MI, Instr, CurOp);
+      break;
+      
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m:
+      OK &= AddMemToInstr(MI, Instr, CurOp); CurOp += 5;
+      if (CurOp < NumOps)
+        OK &= AddImmToInstr(MI, Instr, CurOp);
+      break;
+
+    case X86II::MRMSrcMem:
+      OK &= AddRegToInstr(MI, Instr, CurOp++);
+      if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+          Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+        OK &= AddLMemToInstr(MI, Instr, CurOp);
+      else
+        OK &= AddMemToInstr(MI, Instr, CurOp);
+      break;
+
+    case X86II::MRMDestMem:
+      OK &= AddMemToInstr(MI, Instr, CurOp); CurOp += 5;
+      OK &= AddRegToInstr(MI, Instr, CurOp);
+      break;
+
+    default:
+    case X86II::MRMInitReg:
+    case X86II::Pseudo:
+      OK = false;
+      break;
+    }
+
+    if (!OK) {
+      errs() << "couldn't convert inst '";
+      MI.dump();
+      errs() << "' to machine instr:\n";
+      Instr->dump();
+    }
+
+    InstrEmitter->reset();
+    if (OK)
+      Emit->emitInstruction(*Instr, &Desc);
+    OS << InstrEmitter->str();
+
+    Instr->eraseFromParent();
   }
+};
 }
 
+// Ok, now you can look.
+MCCodeEmitter *llvm::createX86MCCodeEmitter(const Target &,
+                                            TargetMachine &TM) {
+  return new X86MCCodeEmitter(static_cast<X86TargetMachine&>(TM));
+}
diff --git a/lib/Target/X86/X86CompilationCallback_Win64.asm b/lib/Target/X86/X86CompilationCallback_Win64.asm
index 8002f98..f321778 100644
--- a/lib/Target/X86/X86CompilationCallback_Win64.asm
+++ b/lib/Target/X86/X86CompilationCallback_Win64.asm
@@ -17,10 +17,11 @@ extrn X86CompilationCallback2: PROC
 X86CompilationCallback proc
     push    rbp
 
-    ; Save RSP
+    ; Save RSP.
     mov     rbp, rsp
 
     ; Save all int arg registers
+    ; WARNING: We cannot use register spill area - we're generating stubs by hands!
     push    rcx
     push    rdx
     push    r8
@@ -29,27 +30,27 @@ X86CompilationCallback proc
     ; Align stack on 16-byte boundary.
     and     rsp, -16
 
-    ; Save all XMM arg registers
-    sub     rsp, 64
-    movaps  [rsp],     xmm0
-    movaps  [rsp+16],  xmm1
-    movaps  [rsp+32],  xmm2
-    movaps  [rsp+48],  xmm3
+    ; Save all XMM arg registers. Also allocate reg spill area.
+    sub     rsp, 96
+    movaps  [rsp   +32],  xmm0
+    movaps  [rsp+16+32],  xmm1
+    movaps  [rsp+32+32],  xmm2
+    movaps  [rsp+48+32],  xmm3
 
     ; JIT callee
 
-    ; Pass prev frame and return address
+    ; Pass prev frame and return address.
     mov     rcx, rbp
     mov     rdx, qword ptr [rbp+8]
     call    X86CompilationCallback2
 
-    ; Restore all XMM arg registers
-    movaps  xmm3, [rsp+48]
-    movaps  xmm2, [rsp+32]
-    movaps  xmm1, [rsp+16]
-    movaps  xmm0, [rsp]
+    ; Restore all XMM arg registers.
+    movaps  xmm3, [rsp+48+32]
+    movaps  xmm2, [rsp+32+32]
+    movaps  xmm1, [rsp+16+32]
+    movaps  xmm0, [rsp   +32]
 
-    ; Restore RSP
+    ; Restore RSP.
     mov     rsp, rbp
 
     ; Restore all int arg registers
@@ -59,7 +60,7 @@ X86CompilationCallback proc
     pop     rdx
     pop     rcx
 
-    ; Restore RBP
+    ; Restore RBP.
     pop     rbp
     ret
 X86CompilationCallback endp
diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
index 912ab0e..1597d2b3 100644
--- a/lib/Target/X86/X86ELFWriterInfo.cpp
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp
@@ -14,6 +14,7 @@
 #include "X86ELFWriterInfo.h"
 #include "X86Relocations.h"
 #include "llvm/Function.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -38,11 +39,13 @@ unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const {
       return R_X86_64_PC32;
     case X86::reloc_absolute_word:
       return R_X86_64_32;
+    case X86::reloc_absolute_word_sext:
+      return R_X86_64_32S;
     case X86::reloc_absolute_dword:
       return R_X86_64_64;
     case X86::reloc_picrel_word:
     default:
-      assert(0 && "unknown relocation type");
+      llvm_unreachable("unknown x86_64 machine relocation type");
     }
   } else {
     switch(MachineRelTy) {
@@ -50,23 +53,101 @@ unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const {
       return R_386_PC32;
     case X86::reloc_absolute_word:
       return R_386_32;
+    case X86::reloc_absolute_word_sext:
     case X86::reloc_absolute_dword:
     case X86::reloc_picrel_word:
     default:
-      assert(0 && "unknown relocation type");
+      llvm_unreachable("unknown x86 machine relocation type");
     }
   }
   return 0;
 }
 
-long int X86ELFWriterInfo::getAddendForRelTy(unsigned RelTy) const {
+long int X86ELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy,
+                                                    long int Modifier) const {
   if (is64Bit) {
     switch(RelTy) {
-    case R_X86_64_PC32: return -4;
-      break;
+    case R_X86_64_PC32: return Modifier - 4;
+    case R_X86_64_32:
+    case R_X86_64_32S:
+    case R_X86_64_64:
+      return Modifier;
     default:
-      assert(0 && "unknown x86 relocation type");
+      llvm_unreachable("unknown x86_64 relocation type");
+    }
+  } else {
+    switch(RelTy) {
+      case R_386_PC32: return Modifier - 4;
+      case R_386_32: return Modifier;
+    default:
+      llvm_unreachable("unknown x86 relocation type");
+    }
+  }
+  return 0;
+}
+
+unsigned X86ELFWriterInfo::getRelocationTySize(unsigned RelTy) const {
+  if (is64Bit) {
+    switch(RelTy) {
+      case R_X86_64_PC32:
+      case R_X86_64_32:
+      case R_X86_64_32S:
+        return 32;
+      case R_X86_64_64:
+        return 64;
+    default:
+      llvm_unreachable("unknown x86_64 relocation type");
+    }
+  } else {
+    switch(RelTy) {
+      case R_386_PC32:
+      case R_386_32:
+        return 32;
+    default:
+      llvm_unreachable("unknown x86 relocation type");
     }
   }
   return 0;
 }
+
+bool X86ELFWriterInfo::isPCRelativeRel(unsigned RelTy) const {
+  if (is64Bit) {
+    switch(RelTy) {
+      case R_X86_64_PC32:
+        return true;
+      case R_X86_64_32:
+      case R_X86_64_32S:
+      case R_X86_64_64:
+        return false;
+    default:
+      llvm_unreachable("unknown x86_64 relocation type");
+    }
+  } else {
+    switch(RelTy) {
+      case R_386_PC32:
+        return true;
+      case R_386_32:
+        return false;
+    default:
+      llvm_unreachable("unknown x86 relocation type");
+    }
+  }
+  return 0;
+}
+
+unsigned X86ELFWriterInfo::getAbsoluteLabelMachineRelTy() const {
+  return is64Bit ?
+    X86::reloc_absolute_dword : X86::reloc_absolute_word;
+}
+
+long int X86ELFWriterInfo::computeRelocation(unsigned SymOffset,
+                                             unsigned RelOffset,
+                                             unsigned RelTy) const {
+
+  if (RelTy == R_X86_64_PC32 || RelTy == R_386_PC32)
+    return SymOffset - (RelOffset + 4);
+  else
+    assert("computeRelocation unknown for this relocation type");
+
+  return 0;
+}
diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h
index 2ba1a0b..342e6e6 100644
--- a/lib/Target/X86/X86ELFWriterInfo.h
+++ b/lib/Target/X86/X86ELFWriterInfo.h
@@ -49,9 +49,26 @@ namespace llvm {
     /// ELF relocation entry.
     virtual bool hasRelocationAddend() const { return is64Bit ? true : false; }
 
-    /// getAddendForRelTy - Gets the addend value for an ELF relocation entry
-    /// based on the target relocation type
-    virtual long int getAddendForRelTy(unsigned RelTy) const;
+    /// getDefaultAddendForRelTy - Gets the default addend value for a
+    /// relocation entry based on the target ELF relocation type.
+    virtual long int getDefaultAddendForRelTy(unsigned RelTy,
+                                              long int Modifier = 0) const;
+
+    /// getRelTySize - Returns the size of relocatable field in bits
+    virtual unsigned getRelocationTySize(unsigned RelTy) const;
+
+    /// isPCRelativeRel - True if the relocation type is pc relative
+    virtual bool isPCRelativeRel(unsigned RelTy) const;
+
+    /// getJumpTableRelocationTy - Returns the machine relocation type used
+    /// to reference a jumptable.
+    virtual unsigned getAbsoluteLabelMachineRelTy() const;
+
+    /// computeRelocation - Some relocatable fields could be relocated
+    /// directly, avoiding the relocation symbol emission, compute the
+    /// final relocation value for this symbol.
+    virtual long int computeRelocation(unsigned SymOffset, unsigned RelOffset,
+                                       unsigned RelTy) const;
   };
 
 } // end llvm namespace
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index b336d78..3401df0 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
@@ -78,19 +79,20 @@ public:
 #include "X86GenFastISel.inc"
 
 private:
-  bool X86FastEmitCompare(Value *LHS, Value *RHS, MVT VT);
+  bool X86FastEmitCompare(Value *LHS, Value *RHS, EVT VT);
   
-  bool X86FastEmitLoad(MVT VT, const X86AddressMode &AM, unsigned &RR);
+  bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR);
 
-  bool X86FastEmitStore(MVT VT, Value *Val,
+  bool X86FastEmitStore(EVT VT, Value *Val,
                         const X86AddressMode &AM);
-  bool X86FastEmitStore(MVT VT, unsigned Val,
+  bool X86FastEmitStore(EVT VT, unsigned Val,
                         const X86AddressMode &AM);
 
-  bool X86FastEmitExtend(ISD::NodeType Opc, MVT DstVT, unsigned Src, MVT SrcVT,
+  bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
                          unsigned &ResultReg);
   
-  bool X86SelectAddress(Value *V, X86AddressMode &AM, bool isCall);
+  bool X86SelectAddress(Value *V, X86AddressMode &AM);
+  bool X86SelectCallAddress(Value *V, X86AddressMode &AM);
 
   bool X86SelectLoad(Instruction *I);
   
@@ -116,7 +118,7 @@ private:
   bool X86VisitIntrinsicCall(IntrinsicInst &I);
   bool X86SelectCall(Instruction *I);
 
-  CCAssignFn *CCAssignFnForCall(unsigned CC, bool isTailCall = false);
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isTailCall = false);
 
   const X86InstrInfo *getInstrInfo() const {
     return getTargetMachine()->getInstrInfo();
@@ -131,17 +133,17 @@ private:
 
   /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
   /// computed in an SSE register, not on the X87 floating point stack.
-  bool isScalarFPTypeInSSEReg(MVT VT) const {
+  bool isScalarFPTypeInSSEReg(EVT VT) const {
     return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
       (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
   }
 
-  bool isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1 = false);
+  bool isTypeLegal(const Type *Ty, EVT &VT, bool AllowI1 = false);
 };
   
 } // end anonymous namespace.
 
-bool X86FastISel::isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1) {
+bool X86FastISel::isTypeLegal(const Type *Ty, EVT &VT, bool AllowI1) {
   VT = TLI.getValueType(Ty, /*HandleUnknown=*/true);
   if (VT == MVT::Other || !VT.isSimple())
     // Unhandled type. Halt "fast" selection and bail.
@@ -167,7 +169,8 @@ bool X86FastISel::isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1) {
 
 /// CCAssignFnForCall - Selects the correct CCAssignFn for a given calling
 /// convention.
-CCAssignFn *X86FastISel::CCAssignFnForCall(unsigned CC, bool isTaillCall) {
+CCAssignFn *X86FastISel::CCAssignFnForCall(CallingConv::ID CC,
+                                           bool isTaillCall) {
   if (Subtarget->is64Bit()) {
     if (Subtarget->isTargetWin64())
       return CC_X86_Win64_C;
@@ -186,13 +189,14 @@ CCAssignFn *X86FastISel::CCAssignFnForCall(unsigned CC, bool isTaillCall) {
 /// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
 /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
 /// Return true and the result register by reference if it is possible.
-bool X86FastISel::X86FastEmitLoad(MVT VT, const X86AddressMode &AM,
+bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
                                   unsigned &ResultReg) {
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
   const TargetRegisterClass *RC = NULL;
-  switch (VT.getSimpleVT()) {
+  switch (VT.getSimpleVT().SimpleTy) {
   default: return false;
+  case MVT::i1:
   case MVT::i8:
     Opc = X86::MOV8rm;
     RC  = X86::GR8RegisterClass;
@@ -243,13 +247,21 @@ bool X86FastISel::X86FastEmitLoad(MVT VT, const X86AddressMode &AM,
 /// and a displacement offset, or a GlobalAddress,
 /// i.e. V. Return true if it is possible.
 bool
-X86FastISel::X86FastEmitStore(MVT VT, unsigned Val,
+X86FastISel::X86FastEmitStore(EVT VT, unsigned Val,
                               const X86AddressMode &AM) {
   // Get opcode and regclass of the output for the given store instruction.
   unsigned Opc = 0;
-  switch (VT.getSimpleVT()) {
+  switch (VT.getSimpleVT().SimpleTy) {
   case MVT::f80: // No f80 support yet.
   default: return false;
+  case MVT::i1: {
+    // Mask out all but lowest bit.
+    unsigned AndResult = createResultReg(X86::GR8RegisterClass);
+    BuildMI(MBB, DL,
+            TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1);
+    Val = AndResult;
+  }
+  // FALLTHROUGH, handling i1 as i8.
   case MVT::i8:  Opc = X86::MOV8mr;  break;
   case MVT::i16: Opc = X86::MOV16mr; break;
   case MVT::i32: Opc = X86::MOV32mr; break;
@@ -266,17 +278,19 @@ X86FastISel::X86FastEmitStore(MVT VT, unsigned Val,
   return true;
 }
 
-bool X86FastISel::X86FastEmitStore(MVT VT, Value *Val,
+bool X86FastISel::X86FastEmitStore(EVT VT, Value *Val,
                                    const X86AddressMode &AM) {
   // Handle 'null' like i32/i64 0.
   if (isa<ConstantPointerNull>(Val))
-    Val = Constant::getNullValue(TD.getIntPtrType());
+    Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext()));
   
   // If this is a store of a simple constant, fold the constant into the store.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
     unsigned Opc = 0;
-    switch (VT.getSimpleVT()) {
+    bool Signed = true;
+    switch (VT.getSimpleVT().SimpleTy) {
     default: break;
+    case MVT::i1:  Signed = false;     // FALLTHROUGH to handle as i8.
     case MVT::i8:  Opc = X86::MOV8mi;  break;
     case MVT::i16: Opc = X86::MOV16mi; break;
     case MVT::i32: Opc = X86::MOV32mi; break;
@@ -289,7 +303,8 @@ bool X86FastISel::X86FastEmitStore(MVT VT, Value *Val,
     
     if (Opc) {
       addFullAddress(BuildMI(MBB, DL, TII.get(Opc)), AM)
-                             .addImm(CI->getSExtValue());
+                             .addImm(Signed ? CI->getSExtValue() :
+                                              CI->getZExtValue());
       return true;
     }
   }
@@ -304,8 +319,8 @@ bool X86FastISel::X86FastEmitStore(MVT VT, Value *Val,
 /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
 /// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
 /// ISD::SIGN_EXTEND).
-bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, MVT DstVT,
-                                    unsigned Src, MVT SrcVT,
+bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
+                                    unsigned Src, EVT SrcVT,
                                     unsigned &ResultReg) {
   unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src);
   
@@ -318,7 +333,7 @@ bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, MVT DstVT,
 
 /// X86SelectAddress - Attempt to fill in an address from the given value.
 ///
-bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM, bool isCall) {
+bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM) {
   User *U = NULL;
   unsigned Opcode = Instruction::UserOp1;
   if (Instruction *I = dyn_cast<Instruction>(V)) {
@@ -333,22 +348,21 @@ bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM, bool isCall) {
   default: break;
   case Instruction::BitCast:
     // Look past bitcasts.
-    return X86SelectAddress(U->getOperand(0), AM, isCall);
+    return X86SelectAddress(U->getOperand(0), AM);
 
   case Instruction::IntToPtr:
     // Look past no-op inttoptrs.
     if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
-      return X86SelectAddress(U->getOperand(0), AM, isCall);
+      return X86SelectAddress(U->getOperand(0), AM);
     break;
 
   case Instruction::PtrToInt:
     // Look past no-op ptrtoints.
     if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
-      return X86SelectAddress(U->getOperand(0), AM, isCall);
+      return X86SelectAddress(U->getOperand(0), AM);
     break;
 
   case Instruction::Alloca: {
-    if (isCall) break;
     // Do static allocas.
     const AllocaInst *A = cast<AllocaInst>(V);
     DenseMap<const AllocaInst*, int>::iterator SI = StaticAllocaMap.find(A);
@@ -361,21 +375,19 @@ bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM, bool isCall) {
   }
 
   case Instruction::Add: {
-    if (isCall) break;
     // Adds of constants are common and easy enough.
     if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
       uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
       // They have to fit in the 32-bit signed displacement field though.
       if (isInt32(Disp)) {
         AM.Disp = (uint32_t)Disp;
-        return X86SelectAddress(U->getOperand(0), AM, isCall);
+        return X86SelectAddress(U->getOperand(0), AM);
       }
     }
     break;
   }
 
   case Instruction::GetElementPtr: {
-    if (isCall) break;
     // Pattern-match simple GEPs.
     uint64_t Disp = (int32_t)AM.Disp;
     unsigned IndexReg = AM.IndexReg;
@@ -416,7 +428,7 @@ bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM, bool isCall) {
     AM.IndexReg = IndexReg;
     AM.Scale = Scale;
     AM.Disp = (uint32_t)Disp;
-    return X86SelectAddress(U->getOperand(0), AM, isCall);
+    return X86SelectAddress(U->getOperand(0), AM);
   unsupported_gep:
     // Ok, the GEP indices weren't all covered.
     break;
@@ -426,8 +438,7 @@ bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM, bool isCall) {
   // Handle constant address.
   if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
     // Can't handle alternate code models yet.
-    if (TM.getCodeModel() != CodeModel::Default &&
-        TM.getCodeModel() != CodeModel::Small)
+    if (TM.getCodeModel() != CodeModel::Small)
       return false;
 
     // RIP-relative addresses can't have additional register operands.
@@ -440,63 +451,149 @@ bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM, bool isCall) {
       if (GVar->isThreadLocal())
         return false;
 
-    // Set up the basic address.
+    // Okay, we've committed to selecting this global. Set up the basic address.
     AM.GV = GV;
     
-    if (!isCall &&
-        TM.getRelocationModel() == Reloc::PIC_ &&
-        !Subtarget->is64Bit())
-      AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(&MF);
+    // Allow the subtarget to classify the global.
+    unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
 
-    // Emit an extra load if the ABI requires it.
-    if (Subtarget->GVRequiresExtraLoad(GV, TM, isCall)) {
-      // Check to see if we've already materialized this
-      // value in a register in this block.
-      DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
-      if (I != LocalValueMap.end() && I->second != 0) {
-        AM.Base.Reg = I->second;
-        AM.GV = 0;
-        return true;
+    // If this reference is relative to the pic base, set it now.
+    if (isGlobalRelativeToPICBase(GVFlags)) {
+      // FIXME: How do we know Base.Reg is free??
+      AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(&MF);
+    }
+    
+    // Unless the ABI requires an extra load, return a direct reference to
+    // the global.
+    if (!isGlobalStubReference(GVFlags)) {
+      if (Subtarget->isPICStyleRIPRel()) {
+        // Use rip-relative addressing if we can.  Above we verified that the
+        // base and index registers are unused.
+        assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+        AM.Base.Reg = X86::RIP;
       }
-      
+      AM.GVOpFlags = GVFlags;
+      return true;
+    }
+    
+    // Ok, we need to do a load from a stub.  If we've already loaded from this
+    // stub, reuse the loaded pointer, otherwise emit the load now.
+    DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
+    unsigned LoadReg;
+    if (I != LocalValueMap.end() && I->second != 0) {
+      LoadReg = I->second;
+    } else {
       // Issue load from stub.
       unsigned Opc = 0;
       const TargetRegisterClass *RC = NULL;
       X86AddressMode StubAM;
       StubAM.Base.Reg = AM.Base.Reg;
-      StubAM.GV = AM.GV;
-      
-      if (TLI.getPointerTy() == MVT::i32) {
-        Opc = X86::MOV32rm;
-        RC  = X86::GR32RegisterClass;
-        
-        if (Subtarget->isPICStyleGOT() &&
-            TM.getRelocationModel() == Reloc::PIC_)
-          StubAM.GVOpFlags = X86II::MO_GOT;
-        
-      } else {
+      StubAM.GV = GV;
+      StubAM.GVOpFlags = GVFlags;
+
+      if (TLI.getPointerTy() == MVT::i64) {
         Opc = X86::MOV64rm;
         RC  = X86::GR64RegisterClass;
         
-        if (TM.getRelocationModel() != Reloc::Static) {
-          StubAM.GVOpFlags = X86II::MO_GOTPCREL;
+        if (Subtarget->isPICStyleRIPRel())
           StubAM.Base.Reg = X86::RIP;
-        }
+      } else {
+        Opc = X86::MOV32rm;
+        RC  = X86::GR32RegisterClass;
       }
+      
+      LoadReg = createResultReg(RC);
+      addFullAddress(BuildMI(MBB, DL, TII.get(Opc), LoadReg), StubAM);
+      
+      // Prevent loading GV stub multiple times in same MBB.
+      LocalValueMap[V] = LoadReg;
+    }
+    
+    // Now construct the final address. Note that the Disp, Scale,
+    // and Index values may already be set here.
+    AM.Base.Reg = LoadReg;
+    AM.GV = 0;
+    return true;
+  }
 
-      unsigned ResultReg = createResultReg(RC);
-      addFullAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), StubAM);
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
 
-      // Now construct the final address. Note that the Disp, Scale,
-      // and Index values may already be set here.
-      AM.Base.Reg = ResultReg;
-      AM.GV = 0;
+  return false;
+}
 
-      // Prevent loading GV stub multiple times in same MBB.
-      LocalValueMap[V] = AM.Base.Reg;
-    } else if (Subtarget->isPICStyleRIPRel()) {
-      // Use rip-relative addressing if we can.
+/// X86SelectCallAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectCallAddress(Value *V, X86AddressMode &AM) {
+  User *U = NULL;
+  unsigned Opcode = Instruction::UserOp1;
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Opcode = I->getOpcode();
+    U = I;
+  } else if (ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts.
+    return X86SelectCallAddress(U->getOperand(0), AM);
+
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+  }
+
+  // Handle constant address.
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // RIP-relative addresses can't have additional register operands.
+    if (Subtarget->isPICStyleRIPRel() &&
+        (AM.Base.Reg != 0 || AM.IndexReg != 0))
+      return false;
+
+    // Can't handle TLS or DLLImport.
+    if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->isThreadLocal() || GVar->hasDLLImportLinkage())
+        return false;
+
+    // Okay, we've committed to selecting this global. Set up the basic address.
+    AM.GV = GV;
+    
+    // No ABI requires an extra load for anything other than DLLImport, which
+    // we rejected above. Return a direct reference to the global.
+    if (Subtarget->isPICStyleRIPRel()) {
+      // Use rip-relative addressing if we can.  Above we verified that the
+      // base and index registers are unused.
+      assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
       AM.Base.Reg = X86::RIP;
+    } else if (Subtarget->isPICStyleStubPIC()) {
+      AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
+    } else if (Subtarget->isPICStyleGOT()) {
+      AM.GVOpFlags = X86II::MO_GOTOFF;
     }
     
     return true;
@@ -518,14 +615,15 @@ bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM, bool isCall) {
   return false;
 }
 
+
 /// X86SelectStore - Select and emit code to implement store instructions.
 bool X86FastISel::X86SelectStore(Instruction* I) {
-  MVT VT;
-  if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+  EVT VT;
+  if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true))
     return false;
 
   X86AddressMode AM;
-  if (!X86SelectAddress(I->getOperand(1), AM, false))
+  if (!X86SelectAddress(I->getOperand(1), AM))
     return false;
 
   return X86FastEmitStore(VT, I->getOperand(0), AM);
@@ -534,12 +632,12 @@ bool X86FastISel::X86SelectStore(Instruction* I) {
 /// X86SelectLoad - Select and emit code to implement load instructions.
 ///
 bool X86FastISel::X86SelectLoad(Instruction *I)  {
-  MVT VT;
-  if (!isTypeLegal(I->getType(), VT))
+  EVT VT;
+  if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true))
     return false;
 
   X86AddressMode AM;
-  if (!X86SelectAddress(I->getOperand(0), AM, false))
+  if (!X86SelectAddress(I->getOperand(0), AM))
     return false;
 
   unsigned ResultReg = 0;
@@ -550,8 +648,8 @@ bool X86FastISel::X86SelectLoad(Instruction *I)  {
   return false;
 }
 
-static unsigned X86ChooseCmpOpcode(MVT VT) {
-  switch (VT.getSimpleVT()) {
+static unsigned X86ChooseCmpOpcode(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
   default:       return 0;
   case MVT::i8:  return X86::CMP8rr;
   case MVT::i16: return X86::CMP16rr;
@@ -565,8 +663,8 @@ static unsigned X86ChooseCmpOpcode(MVT VT) {
 /// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS
 /// of the comparison, return an opcode that works for the compare (e.g.
 /// CMP32ri) otherwise return 0.
-static unsigned X86ChooseCmpImmediateOpcode(MVT VT, ConstantInt *RHSC) {
-  switch (VT.getSimpleVT()) {
+static unsigned X86ChooseCmpImmediateOpcode(EVT VT, ConstantInt *RHSC) {
+  switch (VT.getSimpleVT().SimpleTy) {
   // Otherwise, we can't fold the immediate into this comparison.
   default: return 0;
   case MVT::i8: return X86::CMP8ri;
@@ -581,13 +679,13 @@ static unsigned X86ChooseCmpImmediateOpcode(MVT VT, ConstantInt *RHSC) {
   }
 }
 
-bool X86FastISel::X86FastEmitCompare(Value *Op0, Value *Op1, MVT VT) {
+bool X86FastISel::X86FastEmitCompare(Value *Op0, Value *Op1, EVT VT) {
   unsigned Op0Reg = getRegForValue(Op0);
   if (Op0Reg == 0) return false;
   
   // Handle 'null' like i32/i64 0.
   if (isa<ConstantPointerNull>(Op1))
-    Op1 = Constant::getNullValue(TD.getIntPtrType());
+    Op1 = Constant::getNullValue(TD.getIntPtrType(Op0->getContext()));
   
   // We have two options: compare with register or immediate.  If the RHS of
   // the compare is an immediate that we can fold into this compare, use
@@ -613,7 +711,7 @@ bool X86FastISel::X86FastEmitCompare(Value *Op0, Value *Op1, MVT VT) {
 bool X86FastISel::X86SelectCmp(Instruction *I) {
   CmpInst *CI = cast<CmpInst>(I);
 
-  MVT VT;
+  EVT VT;
   if (!isTypeLegal(I->getOperand(0)->getType(), VT))
     return false;
 
@@ -688,8 +786,8 @@ bool X86FastISel::X86SelectCmp(Instruction *I) {
 
 bool X86FastISel::X86SelectZExt(Instruction *I) {
   // Handle zero-extension from i1 to i8, which is common.
-  if (I->getType() == Type::Int8Ty &&
-      I->getOperand(0)->getType() == Type::Int1Ty) {
+  if (I->getType() == Type::getInt8Ty(I->getContext()) &&
+      I->getOperand(0)->getType() == Type::getInt1Ty(I->getContext())) {
     unsigned ResultReg = getRegForValue(I->getOperand(0));
     if (ResultReg == 0) return false;
     // Set the high bits to zero.
@@ -713,7 +811,7 @@ bool X86FastISel::X86SelectBranch(Instruction *I) {
   // Fold the common case of a conditional branch with a comparison.
   if (CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     if (CI->hasOneUse()) {
-      MVT VT = TLI.getValueType(CI->getOperand(0)->getType());
+      EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
 
       // Try to take advantage of fallthrough opportunities.
       CmpInst::Predicate Predicate = CI->getPredicate();
@@ -850,7 +948,7 @@ bool X86FastISel::X86SelectBranch(Instruction *I) {
 bool X86FastISel::X86SelectShift(Instruction *I) {
   unsigned CReg = 0, OpReg = 0, OpImm = 0;
   const TargetRegisterClass *RC = NULL;
-  if (I->getType() == Type::Int8Ty) {
+  if (I->getType() == Type::getInt8Ty(I->getContext())) {
     CReg = X86::CL;
     RC = &X86::GR8RegClass;
     switch (I->getOpcode()) {
@@ -859,7 +957,7 @@ bool X86FastISel::X86SelectShift(Instruction *I) {
     case Instruction::Shl:  OpReg = X86::SHL8rCL; OpImm = X86::SHL8ri; break;
     default: return false;
     }
-  } else if (I->getType() == Type::Int16Ty) {
+  } else if (I->getType() == Type::getInt16Ty(I->getContext())) {
     CReg = X86::CX;
     RC = &X86::GR16RegClass;
     switch (I->getOpcode()) {
@@ -868,7 +966,7 @@ bool X86FastISel::X86SelectShift(Instruction *I) {
     case Instruction::Shl:  OpReg = X86::SHL16rCL; OpImm = X86::SHL16ri; break;
     default: return false;
     }
-  } else if (I->getType() == Type::Int32Ty) {
+  } else if (I->getType() == Type::getInt32Ty(I->getContext())) {
     CReg = X86::ECX;
     RC = &X86::GR32RegClass;
     switch (I->getOpcode()) {
@@ -877,7 +975,7 @@ bool X86FastISel::X86SelectShift(Instruction *I) {
     case Instruction::Shl:  OpReg = X86::SHL32rCL; OpImm = X86::SHL32ri; break;
     default: return false;
     }
-  } else if (I->getType() == Type::Int64Ty) {
+  } else if (I->getType() == Type::getInt64Ty(I->getContext())) {
     CReg = X86::RCX;
     RC = &X86::GR64RegClass;
     switch (I->getOpcode()) {
@@ -890,7 +988,7 @@ bool X86FastISel::X86SelectShift(Instruction *I) {
     return false;
   }
 
-  MVT VT = TLI.getValueType(I->getType(), /*HandleUnknown=*/true);
+  EVT VT = TLI.getValueType(I->getType(), /*HandleUnknown=*/true);
   if (VT == MVT::Other || !isTypeLegal(I->getType(), VT))
     return false;
 
@@ -924,7 +1022,7 @@ bool X86FastISel::X86SelectShift(Instruction *I) {
 }
 
 bool X86FastISel::X86SelectSelect(Instruction *I) {
-  MVT VT = TLI.getValueType(I->getType(), /*HandleUnknown=*/true);
+  EVT VT = TLI.getValueType(I->getType(), /*HandleUnknown=*/true);
   if (VT == MVT::Other || !isTypeLegal(I->getType(), VT))
     return false;
   
@@ -959,9 +1057,10 @@ bool X86FastISel::X86SelectSelect(Instruction *I) {
 
 bool X86FastISel::X86SelectFPExt(Instruction *I) {
   // fpext from float to double.
-  if (Subtarget->hasSSE2() && I->getType() == Type::DoubleTy) {
+  if (Subtarget->hasSSE2() &&
+      I->getType()->isDoubleTy()) {
     Value *V = I->getOperand(0);
-    if (V->getType() == Type::FloatTy) {
+    if (V->getType()->isFloatTy()) {
       unsigned OpReg = getRegForValue(V);
       if (OpReg == 0) return false;
       unsigned ResultReg = createResultReg(X86::FR64RegisterClass);
@@ -976,9 +1075,9 @@ bool X86FastISel::X86SelectFPExt(Instruction *I) {
 
 bool X86FastISel::X86SelectFPTrunc(Instruction *I) {
   if (Subtarget->hasSSE2()) {
-    if (I->getType() == Type::FloatTy) {
+    if (I->getType()->isFloatTy()) {
       Value *V = I->getOperand(0);
-      if (V->getType() == Type::DoubleTy) {
+      if (V->getType()->isDoubleTy()) {
         unsigned OpReg = getRegForValue(V);
         if (OpReg == 0) return false;
         unsigned ResultReg = createResultReg(X86::FR32RegisterClass);
@@ -996,8 +1095,8 @@ bool X86FastISel::X86SelectTrunc(Instruction *I) {
   if (Subtarget->is64Bit())
     // All other cases should be handled by the tblgen generated code.
     return false;
-  MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
-  MVT DstVT = TLI.getValueType(I->getType());
+  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(I->getType());
   
   // This code only handles truncation to byte right now.
   if (DstVT != MVT::i8 && DstVT != MVT::i1)
@@ -1065,7 +1164,7 @@ bool X86FastISel::X86VisitIntrinsicCall(IntrinsicInst &I) {
     const Type *RetTy =
       cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
 
-    MVT VT;
+    EVT VT;
     if (!isTypeLegal(RetTy, VT))
       return false;
 
@@ -1125,7 +1224,7 @@ bool X86FastISel::X86SelectCall(Instruction *I) {
 
   // Handle only C and fastcc calling conventions for now.
   CallSite CS(CI);
-  unsigned CC = CS.getCallingConv();
+  CallingConv::ID CC = CS.getCallingConv();
   if (CC != CallingConv::C &&
       CC != CallingConv::Fast &&
       CC != CallingConv::X86_FastCall)
@@ -1144,8 +1243,8 @@ bool X86FastISel::X86SelectCall(Instruction *I) {
 
   // Handle *simple* calls for now.
   const Type *RetTy = CS.getType();
-  MVT RetVT;
-  if (RetTy == Type::VoidTy)
+  EVT RetVT;
+  if (RetTy->isVoidTy())
     RetVT = MVT::isVoid;
   else if (!isTypeLegal(RetTy, RetVT, true))
     return false;
@@ -1153,7 +1252,7 @@ bool X86FastISel::X86SelectCall(Instruction *I) {
   // Materialize callee address in a register. FIXME: GV address can be
   // handled with a CALLpcrel32 instead.
   X86AddressMode CalleeAM;
-  if (!X86SelectAddress(Callee, CalleeAM, true))
+  if (!X86SelectCallAddress(Callee, CalleeAM))
     return false;
   unsigned CalleeOp = 0;
   GlobalValue *GV = 0;
@@ -1174,7 +1273,7 @@ bool X86FastISel::X86SelectCall(Instruction *I) {
   // Deal with call operands first.
   SmallVector<Value*, 8> ArgVals;
   SmallVector<unsigned, 8> Args;
-  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<EVT, 8> ArgVTs;
   SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
   Args.reserve(CS.arg_size());
   ArgVals.reserve(CS.arg_size());
@@ -1200,7 +1299,7 @@ bool X86FastISel::X86SelectCall(Instruction *I) {
       return false;
 
     const Type *ArgTy = (*i)->getType();
-    MVT ArgVT;
+    EVT ArgVT;
     if (!isTypeLegal(ArgTy, ArgVT))
       return false;
     unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
@@ -1214,7 +1313,7 @@ bool X86FastISel::X86SelectCall(Instruction *I) {
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, false, TM, ArgLocs);
+  CCState CCInfo(CC, false, TM, ArgLocs, I->getParent()->getContext());
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -1230,11 +1329,11 @@ bool X86FastISel::X86SelectCall(Instruction *I) {
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     unsigned Arg = Args[VA.getValNo()];
-    MVT ArgVT = ArgVTs[VA.getValNo()];
+    EVT ArgVT = ArgVTs[VA.getValNo()];
   
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
-    default: assert(0 && "Unknown loc info!");
+    default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::SExt: {
       bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
@@ -1266,6 +1365,14 @@ bool X86FastISel::X86SelectCall(Instruction *I) {
       ArgVT = VA.getLocVT();
       break;
     }
+    case CCValAssign::BCvt: {
+      unsigned BC = FastEmit_r(ArgVT.getSimpleVT(), VA.getLocVT().getSimpleVT(),
+                               ISD::BIT_CONVERT, Arg);
+      assert(BC != 0 && "Failed to emit a bitcast!");
+      Arg = BC;
+      ArgVT = VA.getLocVT();
+      break;
+    }
     }
     
     if (VA.isRegLoc()) {
@@ -1294,28 +1401,53 @@ bool X86FastISel::X86SelectCall(Instruction *I) {
 
   // ELF / PIC requires GOT in the EBX register before function calls via PLT
   // GOT pointer.  
-  if (!Subtarget->is64Bit() &&
-      TM.getRelocationModel() == Reloc::PIC_ &&
-      Subtarget->isPICStyleGOT()) {
+  if (Subtarget->isPICStyleGOT()) {
     TargetRegisterClass *RC = X86::GR32RegisterClass;
     unsigned Base = getInstrInfo()->getGlobalBaseReg(&MF);
     bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), X86::EBX, Base, RC, RC);
     assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
     Emitted = true;
   }
-
+  
   // Issue the call.
-  unsigned CallOpc = CalleeOp
-    ? (Subtarget->is64Bit() ? X86::CALL64r       : X86::CALL32r)
-    : (Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32);
-  MachineInstrBuilder MIB = CalleeOp
-    ? BuildMI(MBB, DL, TII.get(CallOpc)).addReg(CalleeOp)
-    : BuildMI(MBB, DL, TII.get(CallOpc)).addGlobalAddress(GV);
+  MachineInstrBuilder MIB;
+  if (CalleeOp) {
+    // Register-indirect call.
+    unsigned CallOpc = Subtarget->is64Bit() ? X86::CALL64r : X86::CALL32r;
+    MIB = BuildMI(MBB, DL, TII.get(CallOpc)).addReg(CalleeOp);
+    
+  } else {
+    // Direct call.
+    assert(GV && "Not a direct call");
+    unsigned CallOpc =
+      Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+    
+    // See if we need any target-specific flags on the GV operand.
+    unsigned char OpFlags = 0;
+    
+    // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+    // external symbols most go through the PLT in PIC mode.  If the symbol
+    // has hidden or protected visibility, or if it is static or local, then
+    // we don't need to use the PLT - we can directly call it.
+    if (Subtarget->isTargetELF() &&
+        TM.getRelocationModel() == Reloc::PIC_ &&
+        GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+      OpFlags = X86II::MO_PLT;
+    } else if (Subtarget->isPICStyleStubAny() &&
+               (GV->isDeclaration() || GV->isWeakForLinker()) &&
+               Subtarget->getDarwinVers() < 9) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = X86II::MO_DARWIN_STUB;
+    }
+    
+    
+    MIB = BuildMI(MBB, DL, TII.get(CallOpc)).addGlobalAddress(GV, 0, OpFlags);
+  }
 
   // Add an implicit use GOT pointer in EBX.
-  if (!Subtarget->is64Bit() &&
-      TM.getRelocationModel() == Reloc::PIC_ &&
-      Subtarget->isPICStyleGOT())
+  if (Subtarget->isPICStyleGOT())
     MIB.addReg(X86::EBX);
 
   // Add implicit physical register uses to the call.
@@ -1327,14 +1459,14 @@ bool X86FastISel::X86SelectCall(Instruction *I) {
   BuildMI(MBB, DL, TII.get(AdjStackUp)).addImm(NumBytes).addImm(0);
 
   // Now handle call return value (if any).
-  if (RetVT.getSimpleVT() != MVT::isVoid) {
+  if (RetVT.getSimpleVT().SimpleTy != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, TM, RVLocs);
+    CCState CCInfo(CC, false, TM, RVLocs, I->getParent()->getContext());
     CCInfo.AnalyzeCallResult(RetVT, RetCC_X86);
 
     // Copy all of the result registers out of their specified physreg.
     assert(RVLocs.size() == 1 && "Can't handle multi-value calls!");
-    MVT CopyVT = RVLocs[0].getValVT();
+    EVT CopyVT = RVLocs[0].getValVT();
     TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
     TargetRegisterClass *SrcRC = DstRC;
     
@@ -1358,7 +1490,7 @@ bool X86FastISel::X86SelectCall(Instruction *I) {
       // Round the F80 the right size, which also moves to the appropriate xmm
       // register. This is accomplished by storing the F80 value in memory and
       // then loading it back. Ewww...
-      MVT ResVT = RVLocs[0].getValVT();
+      EVT ResVT = RVLocs[0].getValVT();
       unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
       unsigned MemSize = ResVT.getSizeInBits()/8;
       int FI = MFI.CreateStackObject(MemSize, MemSize);
@@ -1418,8 +1550,8 @@ X86FastISel::TargetSelectInstruction(Instruction *I)  {
     return X86SelectExtractValue(I);
   case Instruction::IntToPtr: // Deliberate fall-through.
   case Instruction::PtrToInt: {
-    MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
-    MVT DstVT = TLI.getValueType(I->getType());
+    EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(I->getType());
     if (DstVT.bitsGT(SrcVT))
       return X86SelectZExt(I);
     if (DstVT.bitsLT(SrcVT))
@@ -1435,14 +1567,14 @@ X86FastISel::TargetSelectInstruction(Instruction *I)  {
 }
 
 unsigned X86FastISel::TargetMaterializeConstant(Constant *C) {
-  MVT VT;
+  EVT VT;
   if (!isTypeLegal(C->getType(), VT))
     return false;
   
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
   const TargetRegisterClass *RC = NULL;
-  switch (VT.getSimpleVT()) {
+  switch (VT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i8:
     Opc = X86::MOV8rm;
@@ -1487,7 +1619,7 @@ unsigned X86FastISel::TargetMaterializeConstant(Constant *C) {
   // Materialize addresses with LEA instructions.
   if (isa<GlobalValue>(C)) {
     X86AddressMode AM;
-    if (X86SelectAddress(C, AM, false)) {
+    if (X86SelectAddress(C, AM)) {
       if (TLI.getPointerTy() == MVT::i32)
         Opc = X86::LEA32r;
       else
@@ -1509,16 +1641,15 @@ unsigned X86FastISel::TargetMaterializeConstant(Constant *C) {
   // x86-32 PIC requires a PIC base register for constant pools.
   unsigned PICBase = 0;
   unsigned char OpFlag = 0;
-  if (TM.getRelocationModel() == Reloc::PIC_) {
-    if (Subtarget->isPICStyleStub()) {
-      OpFlag = X86II::MO_PIC_BASE_OFFSET;
-      PICBase = getInstrInfo()->getGlobalBaseReg(&MF);
-    } else if (Subtarget->isPICStyleGOT()) {
-      OpFlag = X86II::MO_GOTOFF;
-      PICBase = getInstrInfo()->getGlobalBaseReg(&MF);
-    } else if (Subtarget->isPICStyleRIPRel() &&
-               TM.getCodeModel() == CodeModel::Small)
-      PICBase = X86::RIP;
+  if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+    PICBase = getInstrInfo()->getGlobalBaseReg(&MF);
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOTOFF;
+    PICBase = getInstrInfo()->getGlobalBaseReg(&MF);
+  } else if (Subtarget->isPICStyleRIPRel() &&
+             TM.getCodeModel() == CodeModel::Small) {
+    PICBase = X86::RIP;
   }
 
   // Create the load from the constant pool.
@@ -1542,7 +1673,7 @@ unsigned X86FastISel::TargetMaterializeAlloca(AllocaInst *C) {
     return 0;
 
   X86AddressMode AM;
-  if (!X86SelectAddress(C, AM, false))
+  if (!X86SelectAddress(C, AM))
     return 0;
   unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
   TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 37027ee..d9a05a8 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -31,19 +31,21 @@
 #define DEBUG_TYPE "x86-codegen"
 #include "X86.h"
 #include "X86InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -56,6 +58,7 @@ namespace {
     FPS() : MachineFunctionPass(&ID) {}
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
       AU.addPreservedID(MachineLoopInfoID);
       AU.addPreservedID(MachineDominatorsID);
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -73,12 +76,12 @@ namespace {
     unsigned StackTop;          // The current top of the FP stack.
 
     void dumpStack() const {
-      cerr << "Stack contents:";
+      errs() << "Stack contents:";
       for (unsigned i = 0; i != StackTop; ++i) {
-        cerr << " FP" << Stack[i];
+        errs() << " FP" << Stack[i];
         assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
       }
-      cerr << "\n";
+      errs() << "\n";
     }
   private:
     /// isStackEmpty - Return true if the FP stack is empty.
@@ -210,6 +213,14 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
        I != E; ++I)
     Changed |= processBasicBlock(MF, **I);
 
+  // Process any unreachable blocks in arbitrary order now.
+  if (MF.size() == Processed.size())
+    return Changed;
+
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+    if (Processed.insert(BB))
+      Changed |= processBasicBlock(MF, *BB);
+  
   return Changed;
 }
 
@@ -236,7 +247,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
       PrevMI = prior(I);
 
     ++NumFP;  // Keep track of # of pseudo instrs
-    DOUT << "\nFPInst:\t" << *MI;
+    DEBUG(errs() << "\nFPInst:\t" << *MI);
 
     // Get dead variables list now because the MI pointer may be deleted as part
     // of processing!
@@ -255,7 +266,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
     case X86II::CompareFP:  handleCompareFP(I); break;
     case X86II::CondMovFP:  handleCondMovFP(I); break;
     case X86II::SpecialFP:  handleSpecialFP(I); break;
-    default: assert(0 && "Unknown FP Type!");
+    default: llvm_unreachable("Unknown FP Type!");
     }
 
     // Check to see if any of the values defined by this instruction are dead
@@ -263,7 +274,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
     for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
       unsigned Reg = DeadRegs[i];
       if (Reg >= X86::FP0 && Reg <= X86::FP6) {
-        DOUT << "Register FP#" << Reg-X86::FP0 << " is dead!\n";
+        DEBUG(errs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
         freeStackSlotAfter(I, Reg-X86::FP0);
       }
     }
@@ -272,13 +283,13 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
     DEBUG(
       MachineBasicBlock::iterator PrevI(PrevMI);
       if (I == PrevI) {
-        cerr << "Just deleted pseudo instruction\n";
+        errs() << "Just deleted pseudo instruction\n";
       } else {
         MachineBasicBlock::iterator Start = I;
         // Rewind to first instruction newly inserted.
         while (Start != BB.begin() && prior(Start) != PrevI) --Start;
-        cerr << "Inserted instructions:\n\t";
-        Start->print(*cerr.stream(), &MF.getTarget());
+        errs() << "Inserted instructions:\n\t";
+        Start->print(errs(), &MF.getTarget());
         while (++Start != next(I)) {}
       }
       dumpStack();
@@ -945,7 +956,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
   MachineInstr *MI = I;
   DebugLoc dl = MI->getDebugLoc();
   switch (MI->getOpcode()) {
-  default: assert(0 && "Unknown SpecialFP instruction!");
+  default: llvm_unreachable("Unknown SpecialFP instruction!");
   case X86::FpGET_ST0_32:// Appears immediately after a call returning FP type!
   case X86::FpGET_ST0_64:// Appears immediately after a call returning FP type!
   case X86::FpGET_ST0_80:// Appears immediately after a call returning FP type!
diff --git a/lib/Target/X86/X86FloatingPointRegKill.cpp b/lib/Target/X86/X86FloatingPointRegKill.cpp
index 009846e..3e0385c 100644
--- a/lib/Target/X86/X86FloatingPointRegKill.cpp
+++ b/lib/Target/X86/X86FloatingPointRegKill.cpp
@@ -35,6 +35,7 @@ namespace {
     FPRegKiller() : MachineFunctionPass(&ID) {}
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
       AU.addPreservedID(MachineLoopInfoID);
       AU.addPreservedID(MachineDominatorsID);
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -117,9 +118,10 @@ bool FPRegKiller::runOnMachineFunction(MachineFunction &MF) {
            !ContainsFPCode && SI != E; ++SI) {
         for (BasicBlock::const_iterator II = SI->begin();
              (PN = dyn_cast<PHINode>(II)); ++II) {
-          if (PN->getType()==Type::X86_FP80Ty ||
+          if (PN->getType()==Type::getX86_FP80Ty(LLVMBB->getContext()) ||
               (!Subtarget.hasSSE1() && PN->getType()->isFloatingPoint()) ||
-              (!Subtarget.hasSSE2() && PN->getType()==Type::DoubleTy)) {
+              (!Subtarget.hasSSE2() &&
+                PN->getType()==Type::getDoubleTy(LLVMBB->getContext()))) {
             ContainsFPCode = true;
             break;
           }
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 1336177..5b678fb 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -35,8 +35,9 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Streams.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
@@ -78,7 +79,8 @@ namespace {
 
     X86ISelAddressMode()
       : BaseType(RegBase), Scale(1), IndexReg(), Disp(0),
-        Segment(), GV(0), CP(0), ES(0), JT(-1), Align(0), SymbolFlags(0) {
+        Segment(), GV(0), CP(0), ES(0), JT(-1), Align(0),
+        SymbolFlags(X86II::MO_NO_FLAG) {
     }
 
     bool hasSymbolicDisplacement() const {
@@ -105,23 +107,37 @@ namespace {
     }
 
     void dump() {
-      cerr << "X86ISelAddressMode " << this << "\n";
-      cerr << "Base.Reg ";
-              if (Base.Reg.getNode() != 0) Base.Reg.getNode()->dump(); 
-              else cerr << "nul";
-      cerr << " Base.FrameIndex " << Base.FrameIndex << "\n";
-      cerr << " Scale" << Scale << "\n";
-      cerr << "IndexReg ";
-              if (IndexReg.getNode() != 0) IndexReg.getNode()->dump();
-              else cerr << "nul"; 
-      cerr << " Disp " << Disp << "\n";
-      cerr << "GV "; if (GV) GV->dump(); 
-                     else cerr << "nul";
-      cerr << " CP "; if (CP) CP->dump(); 
-                     else cerr << "nul";
-      cerr << "\n";
-      cerr << "ES "; if (ES) cerr << ES; else cerr << "nul";
-      cerr  << " JT" << JT << " Align" << Align << "\n";
+      errs() << "X86ISelAddressMode " << this << '\n';
+      errs() << "Base.Reg ";
+      if (Base.Reg.getNode() != 0)
+        Base.Reg.getNode()->dump(); 
+      else
+        errs() << "nul";
+      errs() << " Base.FrameIndex " << Base.FrameIndex << '\n'
+             << " Scale" << Scale << '\n'
+             << "IndexReg ";
+      if (IndexReg.getNode() != 0)
+        IndexReg.getNode()->dump();
+      else
+        errs() << "nul"; 
+      errs() << " Disp " << Disp << '\n'
+             << "GV ";
+      if (GV)
+        GV->dump();
+      else
+        errs() << "nul";
+      errs() << " CP ";
+      if (CP)
+        CP->dump();
+      else
+        errs() << "nul";
+      errs() << '\n'
+             << "ES ";
+      if (ES)
+        errs() << ES;
+      else
+        errs() << "nul";
+      errs() << " JT" << JT << " Align" << Align << '\n';
     }
   };
 }
@@ -140,10 +156,6 @@ namespace {
     /// make the right decision when generating code for different targets.
     const X86Subtarget *Subtarget;
 
-    /// CurBB - Current BB being isel'd.
-    ///
-    MachineBasicBlock *CurBB;
-
     /// OptForSize - If true, selector should try to optimize for code size
     /// instead of performance.
     bool OptForSize;
@@ -174,12 +186,14 @@ namespace {
   private:
     SDNode *Select(SDValue N);
     SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+    SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
 
     bool MatchSegmentBaseAddress(SDValue N, X86ISelAddressMode &AM);
     bool MatchLoad(SDValue N, X86ISelAddressMode &AM);
     bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
-    bool MatchAddress(SDValue N, X86ISelAddressMode &AM,
-                      unsigned Depth = 0);
+    bool MatchAddress(SDValue N, X86ISelAddressMode &AM);
+    bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+                                 unsigned Depth);
     bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM);
     bool SelectAddr(SDValue Op, SDValue N, SDValue &Base,
                     SDValue &Scale, SDValue &Index, SDValue &Disp,
@@ -342,13 +356,17 @@ static void MoveBelowTokenFactor(SelectionDAG *CurDAG, SDValue Load,
       Ops.push_back(Load.getOperand(0));
     else
       Ops.push_back(TF.getOperand(i));
-  CurDAG->UpdateNodeOperands(TF, &Ops[0], Ops.size());
-  CurDAG->UpdateNodeOperands(Load, TF, Load.getOperand(1), Load.getOperand(2));
-  CurDAG->UpdateNodeOperands(Store, Load.getValue(1), Store.getOperand(1),
+  SDValue NewTF = CurDAG->UpdateNodeOperands(TF, &Ops[0], Ops.size());
+  SDValue NewLoad = CurDAG->UpdateNodeOperands(Load, NewTF,
+                                               Load.getOperand(1),
+                                               Load.getOperand(2));
+  CurDAG->UpdateNodeOperands(Store, NewLoad.getValue(1), Store.getOperand(1),
                              Store.getOperand(2), Store.getOperand(3));
 }
 
-/// isRMWLoad - Return true if N is a load that's part of RMW sub-DAG.
+/// isRMWLoad - Return true if N is a load that's part of RMW sub-DAG.  The 
+/// chain produced by the load must only be used by the store's chain operand,
+/// otherwise this may produce a cycle in the DAG.
 /// 
 static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
                       SDValue &Load) {
@@ -366,8 +384,9 @@ static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
     return false;
 
   if (N.hasOneUse() &&
+      LD->hasNUsesOfValue(1, 1) &&
       N.getOperand(1) == Address &&
-      N.getNode()->isOperandOf(Chain.getNode())) {
+      LD->isOperandOf(Chain.getNode())) {
     Load = N;
     return true;
   }
@@ -431,7 +450,8 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain) {
   if (Chain.getOperand(0).getNode() == Callee.getNode())
     return true;
   if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
-      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()))
+      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+      Callee.getValue(1).hasOneUse())
     return true;
   return false;
 }
@@ -583,8 +603,8 @@ void X86DAGToDAGISel::PreprocessForFPConvert() {
     
     // If the source and destination are SSE registers, then this is a legal
     // conversion that should not be lowered.
-    MVT SrcVT = N->getOperand(0).getValueType();
-    MVT DstVT = N->getValueType(0);
+    EVT SrcVT = N->getOperand(0).getValueType();
+    EVT DstVT = N->getValueType(0);
     bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
     bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
     if (SrcIsSSE && DstIsSSE)
@@ -602,7 +622,7 @@ void X86DAGToDAGISel::PreprocessForFPConvert() {
     // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
     // FPStack has extload and truncstore.  SSE can fold direct loads into other
     // operations.  Based on this, decide what we want to do.
-    MVT MemVT;
+    EVT MemVT;
     if (N->getOpcode() == ISD::FP_ROUND)
       MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
     else
@@ -635,8 +655,7 @@ void X86DAGToDAGISel::PreprocessForFPConvert() {
 /// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel
 /// when it has created a SelectionDAG for us to codegen.
 void X86DAGToDAGISel::InstructionSelect() {
-  CurBB = BB;  // BB can change as result of isel.
-  const Function *F = CurDAG->getMachineFunction().getFunction();
+  const Function *F = MF->getFunction();
   OptForSize = F->hasFnAttr(Attribute::OptimizeForSize);
 
   DEBUG(BB->dump());
@@ -648,12 +667,12 @@ void X86DAGToDAGISel::InstructionSelect() {
 
   // Codegen the basic block.
 #ifndef NDEBUG
-  DOUT << "===== Instruction selection begins:\n";
+  DEBUG(errs() << "===== Instruction selection begins:\n");
   Indent = 0;
 #endif
   SelectRoot(*CurDAG);
 #ifndef NDEBUG
-  DOUT << "===== Instruction selection ends:\n";
+  DEBUG(errs() << "===== Instruction selection ends:\n");
 #endif
 
   CurDAG->RemoveDeadNodes();
@@ -706,7 +725,7 @@ bool X86DAGToDAGISel::MatchLoad(SDValue N, X86ISelAddressMode &AM) {
 /// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes
 /// into an addressing mode.  These wrap things that will resolve down into a
 /// symbol reference.  If no match is possible, this returns true, otherwise it
-/// returns false.  
+/// returns false.
 bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
   // If the addressing mode already has a symbol as the displacement, we can
   // never match another symbol.
@@ -714,28 +733,27 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
     return true;
 
   SDValue N0 = N.getOperand(0);
-  
+  CodeModel::Model M = TM.getCodeModel();
+
   // Handle X86-64 rip-relative addresses.  We check this before checking direct
   // folding because RIP is preferable to non-RIP accesses.
   if (Subtarget->is64Bit() &&
       // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
       // they cannot be folded into immediate fields.
       // FIXME: This can be improved for kernel and other models?
-      TM.getCodeModel() == CodeModel::Small &&
-      
+      (M == CodeModel::Small || M == CodeModel::Kernel) &&
       // Base and index reg must be 0 in order to use %rip as base and lowering
       // must allow RIP.
       !AM.hasBaseOrIndexReg() && N.getOpcode() == X86ISD::WrapperRIP) {
-  
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
       int64_t Offset = AM.Disp + G->getOffset();
-      if (!isInt32(Offset)) return true;
+      if (!X86::isOffsetSuitableForCodeModel(Offset, M)) return true;
       AM.GV = G->getGlobal();
       AM.Disp = Offset;
       AM.SymbolFlags = G->getTargetFlags();
     } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
       int64_t Offset = AM.Disp + CP->getOffset();
-      if (!isInt32(Offset)) return true;
+      if (!X86::isOffsetSuitableForCodeModel(Offset, M)) return true;
       AM.CP = CP->getConstVal();
       AM.Align = CP->getAlignment();
       AM.Disp = Offset;
@@ -748,7 +766,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
       AM.JT = J->getIndex();
       AM.SymbolFlags = J->getTargetFlags();
     }
-  
+
     if (N.getOpcode() == X86ISD::WrapperRIP)
       AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
     return false;
@@ -758,7 +776,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
   // X86-32 always and X86-64 when in -static -mcmodel=small mode.  In 64-bit
   // mode, this results in a non-RIP-relative computation.
   if (!Subtarget->is64Bit() ||
-      (TM.getCodeModel() == CodeModel::Small &&
+      ((M == CodeModel::Small || M == CodeModel::Kernel) &&
        TM.getRelocationModel() == Reloc::Static)) {
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
       AM.GV = G->getGlobal();
@@ -786,15 +804,49 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
 /// MatchAddress - Add the specified node to the specified addressing mode,
 /// returning true if it cannot be done.  This just pattern matches for the
 /// addressing mode.
-bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
-                                   unsigned Depth) {
+bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
+  if (MatchAddressRecursively(N, AM, 0))
+    return true;
+
+  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
+  // a smaller encoding and avoids a scaled-index.
+  if (AM.Scale == 2 &&
+      AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base.Reg.getNode() == 0) {
+    AM.Base.Reg = AM.IndexReg;
+    AM.Scale = 1;
+  }
+
+  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
+  // because it has a smaller encoding.
+  // TODO: Which other code models can use this?
+  if (TM.getCodeModel() == CodeModel::Small &&
+      Subtarget->is64Bit() &&
+      AM.Scale == 1 &&
+      AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base.Reg.getNode() == 0 &&
+      AM.IndexReg.getNode() == 0 &&
+      AM.SymbolFlags == X86II::MO_NO_FLAG &&
+      AM.hasSymbolicDisplacement())
+    AM.Base.Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+
+  return false;
+}
+
+bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+                                              unsigned Depth) {
   bool is64Bit = Subtarget->is64Bit();
   DebugLoc dl = N.getDebugLoc();
-  DOUT << "MatchAddress: "; DEBUG(AM.dump());
+  DEBUG({
+      errs() << "MatchAddress: ";
+      AM.dump();
+    });
   // Limit recursion.
   if (Depth > 5)
     return MatchAddressBase(N, AM);
-  
+
+  CodeModel::Model M = TM.getCodeModel();
+
   // If this is already a %rip relative address, we can only merge immediates
   // into it.  Instead of handling this in every case, we handle it here.
   // RIP relative addressing: %rip + 32-bit displacement!
@@ -803,10 +855,11 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
     // displacements.  It isn't very important, but this should be fixed for
     // consistency.
     if (!AM.ES && AM.JT != -1) return true;
-    
+
     if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) {
       int64_t Val = AM.Disp + Cst->getSExtValue();
-      if (isInt32(Val)) {
+      if (X86::isOffsetSuitableForCodeModel(Val, M,
+                                            AM.hasSymbolicDisplacement())) {
         AM.Disp = Val;
         return false;
       }
@@ -818,7 +871,9 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
   default: break;
   case ISD::Constant: {
     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
-    if (!is64Bit || isInt32(AM.Disp + Val)) {
+    if (!is64Bit ||
+        X86::isOffsetSuitableForCodeModel(AM.Disp + Val, M,
+                                          AM.hasSymbolicDisplacement())) {
       AM.Disp += Val;
       return false;
     }
@@ -857,6 +912,10 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
     if (ConstantSDNode
           *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
       unsigned Val = CN->getZExtValue();
+      // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
+      // that the base operand remains free for further matching. If
+      // the base doesn't end up getting used, a post-processing step
+      // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
       if (Val == 1 || Val == 2 || Val == 3) {
         AM.Scale = 1 << Val;
         SDValue ShVal = N.getNode()->getOperand(0);
@@ -870,7 +929,9 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
           ConstantSDNode *AddVal =
             cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
           uint64_t Disp = AM.Disp + (AddVal->getSExtValue() << Val);
-          if (!is64Bit || isInt32(Disp))
+          if (!is64Bit ||
+              X86::isOffsetSuitableForCodeModel(Disp, M,
+                                                AM.hasSymbolicDisplacement()))
             AM.Disp = Disp;
           else
             AM.IndexReg = ShVal;
@@ -912,7 +973,9 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
               cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
             uint64_t Disp = AM.Disp + AddVal->getSExtValue() *
                                       CN->getZExtValue();
-            if (!is64Bit || isInt32(Disp))
+            if (!is64Bit ||
+                X86::isOffsetSuitableForCodeModel(Disp, M,
+                                                  AM.hasSymbolicDisplacement()))
               AM.Disp = Disp;
             else
               Reg = N.getNode()->getOperand(0);
@@ -936,7 +999,7 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
 
     // Test if the LHS of the sub can be folded.
     X86ISelAddressMode Backup = AM;
-    if (MatchAddress(N.getNode()->getOperand(0), AM, Depth+1)) {
+    if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
       AM = Backup;
       break;
     }
@@ -998,12 +1061,12 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
 
   case ISD::ADD: {
     X86ISelAddressMode Backup = AM;
-    if (!MatchAddress(N.getNode()->getOperand(0), AM, Depth+1) &&
-        !MatchAddress(N.getNode()->getOperand(1), AM, Depth+1))
+    if (!MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1) &&
+        !MatchAddressRecursively(N.getNode()->getOperand(1), AM, Depth+1))
       return false;
     AM = Backup;
-    if (!MatchAddress(N.getNode()->getOperand(1), AM, Depth+1) &&
-        !MatchAddress(N.getNode()->getOperand(0), AM, Depth+1))
+    if (!MatchAddressRecursively(N.getNode()->getOperand(1), AM, Depth+1) &&
+        !MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1))
       return false;
     AM = Backup;
 
@@ -1027,11 +1090,13 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
       X86ISelAddressMode Backup = AM;
       uint64_t Offset = CN->getSExtValue();
       // Start with the LHS as an addr mode.
-      if (!MatchAddress(N.getOperand(0), AM, Depth+1) &&
+      if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
           // Address could not have picked a GV address for the displacement.
           AM.GV == NULL &&
           // On x86-64, the resultant disp must fit in 32-bits.
-          (!is64Bit || isInt32(AM.Disp + Offset)) &&
+          (!is64Bit ||
+           X86::isOffsetSuitableForCodeModel(AM.Disp + Offset, M,
+                                             AM.hasSymbolicDisplacement())) &&
           // Check to see if the LHS & C is zero.
           CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
         AM.Disp += Offset;
@@ -1219,7 +1284,7 @@ bool X86DAGToDAGISel::SelectAddr(SDValue Op, SDValue N, SDValue &Base,
   if (!Done && MatchAddress(N, AM))
     return false;
 
-  MVT VT = N.getValueType();
+  EVT VT = N.getValueType();
   if (AM.BaseType == X86ISelAddressMode::RegBase) {
     if (!AM.Base.Reg.getNode())
       AM.Base.Reg = CurDAG->getRegister(0, VT);
@@ -1292,7 +1357,7 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue Op, SDValue N,
   assert (T == AM.Segment);
   AM.Segment = Copy;
 
-  MVT VT = N.getValueType();
+  EVT VT = N.getValueType();
   unsigned Complexity = 0;
   if (AM.BaseType == X86ISelAddressMode::RegBase)
     if (AM.Base.Reg.getNode())
@@ -1329,12 +1394,13 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue Op, SDValue N,
   if (AM.Disp && (AM.Base.Reg.getNode() || AM.IndexReg.getNode()))
     Complexity++;
 
-  if (Complexity > 2) {
-    SDValue Segment;
-    getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
-    return true;
-  }
-  return false;
+  // If it isn't worth using an LEA, reject it.
+  if (Complexity <= 2)
+    return false;
+  
+  SDValue Segment;
+  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+  return true;
 }
 
 /// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes.
@@ -1380,7 +1446,6 @@ bool X86DAGToDAGISel::TryFoldLoad(SDValue P, SDValue N,
 /// initialize the global base register, if necessary.
 ///
 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
-  MachineFunction *MF = CurBB->getParent();
   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
   return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
 }
@@ -1400,367 +1465,686 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (!SelectAddr(In1, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
     return NULL;
-  SDValue LSI = Node->getOperand(4);    // MemOperand
-  const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, LSI, Chain};
-  return CurDAG->getTargetNode(Opc, Node->getDebugLoc(),
-                               MVT::i32, MVT::i32, MVT::Other, Ops,
-                               array_lengthof(Ops));
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+                                           MVT::i32, MVT::i32, MVT::Other, Ops,
+                                           array_lengthof(Ops));
+  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
+  return ResNode;
+}
+
+SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
+  if (Node->hasAnyUseOfValue(0))
+    return 0;
+
+  // Optimize common patterns for __sync_add_and_fetch and
+  // __sync_sub_and_fetch where the result is not used. This allows us
+  // to use "lock" version of add, sub, inc, dec instructions.
+  // FIXME: Do not use special instructions but instead add the "lock"
+  // prefix to the target node somehow. The extra information will then be
+  // transferred to machine instruction and it denotes the prefix.
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ptr = Node->getOperand(1);
+  SDValue Val = Node->getOperand(2);
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (!SelectAddr(Ptr, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+    return 0;
+
+  bool isInc = false, isDec = false, isSub = false, isCN = false;
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val);
+  if (CN) {
+    isCN = true;
+    int64_t CNVal = CN->getSExtValue();
+    if (CNVal == 1)
+      isInc = true;
+    else if (CNVal == -1)
+      isDec = true;
+    else if (CNVal >= 0)
+      Val = CurDAG->getTargetConstant(CNVal, NVT);
+    else {
+      isSub = true;
+      Val = CurDAG->getTargetConstant(-CNVal, NVT);
+    }
+  } else if (Val.hasOneUse() &&
+             Val.getOpcode() == ISD::SUB &&
+             X86::isZeroNode(Val.getOperand(0))) {
+    isSub = true;
+    Val = Val.getOperand(1);
+  }
+
+  unsigned Opc = 0;
+  switch (NVT.getSimpleVT().SimpleTy) {
+  default: return 0;
+  case MVT::i8:
+    if (isInc)
+      Opc = X86::LOCK_INC8m;
+    else if (isDec)
+      Opc = X86::LOCK_DEC8m;
+    else if (isSub) {
+      if (isCN)
+        Opc = X86::LOCK_SUB8mi;
+      else
+        Opc = X86::LOCK_SUB8mr;
+    } else {
+      if (isCN)
+        Opc = X86::LOCK_ADD8mi;
+      else
+        Opc = X86::LOCK_ADD8mr;
+    }
+    break;
+  case MVT::i16:
+    if (isInc)
+      Opc = X86::LOCK_INC16m;
+    else if (isDec)
+      Opc = X86::LOCK_DEC16m;
+    else if (isSub) {
+      if (isCN) {
+        if (Predicate_i16immSExt8(Val.getNode()))
+          Opc = X86::LOCK_SUB16mi8;
+        else
+          Opc = X86::LOCK_SUB16mi;
+      } else
+        Opc = X86::LOCK_SUB16mr;
+    } else {
+      if (isCN) {
+        if (Predicate_i16immSExt8(Val.getNode()))
+          Opc = X86::LOCK_ADD16mi8;
+        else
+          Opc = X86::LOCK_ADD16mi;
+      } else
+        Opc = X86::LOCK_ADD16mr;
+    }
+    break;
+  case MVT::i32:
+    if (isInc)
+      Opc = X86::LOCK_INC32m;
+    else if (isDec)
+      Opc = X86::LOCK_DEC32m;
+    else if (isSub) {
+      if (isCN) {
+        if (Predicate_i32immSExt8(Val.getNode()))
+          Opc = X86::LOCK_SUB32mi8;
+        else
+          Opc = X86::LOCK_SUB32mi;
+      } else
+        Opc = X86::LOCK_SUB32mr;
+    } else {
+      if (isCN) {
+        if (Predicate_i32immSExt8(Val.getNode()))
+          Opc = X86::LOCK_ADD32mi8;
+        else
+          Opc = X86::LOCK_ADD32mi;
+      } else
+        Opc = X86::LOCK_ADD32mr;
+    }
+    break;
+  case MVT::i64:
+    if (isInc)
+      Opc = X86::LOCK_INC64m;
+    else if (isDec)
+      Opc = X86::LOCK_DEC64m;
+    else if (isSub) {
+      Opc = X86::LOCK_SUB64mr;
+      if (isCN) {
+        if (Predicate_i64immSExt8(Val.getNode()))
+          Opc = X86::LOCK_SUB64mi8;
+        else if (Predicate_i64immSExt32(Val.getNode()))
+          Opc = X86::LOCK_SUB64mi32;
+      }
+    } else {
+      Opc = X86::LOCK_ADD64mr;
+      if (isCN) {
+        if (Predicate_i64immSExt8(Val.getNode()))
+          Opc = X86::LOCK_ADD64mi8;
+        else if (Predicate_i64immSExt32(Val.getNode()))
+          Opc = X86::LOCK_ADD64mi32;
+      }
+    }
+    break;
+  }
+
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetInstrInfo::IMPLICIT_DEF,
+                                                 dl, NVT), 0);
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  if (isInc || isDec) {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
+    SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 6), 0);
+    cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+    SDValue RetVals[] = { Undef, Ret };
+    return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
+  } else {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
+    SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0);
+    cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+    SDValue RetVals[] = { Undef, Ret };
+    return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
+  }
+}
+
+/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
+/// any uses which require the SF or OF bits to be accurate.
+static bool HasNoSignedComparisonUses(SDNode *N) {
+  // Examine each user of the node.
+  for (SDNode::use_iterator UI = N->use_begin(),
+         UE = N->use_end(); UI != UE; ++UI) {
+    // Only examine CopyToReg uses.
+    if (UI->getOpcode() != ISD::CopyToReg)
+      return false;
+    // Only examine CopyToReg uses that copy to EFLAGS.
+    if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
+          X86::EFLAGS)
+      return false;
+    // Examine each user of the CopyToReg use.
+    for (SDNode::use_iterator FlagUI = UI->use_begin(),
+           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+      // Only examine the Flag result.
+      if (FlagUI.getUse().getResNo() != 1) continue;
+      // Anything unusual: assume conservatively.
+      if (!FlagUI->isMachineOpcode()) return false;
+      // Examine the opcode of the user.
+      switch (FlagUI->getMachineOpcode()) {
+      // These comparisons don't treat the most significant bit specially.
+      case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
+      case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
+      case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
+      case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
+      case X86::JA: case X86::JAE: case X86::JB: case X86::JBE:
+      case X86::JE: case X86::JNE: case X86::JP: case X86::JNP:
+      case X86::CMOVA16rr: case X86::CMOVA16rm:
+      case X86::CMOVA32rr: case X86::CMOVA32rm:
+      case X86::CMOVA64rr: case X86::CMOVA64rm:
+      case X86::CMOVAE16rr: case X86::CMOVAE16rm:
+      case X86::CMOVAE32rr: case X86::CMOVAE32rm:
+      case X86::CMOVAE64rr: case X86::CMOVAE64rm:
+      case X86::CMOVB16rr: case X86::CMOVB16rm:
+      case X86::CMOVB32rr: case X86::CMOVB32rm:
+      case X86::CMOVB64rr: case X86::CMOVB64rm:
+      case X86::CMOVBE16rr: case X86::CMOVBE16rm:
+      case X86::CMOVBE32rr: case X86::CMOVBE32rm:
+      case X86::CMOVBE64rr: case X86::CMOVBE64rm:
+      case X86::CMOVE16rr: case X86::CMOVE16rm:
+      case X86::CMOVE32rr: case X86::CMOVE32rm:
+      case X86::CMOVE64rr: case X86::CMOVE64rm:
+      case X86::CMOVNE16rr: case X86::CMOVNE16rm:
+      case X86::CMOVNE32rr: case X86::CMOVNE32rm:
+      case X86::CMOVNE64rr: case X86::CMOVNE64rm:
+      case X86::CMOVNP16rr: case X86::CMOVNP16rm:
+      case X86::CMOVNP32rr: case X86::CMOVNP32rm:
+      case X86::CMOVNP64rr: case X86::CMOVNP64rm:
+      case X86::CMOVP16rr: case X86::CMOVP16rm:
+      case X86::CMOVP32rr: case X86::CMOVP32rm:
+      case X86::CMOVP64rr: case X86::CMOVP64rm:
+        continue;
+      // Anything else: assume conservatively.
+      default: return false;
+      }
+    }
+  }
+  return true;
 }
 
 SDNode *X86DAGToDAGISel::Select(SDValue N) {
   SDNode *Node = N.getNode();
-  MVT NVT = Node->getValueType(0);
+  EVT NVT = Node->getValueType(0);
   unsigned Opc, MOpc;
   unsigned Opcode = Node->getOpcode();
   DebugLoc dl = Node->getDebugLoc();
   
 #ifndef NDEBUG
-  DOUT << std::string(Indent, ' ') << "Selecting: ";
-  DEBUG(Node->dump(CurDAG));
-  DOUT << "\n";
+  DEBUG({
+      errs() << std::string(Indent, ' ') << "Selecting: ";
+      Node->dump(CurDAG);
+      errs() << '\n';
+    });
   Indent += 2;
 #endif
 
   if (Node->isMachineOpcode()) {
 #ifndef NDEBUG
-    DOUT << std::string(Indent-2, ' ') << "== ";
-    DEBUG(Node->dump(CurDAG));
-    DOUT << "\n";
+    DEBUG({
+        errs() << std::string(Indent-2, ' ') << "== ";
+        Node->dump(CurDAG);
+        errs() << '\n';
+      });
     Indent -= 2;
 #endif
     return NULL;   // Already selected.
   }
 
   switch (Opcode) {
-    default: break;
-    case X86ISD::GlobalBaseReg: 
-      return getGlobalBaseReg();
-
-    case X86ISD::ATOMOR64_DAG:
-      return SelectAtomic64(Node, X86::ATOMOR6432);
-    case X86ISD::ATOMXOR64_DAG:
-      return SelectAtomic64(Node, X86::ATOMXOR6432);
-    case X86ISD::ATOMADD64_DAG:
-      return SelectAtomic64(Node, X86::ATOMADD6432);
-    case X86ISD::ATOMSUB64_DAG:
-      return SelectAtomic64(Node, X86::ATOMSUB6432);
-    case X86ISD::ATOMNAND64_DAG:
-      return SelectAtomic64(Node, X86::ATOMNAND6432);
-    case X86ISD::ATOMAND64_DAG:
-      return SelectAtomic64(Node, X86::ATOMAND6432);
-    case X86ISD::ATOMSWAP64_DAG:
-      return SelectAtomic64(Node, X86::ATOMSWAP6432);
-
-    case ISD::SMUL_LOHI:
-    case ISD::UMUL_LOHI: {
-      SDValue N0 = Node->getOperand(0);
-      SDValue N1 = Node->getOperand(1);
-
-      bool isSigned = Opcode == ISD::SMUL_LOHI;
-      if (!isSigned)
-        switch (NVT.getSimpleVT()) {
-        default: assert(0 && "Unsupported VT!");
-        case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
-        case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
-        case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
-        case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
-        }
-      else
-        switch (NVT.getSimpleVT()) {
-        default: assert(0 && "Unsupported VT!");
-        case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
-        case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
-        case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
-        case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
-        }
+  default: break;
+  case X86ISD::GlobalBaseReg:
+    return getGlobalBaseReg();
+
+  case X86ISD::ATOMOR64_DAG:
+    return SelectAtomic64(Node, X86::ATOMOR6432);
+  case X86ISD::ATOMXOR64_DAG:
+    return SelectAtomic64(Node, X86::ATOMXOR6432);
+  case X86ISD::ATOMADD64_DAG:
+    return SelectAtomic64(Node, X86::ATOMADD6432);
+  case X86ISD::ATOMSUB64_DAG:
+    return SelectAtomic64(Node, X86::ATOMSUB6432);
+  case X86ISD::ATOMNAND64_DAG:
+    return SelectAtomic64(Node, X86::ATOMNAND6432);
+  case X86ISD::ATOMAND64_DAG:
+    return SelectAtomic64(Node, X86::ATOMAND6432);
+  case X86ISD::ATOMSWAP64_DAG:
+    return SelectAtomic64(Node, X86::ATOMSWAP6432);
+
+  case ISD::ATOMIC_LOAD_ADD: {
+    SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT);
+    if (RetVal)
+      return RetVal;
+    break;
+  }
 
-      unsigned LoReg, HiReg;
-      switch (NVT.getSimpleVT()) {
-      default: assert(0 && "Unsupported VT!");
-      case MVT::i8:  LoReg = X86::AL;  HiReg = X86::AH;  break;
-      case MVT::i16: LoReg = X86::AX;  HiReg = X86::DX;  break;
-      case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break;
-      case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break;
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    bool isSigned = Opcode == ISD::SMUL_LOHI;
+    if (!isSigned) {
+      switch (NVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
+      case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+      case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
+      case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
       }
-
-      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-      bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
-      // multiplty is commmutative
-      if (!foldedLoad) {
-        foldedLoad = TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
-        if (foldedLoad)
-          std::swap(N0, N1);
+    } else {
+      switch (NVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
+      case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+      case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+      case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
       }
+    }
 
-      SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
-                                              N0, SDValue()).getValue(1);
-
-      if (foldedLoad) {
-        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
-                          InFlag };
-        SDNode *CNode =
-          CurDAG->getTargetNode(MOpc, dl, MVT::Other, MVT::Flag, Ops,
-                                array_lengthof(Ops));
-        InFlag = SDValue(CNode, 1);
-        // Update the chain.
-        ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
-      } else {
-        InFlag =
-          SDValue(CurDAG->getTargetNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
-      }
+    unsigned LoReg, HiReg;
+    switch (NVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i8:  LoReg = X86::AL;  HiReg = X86::AH;  break;
+    case MVT::i16: LoReg = X86::AX;  HiReg = X86::DX;  break;
+    case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break;
+    case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break;
+    }
 
-      // Copy the low half of the result, if it is needed.
-      if (!N.getValue(0).use_empty()) {
-        SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                                  LoReg, NVT, InFlag);
-        InFlag = Result.getValue(2);
-        ReplaceUses(N.getValue(0), Result);
+    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+    bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    // Multiply is commmutative.
+    if (!foldedLoad) {
+      foldedLoad = TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+      if (foldedLoad)
+        std::swap(N0, N1);
+    }
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+                                            N0, SDValue()).getValue(1);
+
+    if (foldedLoad) {
+      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                        InFlag };
+      SDNode *CNode =
+        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Flag, Ops,
+                               array_lengthof(Ops));
+      InFlag = SDValue(CNode, 1);
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+    } else {
+      InFlag =
+        SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
+    }
+
+    // Copy the low half of the result, if it is needed.
+    if (!N.getValue(0).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                LoReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(N.getValue(0), Result);
 #ifndef NDEBUG
-        DOUT << std::string(Indent-2, ' ') << "=> ";
-        DEBUG(Result.getNode()->dump(CurDAG));
-        DOUT << "\n";
+      DEBUG({
+          errs() << std::string(Indent-2, ' ') << "=> ";
+          Result.getNode()->dump(CurDAG);
+          errs() << '\n';
+        });
 #endif
+    }
+    // Copy the high half of the result, if it is needed.
+    if (!N.getValue(1).use_empty()) {
+      SDValue Result;
+      if (HiReg == X86::AH && Subtarget->is64Bit()) {
+        // Prevent use of AH in a REX instruction by referencing AX instead.
+        // Shift it down 8 bits.
+        Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                        X86::AX, MVT::i16, InFlag);
+        InFlag = Result.getValue(2);
+        Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+                                                Result,
+                                   CurDAG->getTargetConstant(8, MVT::i8)), 0);
+        // Then truncate it down to i8.
+        Result = CurDAG->getTargetExtractSubreg(X86::SUBREG_8BIT, dl,
+                                                MVT::i8, Result);
+      } else {
+        Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                        HiReg, NVT, InFlag);
+        InFlag = Result.getValue(2);
       }
-      // Copy the high half of the result, if it is needed.
-      if (!N.getValue(1).use_empty()) {
-        SDValue Result;
-        if (HiReg == X86::AH && Subtarget->is64Bit()) {
-          // Prevent use of AH in a REX instruction by referencing AX instead.
-          // Shift it down 8 bits.
-          Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                          X86::AX, MVT::i16, InFlag);
-          InFlag = Result.getValue(2);
-          Result = SDValue(CurDAG->getTargetNode(X86::SHR16ri, dl, MVT::i16,
-                                                 Result,
-                                     CurDAG->getTargetConstant(8, MVT::i8)), 0);
-          // Then truncate it down to i8.
-          SDValue SRIdx = CurDAG->getTargetConstant(X86::SUBREG_8BIT, MVT::i32);
-          Result = SDValue(CurDAG->getTargetNode(X86::EXTRACT_SUBREG, dl,
-                                                   MVT::i8, Result, SRIdx), 0);
-        } else {
-          Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                          HiReg, NVT, InFlag);
-          InFlag = Result.getValue(2);
-        }
-        ReplaceUses(N.getValue(1), Result);
+      ReplaceUses(N.getValue(1), Result);
 #ifndef NDEBUG
-        DOUT << std::string(Indent-2, ' ') << "=> ";
-        DEBUG(Result.getNode()->dump(CurDAG));
-        DOUT << "\n";
+      DEBUG({
+          errs() << std::string(Indent-2, ' ') << "=> ";
+          Result.getNode()->dump(CurDAG);
+          errs() << '\n';
+        });
 #endif
-      }
+    }
 
 #ifndef NDEBUG
-      Indent -= 2;
+    Indent -= 2;
 #endif
 
-      return NULL;
-    }
-      
-    case ISD::SDIVREM:
-    case ISD::UDIVREM: {
-      SDValue N0 = Node->getOperand(0);
-      SDValue N1 = Node->getOperand(1);
-
-      bool isSigned = Opcode == ISD::SDIVREM;
-      if (!isSigned)
-        switch (NVT.getSimpleVT()) {
-        default: assert(0 && "Unsupported VT!");
-        case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
-        case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
-        case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
-        case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
-        }
-      else
-        switch (NVT.getSimpleVT()) {
-        default: assert(0 && "Unsupported VT!");
-        case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
-        case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
-        case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
-        case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
-        }
+    return NULL;
+  }
 
-      unsigned LoReg, HiReg;
-      unsigned ClrOpcode, SExtOpcode;
-      switch (NVT.getSimpleVT()) {
-      default: assert(0 && "Unsupported VT!");
-      case MVT::i8:
-        LoReg = X86::AL;  HiReg = X86::AH;
-        ClrOpcode  = 0;
-        SExtOpcode = X86::CBW;
-        break;
-      case MVT::i16:
-        LoReg = X86::AX;  HiReg = X86::DX;
-        ClrOpcode  = X86::MOV16r0;
-        SExtOpcode = X86::CWD;
-        break;
-      case MVT::i32:
-        LoReg = X86::EAX; HiReg = X86::EDX;
-        ClrOpcode  = X86::MOV32r0;
-        SExtOpcode = X86::CDQ;
-        break;
-      case MVT::i64:
-        LoReg = X86::RAX; HiReg = X86::RDX;
-        ClrOpcode  = X86::MOV64r0;
-        SExtOpcode = X86::CQO;
-        break;
+  case ISD::SDIVREM:
+  case ISD::UDIVREM: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    bool isSigned = Opcode == ISD::SDIVREM;
+    if (!isSigned) {
+      switch (NVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
+      case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+      case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+      case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
       }
+    } else {
+      switch (NVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
+      case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+      case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+      case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+      }
+    }
 
-      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-      bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
-      bool signBitIsZero = CurDAG->SignBitIsZero(N0);
-
-      SDValue InFlag;
-      if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
-        // Special case for div8, just use a move with zero extension to AX to
-        // clear the upper 8 bits (AH).
-        SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
-        if (TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
-          SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
-          Move =
-            SDValue(CurDAG->getTargetNode(X86::MOVZX16rm8, dl, MVT::i16, 
-                                          MVT::Other, Ops,
-                                          array_lengthof(Ops)), 0);
-          Chain = Move.getValue(1);
-          ReplaceUses(N0.getValue(1), Chain);
-        } else {
-          Move =
-            SDValue(CurDAG->getTargetNode(X86::MOVZX16rr8, dl, MVT::i16, N0),0);
-          Chain = CurDAG->getEntryNode();
-        }
-        Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, Move, SDValue());
-        InFlag = Chain.getValue(1);
+    unsigned LoReg, HiReg;
+    unsigned ClrOpcode, SExtOpcode;
+    switch (NVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i8:
+      LoReg = X86::AL;  HiReg = X86::AH;
+      ClrOpcode  = 0;
+      SExtOpcode = X86::CBW;
+      break;
+    case MVT::i16:
+      LoReg = X86::AX;  HiReg = X86::DX;
+      ClrOpcode  = X86::MOV16r0;
+      SExtOpcode = X86::CWD;
+      break;
+    case MVT::i32:
+      LoReg = X86::EAX; HiReg = X86::EDX;
+      ClrOpcode  = X86::MOV32r0;
+      SExtOpcode = X86::CDQ;
+      break;
+    case MVT::i64:
+      LoReg = X86::RAX; HiReg = X86::RDX;
+      ClrOpcode  = ~0U; // NOT USED.
+      SExtOpcode = X86::CQO;
+      break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+    bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    bool signBitIsZero = CurDAG->SignBitIsZero(N0);
+
+    SDValue InFlag;
+    if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+      // Special case for div8, just use a move with zero extension to AX to
+      // clear the upper 8 bits (AH).
+      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
+      if (TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+        Move =
+          SDValue(CurDAG->getMachineNode(X86::MOVZX16rm8, dl, MVT::i16,
+                                         MVT::Other, Ops,
+                                         array_lengthof(Ops)), 0);
+        Chain = Move.getValue(1);
+        ReplaceUses(N0.getValue(1), Chain);
       } else {
+        Move =
+          SDValue(CurDAG->getMachineNode(X86::MOVZX16rr8, dl, MVT::i16, N0),0);
+        Chain = CurDAG->getEntryNode();
+      }
+      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, Move, SDValue());
+      InFlag = Chain.getValue(1);
+    } else {
+      InFlag =
+        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+                             LoReg, N0, SDValue()).getValue(1);
+      if (isSigned && !signBitIsZero) {
+        // Sign extend the low part into the high part.
         InFlag =
-          CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
-                               LoReg, N0, SDValue()).getValue(1);
-        if (isSigned && !signBitIsZero) {
-          // Sign extend the low part into the high part.
-          InFlag =
-            SDValue(CurDAG->getTargetNode(SExtOpcode, dl, MVT::Flag, InFlag),0);
+          SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Flag, InFlag),0);
+      } else {
+        // Zero out the high part, effectively zero extending the input.
+        SDValue ClrNode;
+
+        if (NVT.getSimpleVT() == MVT::i64) {
+          ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, MVT::i32),
+                            0);
+          // We just did a 32-bit clear, insert it into a 64-bit register to
+          // clear the whole 64-bit reg.
+          SDValue Undef =
+            SDValue(CurDAG->getMachineNode(TargetInstrInfo::IMPLICIT_DEF,
+                                           dl, MVT::i64), 0);
+          SDValue SubRegNo =
+            CurDAG->getTargetConstant(X86::SUBREG_32BIT, MVT::i32);
+          ClrNode =
+            SDValue(CurDAG->getMachineNode(TargetInstrInfo::INSERT_SUBREG, dl,
+                                           MVT::i64, Undef, ClrNode, SubRegNo),
+                    0);
         } else {
-          // Zero out the high part, effectively zero extending the input.
-          SDValue ClrNode = SDValue(CurDAG->getTargetNode(ClrOpcode, dl, NVT), 
-                                    0);
-          InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, HiReg,
-                                        ClrNode, InFlag).getValue(1);
+          ClrNode = SDValue(CurDAG->getMachineNode(ClrOpcode, dl, NVT), 0);
         }
-      }
 
-      if (foldedLoad) {
-        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
-                          InFlag };
-        SDNode *CNode =
-          CurDAG->getTargetNode(MOpc, dl, MVT::Other, MVT::Flag, Ops,
-                                array_lengthof(Ops));
-        InFlag = SDValue(CNode, 1);
-        // Update the chain.
-        ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
-      } else {
-        InFlag =
-          SDValue(CurDAG->getTargetNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
+        InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, HiReg,
+                                      ClrNode, InFlag).getValue(1);
       }
+    }
 
-      // Copy the division (low) result, if it is needed.
-      if (!N.getValue(0).use_empty()) {
-        SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                                  LoReg, NVT, InFlag);
-        InFlag = Result.getValue(2);
-        ReplaceUses(N.getValue(0), Result);
+    if (foldedLoad) {
+      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                        InFlag };
+      SDNode *CNode =
+        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Flag, Ops,
+                               array_lengthof(Ops));
+      InFlag = SDValue(CNode, 1);
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+    } else {
+      InFlag =
+        SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
+    }
+
+    // Copy the division (low) result, if it is needed.
+    if (!N.getValue(0).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                LoReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(N.getValue(0), Result);
 #ifndef NDEBUG
-        DOUT << std::string(Indent-2, ' ') << "=> ";
-        DEBUG(Result.getNode()->dump(CurDAG));
-        DOUT << "\n";
+      DEBUG({
+          errs() << std::string(Indent-2, ' ') << "=> ";
+          Result.getNode()->dump(CurDAG);
+          errs() << '\n';
+        });
 #endif
+    }
+    // Copy the remainder (high) result, if it is needed.
+    if (!N.getValue(1).use_empty()) {
+      SDValue Result;
+      if (HiReg == X86::AH && Subtarget->is64Bit()) {
+        // Prevent use of AH in a REX instruction by referencing AX instead.
+        // Shift it down 8 bits.
+        Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                        X86::AX, MVT::i16, InFlag);
+        InFlag = Result.getValue(2);
+        Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+                                      Result,
+                                      CurDAG->getTargetConstant(8, MVT::i8)),
+                         0);
+        // Then truncate it down to i8.
+        Result = CurDAG->getTargetExtractSubreg(X86::SUBREG_8BIT, dl,
+                                                MVT::i8, Result);
+      } else {
+        Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                        HiReg, NVT, InFlag);
+        InFlag = Result.getValue(2);
       }
-      // Copy the remainder (high) result, if it is needed.
-      if (!N.getValue(1).use_empty()) {
-        SDValue Result;
-        if (HiReg == X86::AH && Subtarget->is64Bit()) {
-          // Prevent use of AH in a REX instruction by referencing AX instead.
-          // Shift it down 8 bits.
-          Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                          X86::AX, MVT::i16, InFlag);
-          InFlag = Result.getValue(2);
-          Result = SDValue(CurDAG->getTargetNode(X86::SHR16ri, dl, MVT::i16,
-                                        Result,
-                                        CurDAG->getTargetConstant(8, MVT::i8)), 
-                           0);
-          // Then truncate it down to i8.
-          SDValue SRIdx = CurDAG->getTargetConstant(X86::SUBREG_8BIT, MVT::i32);
-          Result = SDValue(CurDAG->getTargetNode(X86::EXTRACT_SUBREG, dl,
-                                                   MVT::i8, Result, SRIdx), 0);
-        } else {
-          Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                          HiReg, NVT, InFlag);
-          InFlag = Result.getValue(2);
-        }
-        ReplaceUses(N.getValue(1), Result);
+      ReplaceUses(N.getValue(1), Result);
 #ifndef NDEBUG
-        DOUT << std::string(Indent-2, ' ') << "=> ";
-        DEBUG(Result.getNode()->dump(CurDAG));
-        DOUT << "\n";
+      DEBUG({
+          errs() << std::string(Indent-2, ' ') << "=> ";
+          Result.getNode()->dump(CurDAG);
+          errs() << '\n';
+        });
 #endif
-      }
+    }
 
 #ifndef NDEBUG
-      Indent -= 2;
+    Indent -= 2;
 #endif
 
-      return NULL;
-    }
+    return NULL;
+  }
 
-    case ISD::DECLARE: {
-      // Handle DECLARE nodes here because the second operand may have been
-      // wrapped in X86ISD::Wrapper.
-      SDValue Chain = Node->getOperand(0);
-      SDValue N1 = Node->getOperand(1);
-      SDValue N2 = Node->getOperand(2);
-      FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N1);
-      
-      // FIXME: We need to handle this for VLAs.
-      if (!FINode) {
-        ReplaceUses(N.getValue(0), Chain);
-        return NULL;
+  case X86ISD::CMP: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
+    // use a smaller encoding.
+    if (N0.getNode()->getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
+        N0.getValueType() != MVT::i8 &&
+        X86::isZeroNode(N1)) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+      if (!C) break;
+
+      // For example, convert "testl %eax, $8" to "testb %al, $8"
+      if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
+          (!(C->getZExtValue() & 0x80) ||
+           HasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i8);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // On x86-32, only the ABCD registers have 8-bit subregisters.
+        if (!Subtarget->is64Bit()) {
+          TargetRegisterClass *TRC = 0;
+          switch (N0.getValueType().getSimpleVT().SimpleTy) {
+          case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+          case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+          default: llvm_unreachable("Unsupported TEST operand type!");
+          }
+          SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
+          Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+                                               Reg.getValueType(), Reg, RC), 0);
+        }
+
+        // Extract the l-register.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::SUBREG_8BIT, dl,
+                                                        MVT::i8, Reg);
+
+        // Emit a testb.
+        return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, Subreg, Imm);
       }
-      
-      if (N2.getOpcode() == ISD::ADD &&
-          N2.getOperand(0).getOpcode() == X86ISD::GlobalBaseReg)
-        N2 = N2.getOperand(1);
-      
-      // If N2 is not Wrapper(decriptor) then the llvm.declare is mangled
-      // somehow, just ignore it.
-      if (N2.getOpcode() != X86ISD::Wrapper &&
-          N2.getOpcode() != X86ISD::WrapperRIP) {
-        ReplaceUses(N.getValue(0), Chain);
-        return NULL;
+
+      // For example, "testl %eax, $2048" to "testb %ah, $8".
+      if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
+          (!(C->getZExtValue() & 0x8000) ||
+           HasNoSignedComparisonUses(Node))) {
+        // Shift the immediate right by 8 bits.
+        SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
+                                                       MVT::i8);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Put the value in an ABCD register.
+        TargetRegisterClass *TRC = 0;
+        switch (N0.getValueType().getSimpleVT().SimpleTy) {
+        case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
+        case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+        case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+        default: llvm_unreachable("Unsupported TEST operand type!");
+        }
+        SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
+        Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+                                             Reg.getValueType(), Reg, RC), 0);
+
+        // Extract the h-register.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::SUBREG_8BIT_HI, dl,
+                                                        MVT::i8, Reg);
+
+        // Emit a testb. No special NOREX tricks are needed since there's
+        // only one GPR operand!
+        return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+                                      Subreg, ShiftedImm);
       }
-      GlobalAddressSDNode *GVNode =
-        dyn_cast<GlobalAddressSDNode>(N2.getOperand(0));
-      if (GVNode == 0) {
-        ReplaceUses(N.getValue(0), Chain);
-        return NULL;
+
+      // For example, "testl %eax, $32776" to "testw %ax, $32776".
+      if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
+          N0.getValueType() != MVT::i16 &&
+          (!(C->getZExtValue() & 0x8000) ||
+           HasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i16);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Extract the 16-bit subregister.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::SUBREG_16BIT, dl,
+                                                        MVT::i16, Reg);
+
+        // Emit a testw.
+        return CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, Subreg, Imm);
+      }
+
+      // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+      if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
+          N0.getValueType() == MVT::i64 &&
+          (!(C->getZExtValue() & 0x80000000) ||
+           HasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Extract the 32-bit subregister.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::SUBREG_32BIT, dl,
+                                                        MVT::i32, Reg);
+
+        // Emit a testl.
+        return CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, Subreg, Imm);
       }
-      SDValue Tmp1 = CurDAG->getTargetFrameIndex(FINode->getIndex(),
-                                                 TLI.getPointerTy());
-      SDValue Tmp2 = CurDAG->getTargetGlobalAddress(GVNode->getGlobal(),
-                                                    TLI.getPointerTy());
-      SDValue Ops[] = { Tmp1, Tmp2, Chain };
-      return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl,
-                                   MVT::Other, Ops,
-                                   array_lengthof(Ops));
     }
+    break;
+  }
   }
 
   SDNode *ResNode = SelectCode(N);
 
 #ifndef NDEBUG
-  DOUT << std::string(Indent-2, ' ') << "=> ";
-  if (ResNode == NULL || ResNode == N.getNode())
-    DEBUG(N.getNode()->dump(CurDAG));
-  else
-    DEBUG(ResNode->dump(CurDAG));
-  DOUT << "\n";
+  DEBUG({
+      errs() << std::string(Indent-2, ' ') << "=> ";
+      if (ResNode == NULL || ResNode == N.getNode())
+        N.getNode()->dump(CurDAG);
+      else
+        ResNode->dump(CurDAG);
+      errs() << '\n';
+    });
   Indent -= 2;
 #endif
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 5a6294a..fadc818 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16,13 +16,16 @@
 #include "X86InstrBuilder.h"
 #include "X86ISelLowering.h"
 #include "X86TargetMachine.h"
+#include "X86TargetObjectFile.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalAlias.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Function.h"
+#include "llvm/Instructions.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/VectorExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -33,21 +36,48 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 static cl::opt<bool>
 DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
 
+// Disable16Bit - 16-bit operations typically have a larger encoding than
+// corresponding 32-bit instructions, and 16-bit code is slow on some
+// processors. This is an experimental flag to disable 16-bit operations
+// (which forces them to be Legalized to 32-bit operations).
+static cl::opt<bool>
+Disable16Bit("disable-16bit", cl::Hidden,
+             cl::desc("Disable use of 16-bit instructions"));
+
 // Forward declarations.
-static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                        SDValue V2);
 
+static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
+  switch (TM.getSubtarget<X86Subtarget>().TargetType) {
+  default: llvm_unreachable("unknown subtarget type");
+  case X86Subtarget::isDarwin:
+    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+      return new X8664_MachoTargetObjectFile();
+    return new X8632_MachoTargetObjectFile();
+  case X86Subtarget::isELF:
+    return new TargetLoweringObjectFileELF();
+  case X86Subtarget::isMingw:
+  case X86Subtarget::isCygwin:
+  case X86Subtarget::isWindows:
+    return new TargetLoweringObjectFileCOFF();
+  }
+
+}
+
 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
-  : TargetLowering(TM) {
+  : TargetLowering(TM, createTLOF(TM)) {
   Subtarget = &TM.getSubtarget<X86Subtarget>();
   X86ScalarSSEf64 = Subtarget->hasSSE2();
   X86ScalarSSEf32 = Subtarget->hasSSE1();
@@ -62,7 +92,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setShiftAmountType(MVT::i8);
   setBooleanContents(ZeroOrOneBooleanContent);
   setSchedulingPreference(SchedulingForRegPressure);
-  setShiftAmountFlavor(Mask);   // shl X, 32 == shl X, 0
   setStackPointerRegisterToSaveRestore(X86StackPtr);
 
   if (Subtarget->isTargetDarwin()) {
@@ -80,7 +109,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // Set up the register classes.
   addRegisterClass(MVT::i8, X86::GR8RegisterClass);
-  addRegisterClass(MVT::i16, X86::GR16RegisterClass);
+  if (!Disable16Bit)
+    addRegisterClass(MVT::i16, X86::GR16RegisterClass);
   addRegisterClass(MVT::i32, X86::GR32RegisterClass);
   if (Subtarget->is64Bit())
     addRegisterClass(MVT::i64, X86::GR64RegisterClass);
@@ -89,9 +119,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // We don't accept any truncstore of integer registers.
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
-  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  if (!Disable16Bit)
+    setTruncStoreAction(MVT::i64, MVT::i16, Expand);
   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
-  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+  if (!Disable16Bit)
+    setTruncStoreAction(MVT::i32, MVT::i16, Expand);
   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 
@@ -242,8 +274,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
   setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
   setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
-  setOperationAction(ISD::CTTZ             , MVT::i16  , Custom);
-  setOperationAction(ISD::CTLZ             , MVT::i16  , Custom);
+  if (Disable16Bit) {
+    setOperationAction(ISD::CTTZ           , MVT::i16  , Expand);
+    setOperationAction(ISD::CTLZ           , MVT::i16  , Expand);
+  } else {
+    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
+  }
   setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
   setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
   setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
@@ -257,16 +294,22 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
 
   // These should be promoted to a larger select which is supported.
-  setOperationAction(ISD::SELECT           , MVT::i1   , Promote);
-  setOperationAction(ISD::SELECT           , MVT::i8   , Promote);
+  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
   // X86 wants to expand cmov itself.
-  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
+  if (Disable16Bit)
+    setOperationAction(ISD::SELECT        , MVT::i16  , Expand);
+  else
+    setOperationAction(ISD::SELECT        , MVT::i16  , Custom);
   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
-  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
+  if (Disable16Bit)
+    setOperationAction(ISD::SETCC         , MVT::i16  , Expand);
+  else
+    setOperationAction(ISD::SETCC         , MVT::i16  , Custom);
   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
@@ -275,8 +318,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
   }
-  // X86 ret instruction may pop stack.
-  setOperationAction(ISD::RET             , MVT::Other, Custom);
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 
   // Darwin ABI issue.
@@ -330,7 +371,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
   }
 
-  // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion.
+  // Use the default ISD::DBG_STOPPOINT.
   setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
   // FIXME - use subtarget debug flags
   if (!Subtarget->isTargetDarwin() &&
@@ -637,6 +678,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SELECT,             MVT::v4i16, Promote);
     setOperationAction(ISD::SELECT,             MVT::v2i32, Promote);
     setOperationAction(ISD::SELECT,             MVT::v1i64, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v8i8, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v4i16, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v2i32, Custom);
   }
 
   if (!UseSoftFloat && Subtarget->hasSSE1()) {
@@ -696,16 +740,19 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
     for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
-      MVT VT = (MVT::SimpleValueType)i;
+      EVT VT = (MVT::SimpleValueType)i;
       // Do not attempt to custom lower non-power-of-2 vectors
       if (!isPowerOf2_32(VT.getVectorNumElements()))
         continue;
       // Do not attempt to custom lower non-128-bit vectors
       if (!VT.is128BitVector())
         continue;
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,
+                         VT.getSimpleVT().SimpleTy, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,
+                         VT.getSimpleVT().SimpleTy, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT,
+                         VT.getSimpleVT().SimpleTy, Custom);
     }
 
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
@@ -722,22 +769,23 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
     for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
-      MVT VT = (MVT::SimpleValueType)i;
+      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
+      EVT VT = SVT;
 
       // Do not attempt to promote non-128-bit vectors
       if (!VT.is128BitVector()) {
         continue;
       }
-      setOperationAction(ISD::AND,    VT, Promote);
-      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
-      setOperationAction(ISD::OR,     VT, Promote);
-      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
-      setOperationAction(ISD::XOR,    VT, Promote);
-      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
-      setOperationAction(ISD::LOAD,   VT, Promote);
-      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
-      setOperationAction(ISD::SELECT, VT, Promote);
-      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
+      setOperationAction(ISD::AND,    SVT, Promote);
+      AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
+      setOperationAction(ISD::OR,     SVT, Promote);
+      AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    SVT, Promote);
+      AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
+      setOperationAction(ISD::LOAD,   SVT, Promote);
+      AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, SVT, Promote);
+      AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
     }
 
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -847,7 +895,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
     // This includes 256-bit vectors
     for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) {
-      MVT VT = (MVT::SimpleValueType)i;
+      EVT VT = (MVT::SimpleValueType)i;
 
       // Do not attempt to custom lower non-power-of-2 vectors
       if (!isPowerOf2_32(VT.getVectorNumElements()))
@@ -861,7 +909,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     if (Subtarget->is64Bit()) {
       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i64, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom);
-    }    
+    }
 #endif
 
 #if 0
@@ -871,7 +919,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64.
     // Including 256-bit vectors
     for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) {
-      MVT VT = (MVT::SimpleValueType)i;
+      EVT VT = (MVT::SimpleValueType)i;
 
       if (!VT.is256BitVector()) {
         continue;
@@ -933,13 +981,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
   maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
-  allowUnalignedMemoryAccesses = true; // x86 supports it!
   setPrefLoopAlignment(16);
   benefitFromCodePlacementOpt = true;
 }
 
 
-MVT X86TargetLowering::getSetCCResultType(MVT VT) const {
+MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i8;
 }
 
@@ -993,7 +1040,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
 /// and store operations as a result of memset, memcpy, and memmove
 /// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
 /// determining it.
-MVT
+EVT
 X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
                                        bool isSrcConst, bool isSrcStr,
                                        SelectionDAG &DAG) const {
@@ -1019,7 +1066,7 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
                                                       SelectionDAG &DAG) const {
   if (usesGlobalOffsetTable())
     return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy());
-  if (!Subtarget->isPICStyleRIPRel())
+  if (!Subtarget->is64Bit())
     // This doesn't have DebugLoc associated with it, but is not really the
     // same as a Register.
     return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(),
@@ -1029,7 +1076,7 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
 
 /// getFunctionAlignment - Return the Log2 alignment of this function.
 unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
-  return F->hasFnAttr(Attribute::OptimizeForSize) ? 1 : 4;
+  return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1038,16 +1085,16 @@ unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
 
 #include "X86GenCallingConv.inc"
 
-/// LowerRET - Lower an ISD::RET node.
-SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
-  assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args");
+SDValue
+X86TargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               DebugLoc dl, SelectionDAG &DAG) {
 
   SmallVector<CCValAssign, 16> RVLocs;
-  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
-  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
-  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_X86);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
   // If this is the first return lowered for this function, add the regs to the
   // liveout set for the function.
@@ -1056,49 +1103,19 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
       if (RVLocs[i].isRegLoc())
         DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
   }
-  SDValue Chain = Op.getOperand(0);
-
-  // Handle tail call return.
-  Chain = GetPossiblePreceedingTailCall(Chain, X86ISD::TAILCALL);
-  if (Chain.getOpcode() == X86ISD::TAILCALL) {
-    SDValue TailCall = Chain;
-    SDValue TargetAddress = TailCall.getOperand(1);
-    SDValue StackAdjustment = TailCall.getOperand(2);
-    assert(((TargetAddress.getOpcode() == ISD::Register &&
-               (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::EAX ||
-                cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R11)) ||
-              TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
-              TargetAddress.getOpcode() == ISD::TargetGlobalAddress) &&
-             "Expecting an global address, external symbol, or register");
-    assert(StackAdjustment.getOpcode() == ISD::Constant &&
-           "Expecting a const value");
-
-    SmallVector<SDValue,8> Operands;
-    Operands.push_back(Chain.getOperand(0));
-    Operands.push_back(TargetAddress);
-    Operands.push_back(StackAdjustment);
-    // Copy registers used by the call. Last operand is a flag so it is not
-    // copied.
-    for (unsigned i=3; i < TailCall.getNumOperands()-1; i++) {
-      Operands.push_back(Chain.getOperand(i));
-    }
-    return DAG.getNode(X86ISD::TC_RETURN, dl, MVT::Other, &Operands[0],
-                       Operands.size());
-  }
 
-  // Regular return.
   SDValue Flag;
 
   SmallVector<SDValue, 6> RetOps;
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   // Operand #1 = Bytes To Pop
-  RetOps.push_back(DAG.getConstant(getBytesToPopOnReturn(), MVT::i16));
+  RetOps.push_back(DAG.getTargetConstant(getBytesToPopOnReturn(), MVT::i16));
 
   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
-    SDValue ValToCopy = Op.getOperand(i*2+1);
+    SDValue ValToCopy = Outs[i].Val;
 
     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
     // the RET instruction and handled by the FP Stackifier.
@@ -1116,7 +1133,7 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
     // which is returned in RAX / RDX.
     if (Subtarget->is64Bit()) {
-      MVT ValVT = ValToCopy.getValueType();
+      EVT ValVT = ValToCopy.getValueType();
       if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
         ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
@@ -1145,6 +1162,9 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
 
     Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
     Flag = Chain.getValue(1);
+
+    // RAX now acts like a return value.
+    MF.getRegInfo().addLiveOut(X86::RAX);
   }
 
   RetOps[0] = Chain;  // Update chain.
@@ -1157,36 +1177,32 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
                      MVT::Other, &RetOps[0], RetOps.size());
 }
 
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals) {
 
-/// LowerCallResult - Lower the result values of an ISD::CALL into the
-/// appropriate copies out of appropriate physical registers.  This assumes that
-/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
-/// being lowered.  The returns a SDNode with the same number of values as the
-/// ISD::CALL.
-SDNode *X86TargetLowering::
-LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
-                unsigned CallingConv, SelectionDAG &DAG) {
-
-  DebugLoc dl = TheCall->getDebugLoc();
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  bool isVarArg = TheCall->isVarArg();
   bool Is64Bit = Subtarget->is64Bit();
-  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
-  CCInfo.AnalyzeCallResult(TheCall, RetCC_X86);
-
-  SmallVector<SDValue, 8> ResultVals;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
-    MVT CopyVT = VA.getValVT();
+    EVT CopyVT = VA.getValVT();
 
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
-        ((Is64Bit || TheCall->isInreg()) && !Subtarget->hasSSE1())) {
-      cerr << "SSE register return with SSE disabled\n";
-      exit(1);
+        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+      llvm_report_error("SSE register return with SSE disabled");
     }
 
     // If this is a call to a function that returns an fp value on the floating
@@ -1206,7 +1222,7 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
                                    MVT::v2i64, InFlag).getValue(1);
         Val = Chain.getValue(0);
         Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
-                          Val, DAG.getConstant(0, MVT::i64));        
+                          Val, DAG.getConstant(0, MVT::i64));
       } else {
         Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
                                    MVT::i64, InFlag).getValue(1);
@@ -1228,13 +1244,10 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
                         DAG.getIntPtrConstant(1));
     }
 
-    ResultVals.push_back(Val);
+    InVals.push_back(Val);
   }
 
-  // Merge everything together with a MERGE_VALUES node.
-  ResultVals.push_back(Chain);
-  return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
-                     &ResultVals[0], ResultVals.size()).getNode();
+  return Chain;
 }
 
 
@@ -1248,30 +1261,28 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
 //  For info on fast calling convention see Fast Calling Convention (tail call)
 //  implementation LowerX86_32FastCCCallTo.
 
-/// CallIsStructReturn - Determines whether a CALL node uses struct return
+/// CallIsStructReturn - Determines whether a call uses struct return
 /// semantics.
-static bool CallIsStructReturn(CallSDNode *TheCall) {
-  unsigned NumOps = TheCall->getNumArgs();
-  if (!NumOps)
+static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  if (Outs.empty())
     return false;
 
-  return TheCall->getArgFlags(0).isSRet();
+  return Outs[0].Flags.isSRet();
 }
 
-/// ArgsAreStructReturn - Determines whether a FORMAL_ARGUMENTS node uses struct
+/// ArgsAreStructReturn - Determines whether a function uses struct
 /// return semantics.
-static bool ArgsAreStructReturn(SDValue Op) {
-  unsigned NumArgs = Op.getNode()->getNumValues() - 1;
-  if (!NumArgs)
+static bool
+ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
+  if (Ins.empty())
     return false;
 
-  return cast<ARG_FLAGSSDNode>(Op.getOperand(3))->getArgFlags().isSRet();
+  return Ins[0].Flags.isSRet();
 }
 
-/// IsCalleePop - Determines whether a CALL or FORMAL_ARGUMENTS node requires
-/// the callee to pop its own arguments. Callee pop is necessary to support tail
-/// calls.
-bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) {
+/// IsCalleePop - Determines whether the callee is required to pop its
+/// own arguments. Callee pop is necessary to support tail calls.
+bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){
   if (IsVarArg)
     return false;
 
@@ -1289,7 +1300,7 @@ bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) {
 
 /// CCAssignFnForNode - Selects the correct CCAssignFn for a the
 /// given CallingConvention value.
-CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const {
+CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
   if (Subtarget->is64Bit()) {
     if (Subtarget->isTargetWin64())
       return CC_X86_Win64_C;
@@ -1305,36 +1316,18 @@ CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const {
     return CC_X86_32_C;
 }
 
-/// NameDecorationForFORMAL_ARGUMENTS - Selects the appropriate decoration to
-/// apply to a MachineFunction containing a given FORMAL_ARGUMENTS node.
+/// NameDecorationForCallConv - Selects the appropriate decoration to
+/// apply to a MachineFunction containing a given calling convention.
 NameDecorationStyle
-X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDValue Op) {
-  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-  if (CC == CallingConv::X86_FastCall)
+X86TargetLowering::NameDecorationForCallConv(CallingConv::ID CallConv) {
+  if (CallConv == CallingConv::X86_FastCall)
     return FastCall;
-  else if (CC == CallingConv::X86_StdCall)
+  else if (CallConv == CallingConv::X86_StdCall)
     return StdCall;
   return None;
 }
 
 
-/// CallRequiresGOTInRegister - Check whether the call requires the GOT pointer
-/// in a register before calling.
-bool X86TargetLowering::CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall) {
-  return !IsTailCall && !Is64Bit &&
-    getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
-    Subtarget->isPICStyleGOT();
-}
-
-/// CallRequiresFnAddressInReg - Check whether the call requires the function
-/// address to be loaded in a register.
-bool
-X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) {
-  return !Is64Bit && IsTailCall &&
-    getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
-    Subtarget->isPICStyleGOT();
-}
-
 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
 /// by "Src" to address "Dst" with size and alignment information specified by
 /// the specific parameter attribute. The copy will be passed as a byval
@@ -1348,35 +1341,52 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                        /*AlwaysInline=*/true, NULL, 0, NULL, 0);
 }
 
-SDValue X86TargetLowering::LowerMemArgument(SDValue Op, SelectionDAG &DAG,
-                                              const CCValAssign &VA,
-                                              MachineFrameInfo *MFI,
-                                              unsigned CC,
-                                              SDValue Root, unsigned i) {
+SDValue
+X86TargetLowering::LowerMemArgument(SDValue Chain,
+                                    CallingConv::ID CallConv,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    MachineFrameInfo *MFI,
+                                    unsigned i) {
+
   // Create the nodes corresponding to a load from this parameter slot.
-  ISD::ArgFlagsTy Flags =
-    cast<ARG_FLAGSSDNode>(Op.getOperand(3 + i))->getArgFlags();
-  bool AlwaysUseMutable = (CC==CallingConv::Fast) && PerformTailCallOpt;
+  ISD::ArgFlagsTy Flags = Ins[i].Flags;
+  bool AlwaysUseMutable = (CallConv==CallingConv::Fast) && PerformTailCallOpt;
   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+  EVT ValVT;
+
+  // If value is passed by pointer we have address passed instead of the value
+  // itself.
+  if (VA.getLocInfo() == CCValAssign::Indirect)
+    ValVT = VA.getLocVT();
+  else
+    ValVT = VA.getValVT();
 
   // FIXME: For now, all byval parameter objects are marked mutable. This can be
   // changed with more analysis.
   // In case of tail call optimization mark all arguments mutable. Since they
   // could be overwritten by lowering of arguments in case of a tail call.
-  int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+  int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
                                   VA.getLocMemOffset(), isImmutable);
   SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
   if (Flags.isByVal())
     return FIN;
-  return DAG.getLoad(VA.getValVT(), Op.getDebugLoc(), Root, FIN,
+  return DAG.getLoad(ValVT, dl, Chain, FIN,
                      PseudoSourceValue::getFixedStack(FI), 0);
 }
 
 SDValue
-X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
+X86TargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv,
+                                        bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                        DebugLoc dl,
+                                        SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals) {
+
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
-  DebugLoc dl = Op.getDebugLoc();
 
   const Function* Fn = MF.getFunction();
   if (Fn->hasExternalLinkage() &&
@@ -1385,25 +1395,23 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
     FuncInfo->setForceFramePointer(true);
 
   // Decorate the function name.
-  FuncInfo->setDecorationStyle(NameDecorationForFORMAL_ARGUMENTS(Op));
+  FuncInfo->setDecorationStyle(NameDecorationForCallConv(CallConv));
 
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  SDValue Root = Op.getOperand(0);
-  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
-  unsigned CC = MF.getFunction()->getCallingConv();
   bool Is64Bit = Subtarget->is64Bit();
   bool IsWin64 = Subtarget->isTargetWin64();
 
-  assert(!(isVarArg && CC == CallingConv::Fast) &&
+  assert(!(isVarArg && CallConv == CallingConv::Fast) &&
          "Var args not supported with calling convention fastcc");
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
-  CCInfo.AnalyzeFormalArguments(Op.getNode(), CCAssignFnForNode(CC));
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
 
-  SmallVector<SDValue, 8> ArgValues;
   unsigned LastVal = ~0U;
+  SDValue ArgValue;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
@@ -1413,7 +1421,7 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
     LastVal = VA.getValNo();
 
     if (VA.isRegLoc()) {
-      MVT RegVT = VA.getLocVT();
+      EVT RegVT = VA.getLocVT();
       TargetRegisterClass *RC = NULL;
       if (RegVT == MVT::i32)
         RC = X86::GR32RegisterClass;
@@ -1425,27 +1433,13 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
         RC = X86::FR64RegisterClass;
       else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
         RC = X86::VR128RegisterClass;
-      else if (RegVT.isVector()) {
-        assert(RegVT.getSizeInBits() == 64);
-        if (!Is64Bit)
-          RC = X86::VR64RegisterClass;     // MMX values are passed in MMXs.
-        else {
-          // Darwin calling convention passes MMX values in either GPRs or
-          // XMMs in x86-64. Other targets pass them in memory.
-          if (RegVT != MVT::v1i64 && Subtarget->hasSSE2()) {
-            RC = X86::VR128RegisterClass;  // MMX values are passed in XMMs.
-            RegVT = MVT::v2i64;
-          } else {
-            RC = X86::GR64RegisterClass;   // v1i64 values are passed in GPRs.
-            RegVT = MVT::i64;
-          }
-        }
-      } else {
-        assert(0 && "Unknown argument type!");
-      }
+      else if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
+        RC = X86::VR64RegisterClass;
+      else
+        llvm_unreachable("Unknown argument type!");
 
-      unsigned Reg = DAG.getMachineFunction().addLiveIn(VA.getLocReg(), RC);
-      SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT);
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
 
       // If this is an 8 or 16-bit value, it is really passed promoted to 32
       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
@@ -1456,52 +1450,53 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
       else if (VA.getLocInfo() == CCValAssign::ZExt)
         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
                                DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::BCvt)
+        ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
 
-      if (VA.getLocInfo() != CCValAssign::Full)
-        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
-
-      // Handle MMX values passed in GPRs.
-      if (Is64Bit && RegVT != VA.getLocVT()) {
-        if (RegVT.getSizeInBits() == 64 && RC == X86::GR64RegisterClass)
-          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue);
-        else if (RC == X86::VR128RegisterClass) {
+      if (VA.isExtInLoc()) {
+        // Handle MMX values passed in XMM regs.
+        if (RegVT.isVector()) {
           ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
                                  ArgValue, DAG.getConstant(0, MVT::i64));
-          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue);
-        }
+          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
+        } else
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
       }
-
-      ArgValues.push_back(ArgValue);
     } else {
       assert(VA.isMemLoc());
-      ArgValues.push_back(LowerMemArgument(Op, DAG, VA, MFI, CC, Root, i));
+      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
     }
+
+    // If value is passed via pointer - do a load.
+    if (VA.getLocInfo() == CCValAssign::Indirect)
+      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0);
+
+    InVals.push_back(ArgValue);
   }
 
   // The x86-64 ABI for returning structs by value requires that we copy
   // the sret argument into %rax for the return. Save the argument into
   // a virtual register so that we can access it from the return points.
-  if (Is64Bit && DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
-    MachineFunction &MF = DAG.getMachineFunction();
+  if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
     unsigned Reg = FuncInfo->getSRetReturnReg();
     if (!Reg) {
       Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
       FuncInfo->setSRetReturnReg(Reg);
     }
-    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, ArgValues[0]);
-    Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Root);
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   }
 
   unsigned StackSize = CCInfo.getNextStackOffset();
   // align stack specially for tail calls
-  if (PerformTailCallOpt && CC == CallingConv::Fast)
+  if (PerformTailCallOpt && CallConv == CallingConv::Fast)
     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
 
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start.
   if (isVarArg) {
-    if (Is64Bit || CC != CallingConv::X86_FastCall) {
+    if (Is64Bit || CallConv != CallingConv::X86_FastCall) {
       VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
     }
     if (Is64Bit) {
@@ -1558,75 +1553,81 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
       // Store the integer parameter registers.
       SmallVector<SDValue, 8> MemOps;
       SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
-      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
-                                  DAG.getIntPtrConstant(VarArgsGPOffset));
+      unsigned Offset = VarArgsGPOffset;
       for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
+        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+                                  DAG.getIntPtrConstant(Offset));
         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
                                      X86::GR64RegisterClass);
-        SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i64);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
         SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
+                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
+                       Offset);
         MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
-                          DAG.getIntPtrConstant(8));
+        Offset += 8;
       }
 
-      // Now store the XMM (fp + vector) parameter registers.
-      FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
-                        DAG.getIntPtrConstant(VarArgsFPOffset));
-      for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
-        unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
-                                     X86::VR128RegisterClass);
-        SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::v4f32);
-        SDValue Store =
-          DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
-        MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
-                          DAG.getIntPtrConstant(16));
+      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
+        // Now store the XMM (fp + vector) parameter registers.
+        SmallVector<SDValue, 11> SaveXMMOps;
+        SaveXMMOps.push_back(Chain);
+
+        unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
+        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
+        SaveXMMOps.push_back(ALVal);
+
+        SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex));
+        SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset));
+
+        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
+          unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
+                                       X86::VR128RegisterClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
+          SaveXMMOps.push_back(Val);
+        }
+        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+                                     MVT::Other,
+                                     &SaveXMMOps[0], SaveXMMOps.size()));
       }
+
       if (!MemOps.empty())
-          Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                             &MemOps[0], MemOps.size());
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                            &MemOps[0], MemOps.size());
     }
   }
 
-  ArgValues.push_back(Root);
-
   // Some CCs need callee pop.
-  if (IsCalleePop(isVarArg, CC)) {
+  if (IsCalleePop(isVarArg, CallConv)) {
     BytesToPopOnReturn  = StackSize; // Callee pops everything.
     BytesCallerReserves = 0;
   } else {
     BytesToPopOnReturn  = 0; // Callee pops nothing.
     // If this is an sret function, the return should pop the hidden pointer.
-    if (!Is64Bit && CC != CallingConv::Fast && ArgsAreStructReturn(Op))
+    if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins))
       BytesToPopOnReturn = 4;
     BytesCallerReserves = StackSize;
   }
 
   if (!Is64Bit) {
     RegSaveFrameIndex = 0xAAAAAAA;   // RegSaveFrameIndex is X86-64 only.
-    if (CC == CallingConv::X86_FastCall)
+    if (CallConv == CallingConv::X86_FastCall)
       VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
   }
 
   FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
 
-  // Return the new list of results.
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
-                     &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
+  return Chain;
 }
 
 SDValue
-X86TargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
-                                    const SDValue &StackPtr,
+X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
+                                    SDValue StackPtr, SDValue Arg,
+                                    DebugLoc dl, SelectionDAG &DAG,
                                     const CCValAssign &VA,
-                                    SDValue Chain,
-                                    SDValue Arg, ISD::ArgFlagsTy Flags) {
-  DebugLoc dl = TheCall->getDebugLoc();
-  unsigned LocMemOffset = VA.getLocMemOffset();
+                                    ISD::ArgFlagsTy Flags) {
+  const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0);
+  unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
   if (Flags.isByVal()) {
@@ -1649,7 +1650,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
   if (!IsTailCall || FPDiff==0) return Chain;
 
   // Adjust the Return address stack slot.
-  MVT VT = getPointerTy();
+  EVT VT = getPointerTy();
   OutRetAddr = getReturnAddressFrameIndex(DAG);
 
   // Load the "old" Return address.
@@ -1669,41 +1670,45 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
   int SlotSize = Is64Bit ? 8 : 4;
   int NewReturnAddrFI =
     MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
-  MVT VT = Is64Bit ? MVT::i64 : MVT::i32;
+  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
                        PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0);
   return Chain;
 }
 
-SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
+SDValue
+X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool isTailCall,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) {
+
   MachineFunction &MF = DAG.getMachineFunction();
-  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
-  SDValue Chain       = TheCall->getChain();
-  unsigned CC         = TheCall->getCallingConv();
-  bool isVarArg       = TheCall->isVarArg();
-  bool IsTailCall     = TheCall->isTailCall() &&
-                        CC == CallingConv::Fast && PerformTailCallOpt;
-  SDValue Callee      = TheCall->getCallee();
   bool Is64Bit        = Subtarget->is64Bit();
-  bool IsStructRet    = CallIsStructReturn(TheCall);
-  DebugLoc dl         = TheCall->getDebugLoc();
+  bool IsStructRet    = CallIsStructReturn(Outs);
 
-  assert(!(isVarArg && CC == CallingConv::Fast) &&
+  assert((!isTailCall ||
+          (CallConv == CallingConv::Fast && PerformTailCallOpt)) &&
+         "IsEligibleForTailCallOptimization missed a case!");
+  assert(!(isVarArg && CallConv == CallingConv::Fast) &&
          "Var args not supported with calling convention fastcc");
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
-  CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC));
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
-  if (PerformTailCallOpt && CC == CallingConv::Fast)
+  if (PerformTailCallOpt && CallConv == CallingConv::Fast)
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
   int FPDiff = 0;
-  if (IsTailCall) {
+  if (isTailCall) {
     // Lower arguments at fp - stackoffset + fpdiff.
     unsigned NumBytesCallerPushed =
       MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
@@ -1719,7 +1724,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
 
   SDValue RetAddrFrIdx;
   // Load return adress for tail calls.
-  Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, IsTailCall, Is64Bit,
+  Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit,
                                   FPDiff, dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
@@ -1730,57 +1735,54 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
   // of tail call optimization arguments are handle later.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = TheCall->getArg(i);
-    ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
+    EVT RegVT = VA.getLocVT();
+    SDValue Arg = Outs[i].Val;
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
     bool isByVal = Flags.isByVal();
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
-    default: assert(0 && "Unknown loc info!");
+    default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::SExt:
-      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
       break;
     case CCValAssign::ZExt:
-      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
       break;
     case CCValAssign::AExt:
-      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
+        // Special case: passing MMX values in XMM registers.
+        Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
+        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
+        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
+      } else
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
       break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg);
+      break;
+    case CCValAssign::Indirect: {
+      // Store the argument.
+      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
+                           PseudoSourceValue::getFixedStack(FI), 0);
+      Arg = SpillSlot;
+      break;
+    }
     }
 
     if (VA.isRegLoc()) {
-      if (Is64Bit) {
-        MVT RegVT = VA.getLocVT();
-        if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
-          switch (VA.getLocReg()) {
-          default:
-            break;
-          case X86::RDI: case X86::RSI: case X86::RDX: case X86::RCX:
-          case X86::R8: {
-            // Special case: passing MMX values in GPR registers.
-            Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
-            break;
-          }
-          case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3:
-          case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7: {
-            // Special case: passing MMX values in XMM registers.
-            Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
-            Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
-            Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
-            break;
-          }
-          }
-      }
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else {
-      if (!IsTailCall || (IsTailCall && isByVal)) {
+      if (!isTailCall || (isTailCall && isByVal)) {
         assert(VA.isMemLoc());
         if (StackPtr.getNode() == 0)
           StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
 
-        MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
-                                               Chain, Arg, Flags));
+        MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+                                               dl, DAG, VA, Flags));
       }
     }
   }
@@ -1794,37 +1796,41 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
   SDValue InFlag;
   // Tail call byval lowering might overwrite argument registers so in case of
   // tail call optimization the copies to registers are lowered later.
-  if (!IsTailCall)
+  if (!isTailCall)
     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                                RegsToPass[i].second, InFlag);
       InFlag = Chain.getValue(1);
     }
 
-  // ELF / PIC requires GOT in the EBX register before function calls via PLT
-  // GOT pointer.
-  if (CallRequiresGOTPtrInReg(Is64Bit, IsTailCall)) {
-    Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
-                             DAG.getNode(X86ISD::GlobalBaseReg,
-                                         DebugLoc::getUnknownLoc(),
-                                         getPointerTy()),
-                             InFlag);
-    InFlag = Chain.getValue(1);
-  }
-  // If we are tail calling and generating PIC/GOT style code load the address
-  // of the callee into ecx. The value in ecx is used as target of the tail
-  // jump. This is done to circumvent the ebx/callee-saved problem for tail
-  // calls on PIC/GOT architectures. Normally we would just put the address of
-  // GOT into ebx and then call target@PLT. But for tail callss ebx would be
-  // restored (since ebx is callee saved) before jumping to the target@PLT.
-  if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) {
-    // Note: The actual moving to ecx is done further down.
-    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
-    if (G && !G->getGlobal()->hasHiddenVisibility() &&
-        !G->getGlobal()->hasProtectedVisibility())
-      Callee =  LowerGlobalAddress(Callee, DAG);
-    else if (isa<ExternalSymbolSDNode>(Callee))
-      Callee = LowerExternalSymbol(Callee,DAG);
+
+  if (Subtarget->isPICStyleGOT()) {
+    // ELF / PIC requires GOT in the EBX register before function calls via PLT
+    // GOT pointer.
+    if (!isTailCall) {
+      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
+                               DAG.getNode(X86ISD::GlobalBaseReg,
+                                           DebugLoc::getUnknownLoc(),
+                                           getPointerTy()),
+                               InFlag);
+      InFlag = Chain.getValue(1);
+    } else {
+      // If we are tail calling and generating PIC/GOT style code load the
+      // address of the callee into ECX. The value in ecx is used as target of
+      // the tail jump. This is done to circumvent the ebx/callee-saved problem
+      // for tail calls on PIC/GOT architectures. Normally we would just put the
+      // address of GOT into ebx and then call target@PLT. But for tail calls
+      // ebx would be restored (since ebx is callee saved) before jumping to the
+      // target@PLT.
+
+      // Note: The actual moving to ECX is done further down.
+      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+      if (G && !G->getGlobal()->hasHiddenVisibility() &&
+          !G->getGlobal()->hasProtectedVisibility())
+        Callee = LowerGlobalAddress(Callee, DAG);
+      else if (isa<ExternalSymbolSDNode>(Callee))
+        Callee = LowerExternalSymbol(Callee, DAG);
+    }
   }
 
   if (Is64Bit && isVarArg) {
@@ -1853,7 +1859,15 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
 
 
   // For tail calls lower the arguments to the 'real' stack slot.
-  if (IsTailCall) {
+  if (isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
+
     SmallVector<SDValue, 8> MemOpChains2;
     SDValue FIN;
     int FI = 0;
@@ -1863,8 +1877,8 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
       CCValAssign &VA = ArgLocs[i];
       if (!VA.isRegLoc()) {
         assert(VA.isMemLoc());
-        SDValue Arg = TheCall->getArg(i);
-        ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
+        SDValue Arg = Outs[i].Val;
+        ISD::ArgFlagsTy Flags = Outs[i].Flags;
         // Create frame index.
         int32_t Offset = VA.getLocMemOffset()+FPDiff;
         uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
@@ -1879,12 +1893,13 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
                                           getPointerTy());
           Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
 
-          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, Chain,
+          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+                                                           ArgChain,
                                                            Flags, DAG, dl));
         } else {
           // Store relative to framepointer.
           MemOpChains2.push_back(
-            DAG.getStore(Chain, dl, Arg, FIN,
+            DAG.getStore(ArgChain, dl, Arg, FIN,
                          PseudoSourceValue::getFixedStack(FI), 0));
         }
       }
@@ -1912,13 +1927,49 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     // We should use extra load for direct calls to dllimported functions in
     // non-JIT mode.
-    if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
-                                        getTargetMachine(), true))
-      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy(),
-                                          G->getOffset());
+    GlobalValue *GV = G->getGlobal();
+    if (!GV->hasDLLImportLinkage()) {
+      unsigned char OpFlags = 0;
+
+      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+      // external symbols most go through the PLT in PIC mode.  If the symbol
+      // has hidden or protected visibility, or if it is static or local, then
+      // we don't need to use the PLT - we can directly call it.
+      if (Subtarget->isTargetELF() &&
+          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+        OpFlags = X86II::MO_PLT;
+      } else if (Subtarget->isPICStyleStubAny() &&
+               (GV->isDeclaration() || GV->isWeakForLinker()) &&
+               Subtarget->getDarwinVers() < 9) {
+        // PC-relative references to external symbols should go through $stub,
+        // unless we're building with the leopard linker or later, which
+        // automatically synthesizes these stubs.
+        OpFlags = X86II::MO_DARWIN_STUB;
+      }
+
+      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(),
+                                          G->getOffset(), OpFlags);
+    }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
-  } else if (IsTailCall) {
+    unsigned char OpFlags = 0;
+
+    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
+    // symbols should go through the PLT.
+    if (Subtarget->isTargetELF() &&
+        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+      OpFlags = X86II::MO_PLT;
+    } else if (Subtarget->isPICStyleStubAny() &&
+             Subtarget->getDarwinVers() < 9) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = X86II::MO_DARWIN_STUB;
+    }
+
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
+                                         OpFlags);
+  } else if (isTailCall) {
     unsigned Opc = Is64Bit ? X86::R11 : X86::EAX;
 
     Chain = DAG.getCopyToReg(Chain,  dl,
@@ -1926,27 +1977,23 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
                              Callee,InFlag);
     Callee = DAG.getRegister(Opc, getPointerTy());
     // Add register as live out.
-    DAG.getMachineFunction().getRegInfo().addLiveOut(Opc);
+    MF.getRegInfo().addLiveOut(Opc);
   }
 
   // Returns a chain & a flag for retval copy to use.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
   SmallVector<SDValue, 8> Ops;
 
-  if (IsTailCall) {
+  if (isTailCall) {
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                            DAG.getIntPtrConstant(0, true), InFlag);
     InFlag = Chain.getValue(1);
-
-    // Returns a chain & a flag for retval copy to use.
-    NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
-    Ops.clear();
   }
 
   Ops.push_back(Chain);
   Ops.push_back(Callee);
 
-  if (IsTailCall)
+  if (isTailCall)
     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
 
   // Add argument registers to the end of the list so that they are known live
@@ -1956,9 +2003,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
                                   RegsToPass[i].second.getValueType()));
 
   // Add an implicit use GOT pointer in EBX.
-  if (!IsTailCall && !Is64Bit &&
-      getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
-      Subtarget->isPICStyleGOT())
+  if (!isTailCall && Subtarget->isPICStyleGOT())
     Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
 
   // Add an implicit use of AL for x86 vararg functions.
@@ -1968,13 +2013,28 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
-  if (IsTailCall) {
-    assert(InFlag.getNode() &&
-           "Flag must be set. Depend on flag being set in LowerRET");
-    Chain = DAG.getNode(X86ISD::TAILCALL, dl,
-                        TheCall->getVTList(), &Ops[0], Ops.size());
+  if (isTailCall) {
+    // If this is the first return lowered for this function, add the regs
+    // to the liveout set for the function.
+    if (MF.getRegInfo().liveout_empty()) {
+      SmallVector<CCValAssign, 16> RVLocs;
+      CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
+                     *DAG.getContext());
+      CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+      for (unsigned i = 0; i != RVLocs.size(); ++i)
+        if (RVLocs[i].isRegLoc())
+          MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+    }
+
+    assert(((Callee.getOpcode() == ISD::Register &&
+               (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX ||
+                cast<RegisterSDNode>(Callee)->getReg() == X86::R9)) ||
+              Callee.getOpcode() == ISD::TargetExternalSymbol ||
+              Callee.getOpcode() == ISD::TargetGlobalAddress) &&
+             "Expecting an global address, external symbol, or register");
 
-    return SDValue(Chain.getNode(), Op.getResNo());
+    return DAG.getNode(X86ISD::TC_RETURN, dl,
+                       NodeTys, &Ops[0], Ops.size());
   }
 
   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
@@ -1982,9 +2042,9 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
 
   // Create the CALLSEQ_END node.
   unsigned NumBytesForCalleeToPush;
-  if (IsCalleePop(isVarArg, CC))
+  if (IsCalleePop(isVarArg, CallConv))
     NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
-  else if (!Is64Bit && CC != CallingConv::Fast && IsStructRet)
+  else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet)
     // If this is is a call to a struct-return function, the callee
     // pops the hidden struct pointer, so we have to push it back.
     // This is common for Darwin/X86, Linux & Mingw32 targets.
@@ -2002,8 +2062,8 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
-  return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
-                 Op.getResNo());
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
 }
 
 
@@ -2060,36 +2120,18 @@ unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
   return Offset;
 }
 
-/// IsEligibleForTailCallElimination - Check to see whether the next instruction
-/// following the call is a return. A function is eligible if caller/callee
-/// calling conventions match, currently only fastcc supports tail calls, and
-/// the function CALL is immediatly followed by a RET.
-bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall,
-                                                      SDValue Ret,
-                                                      SelectionDAG& DAG) const {
-  if (!PerformTailCallOpt)
-    return false;
-
-  if (CheckTailCallReturnConstraints(TheCall, Ret)) {
-    MachineFunction &MF = DAG.getMachineFunction();
-    unsigned CallerCC = MF.getFunction()->getCallingConv();
-    unsigned CalleeCC= TheCall->getCallingConv();
-    if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
-      SDValue Callee = TheCall->getCallee();
-      // On x86/32Bit PIC/GOT  tail calls are supported.
-      if (getTargetMachine().getRelocationModel() != Reloc::PIC_ ||
-          !Subtarget->isPICStyleGOT()|| !Subtarget->is64Bit())
-        return true;
-
-      // Can only do local tail calls (in same module, hidden or protected) on
-      // x86_64 PIC/GOT at the moment.
-      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-        return G->getGlobal()->hasHiddenVisibility()
-            || G->getGlobal()->hasProtectedVisibility();
-    }
-  }
-
-  return false;
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG& DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+  return CalleeCC == CallingConv::Fast && CallerCC == CalleeCC;
 }
 
 FastISel *
@@ -2133,6 +2175,36 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
 }
 
 
+bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+                                       bool hasSymbolicDisplacement) {
+  // Offset should fit into 32 bit immediate field.
+  if (!isInt32(Offset))
+    return false;
+
+  // If we don't have a symbolic displacement - we don't have any extra
+  // restrictions.
+  if (!hasSymbolicDisplacement)
+    return true;
+
+  // FIXME: Some tweaks might be needed for medium code model.
+  if (M != CodeModel::Small && M != CodeModel::Kernel)
+    return false;
+
+  // For small code model we assume that latest object is 16MB before end of 31
+  // bits boundary. We may also accept pretty large negative constants knowing
+  // that all objects are in the positive half of address space.
+  if (M == CodeModel::Small && Offset < 16*1024*1024)
+    return true;
+
+  // For kernel code model we know that all object resist in the negative half
+  // of 32bits address space. We may not accept negative offsets, since they may
+  // be just off and we may accept pretty large positive ones.
+  if (M == CodeModel::Kernel && Offset > 0)
+    return true;
+
+  return false;
+}
+
 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
 /// specific condition code, returning the condition code and the LHS/RHS of the
 /// comparison to make.
@@ -2155,7 +2227,7 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
     }
 
     switch (SetCCOpcode) {
-    default: assert(0 && "Invalid integer condition!");
+    default: llvm_unreachable("Invalid integer condition!");
     case ISD::SETEQ:  return X86::COND_E;
     case ISD::SETGT:  return X86::COND_G;
     case ISD::SETGE:  return X86::COND_GE;
@@ -2195,7 +2267,7 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
   //  1 | 0 | 0 | X == Y
   //  1 | 1 | 1 | unordered
   switch (SetCCOpcode) {
-  default: assert(0 && "Condcode should be pre-legalized away");
+  default: llvm_unreachable("Condcode should be pre-legalized away");
   case ISD::SETUEQ:
   case ISD::SETEQ:   return X86::COND_E;
   case ISD::SETOLT:              // flipped
@@ -2253,7 +2325,7 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
 /// the second operand.
-static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
   if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
   if (VT == MVT::v2f64 || VT == MVT::v2i64)
@@ -2262,68 +2334,68 @@ static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, MVT VT) {
 }
 
 bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M; 
+  SmallVector<int, 8> M;
   N->getMask(M);
   return ::isPSHUFDMask(M, N->getValueType(0));
 }
 
 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
   if (VT != MVT::v8i16)
     return false;
-  
+
   // Lower quadword copied in order or undef.
   for (int i = 0; i != 4; ++i)
     if (Mask[i] >= 0 && Mask[i] != i)
       return false;
-  
+
   // Upper quadword shuffled.
   for (int i = 4; i != 8; ++i)
     if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
       return false;
-  
+
   return true;
 }
 
 bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M; 
+  SmallVector<int, 8> M;
   N->getMask(M);
   return ::isPSHUFHWMask(M, N->getValueType(0));
 }
 
 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
   if (VT != MVT::v8i16)
     return false;
-  
+
   // Upper quadword copied in order.
   for (int i = 4; i != 8; ++i)
     if (Mask[i] >= 0 && Mask[i] != i)
       return false;
-  
+
   // Lower quadword shuffled.
   for (int i = 0; i != 4; ++i)
     if (Mask[i] >= 4)
       return false;
-  
+
   return true;
 }
 
 bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M; 
+  SmallVector<int, 8> M;
   N->getMask(M);
   return ::isPSHUFLWMask(M, N->getValueType(0));
 }
 
 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to SHUFP*.
-static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
   int NumElems = VT.getVectorNumElements();
   if (NumElems != 2 && NumElems != 4)
     return false;
-  
+
   int Half = NumElems / 2;
   for (int i = 0; i < Half; ++i)
     if (!isUndefOrInRange(Mask[i], 0, NumElems))
@@ -2331,7 +2403,7 @@ static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) {
   for (int i = Half; i < NumElems; ++i)
     if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
       return false;
-  
+
   return true;
 }
 
@@ -2345,12 +2417,12 @@ bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
 /// the reverse of what x86 shuffles want. x86 shuffles requires the lower
 /// half elements to come from vector 1 (which would equal the dest.) and
 /// the upper half to come from vector 2.
-static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
   int NumElems = VT.getVectorNumElements();
-  
-  if (NumElems != 2 && NumElems != 4) 
+
+  if (NumElems != 2 && NumElems != 4)
     return false;
-  
+
   int Half = NumElems / 2;
   for (int i = 0; i < Half; ++i)
     if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
@@ -2424,24 +2496,24 @@ bool X86::isMOVHPMask(ShuffleVectorSDNode *N) {
 /// <2, 3, 2, 3>
 bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
   unsigned NumElems = N->getValueType(0).getVectorNumElements();
-  
+
   if (NumElems != 4)
     return false;
-  
-  return isUndefOrEqual(N->getMaskElt(0), 2) && 
+
+  return isUndefOrEqual(N->getMaskElt(0), 2) &&
          isUndefOrEqual(N->getMaskElt(1), 3) &&
-         isUndefOrEqual(N->getMaskElt(2), 2) && 
+         isUndefOrEqual(N->getMaskElt(2), 2) &&
          isUndefOrEqual(N->getMaskElt(3), 3);
 }
 
 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
-static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, MVT VT,
+static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
                          bool V2IsSplat = false) {
   int NumElts = VT.getVectorNumElements();
   if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
     return false;
-  
+
   for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
     int BitI  = Mask[i];
     int BitI1 = Mask[i+1];
@@ -2466,12 +2538,12 @@ bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
 
 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
-static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, MVT VT, 
+static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
                          bool V2IsSplat = false) {
   int NumElts = VT.getVectorNumElements();
   if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
     return false;
-  
+
   for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
     int BitI  = Mask[i];
     int BitI1 = Mask[i+1];
@@ -2497,11 +2569,11 @@ bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
 /// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) {
+static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
   int NumElems = VT.getVectorNumElements();
   if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
     return false;
-  
+
   for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
     int BitI  = Mask[i];
     int BitI1 = Mask[i+1];
@@ -2522,11 +2594,11 @@ bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
 /// <2, 2, 3, 3>
-static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) {
+static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
   int NumElems = VT.getVectorNumElements();
   if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
     return false;
-  
+
   for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
     int BitI  = Mask[i];
     int BitI1 = Mask[i+1];
@@ -2547,19 +2619,19 @@ bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSS,
 /// MOVSD, and MOVD, i.e. setting the lowest element.
-static bool isMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
   if (VT.getVectorElementType().getSizeInBits() < 32)
     return false;
 
   int NumElts = VT.getVectorNumElements();
-  
+
   if (!isUndefOrEqual(Mask[0], NumElts))
     return false;
-  
+
   for (int i = 1; i < NumElts; ++i)
     if (!isUndefOrEqual(Mask[i], i))
       return false;
-  
+
   return true;
 }
 
@@ -2572,21 +2644,21 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
 /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
 /// element of vector 2 and the other elements to come from vector 1 in order.
-static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT,
+static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
                                bool V2IsSplat = false, bool V2IsUndef = false) {
   int NumOps = VT.getVectorNumElements();
   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
     return false;
-  
+
   if (!isUndefOrEqual(Mask[0], 0))
     return false;
-  
+
   for (int i = 1; i < NumOps; ++i)
     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
       return false;
-  
+
   return true;
 }
 
@@ -2650,7 +2722,7 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
 /// specifies a shuffle of elements that is suitable for input to MOVDDUP.
 bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
   int e = N->getValueType(0).getVectorNumElements() / 2;
-  
+
   for (int i = 0; i < e; ++i)
     if (!isUndefOrEqual(N->getMaskElt(i), i))
       return false;
@@ -2714,14 +2786,23 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
   return Mask;
 }
 
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+bool X86::isZeroNode(SDValue Elt) {
+  return ((isa<ConstantSDNode>(Elt) &&
+           cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
+          (isa<ConstantFPSDNode>(Elt) &&
+           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
+}
+
 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
 /// their permute mask.
 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
                                     SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0);
+  EVT VT = SVOp->getValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> MaskVec;
-  
+
   for (unsigned i = 0; i != NumElems; ++i) {
     int idx = SVOp->getMaskElt(i);
     if (idx < 0)
@@ -2737,7 +2818,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
 
 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
 /// the two vector operands have swapped position.
-static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, MVT VT) {
+static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
   unsigned NumElems = VT.getVectorNumElements();
   for (unsigned i = 0; i != NumElems; ++i) {
     int idx = Mask[i];
@@ -2795,7 +2876,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
     return false;
 
   unsigned NumElems = Op->getValueType(0).getVectorNumElements();
-  
+
   if (NumElems != 2 && NumElems != 4)
     return false;
   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
@@ -2820,17 +2901,8 @@ static bool isSplatVector(SDNode *N) {
   return true;
 }
 
-/// isZeroNode - Returns true if Elt is a constant zero or a floating point
-/// constant +0.0.
-static inline bool isZeroNode(SDValue Elt) {
-  return ((isa<ConstantSDNode>(Elt) &&
-           cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
-          (isa<ConstantFPSDNode>(Elt) &&
-           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
-}
-
 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
-/// to an zero vector. 
+/// to an zero vector.
 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
   SDValue V1 = N->getOperand(0);
@@ -2842,13 +2914,15 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
       unsigned Opc = V2.getOpcode();
       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
         continue;
-      if (Opc != ISD::BUILD_VECTOR || !isZeroNode(V2.getOperand(Idx-NumElems)))
+      if (Opc != ISD::BUILD_VECTOR ||
+          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
         return false;
     } else if (Idx >= 0) {
       unsigned Opc = V1.getOpcode();
       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
         continue;
-      if (Opc != ISD::BUILD_VECTOR || !isZeroNode(V1.getOperand(Idx)))
+      if (Opc != ISD::BUILD_VECTOR ||
+          !X86::isZeroNode(V1.getOperand(Idx)))
         return false;
     }
   }
@@ -2857,7 +2931,7 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
 
 /// getZeroVector - Returns a vector of specified type with all zero elements.
 ///
-static SDValue getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG,
+static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
                              DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
@@ -2879,7 +2953,7 @@ static SDValue getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG,
 
 /// getOnesVector - Returns a vector of specified type with all bits set.
 ///
-static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
   // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
@@ -2897,13 +2971,13 @@ static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
 /// that point to V2 points to its first element.
 static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0);
+  EVT VT = SVOp->getValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
-  
+
   bool Changed = false;
   SmallVector<int, 8> MaskVec;
   SVOp->getMask(MaskVec);
-  
+
   for (unsigned i = 0; i != NumElems; ++i) {
     if (MaskVec[i] > (int)NumElems) {
       MaskVec[i] = NumElems;
@@ -2918,7 +2992,7 @@ static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
 
 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
 /// operation of specified width.
-static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                        SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
@@ -2929,7 +3003,7 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
 }
 
 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
-static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
+static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
@@ -2941,7 +3015,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
 }
 
 /// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
-static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
+static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   unsigned Half = NumElems/2;
@@ -2954,13 +3028,13 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
 }
 
 /// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
-static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
                             bool HasSSE2) {
   if (SV->getValueType(0).getVectorNumElements() <= 4)
     return SDValue(SV, 0);
-  
-  MVT PVT = MVT::v4f32;
-  MVT VT = SV->getValueType(0);
+
+  EVT PVT = MVT::v4f32;
+  EVT VT = SV->getValueType(0);
   DebugLoc dl = SV->getDebugLoc();
   SDValue V1 = SV->getOperand(0);
   int NumElems = VT.getVectorNumElements();
@@ -2976,7 +3050,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
     }
     NumElems >>= 1;
   }
-  
+
   // Perform the splat.
   int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
   V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
@@ -2991,7 +3065,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
                                              bool isZero, bool HasSSE2,
                                              SelectionDAG &DAG) {
-  MVT VT = V2.getValueType();
+  EVT VT = V2.getValueType();
   SDValue V1 = isZero
     ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
   unsigned NumElems = VT.getVectorNumElements();
@@ -3016,7 +3090,7 @@ unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems,
       continue;
     }
     SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index);
-    if (Elt.getNode() && isZeroNode(Elt))
+    if (Elt.getNode() && X86::isZeroNode(Elt))
       ++NumZeros;
     else
       break;
@@ -3142,11 +3216,11 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
 
 /// getVShift - Return a vector logical shift node.
 ///
-static SDValue getVShift(bool isLeft, MVT VT, SDValue SrcOp,
+static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                          unsigned NumBits, SelectionDAG &DAG,
                          const TargetLowering &TLI, DebugLoc dl) {
   bool isMMX = VT.getSizeInBits() == 64;
-  MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
+  EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
   unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
   SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
   return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
@@ -3171,9 +3245,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
     return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
   }
 
-  MVT VT = Op.getValueType();
-  MVT EVT = VT.getVectorElementType();
-  unsigned EVTBits = EVT.getSizeInBits();
+  EVT VT = Op.getValueType();
+  EVT ExtVT = VT.getVectorElementType();
+  unsigned EVTBits = ExtVT.getSizeInBits();
 
   unsigned NumElems = Op.getNumOperands();
   unsigned NumZero  = 0;
@@ -3189,7 +3263,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
     if (Elt.getOpcode() != ISD::Constant &&
         Elt.getOpcode() != ISD::ConstantFP)
       IsAllConstants = false;
-    if (isZeroNode(Elt))
+    if (X86::isZeroNode(Elt))
       NumZero++;
     else {
       NonZeros |= (1 << i);
@@ -3212,11 +3286,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
     // insertion that way.  Only do this if the value is non-constant or if the
     // value is a constant being inserted into element 0.  It is cheaper to do
     // a constant pool load than it is to do a movd + shuffle.
-    if (EVT == MVT::i64 && !Subtarget->is64Bit() &&
+    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
         (!IsAllConstants || Idx == 0)) {
       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
         // Handle MMX and SSE both.
-        MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
+        EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
         unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
 
         // Truncate the value (which may itself be a constant) to i32, and
@@ -3234,7 +3308,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
           for (unsigned i = 1; i != VecElts; ++i)
             Mask.push_back(i);
           Item = DAG.getVectorShuffle(VecVT, dl, Item,
-                                      DAG.getUNDEF(Item.getValueType()), 
+                                      DAG.getUNDEF(Item.getValueType()),
                                       &Mask[0]);
         }
         return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
@@ -3248,15 +3322,15 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
     if (Idx == 0) {
       if (NumZero == 0) {
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
-      } else if (EVT == MVT::i32 || EVT == MVT::f32 || EVT == MVT::f64 ||
-          (EVT == MVT::i64 && Subtarget->is64Bit())) {
+      } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
+          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
                                            DAG);
-      } else if (EVT == MVT::i16 || EVT == MVT::i8) {
+      } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
-        MVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
+        EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
         Item = getShuffleVectorZeroOrUndef(Item, 0, true,
                                            Subtarget->hasSSE2(), DAG);
@@ -3266,7 +3340,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
 
     // Is it a vector logical left shift?
     if (NumElems == 2 && Idx == 1 &&
-        isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) {
+        X86::isZeroNode(Op.getOperand(0)) &&
+        !X86::isZeroNode(Op.getOperand(1))) {
       unsigned NumBits = VT.getSizeInBits();
       return getVShift(true, VT,
                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
@@ -3374,9 +3449,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
     // If we have SSE 4.1, Expand into a number of inserts unless the number of
     // values to be inserted is equal to the number of elements, in which case
     // use the unpack code below in the hopes of matching the consecutive elts
-    // load merge pattern for shuffles. 
+    // load merge pattern for shuffles.
     // FIXME: We could probably just check that here directly.
-    if (Values.size() < NumElems && VT.getSizeInBits() == 128 && 
+    if (Values.size() < NumElems && VT.getSizeInBits() == 128 &&
         getSubtarget()->hasSSE41()) {
       V[0] = DAG.getUNDEF(VT);
       for (unsigned i = 0; i < NumElems; ++i)
@@ -3457,7 +3532,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
   }
 
   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
-  // of the two input vectors, shuffle them into one input vector so only a 
+  // of the two input vectors, shuffle them into one input vector so only a
   // single pshufb instruction is necessary. If There are more than 2 input
   // quads, disable the next transformation since it does not help SSSE3.
   bool V1Used = InputQuads[0] || InputQuads[1];
@@ -3481,7 +3556,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
     SmallVector<int, 8> MaskV;
     MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
     MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
-    NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 
+    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
     NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
@@ -3506,7 +3581,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
         int idx = MaskVals[i];
         if (idx < 0)
           continue;
-        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 
+        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
         if ((idx != i) && idx < 4)
           pshufhw = false;
         if ((idx != i) && idx > 3)
@@ -3521,19 +3596,19 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
     // If we've eliminated the use of V2, and the new mask is a pshuflw or
     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
-      return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 
+      return DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
     }
   }
-  
+
   // If we have SSSE3, and all words of the result are from 1 input vector,
   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
   // is present, fall back to case 4.
   if (TLI.getSubtarget()->hasSSSE3()) {
     SmallVector<SDValue,16> pshufbMask;
-    
+
     // If we have elements from both input vectors, set the high bit of the
-    // shuffle mask element to zero out elements that come from V2 in the V1 
+    // shuffle mask element to zero out elements that come from V2 in the V1
     // mask, and elements that come from V1 in the V2 mask, so that the two
     // results can be OR'd together.
     bool TwoInputs = V1Used && V2Used;
@@ -3548,12 +3623,12 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
       pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
     }
     V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
-    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 
+    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
                                  MVT::v16i8, &pshufbMask[0], 16));
     if (!TwoInputs)
       return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
-    
+
     // Calculate the shuffle mask for the second input, shuffle it, and
     // OR it with the first shuffled input.
     pshufbMask.clear();
@@ -3568,7 +3643,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
       pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
     }
     V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
-    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 
+    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
                                  MVT::v16i8, &pshufbMask[0], 16));
     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
@@ -3597,7 +3672,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
   }
-  
+
   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
   // and update MaskVals with the new element order.
   if (BestHiQuad >= 0) {
@@ -3619,7 +3694,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
   }
-  
+
   // In case BestHi & BestLo were both -1, which means each quadword has a word
   // from each of the four input quadwords, calculate the InOrder bitvector now
   // before falling through to the insert/extract cleanup.
@@ -3629,7 +3704,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
       if (MaskVals[i] < 0 || MaskVals[i] == i)
         InOrder.set(i);
   }
-  
+
   // The other elements are put in the right place using pextrw and pinsrw.
   for (unsigned i = 0; i != 8; ++i) {
     if (InOrder[i])
@@ -3660,9 +3735,9 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   DebugLoc dl = SVOp->getDebugLoc();
   SmallVector<int, 16> MaskVals;
   SVOp->getMask(MaskVals);
-  
+
   // If we have SSSE3, case 1 is generated when all result bytes come from
-  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is 
+  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
   // present, fall back to case 3.
   // FIXME: kill V2Only once shuffles are canonizalized by getNode.
   bool V1Only = true;
@@ -3676,13 +3751,13 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
     else
       V1Only = false;
   }
-  
+
   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
   if (TLI.getSubtarget()->hasSSSE3()) {
     SmallVector<SDValue,16> pshufbMask;
-    
+
     // If all result elements are from one input vector, then only translate
-    // undef mask values to 0x80 (zero out result) in the pshufb mask. 
+    // undef mask values to 0x80 (zero out result) in the pshufb mask.
     //
     // Otherwise, we have elements from both input vectors, and must zero out
     // elements that come from V2 in the first mask, and V1 in the second mask
@@ -3705,7 +3780,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
                                  MVT::v16i8, &pshufbMask[0], 16));
     if (!TwoInputs)
       return V1;
-    
+
     // Calculate the shuffle mask for the second input, shuffle it, and
     // OR it with the first shuffled input.
     pshufbMask.clear();
@@ -3722,7 +3797,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
                                  MVT::v16i8, &pshufbMask[0], 16));
     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
   }
-  
+
   // No SSSE3 - Calculate in place words and then fix all out of place words
   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
   // the 16 different words that comprise the two doublequadword input vectors.
@@ -3732,17 +3807,17 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   for (int i = 0; i != 8; ++i) {
     int Elt0 = MaskVals[i*2];
     int Elt1 = MaskVals[i*2+1];
-    
+
     // This word of the result is all undef, skip it.
     if (Elt0 < 0 && Elt1 < 0)
       continue;
-    
+
     // This word of the result is already in the correct place, skip it.
     if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
       continue;
     if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
       continue;
-    
+
     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
     SDValue InsElt;
@@ -3801,15 +3876,15 @@ static
 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
                                  SelectionDAG &DAG,
                                  TargetLowering &TLI, DebugLoc dl) {
-  MVT VT = SVOp->getValueType(0);
+  EVT VT = SVOp->getValueType(0);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   unsigned NumElems = VT.getVectorNumElements();
   unsigned NewWidth = (NumElems == 4) ? 2 : 4;
-  MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
-  MVT MaskEltVT = MaskVT.getVectorElementType();
-  MVT NewVT = MaskVT;
-  switch (VT.getSimpleVT()) {
+  EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
+  EVT MaskEltVT = MaskVT.getVectorElementType();
+  EVT NewVT = MaskVT;
+  switch (VT.getSimpleVT().SimpleTy) {
   default: assert(false && "Unexpected!");
   case MVT::v4f32: NewVT = MVT::v2f64; break;
   case MVT::v4i32: NewVT = MVT::v2i64; break;
@@ -3849,7 +3924,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
 
 /// getVZextMovL - Return a zero-extending vector move low node.
 ///
-static SDValue getVZextMovL(MVT VT, MVT OpVT,
+static SDValue getVZextMovL(EVT VT, EVT OpVT,
                             SDValue SrcOp, SelectionDAG &DAG,
                             const X86Subtarget *Subtarget, DebugLoc dl) {
   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
@@ -3859,11 +3934,11 @@ static SDValue getVZextMovL(MVT VT, MVT OpVT,
     if (!LD) {
       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
       // instead.
-      MVT EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
-      if ((EVT != MVT::i64 || Subtarget->is64Bit()) &&
+      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
+      if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) &&
           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
           SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
-          SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) {
+          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
         // PR2108
         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
         return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
@@ -3889,8 +3964,8 @@ LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   DebugLoc dl = SVOp->getDebugLoc();
-  MVT VT = SVOp->getValueType(0);
-  
+  EVT VT = SVOp->getValueType(0);
+
   SmallVector<std::pair<int, int>, 8> Locs;
   Locs.resize(4);
   SmallVector<int, 8> Mask1(4U, -1);
@@ -3926,7 +4001,7 @@ LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
 
     SmallVector<int, 8> Mask2(4U, -1);
-    
+
     for (unsigned i = 0; i != 4; ++i) {
       if (Locs[i].first == -1)
         continue;
@@ -4036,7 +4111,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
   unsigned NumElems = VT.getVectorNumElements();
   bool isMMX = VT.getSizeInBits() == 64;
@@ -4050,7 +4125,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
 
   // Promote splats to v4f32.
   if (SVOp->isSplat()) {
-    if (isMMX || NumElems < 4) 
+    if (isMMX || NumElems < 4)
       return Op;
     return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
   }
@@ -4079,10 +4154,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
                             DAG, Subtarget, dl);
     }
   }
-  
+
   if (X86::isPSHUFDMask(SVOp))
     return Op;
-  
+
   // Check if this can be converted into a logical shift.
   bool isLeft = false;
   unsigned ShAmt = 0;
@@ -4092,11 +4167,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   if (isShift && ShVal.hasOneUse()) {
     // If the shifted value has multiple uses, it may be cheaper to use
     // v_set0 + movlhps or movhlps, etc.
-    MVT EVT = VT.getVectorElementType();
-    ShAmt *= EVT.getSizeInBits();
+    EVT EltVT = VT.getVectorElementType();
+    ShAmt *= EltVT.getSizeInBits();
     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
   }
-  
+
   if (X86::isMOVLMask(SVOp)) {
     if (V1IsUndef)
       return V2;
@@ -4105,7 +4180,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     if (!isMMX)
       return Op;
   }
-  
+
   // FIXME: fold these into legal mask.
   if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
                  X86::isMOVSLDUPMask(SVOp) ||
@@ -4120,11 +4195,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
 
   if (isShift) {
     // No better options. Use a vshl / vsrl.
-    MVT EVT = VT.getVectorElementType();
-    ShAmt *= EVT.getSizeInBits();
+    EVT EltVT = VT.getVectorElementType();
+    ShAmt *= EltVT.getSizeInBits();
     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
   }
-  
+
   bool Commuted = false;
   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
   // 1,1,1,1 -> v8i16 though.
@@ -4144,7 +4219,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
 
   if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
     // Shuffling low element of v1 into undef, just return v1.
-    if (V2IsUndef) 
+    if (V2IsUndef)
       return V1;
     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
     // the instruction selector will not match, so get a canonical MOVL with
@@ -4196,7 +4271,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   SVOp->getMask(PermMask);
   if (isShuffleMaskLegal(PermMask, VT))
     return Op;
-  
+
   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
   if (VT == MVT::v8i16) {
     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
@@ -4209,7 +4284,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     if (NewOp.getNode())
       return NewOp;
   }
-  
+
   // Handle all 4 wide cases with a number of shuffles except for MMX.
   if (NumElems == 4 && !isMMX)
     return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
@@ -4220,7 +4295,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
 SDValue
 X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                                 SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
   if (VT.getSizeInBits() == 8) {
     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
@@ -4283,7 +4358,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
       return Res;
   }
 
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
   // TODO: handle v16i8.
   if (VT.getSizeInBits() == 16) {
@@ -4296,21 +4371,21 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
                                                  MVT::v4i32, Vec),
                                      Op.getOperand(1)));
     // Transform it so it match pextrw which produces a 32-bit result.
-    MVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT()+1);
-    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EVT,
+    EVT EltVT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy+1);
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
                                     Op.getOperand(0), Op.getOperand(1));
-    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EVT, Extract,
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
                                     DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   } else if (VT.getSizeInBits() == 32) {
     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     if (Idx == 0)
       return Op;
-    
+
     // SHUFPS the element to the lowest double word, then movss.
     int Mask[4] = { Idx, -1, -1, -1 };
-    MVT VVT = Op.getOperand(0).getValueType();
-    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 
+    EVT VVT = Op.getOperand(0).getValueType();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                        DAG.getUNDEF(VVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getIntPtrConstant(0));
@@ -4326,8 +4401,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
     int Mask[2] = { 1, -1 };
-    MVT VVT = Op.getOperand(0).getValueType();
-    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 
+    EVT VVT = Op.getOperand(0).getValueType();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                        DAG.getUNDEF(VVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getIntPtrConstant(0));
@@ -4338,18 +4413,18 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
 
 SDValue
 X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
-  MVT VT = Op.getValueType();
-  MVT EVT = VT.getVectorElementType();
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT.getVectorElementType();
   DebugLoc dl = Op.getDebugLoc();
 
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
 
-  if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) &&
+  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
       isa<ConstantSDNode>(N2)) {
-    unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB
-                                              : X86ISD::PINSRW;
+    unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB
+                                                : X86ISD::PINSRW;
     // Transform it so it match pinsr{b,w} which expects a GR32 as its second
     // argument.
     if (N1.getValueType() != MVT::i32)
@@ -4357,7 +4432,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
     if (N2.getValueType() != MVT::i32)
       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
-  } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+  } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
     // Bits [7:6] of the constant are the source select.  This will always be
     //  zero here.  The DAG Combiner may combine an extract_elt index into these
     //  bits.  For example (insert (extract, 3), 2) could be matched by putting
@@ -4367,24 +4442,25 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
     // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
     //   combine either bitwise AND or insert of float 0.0 to set these bits.
     N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
+    // Create this as a scalar to vector..
+    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
     return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
-  } else if (EVT == MVT::i32) {
-    // InsertPS works with constant index.
-    if (isa<ConstantSDNode>(N2))
-      return Op;
+  } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
+    // PINSR* works with constant index.
+    return Op;
   }
   return SDValue();
 }
 
 SDValue
 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
-  MVT EVT = VT.getVectorElementType();
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT.getVectorElementType();
 
   if (Subtarget->hasSSE41())
     return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
 
-  if (EVT == MVT::i8)
+  if (EltVT == MVT::i8)
     return SDValue();
 
   DebugLoc dl = Op.getDebugLoc();
@@ -4392,7 +4468,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
 
-  if (EVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
+  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
     // as its second argument.
     if (N1.getValueType() != MVT::i32)
@@ -4413,9 +4489,12 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
                                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
                                                Op.getOperand(0))));
 
+  if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
+
   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
-  MVT VT = MVT::v2i32;
-  switch (Op.getValueType().getSimpleVT()) {
+  EVT VT = MVT::v2i32;
+  switch (Op.getValueType().getSimpleVT().SimpleTy) {
   default: break;
   case MVT::v16i8:
   case MVT::v8i16:
@@ -4435,21 +4514,21 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
 SDValue
 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-  
+
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   // global base reg.
   unsigned char OpFlag = 0;
   unsigned WrapperKind = X86ISD::Wrapper;
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
-    if (Subtarget->isPICStyleStub())
-      OpFlag = X86II::MO_PIC_BASE_OFFSET;
-    else if (Subtarget->isPICStyleGOT())
-      OpFlag = X86II::MO_GOTOFF;
-    else if (Subtarget->isPICStyleRIPRel() &&
-             getTargetMachine().getCodeModel() == CodeModel::Small)
-      WrapperKind = X86ISD::WrapperRIP;
-  }
-  
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
                                              CP->getAlignment(),
                                              CP->getOffset(), OpFlag);
@@ -4468,25 +4547,26 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
 
 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  
+
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   // global base reg.
   unsigned char OpFlag = 0;
   unsigned WrapperKind = X86ISD::Wrapper;
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
-    if (Subtarget->isPICStyleStub())
-      OpFlag = X86II::MO_PIC_BASE_OFFSET;
-    else if (Subtarget->isPICStyleGOT())
-      OpFlag = X86II::MO_GOTOFF;
-    else if (Subtarget->isPICStyleRIPRel())
-      WrapperKind = X86ISD::WrapperRIP;
-  }
-  
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
                                           OpFlag);
   DebugLoc DL = JT->getDebugLoc();
   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
-  
+
   // With PIC, the address is actually $g + Offset.
   if (OpFlag) {
     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
@@ -4494,43 +4574,44 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
                                      DebugLoc::getUnknownLoc(), getPointerTy()),
                          Result);
   }
-  
+
   return Result;
 }
 
 SDValue
 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
-  
+
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   // global base reg.
   unsigned char OpFlag = 0;
   unsigned WrapperKind = X86ISD::Wrapper;
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
-    if (Subtarget->isPICStyleStub())
-      OpFlag = X86II::MO_PIC_BASE_OFFSET;
-    else if (Subtarget->isPICStyleGOT())
-      OpFlag = X86II::MO_GOTOFF;
-    else if (Subtarget->isPICStyleRIPRel())
-      WrapperKind = X86ISD::WrapperRIP;
-  }
-  
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
-  
+
   DebugLoc DL = Op.getDebugLoc();
   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
-  
-  
+
+
   // With PIC, the address is actually $g + Offset.
   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
-      !Subtarget->isPICStyleRIPRel()) {
+      !Subtarget->is64Bit()) {
     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                          DAG.getNode(X86ISD::GlobalBaseReg,
                                      DebugLoc::getUnknownLoc(),
                                      getPointerTy()),
                          Result);
   }
-  
+
   return Result;
 }
 
@@ -4538,53 +4619,37 @@ SDValue
 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
                                       int64_t Offset,
                                       SelectionDAG &DAG) const {
-  bool IsPic = getTargetMachine().getRelocationModel() == Reloc::PIC_;
-  bool ExtraLoadRequired =
-    Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false);
-
   // Create the TargetGlobalAddress node, folding in the constant
   // offset if it is legal.
+  unsigned char OpFlags =
+    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+  CodeModel::Model M = getTargetMachine().getCodeModel();
   SDValue Result;
-  if (!IsPic && !ExtraLoadRequired && isInt32(Offset)) {
+  if (OpFlags == X86II::MO_NO_FLAG &&
+      X86::isOffsetSuitableForCodeModel(Offset, M)) {
+    // A direct static reference to a global.
     Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
     Offset = 0;
   } else {
-    unsigned char OpFlags = 0;
-    
-    if (Subtarget->isPICStyleRIPRel() &&
-        getTargetMachine().getRelocationModel() != Reloc::Static) {
-      if (ExtraLoadRequired)
-        OpFlags = X86II::MO_GOTPCREL;
-    } else if (Subtarget->isPICStyleGOT() &&
-               getTargetMachine().getRelocationModel() == Reloc::PIC_) {
-      if (ExtraLoadRequired)
-        OpFlags = X86II::MO_GOT;
-      else
-        OpFlags = X86II::MO_GOTOFF;
-    }
-    
     Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
   }
-  
+
   if (Subtarget->isPICStyleRIPRel() &&
-      getTargetMachine().getCodeModel() == CodeModel::Small)
+      (M == CodeModel::Small || M == CodeModel::Kernel))
     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
   else
     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
 
   // With PIC, the address is actually $g + Offset.
-  if (IsPic && !Subtarget->isPICStyleRIPRel()) {
+  if (isGlobalRelativeToPICBase(OpFlags)) {
     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
                          Result);
   }
 
-  // For Darwin & Mingw32, external and weak symbols are indirect, so we want to
-  // load the value at address GV, not the value of GV itself. This means that
-  // the GlobalAddress must be in the base or index register of the address, not
-  // the GV offset field. Platform check is inside GVRequiresExtraLoad() call
-  // The same applies for external symbols during PIC codegen
-  if (ExtraLoadRequired)
+  // For globals that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlags))
     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
                          PseudoSourceValue::getGOT(), 0);
 
@@ -4606,7 +4671,7 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue
 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
-           SDValue *InFlag, const MVT PtrVT, unsigned ReturnReg,
+           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
            unsigned char OperandFlags) {
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
   DebugLoc dl = GA->getDebugLoc();
@@ -4628,7 +4693,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
 static SDValue
 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                                const MVT PtrVT) {
+                                const EVT PtrVT) {
   SDValue InFlag;
   DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
@@ -4643,7 +4708,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
 static SDValue
 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                                const MVT PtrVT) {
+                                const EVT PtrVT) {
   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
                     X86::RAX, X86II::MO_TLSGD);
 }
@@ -4651,7 +4716,7 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
 // Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
 // "local exec" model.
 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                                   const MVT PtrVT, TLSModel::Model model,
+                                   const EVT PtrVT, TLSModel::Model model,
                                    bool is64Bit) {
   DebugLoc dl = GA->getDebugLoc();
   // Get the Thread Pointer
@@ -4677,7 +4742,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
     assert(model == TLSModel::InitialExec);
     OperandFlags = X86II::MO_INDNTPOFF;
   }
-  
+
   // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
   // exec)
   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
@@ -4701,29 +4766,29 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
          "TLS not implemented for non-ELF targets");
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GA->getGlobal();
-  
+
   // If GV is an alias then use the aliasee for determining
   // thread-localness.
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
     GV = GA->resolveAliasedGlobal(false);
-  
+
   TLSModel::Model model = getTLSModel(GV,
                                       getTargetMachine().getRelocationModel());
-  
+
   switch (model) {
   case TLSModel::GeneralDynamic:
   case TLSModel::LocalDynamic: // not implemented
     if (Subtarget->is64Bit())
       return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
     return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
-    
+
   case TLSModel::InitialExec:
   case TLSModel::LocalExec:
     return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
                                Subtarget->is64Bit());
   }
-  
-  assert(0 && "Unreachable");
+
+  llvm_unreachable("Unreachable");
   return SDValue();
 }
 
@@ -4732,17 +4797,16 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
 /// take a 2 x i32 value to shift plus a shift amount.
 SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
   DebugLoc dl = Op.getDebugLoc();
   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
-  SDValue Tmp1 = isSRA ?
-    DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
-                DAG.getConstant(VTBits - 1, MVT::i8)) :
-    DAG.getConstant(0, VT);
+  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+                                     DAG.getConstant(VTBits - 1, MVT::i8))
+                       : DAG.getConstant(0, VT);
 
   SDValue Tmp2, Tmp3;
   if (Op.getOpcode() == ISD::SHL_PARTS) {
@@ -4754,9 +4818,9 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
   }
 
   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
-                                  DAG.getConstant(VTBits, MVT::i8));
+                                DAG.getConstant(VTBits, MVT::i8));
   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT,
-                               AndNode, DAG.getConstant(0, MVT::i8));
+                             AndNode, DAG.getConstant(0, MVT::i8));
 
   SDValue Hi, Lo;
   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
@@ -4776,7 +4840,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
-  MVT SrcVT = Op.getOperand(0).getValueType();
+  EVT SrcVT = Op.getOperand(0).getValueType();
 
   if (SrcVT.isVector()) {
     if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) {
@@ -4808,7 +4872,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
 }
 
-SDValue X86TargetLowering::BuildFILD(SDValue Op, MVT SrcVT, SDValue Chain,
+SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
                                      SDValue StackSlot,
                                      SelectionDAG &DAG) {
   // Build the FILD
@@ -4888,19 +4952,22 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) {
   */
 
   DebugLoc dl = Op.getDebugLoc();
+  LLVMContext *Context = DAG.getContext();
 
   // Build some magic constants.
   std::vector<Constant*> CV0;
-  CV0.push_back(ConstantInt::get(APInt(32, 0x45300000)));
-  CV0.push_back(ConstantInt::get(APInt(32, 0x43300000)));
-  CV0.push_back(ConstantInt::get(APInt(32, 0)));
-  CV0.push_back(ConstantInt::get(APInt(32, 0)));
+  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
+  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
+  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
+  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
   Constant *C0 = ConstantVector::get(CV0);
   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
 
   std::vector<Constant*> CV1;
-  CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4530000000000000ULL))));
-  CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4330000000000000ULL))));
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
   Constant *C1 = ConstantVector::get(CV1);
   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
 
@@ -4965,7 +5032,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) {
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
 
   // Handle final rounding.
-  MVT DestVT = Op.getValueType();
+  EVT DestVT = Op.getValueType();
 
   if (DestVT.bitsLT(MVT::f64)) {
     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
@@ -4988,7 +5055,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   if (DAG.SignBitIsZero(N0))
     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
 
-  MVT SrcVT = N0.getValueType();
+  EVT SrcVT = N0.getValueType();
   if (SrcVT == MVT::i64) {
     // We only handle SSE2 f64 target here; caller can expand the rest.
     if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64)
@@ -5017,7 +5084,7 @@ std::pair<SDValue,SDValue> X86TargetLowering::
 FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) {
   DebugLoc dl = Op.getDebugLoc();
 
-  MVT DstTy = Op.getValueType();
+  EVT DstTy = Op.getValueType();
 
   if (!IsSigned) {
     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
@@ -5043,10 +5110,10 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) {
   unsigned MemSize = DstTy.getSizeInBits()/8;
   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
-  
+
   unsigned Opc;
-  switch (DstTy.getSimpleVT()) {
-  default: assert(0 && "Invalid FP_TO_SINT to lower!");
+  switch (DstTy.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
   case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
   case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
   case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
@@ -5105,18 +5172,19 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
+  LLVMContext *Context = DAG.getContext();
   DebugLoc dl = Op.getDebugLoc();
-  MVT VT = Op.getValueType();
-  MVT EltVT = VT;
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT;
   if (VT.isVector())
     EltVT = VT.getVectorElementType();
   std::vector<Constant*> CV;
   if (EltVT == MVT::f64) {
-    Constant *C = ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63))));
+    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
     CV.push_back(C);
     CV.push_back(C);
   } else {
-    Constant *C = ConstantFP::get(APFloat(APInt(32, ~(1U << 31))));
+    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
     CV.push_back(C);
     CV.push_back(C);
     CV.push_back(C);
@@ -5131,21 +5199,19 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) {
+  LLVMContext *Context = DAG.getContext();
   DebugLoc dl = Op.getDebugLoc();
-  MVT VT = Op.getValueType();
-  MVT EltVT = VT;
-  unsigned EltNum = 1;
-  if (VT.isVector()) {
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT;
+  if (VT.isVector())
     EltVT = VT.getVectorElementType();
-    EltNum = VT.getVectorNumElements();
-  }
   std::vector<Constant*> CV;
   if (EltVT == MVT::f64) {
-    Constant *C = ConstantFP::get(APFloat(APInt(64, 1ULL << 63)));
+    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
     CV.push_back(C);
     CV.push_back(C);
   } else {
-    Constant *C = ConstantFP::get(APFloat(APInt(32, 1U << 31)));
+    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
     CV.push_back(C);
     CV.push_back(C);
     CV.push_back(C);
@@ -5168,11 +5234,12 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+  LLVMContext *Context = DAG.getContext();
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   DebugLoc dl = Op.getDebugLoc();
-  MVT VT = Op.getValueType();
-  MVT SrcVT = Op1.getValueType();
+  EVT VT = Op.getValueType();
+  EVT SrcVT = Op1.getValueType();
 
   // If second operand is smaller, extend it first.
   if (SrcVT.bitsLT(VT)) {
@@ -5191,13 +5258,13 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   // First get the sign bit of second operand.
   std::vector<Constant*> CV;
   if (SrcVT == MVT::f64) {
-    CV.push_back(ConstantFP::get(APFloat(APInt(64, 1ULL << 63))));
-    CV.push_back(ConstantFP::get(APFloat(APInt(64, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
   } else {
-    CV.push_back(ConstantFP::get(APFloat(APInt(32, 1U << 31))));
-    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
-    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
-    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   }
   Constant *C = ConstantVector::get(CV);
   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
@@ -5220,13 +5287,13 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   // Clear first operand sign bit.
   CV.clear();
   if (VT == MVT::f64) {
-    CV.push_back(ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63)))));
-    CV.push_back(ConstantFP::get(APFloat(APInt(64, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
   } else {
-    CV.push_back(ConstantFP::get(APFloat(APInt(32, ~(1U << 31)))));
-    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
-    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
-    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
   }
   C = ConstantVector::get(CV);
   CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
@@ -5299,21 +5366,48 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
       Opcode = X86ISD::ADD;
       NumOperands = 2;
       break;
+    case ISD::AND: {
+      // If the primary and result isn't used, don't bother using X86ISD::AND,
+      // because a TEST instruction will be better.
+      bool NonFlagUse = false;
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI)
+        if (UI->getOpcode() != ISD::BRCOND &&
+            UI->getOpcode() != ISD::SELECT &&
+            UI->getOpcode() != ISD::SETCC) {
+          NonFlagUse = true;
+          break;
+        }
+      if (!NonFlagUse)
+        break;
+    }
+    // FALL THROUGH
     case ISD::SUB:
-      // Due to the ISEL shortcoming noted above, be conservative if this sub is
+    case ISD::OR:
+    case ISD::XOR:
+      // Due to the ISEL shortcoming noted above, be conservative if this op is
       // likely to be selected as part of a load-modify-store instruction.
       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
            UE = Op.getNode()->use_end(); UI != UE; ++UI)
         if (UI->getOpcode() == ISD::STORE)
           goto default_case;
-      // Otherwise use a regular EFLAGS-setting sub.
-      Opcode = X86ISD::SUB;
+      // Otherwise use a regular EFLAGS-setting instruction.
+      switch (Op.getNode()->getOpcode()) {
+      case ISD::SUB: Opcode = X86ISD::SUB; break;
+      case ISD::OR:  Opcode = X86ISD::OR;  break;
+      case ISD::XOR: Opcode = X86ISD::XOR; break;
+      case ISD::AND: Opcode = X86ISD::AND; break;
+      default: llvm_unreachable("unexpected operator!");
+      }
       NumOperands = 2;
       break;
     case X86ISD::ADD:
     case X86ISD::SUB:
     case X86ISD::INC:
     case X86ISD::DEC:
+    case X86ISD::OR:
+    case X86ISD::XOR:
+    case X86ISD::AND:
       return SDValue(Op.getNode(), 1);
     default:
     default_case:
@@ -5419,14 +5513,14 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
   DebugLoc dl = Op.getDebugLoc();
 
   if (isFP) {
     unsigned SSECC = 8;
-    MVT VT0 = Op0.getValueType();
+    EVT VT0 = Op0.getValueType();
     assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
     unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
     bool Swap = false;
@@ -5469,7 +5563,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
         NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
         return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
       }
-      assert(0 && "Illegal FP comparison");
+      llvm_unreachable("Illegal FP comparison");
     }
     // Handle all other FP comparisons here.
     return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
@@ -5481,10 +5575,13 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
   bool Swap = false, Invert = false, FlipSigns = false;
 
-  switch (VT.getSimpleVT()) {
+  switch (VT.getSimpleVT().SimpleTy) {
   default: break;
+  case MVT::v8i8:
   case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
+  case MVT::v4i16:
   case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
+  case MVT::v2i32:
   case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
   case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
   }
@@ -5508,7 +5605,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   // Since SSE has no unsigned integer comparisons, we need to flip  the sign
   // bits of the inputs before performing those operations.
   if (FlipSigns) {
-    MVT EltVT = VT.getVectorElementType();
+    EVT EltVT = VT.getVectorElementType();
     SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
                                       EltVT);
     std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
@@ -5538,7 +5635,10 @@ static bool isX86LogicalCmp(SDValue Op) {
        Opc == X86ISD::SMUL ||
        Opc == X86ISD::UMUL ||
        Opc == X86ISD::INC ||
-       Opc == X86ISD::DEC))
+       Opc == X86ISD::DEC ||
+       Opc == X86ISD::OR ||
+       Opc == X86ISD::XOR ||
+       Opc == X86ISD::AND))
     return true;
 
   return false;
@@ -5560,7 +5660,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
 
     SDValue Cmp = Cond.getOperand(1);
     unsigned Opc = Cmp.getOpcode();
-    MVT VT = Op.getValueType();
+    EVT VT = Op.getValueType();
 
     bool IllegalFPCMov = false;
     if (VT.isFloatingPoint() && !VT.isVector() &&
@@ -5751,8 +5851,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
   SDValue Flag;
 
-  MVT IntPtr = getPointerTy();
-  MVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+  EVT IntPtr = getPointerTy();
+  EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
 
   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
 
@@ -5802,8 +5902,8 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
 
     if (const char *bzeroEntry =  V &&
         V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
-      MVT IntPtr = getPointerTy();
-      const Type *IntPtrTy = TD->getIntPtrType();
+      EVT IntPtr = getPointerTy();
+      const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
       TargetLowering::ArgListEntry Entry;
       Entry.Node = Dst;
@@ -5812,8 +5912,9 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
       Entry.Node = Size;
       Args.push_back(Entry);
       std::pair<SDValue,SDValue> CallResult =
-        LowerCallTo(Chain, Type::VoidTy, false, false, false, false,
-                    0, CallingConv::C, false,
+        LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
+                    false, false, false, false,
+                    0, CallingConv::C, false, /*isReturnValueUsed=*/false,
                     DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
       return CallResult.second;
     }
@@ -5824,7 +5925,7 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
 
   uint64_t SizeVal = ConstantSize->getZExtValue();
   SDValue InFlag(0, 0);
-  MVT AVT;
+  EVT AVT;
   SDValue Count;
   ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
   unsigned BytesLeft = 0;
@@ -5893,7 +5994,7 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
   if (TwoRepStos) {
     InFlag = Chain.getValue(1);
     Count  = Size;
-    MVT CVT = Count.getValueType();
+    EVT CVT = Count.getValueType();
     SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
                                DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
     Chain  = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
@@ -5909,8 +6010,8 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
   } else if (BytesLeft) {
     // Handle the last 1 - 7 bytes.
     unsigned Offset = SizeVal - BytesLeft;
-    MVT AddrVT = Dst.getValueType();
-    MVT SizeVT = Size.getValueType();
+    EVT AddrVT = Dst.getValueType();
+    EVT SizeVT = Size.getValueType();
 
     Chain = DAG.getMemset(Chain, dl,
                           DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
@@ -5945,7 +6046,7 @@ X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
     return SDValue();
 
   // DWORD aligned
-  MVT AVT = MVT::i32;
+  EVT AVT = MVT::i32;
   if (Subtarget->is64Bit() && ((Align & 0x7) == 0))  // QWORD aligned
     AVT = MVT::i64;
 
@@ -5980,9 +6081,9 @@ X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
   if (BytesLeft) {
     // Handle the last 1 - 7 bytes.
     unsigned Offset = SizeVal - BytesLeft;
-    MVT DstVT = Dst.getValueType();
-    MVT SrcVT = Src.getValueType();
-    MVT SizeVT = Size.getValueType();
+    EVT DstVT = Dst.getValueType();
+    EVT SrcVT = Src.getValueType();
+    EVT SizeVT = Size.getValueType();
     Results.push_back(DAG.getMemcpy(Chain, dl,
                                     DAG.getNode(ISD::ADD, dl, DstVT, Dst,
                                                 DAG.getConstant(Offset, DstVT)),
@@ -6054,8 +6155,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) {
   SDValue SrcPtr = Op.getOperand(1);
   SDValue SrcSV = Op.getOperand(2);
 
-  assert(0 && "VAArgInst is not yet implemented for x86-64!");
-  abort();
+  llvm_report_error("VAArgInst is not yet implemented for x86-64!");
   return SDValue();
 }
 
@@ -6179,6 +6279,36 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
                                 DAG.getConstant(X86CC, MVT::i8), Cond);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
+  // ptest intrinsics. The intrinsic these come from are designed to return
+  // an integer value, not just an instruction so lower it to the ptest
+  // pattern and a setcc for the result.
+  case Intrinsic::x86_sse41_ptestz:
+  case Intrinsic::x86_sse41_ptestc:
+  case Intrinsic::x86_sse41_ptestnzc:{
+    unsigned X86CC = 0;
+    switch (IntNo) {
+    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+    case Intrinsic::x86_sse41_ptestz:
+      // ZF = 1
+      X86CC = X86::COND_E;
+      break;
+    case Intrinsic::x86_sse41_ptestc:
+      // CF = 1
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_sse41_ptestnzc:
+      // ZF and CF = 0
+      X86CC = X86::COND_A;
+      break;
+    }
+
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+    SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS);
+    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
 
   // Fix vector shift instructions where the last operand is a non-immediate
   // i32 value.
@@ -6203,7 +6333,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
       return SDValue();
 
     unsigned NewIntNo = 0;
-    MVT ShAmtVT = MVT::v4i32;
+    EVT ShAmtVT = MVT::v4i32;
     switch (IntNo) {
     case Intrinsic::x86_sse2_pslli_w:
       NewIntNo = Intrinsic::x86_sse2_psll_w;
@@ -6256,14 +6386,28 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
       case Intrinsic::x86_mmx_psrai_d:
         NewIntNo = Intrinsic::x86_mmx_psra_d;
         break;
-      default: abort();  // Can't reach here.
+      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
       }
       break;
     }
     }
-    MVT VT = Op.getValueType();
-    ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT,
-                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShAmtVT, ShAmt));
+
+    // The vector shift intrinsics with scalars uses 32b shift amounts but
+    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
+    // to be zero.
+    SDValue ShOps[4];
+    ShOps[0] = ShAmt;
+    ShOps[1] = DAG.getConstant(0, MVT::i32);
+    if (ShAmtVT == MVT::v4i32) {
+      ShOps[2] = DAG.getUNDEF(MVT::i32);
+      ShOps[3] = DAG.getUNDEF(MVT::i32);
+      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
+    } else {
+      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
+    }
+
+    EVT VT = Op.getValueType();
+    ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt);
     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                        DAG.getConstant(NewIntNo, MVT::i32),
                        Op.getOperand(1), ShAmt);
@@ -6295,7 +6439,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
@@ -6401,12 +6545,12 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
   } else {
     const Function *Func =
       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
-    unsigned CC = Func->getCallingConv();
+    CallingConv::ID CC = Func->getCallingConv();
     unsigned NestReg;
 
     switch (CC) {
     default:
-      assert(0 && "Unsupported calling convention");
+      llvm_unreachable("Unsupported calling convention");
     case CallingConv::C:
     case CallingConv::X86_StdCall: {
       // Pass 'nest' parameter in ECX.
@@ -6428,8 +6572,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
 
         if (InRegCount > 2) {
-          cerr << "Nest register in use - reduce number of inreg parameters!\n";
-          abort();
+          llvm_report_error("Nest register in use - reduce number of inreg parameters!");
         }
       }
       break;
@@ -6499,7 +6642,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
   const TargetMachine &TM = MF.getTarget();
   const TargetFrameInfo &TFI = *TM.getFrameInfo();
   unsigned StackAlignment = TFI.getStackAlignment();
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
 
   // Save FP Control Word to stack slot
@@ -6537,8 +6680,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
-  MVT OpVT = VT;
+  EVT VT = Op.getValueType();
+  EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
   DebugLoc dl = Op.getDebugLoc();
 
@@ -6570,8 +6713,8 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
-  MVT OpVT = VT;
+  EVT VT = Op.getValueType();
+  EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
   DebugLoc dl = Op.getDebugLoc();
 
@@ -6599,7 +6742,7 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
   assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
   DebugLoc dl = Op.getDebugLoc();
 
@@ -6656,7 +6799,7 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
 
   switch (Op.getOpcode()) {
-  default: assert(0 && "Unknown ovf instruction!");
+  default: llvm_unreachable("Unknown ovf instruction!");
   case ISD::SADDO:
     // A subtract of one will be selected as a INC. Note that INC doesn't
     // set CF, so we can't do this for UADDO.
@@ -6712,11 +6855,11 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
-  MVT T = Op.getValueType();
+  EVT T = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
   unsigned Reg = 0;
   unsigned size = 0;
-  switch(T.getSimpleVT()) {
+  switch(T.getSimpleVT().SimpleTy) {
   default:
     assert(false && "Invalid value type!");
   case MVT::i8:  Reg = X86::AL;  size = 1; break;
@@ -6763,7 +6906,7 @@ SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
 SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
   SDNode *Node = Op.getNode();
   DebugLoc dl = Node->getDebugLoc();
-  MVT T = Node->getValueType(0);
+  EVT T = Node->getValueType(0);
   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
                               DAG.getConstant(0, T), Node->getOperand(2));
   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
@@ -6778,7 +6921,7 @@ SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
 ///
 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   switch (Op.getOpcode()) {
-  default: assert(0 && "Should not custom lower this!");
+  default: llvm_unreachable("Should not custom lower this!");
   case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
@@ -6805,9 +6948,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
-  case ISD::CALL:               return LowerCALL(Op, DAG);
-  case ISD::RET:                return LowerRET(Op, DAG);
-  case ISD::FORMAL_ARGUMENTS:   return LowerFORMAL_ARGUMENTS(Op, DAG);
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
@@ -6836,7 +6976,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
 void X86TargetLowering::
 ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
                         SelectionDAG &DAG, unsigned NewOp) {
-  MVT T = Node->getValueType(0);
+  EVT T = Node->getValueType(0);
   DebugLoc dl = Node->getDebugLoc();
   assert (T == MVT::i64 && "Only know how to expand i64 atomics");
 
@@ -6846,12 +6986,11 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
                              Node->getOperand(2), DAG.getIntPtrConstant(0));
   SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
                              Node->getOperand(2), DAG.getIntPtrConstant(1));
-  // This is a generalized SDNode, not an AtomicSDNode, so it doesn't
-  // have a MemOperand.  Pass the info through as a normal operand.
-  SDValue LSI = DAG.getMemOperand(cast<MemSDNode>(Node)->getMemOperand());
-  SDValue Ops[] = { Chain, In1, In2L, In2H, LSI };
+  SDValue Ops[] = { Chain, In1, In2L, In2H };
   SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
-  SDValue Result = DAG.getNode(NewOp, dl, Tys, Ops, 5);
+  SDValue Result =
+    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
+                            cast<MemSDNode>(Node)->getMemOperand());
   SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
   Results.push_back(Result.getValue(2));
@@ -6872,7 +7011,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         FP_TO_INTHelper(SDValue(N, 0), DAG, true);
     SDValue FIST = Vals.first, StackSlot = Vals.second;
     if (FIST.getNode() != 0) {
-      MVT VT = N->getValueType(0);
+      EVT VT = N->getValueType(0);
       // Return a load from the stack slot.
       Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0));
     }
@@ -6893,7 +7032,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::ATOMIC_CMP_SWAP: {
-    MVT T = N->getValueType(0);
+    EVT T = N->getValueType(0);
     assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
     SDValue cpInL, cpInH;
     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
@@ -6969,7 +7108,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FLD:                return "X86ISD::FLD";
   case X86ISD::FST:                return "X86ISD::FST";
   case X86ISD::CALL:               return "X86ISD::CALL";
-  case X86ISD::TAILCALL:           return "X86ISD::TAILCALL";
   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
   case X86ISD::BT:                 return "X86ISD::BT";
   case X86ISD::CMP:                return "X86ISD::CMP";
@@ -7027,7 +7165,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::UMUL:               return "X86ISD::UMUL";
   case X86ISD::INC:                return "X86ISD::INC";
   case X86ISD::DEC:                return "X86ISD::DEC";
+  case X86ISD::OR:                 return "X86ISD::OR";
+  case X86ISD::XOR:                return "X86ISD::XOR";
+  case X86ISD::AND:                return "X86ISD::AND";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
+  case X86ISD::PTEST:              return "X86ISD::PTEST";
+  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   }
 }
 
@@ -7036,28 +7179,28 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
                                               const Type *Ty) const {
   // X86 supports extremely general addressing modes.
+  CodeModel::Model M = getTargetMachine().getCodeModel();
 
   // X86 allows a sign-extended 32-bit immediate field as a displacement.
-  if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1)
+  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
     return false;
 
   if (AM.BaseGV) {
-    // We can only fold this if we don't need an extra load.
-    if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false))
+    unsigned GVFlags =
+      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
+
+    // If a reference to this global requires an extra load, we can't fold it.
+    if (isGlobalStubReference(GVFlags))
       return false;
-    // If BaseGV requires a register, we cannot also have a BaseReg.
-    if (Subtarget->GVRequiresRegister(AM.BaseGV, getTargetMachine(), false) &&
-        AM.HasBaseReg)
+
+    // If BaseGV requires a register for the PIC base, we cannot also have a
+    // BaseReg specified.
+    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
       return false;
 
-    // X86-64 only supports addr of globals in small code model.
-    if (Subtarget->is64Bit()) {
-      if (getTargetMachine().getCodeModel() != CodeModel::Small)
-        return false;
-      // If lower 4G is not available, then we must use rip-relative addressing.
-      if (AM.BaseOffs || AM.Scale > 1)
-        return false;
-    }
+    // If lower 4G is not available, then we must use rip-relative addressing.
+    if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+      return false;
   }
 
   switch (AM.Scale) {
@@ -7094,7 +7237,7 @@ bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
   return Subtarget->is64Bit() || NumBits1 < 64;
 }
 
-bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const {
+bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (!VT1.isInteger() || !VT2.isInteger())
     return false;
   unsigned NumBits1 = VT1.getSizeInBits();
@@ -7106,15 +7249,16 @@ bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const {
 
 bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
-  return Ty1 == Type::Int32Ty && Ty2 == Type::Int64Ty && Subtarget->is64Bit();
+  return Ty1 == Type::getInt32Ty(Ty1->getContext()) &&
+         Ty2 == Type::getInt64Ty(Ty1->getContext()) && Subtarget->is64Bit();
 }
 
-bool X86TargetLowering::isZExtFree(MVT VT1, MVT VT2) const {
+bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
 }
 
-bool X86TargetLowering::isNarrowingProfitable(MVT VT1, MVT VT2) const {
+bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
   // i16 instructions are longer (0x66 prefix) and potentially slower.
   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
 }
@@ -7124,8 +7268,8 @@ bool X86TargetLowering::isNarrowingProfitable(MVT VT1, MVT VT2) const {
 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
 /// are assumed to be legal.
 bool
-X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 
-                                      MVT VT) const {
+X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                      EVT VT) const {
   // Only do shuffles on 128-bit vector types for now.
   if (VT.getSizeInBits() == 64)
     return false;
@@ -7146,7 +7290,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
 
 bool
 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
-                                          MVT VT) const {
+                                          EVT VT) const {
   unsigned NumElts = VT.getVectorNumElements();
   // FIXME: This collection of masks seems suspect.
   if (NumElts == 2)
@@ -7254,7 +7398,8 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
     (*MIB).addOperand(*argOpers[i]);
   MIB.addReg(t2);
   assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
-  (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
+  (*MIB).setMemRefs(bInstr->memoperands_begin(),
+                    bInstr->memoperands_end());
 
   MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg());
   MIB.addReg(EAXreg);
@@ -7406,7 +7551,8 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
     (*MIB).addOperand(*argOpers[i]);
 
   assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
-  (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
+  (*MIB).setMemRefs(bInstr->memoperands_begin(),
+                    bInstr->memoperands_end());
 
   MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3);
   MIB.addReg(X86::EAX);
@@ -7450,7 +7596,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
   F->insert(MBBIter, newMBB);
   F->insert(MBBIter, nextMBB);
 
-  // Move all successors to thisMBB to nextMBB
+  // Move all successors of thisMBB to nextMBB
   nextMBB->transferSuccessors(thisMBB);
 
   // Update thisMBB to fall through to newMBB
@@ -7510,7 +7656,8 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
     (*MIB).addOperand(*argOpers[i]);
   MIB.addReg(t3);
   assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
-  (*MIB).addMemOperand(*F, *mInstr->memoperands_begin());
+  (*MIB).setMemRefs(mInstr->memoperands_begin(),
+                    mInstr->memoperands_end());
 
   MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg());
   MIB.addReg(X86::EAX);
@@ -7522,70 +7669,190 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
   return nextMBB;
 }
 
-
+// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
+// all of this code can be replaced with that in the .td file.
 MachineBasicBlock *
-X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                               MachineBasicBlock *BB) const {
+X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
+                            unsigned numArgs, bool memArg) const {
+
+  MachineFunction *F = BB->getParent();
   DebugLoc dl = MI->getDebugLoc();
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  unsigned Opc;
+  if (memArg)
+    Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
+  else
+    Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
+
+  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc));
+
+  for (unsigned i = 0; i < numArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i+1);
+
+    if (!(Op.isReg() && Op.isImplicit()))
+      MIB.addOperand(Op);
+  }
+
+  BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
+    .addReg(X86::XMM0);
+
+  F->DeleteMachineInstr(MI);
+
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+                                                 MachineInstr *MI,
+                                                 MachineBasicBlock *MBB) const {
+  // Emit code to save XMM registers to the stack. The ABI says that the
+  // number of registers to save is given in %al, so it's theoretically
+  // possible to do an indirect jump trick to avoid saving all of them,
+  // however this code takes a simpler approach and just executes all
+  // of the stores if %al is non-zero. It's less code, and it's probably
+  // easier on the hardware branch predictor, and stores aren't all that
+  // expensive anyway.
+
+  // Create the new basic blocks. One block contains all the XMM stores,
+  // and one block is the final destination regardless of whether any
+  // stores were performed.
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction *F = MBB->getParent();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, XMMSaveMBB);
+  F->insert(MBBIter, EndMBB);
+
+  // Set up the CFG.
+  // Move any original successors of MBB to the end block.
+  EndMBB->transferSuccessors(MBB);
+  // The original block will now fall through to the XMM save block.
+  MBB->addSuccessor(XMMSaveMBB);
+  // The XMMSaveMBB will fall through to the end block.
+  XMMSaveMBB->addSuccessor(EndMBB);
+
+  // Now add the instructions.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned CountReg = MI->getOperand(0).getReg();
+  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
+  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
+
+  if (!Subtarget->isTargetWin64()) {
+    // If %al is 0, branch around the XMM save block.
+    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
+    BuildMI(MBB, DL, TII->get(X86::JE)).addMBB(EndMBB);
+    MBB->addSuccessor(EndMBB);
+  }
+
+  // In the XMM save block, save all the XMM argument registers.
+  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
+    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
+    MachineMemOperand *MMO =
+      F->getMachineMemOperand(
+        PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
+        MachineMemOperand::MOStore, Offset,
+        /*Size=*/16, /*Align=*/16);
+    BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
+      .addFrameIndex(RegSaveFrameIndex)
+      .addImm(/*Scale=*/1)
+      .addReg(/*IndexReg=*/0)
+      .addImm(/*Disp=*/Offset)
+      .addReg(/*Segment=*/0)
+      .addReg(MI->getOperand(i).getReg())
+      .addMemOperand(MMO);
+  }
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+
+  return EndMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
+                                     MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  unsigned Opc =
+    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+  // Update machine-CFG edges by first adding all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  // Also inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(),
+         E = BB->succ_end(); I != E; ++I) {
+    EM->insert(std::make_pair(*I, sinkMBB));
+    sinkMBB->addSuccessor(*I);
+  }
+  // Next, remove all successors of the current block, and add the true
+  // and fallthrough blocks as its successors.
+  while (!BB->succ_empty())
+    BB->removeSuccessor(BB->succ_begin());
+  // Add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}
+
+
+MachineBasicBlock *
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
   switch (MI->getOpcode()) {
   default: assert(false && "Unexpected instr type to insert");
+  case X86::CMOV_GR8:
   case X86::CMOV_V1I64:
   case X86::CMOV_FR32:
   case X86::CMOV_FR64:
   case X86::CMOV_V4F32:
   case X86::CMOV_V2F64:
-  case X86::CMOV_V2I64: {
-    // To "insert" a SELECT_CC instruction, we actually have to insert the
-    // diamond control-flow pattern.  The incoming instruction knows the
-    // destination vreg to set, the condition code register to branch on, the
-    // true/false values to select between, and a branch opcode to use.
-    const BasicBlock *LLVM_BB = BB->getBasicBlock();
-    MachineFunction::iterator It = BB;
-    ++It;
-
-    //  thisMBB:
-    //  ...
-    //   TrueVal = ...
-    //   cmpTY ccX, r1, r2
-    //   bCC copy1MBB
-    //   fallthrough --> copy0MBB
-    MachineBasicBlock *thisMBB = BB;
-    MachineFunction *F = BB->getParent();
-    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
-    MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
-    unsigned Opc =
-      X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
-    BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB);
-    F->insert(It, copy0MBB);
-    F->insert(It, sinkMBB);
-    // Update machine-CFG edges by transferring all successors of the current
-    // block to the new block which will contain the Phi node for the select.
-    sinkMBB->transferSuccessors(BB);
-
-    // Add the true and fallthrough blocks as its successors.
-    BB->addSuccessor(copy0MBB);
-    BB->addSuccessor(sinkMBB);
-
-    //  copy0MBB:
-    //   %FalseValue = ...
-    //   # fallthrough to sinkMBB
-    BB = copy0MBB;
-
-    // Update machine-CFG edges
-    BB->addSuccessor(sinkMBB);
-
-    //  sinkMBB:
-    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
-    //  ...
-    BB = sinkMBB;
-    BuildMI(BB, dl, TII->get(X86::PHI), MI->getOperand(0).getReg())
-      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
-      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
-
-    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
-    return BB;
-  }
+  case X86::CMOV_V2I64:
+    return EmitLoweredSelect(MI, BB, EM);
 
   case X86::FP32_TO_INT16_IN_MEM:
   case X86::FP32_TO_INT32_IN_MEM:
@@ -7596,33 +7863,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::FP80_TO_INT16_IN_MEM:
   case X86::FP80_TO_INT32_IN_MEM:
   case X86::FP80_TO_INT64_IN_MEM: {
+    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+    DebugLoc DL = MI->getDebugLoc();
+
     // Change the floating point control register to use "round towards zero"
     // mode when truncating to an integer value.
     MachineFunction *F = BB->getParent();
     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
-    addFrameReference(BuildMI(BB, dl, TII->get(X86::FNSTCW16m)), CWFrameIdx);
+    addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx);
 
     // Load the old value of the high byte of the control word...
     unsigned OldCW =
       F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
-    addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16rm), OldCW),
+    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW),
                       CWFrameIdx);
 
     // Set the high part to be round to zero...
-    addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mi)), CWFrameIdx)
+    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
       .addImm(0xC7F);
 
     // Reload the modified control word now...
-    addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx);
+    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
 
     // Restore the memory image of control word to original value
-    addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mr)), CWFrameIdx)
+    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
       .addReg(OldCW);
 
     // Get the X86 opcode to use.
     unsigned Opc;
     switch (MI->getOpcode()) {
-    default: assert(0 && "illegal opcode!");
+    default: llvm_unreachable("illegal opcode!");
     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
@@ -7655,15 +7925,26 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     } else {
       AM.Disp = Op.getImm();
     }
-    addFullAddress(BuildMI(BB, dl, TII->get(Opc)), AM)
+    addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM)
                       .addReg(MI->getOperand(X86AddrNumOperands).getReg());
 
     // Reload the original control word now.
-    addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx);
+    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
 
     F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
     return BB;
   }
+    // String/text processing lowering.
+  case X86::PCMPISTRM128REG:
+    return EmitPCMP(MI, BB, 3, false /* in-mem */);
+  case X86::PCMPISTRM128MEM:
+    return EmitPCMP(MI, BB, 3, true /* in-mem */);
+  case X86::PCMPESTRM128REG:
+    return EmitPCMP(MI, BB, 5, false /* in mem */);
+  case X86::PCMPESTRM128MEM:
+    return EmitPCMP(MI, BB, 5, true /* in mem */);
+
+    // Atomic Lowering.
   case X86::ATOMAND32:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
                                                X86::AND32ri, X86::MOV32rm,
@@ -7825,6 +8106,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                X86::MOV32rr, X86::MOV32rr,
                                                X86::MOV32ri, X86::MOV32ri,
                                                false);
+  case X86::VASTART_SAVE_XMM_REGS:
+    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
   }
 }
 
@@ -7855,6 +8138,9 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   case X86ISD::UMUL:
   case X86ISD::INC:
   case X86ISD::DEC:
+  case X86ISD::OR:
+  case X86ISD::XOR:
+  case X86ISD::AND:
     // These nodes' second result is a boolean.
     if (Op.getResNo() == 0)
       break;
@@ -7891,7 +8177,7 @@ static bool isBaseAlignmentOfN(unsigned N, SDNode *Base,
 }
 
 static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
-                                     MVT EVT, LoadSDNode *&LDBase,
+                                     EVT EltVT, LoadSDNode *&LDBase,
                                      unsigned &LastLoadedElt,
                                      SelectionDAG &DAG, MachineFrameInfo *MFI,
                                      const TargetLowering &TLI) {
@@ -7919,7 +8205,7 @@ static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
       continue;
 
     LoadSDNode *LD = cast<LoadSDNode>(Elt);
-    if (!TLI.isConsecutiveLoad(LD, LDBase, EVT.getSizeInBits()/8, i, MFI))
+    if (!TLI.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i, MFI))
       return false;
     LastLoadedElt = i;
   }
@@ -7935,8 +8221,8 @@ static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                      const TargetLowering &TLI) {
   DebugLoc dl = N->getDebugLoc();
-  MVT VT = N->getValueType(0);
-  MVT EVT = VT.getVectorElementType();
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
   unsigned NumElems = VT.getVectorNumElements();
 
@@ -7947,7 +8233,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   LoadSDNode *LD = NULL;
   unsigned LastLoadedElt;
-  if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, LD, LastLoadedElt, DAG,
+  if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG,
                                 MFI, TLI))
     return SDValue();
 
@@ -7976,57 +8262,159 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // Get the LHS/RHS of the select.
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
-  
-  // If we have SSE[12] support, try to form min/max nodes.
+
+  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
+  // instructions have the peculiarity that if either operand is a NaN,
+  // they chose what we call the RHS operand (and as such are not symmetric).
+  // It happens that this matches the semantics of the common C idiom
+  // x<y?x:y and related forms, so we can recognize these cases.
   if (Subtarget->hasSSE2() &&
       (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
       Cond.getOpcode() == ISD::SETCC) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
     unsigned Opcode = 0;
+    // Check for x CC y ? x : y.
     if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
       switch (CC) {
       default: break;
-      case ISD::SETOLE: // (X <= Y) ? X : Y -> min
+      case ISD::SETULT:
+        // This can be a min if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(RHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(LHS))
+            break;
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETOLE:
+        // This can be a min if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(LHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(RHS))
+            break;
+        }
+        Opcode = X86ISD::FMIN;
+        break;
       case ISD::SETULE:
-      case ISD::SETLE:
-        if (!UnsafeFPMath) break;
-        // FALL THROUGH.
-      case ISD::SETOLT:  // (X olt/lt Y) ? X : Y -> min
+        // This can be a min, but if either operand is a NaN we need it to
+        // preserve the original LHS.
+        std::swap(LHS, RHS);
+      case ISD::SETOLT:
       case ISD::SETLT:
+      case ISD::SETLE:
         Opcode = X86ISD::FMIN;
         break;
 
-      case ISD::SETOGT: // (X > Y) ? X : Y -> max
+      case ISD::SETOGE:
+        // This can be a max if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(LHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(RHS))
+            break;
+        }
+        Opcode = X86ISD::FMAX;
+        break;
       case ISD::SETUGT:
+        // This can be a max if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(RHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(LHS))
+            break;
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETUGE:
+        // This can be a max, but if either operand is a NaN we need it to
+        // preserve the original LHS.
+        std::swap(LHS, RHS);
+      case ISD::SETOGT:
       case ISD::SETGT:
-        if (!UnsafeFPMath) break;
-        // FALL THROUGH.
-      case ISD::SETUGE:  // (X uge/ge Y) ? X : Y -> max
       case ISD::SETGE:
         Opcode = X86ISD::FMAX;
         break;
       }
+    // Check for x CC y ? y : x -- a min/max with reversed arms.
     } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
       switch (CC) {
       default: break;
-      case ISD::SETOGT: // (X > Y) ? Y : X -> min
+      case ISD::SETOGE:
+        // This can be a min if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(RHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(LHS))
+            break;
+        }
+        Opcode = X86ISD::FMIN;
+        break;
       case ISD::SETUGT:
+        // This can be a min if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(LHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(RHS))
+            break;
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETUGE:
+        // This can be a min, but if either operand is a NaN we need it to
+        // preserve the original LHS.
+        std::swap(LHS, RHS);
+      case ISD::SETOGT:
       case ISD::SETGT:
-        if (!UnsafeFPMath) break;
-        // FALL THROUGH.
-      case ISD::SETUGE:  // (X uge/ge Y) ? Y : X -> min
       case ISD::SETGE:
         Opcode = X86ISD::FMIN;
         break;
 
-      case ISD::SETOLE:   // (X <= Y) ? Y : X -> max
+      case ISD::SETULT:
+        // This can be a max if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(LHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(RHS))
+            break;
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETOLE:
+        // This can be a max if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(RHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(LHS))
+            break;
+        }
+        Opcode = X86ISD::FMAX;
+        break;
       case ISD::SETULE:
-      case ISD::SETLE:
-        if (!UnsafeFPMath) break;
-        // FALL THROUGH.
-      case ISD::SETOLT:   // (X olt/lt Y) ? Y : X -> max
+        // This can be a max, but if either operand is a NaN we need it to
+        // preserve the original LHS.
+        std::swap(LHS, RHS);
+      case ISD::SETOLT:
       case ISD::SETLT:
+      case ISD::SETLE:
         Opcode = X86ISD::FMAX;
         break;
       }
@@ -8035,7 +8423,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     if (Opcode)
       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
   }
-  
+
   // If this is a select between two integer constants, try to do some
   // optimizations.
   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
@@ -8045,7 +8433,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
         // so that TrueC (the true value) is larger than FalseC.
         bool NeedsCondInvert = false;
-        
+
         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
             // Efficiently invertible.
             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
@@ -8054,41 +8442,41 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
           NeedsCondInvert = true;
           std::swap(TrueC, FalseC);
         }
-   
+
         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
         if (FalseC->getAPIntValue() == 0 &&
             TrueC->getAPIntValue().isPowerOf2()) {
           if (NeedsCondInvert) // Invert the condition if needed.
             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
                                DAG.getConstant(1, Cond.getValueType()));
-          
+
           // Zero extend the condition if needed.
           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
-          
+
           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
                              DAG.getConstant(ShAmt, MVT::i8));
         }
-        
+
         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
           if (NeedsCondInvert) // Invert the condition if needed.
             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
                                DAG.getConstant(1, Cond.getValueType()));
-          
+
           // Zero extend the condition if needed.
           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
                              FalseC->getValueType(0), Cond);
           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
                              SDValue(FalseC, 0));
         }
-        
+
         // Optimize cases that will turn into an LEA instruction.  This requires
         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
-          
+
           bool isFastMultiplier = false;
           if (Diff < 10) {
             switch ((unsigned char)Diff) {
@@ -8104,13 +8492,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
                 break;
             }
           }
-          
+
           if (isFastMultiplier) {
             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
             if (NeedsCondInvert) // Invert the condition if needed.
               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
                                  DAG.getConstant(1, Cond.getValueType()));
-            
+
             // Zero extend the condition if needed.
             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
                                Cond);
@@ -8118,17 +8506,17 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
             if (Diff != 1)
               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
                                  DAG.getConstant(Diff, Cond.getValueType()));
-            
+
             // Add the base if non-zero.
             if (FalseC->getAPIntValue() != 0)
               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
                                  SDValue(FalseC, 0));
             return Cond;
           }
-        }      
+        }
       }
   }
-      
+
   return SDValue();
 }
 
@@ -8136,11 +8524,11 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI) {
   DebugLoc DL = N->getDebugLoc();
-  
+
   // If the flag operand isn't dead, don't touch this CMOV.
   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
     return SDValue();
-  
+
   // If this is a select between two integer constants, try to do some
   // optimizations.  Note that the operands are ordered the opposite of SELECT
   // operands.
@@ -8149,12 +8537,12 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
       // larger than FalseC (the false value).
       X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
-        
+
       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
         CC = X86::GetOppositeBranchCondition(CC);
         std::swap(TrueC, FalseC);
       }
-        
+
       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
       // This is efficient for any integer data type (including i8/i16) and
       // shift amount.
@@ -8162,10 +8550,10 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
         SDValue Cond = N->getOperand(3);
         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                            DAG.getConstant(CC, MVT::i8), Cond);
-      
+
         // Zero extend the condition if needed.
         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
-        
+
         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
                            DAG.getConstant(ShAmt, MVT::i8));
@@ -8173,31 +8561,31 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
           return DCI.CombineTo(N, Cond, SDValue());
         return Cond;
       }
-      
+
       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
       // for any integer data type, including i8/i16.
       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
         SDValue Cond = N->getOperand(3);
         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                            DAG.getConstant(CC, MVT::i8), Cond);
-        
+
         // Zero extend the condition if needed.
         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
                            FalseC->getValueType(0), Cond);
         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
                            SDValue(FalseC, 0));
-        
+
         if (N->getNumValues() == 2)  // Dead flag value?
           return DCI.CombineTo(N, Cond, SDValue());
         return Cond;
       }
-      
+
       // Optimize cases that will turn into an LEA instruction.  This requires
       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
-       
+
         bool isFastMultiplier = false;
         if (Diff < 10) {
           switch ((unsigned char)Diff) {
@@ -8213,7 +8601,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
             break;
           }
         }
-        
+
         if (isFastMultiplier) {
           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
           SDValue Cond = N->getOperand(3);
@@ -8235,7 +8623,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
             return DCI.CombineTo(N, Cond, SDValue());
           return Cond;
         }
-      }      
+      }
     }
   }
   return SDValue();
@@ -8254,7 +8642,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
-  MVT VT = N->getValueType(0);
+  EVT VT = N->getValueType(0);
   if (VT != MVT::i64)
     return SDValue();
 
@@ -8289,17 +8677,17 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
       std::swap(MulAmt1, MulAmt2);
 
     SDValue NewMul;
-    if (isPowerOf2_64(MulAmt1)) 
+    if (isPowerOf2_64(MulAmt1))
       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
     else
       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
                            DAG.getConstant(MulAmt1, VT));
 
-    if (isPowerOf2_64(MulAmt2)) 
+    if (isPowerOf2_64(MulAmt2))
       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
-    else 
+    else
       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
                            DAG.getConstant(MulAmt2, VT));
 
@@ -8321,14 +8709,14 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
   if (!Subtarget->hasSSE2())
     return SDValue();
 
-  MVT VT = N->getValueType(0);
+  EVT VT = N->getValueType(0);
   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
     return SDValue();
 
   SDValue ShAmtOp = N->getOperand(1);
-  MVT EltVT = VT.getVectorElementType();
+  EVT EltVT = VT.getVectorElementType();
   DebugLoc DL = N->getDebugLoc();
-  SDValue BaseShAmt;
+  SDValue BaseShAmt = SDValue();
   if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
     unsigned NumElts = VT.getVectorNumElements();
     unsigned i = 0;
@@ -8347,21 +8735,40 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
     }
   } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
              cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
-    BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
-                            DAG.getIntPtrConstant(0));
+    SDValue InVec = ShAmtOp.getOperand(0);
+    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+      unsigned NumElts = InVec.getValueType().getVectorNumElements();
+      unsigned i = 0;
+      for (; i != NumElts; ++i) {
+        SDValue Arg = InVec.getOperand(i);
+        if (Arg.getOpcode() == ISD::UNDEF) continue;
+        BaseShAmt = Arg;
+        break;
+      }
+    } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
+         unsigned SplatIdx = cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
+         if (C->getZExtValue() == SplatIdx)
+           BaseShAmt = InVec.getOperand(1);
+       }
+    }
+    if (BaseShAmt.getNode() == 0)
+      BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
+                              DAG.getIntPtrConstant(0));
   } else
     return SDValue();
 
+  // The shift amount is an i32.
   if (EltVT.bitsGT(MVT::i32))
     BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
   else if (EltVT.bitsLT(MVT::i32))
-    BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, BaseShAmt);
+    BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
 
   // The shift amount is identical so we can do a vector shift.
   SDValue  ValOp = N->getOperand(0);
   switch (N->getOpcode()) {
   default:
-    assert(0 && "Unknown shift opcode!");
+    llvm_unreachable("Unknown shift opcode!");
     break;
   case ISD::SHL:
     if (VT == MVT::v2i64)
@@ -8415,13 +8822,13 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
 
   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
   StoreSDNode *St = cast<StoreSDNode>(N);
-  MVT VT = St->getValue().getValueType();
+  EVT VT = St->getValue().getValueType();
   if (VT.getSizeInBits() != 64)
     return SDValue();
 
   const Function *F = DAG.getMachineFunction().getFunction();
   bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
-  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 
+  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
     && Subtarget->hasSSE2();
   if ((VT.isVector() ||
        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
@@ -8464,7 +8871,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
     // pair instead.
     if (Subtarget->is64Bit() || F64IsLegal) {
-      MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
                                   Ld->getBasePtr(), Ld->getSrcValue(),
                                   Ld->getSrcValueOffset(), Ld->isVolatile(),
@@ -8568,9 +8975,9 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue Op = N->getOperand(0);
   if (Op.getOpcode() == ISD::BIT_CONVERT)
     Op = Op.getOperand(0);
-  MVT VT = N->getValueType(0), OpVT = Op.getValueType();
+  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
-      VT.getVectorElementType().getSizeInBits() == 
+      VT.getVectorElementType().getSizeInBits() ==
       OpVT.getVectorElementType().getSizeInBits()) {
     return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
   }
@@ -8580,7 +8987,7 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
 // On X86 and X86-64, atomic operations are lowered to locked instructions.
 // Locked instructions, in turn, have implicit fence semantics (all memory
 // operations are flushed before issuing the locked instruction, and the
-// are not buffered), so we can fold away the common pattern of 
+// are not buffered), so we can fold away the common pattern of
 // fence-atomic-fence.
 static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) {
   SDValue atomic = N->getOperand(0);
@@ -8601,11 +9008,11 @@ static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) {
     default:
       return SDValue();
   }
-  
+
   SDValue fence = atomic.getOperand(0);
   if (fence.getOpcode() != ISD::MEMBARRIER)
     return SDValue();
-  
+
   switch (atomic.getOpcode()) {
     case ISD::ATOMIC_CMP_SWAP:
       return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
@@ -8657,6 +9064,101 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
 //                           X86 Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
+static bool LowerToBSwap(CallInst *CI) {
+  // FIXME: this should verify that we are targetting a 486 or better.  If not,
+  // we will turn this bswap into something that will be lowered to logical ops
+  // instead of emitting the bswap asm.  For now, we don't support 486 or lower
+  // so don't worry about this.
+
+  // Verify this is a simple bswap.
+  if (CI->getNumOperands() != 2 ||
+      CI->getType() != CI->getOperand(1)->getType() ||
+      !CI->getType()->isInteger())
+    return false;
+
+  const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty || Ty->getBitWidth() % 16 != 0)
+    return false;
+
+  // Okay, we can do this xform, do so now.
+  const Type *Tys[] = { Ty };
+  Module *M = CI->getParent()->getParent()->getParent();
+  Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
+
+  Value *Op = CI->getOperand(1);
+  Op = CallInst::Create(Int, Op, CI->getName(), CI);
+
+  CI->replaceAllUsesWith(Op);
+  CI->eraseFromParent();
+  return true;
+}
+
+bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+  std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
+
+  std::string AsmStr = IA->getAsmString();
+
+  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+  std::vector<std::string> AsmPieces;
+  SplitString(AsmStr, AsmPieces, "\n");  // ; as separator?
+
+  switch (AsmPieces.size()) {
+  default: return false;
+  case 1:
+    AsmStr = AsmPieces[0];
+    AsmPieces.clear();
+    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
+
+    // bswap $0
+    if (AsmPieces.size() == 2 &&
+        (AsmPieces[0] == "bswap" ||
+         AsmPieces[0] == "bswapq" ||
+         AsmPieces[0] == "bswapl") &&
+        (AsmPieces[1] == "$0" ||
+         AsmPieces[1] == "${0:q}")) {
+      // No need to check constraints, nothing other than the equivalent of
+      // "=r,0" would be valid here.
+      return LowerToBSwap(CI);
+    }
+    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
+    if (CI->getType() == Type::getInt16Ty(CI->getContext()) &&
+        AsmPieces.size() == 3 &&
+        AsmPieces[0] == "rorw" &&
+        AsmPieces[1] == "$$8," &&
+        AsmPieces[2] == "${0:w}" &&
+        IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") {
+      return LowerToBSwap(CI);
+    }
+    break;
+  case 3:
+    if (CI->getType() == Type::getInt64Ty(CI->getContext()) &&
+        Constraints.size() >= 2 &&
+        Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+        Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+      // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
+      std::vector<std::string> Words;
+      SplitString(AsmPieces[0], Words, " \t");
+      if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
+        Words.clear();
+        SplitString(AsmPieces[1], Words, " \t");
+        if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
+          Words.clear();
+          SplitString(AsmPieces[2], Words, " \t,");
+          if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
+              Words[2] == "%edx") {
+            return LowerToBSwap(CI);
+          }
+        }
+      }
+    }
+    break;
+  }
+  return false;
+}
+
+
+
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 X86TargetLowering::ConstraintType
@@ -8689,7 +9191,7 @@ X86TargetLowering::getConstraintType(const std::string &Constraint) const {
 /// with another that has more specific requirements based on the type of the
 /// corresponding operand.
 const char *X86TargetLowering::
-LowerXConstraint(MVT ConstraintVT) const {
+LowerXConstraint(EVT ConstraintVT) const {
   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   // 'f' like normal targets.
   if (ConstraintVT.isFloatingPoint()) {
@@ -8749,7 +9251,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     // 32-bit signed value
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       const ConstantInt *CI = C->getConstantIntValue();
-      if (CI->isValueValidForType(Type::Int32Ty, C->getSExtValue())) {
+      if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                  C->getSExtValue())) {
         // Widen to 64 bits here to get it sign extended.
         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
         break;
@@ -8763,7 +9266,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     // 32-bit unsigned value
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       const ConstantInt *CI = C->getConstantIntValue();
-      if (CI->isValueValidForType(Type::Int32Ty, C->getZExtValue())) {
+      if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                  C->getZExtValue())) {
         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
         break;
       }
@@ -8803,16 +9307,22 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
           continue;
         }
       }
-      
+
       // Otherwise, this isn't something we can handle, reject it.
       return;
     }
 
+    GlobalValue *GV = GA->getGlobal();
+    // If we require an extra load to get this address, as in PIC mode, we
+    // can't accept it.
+    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
+                                                        getTargetMachine())))
+      return;
+
     if (hasMemory)
-      Op = LowerGlobalAddress(GA->getGlobal(), Op.getDebugLoc(), Offset, DAG);
+      Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
     else
-      Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
-                                      Offset);
+      Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset);
     Result = Op;
     break;
   }
@@ -8828,12 +9338,42 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
 std::vector<unsigned> X86TargetLowering::
 getRegClassForInlineAsmConstraint(const std::string &Constraint,
-                                  MVT VT) const {
+                                  EVT VT) const {
   if (Constraint.size() == 1) {
     // FIXME: not handling fp-stack yet!
     switch (Constraint[0]) {      // GCC X86 Constraint Letters
     default: break;  // Unknown constraint letter
-    case 'q':   // Q_REGS (GENERAL_REGS in 64-bit mode)
+    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+      if (Subtarget->is64Bit()) {
+        if (VT == MVT::i32)
+          return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX,
+                                       X86::ESI, X86::EDI, X86::R8D, X86::R9D,
+                                       X86::R10D,X86::R11D,X86::R12D,
+                                       X86::R13D,X86::R14D,X86::R15D,
+                                       X86::EBP, X86::ESP, 0);
+        else if (VT == MVT::i16)
+          return make_vector<unsigned>(X86::AX,  X86::DX,  X86::CX, X86::BX,
+                                       X86::SI,  X86::DI,  X86::R8W,X86::R9W,
+                                       X86::R10W,X86::R11W,X86::R12W,
+                                       X86::R13W,X86::R14W,X86::R15W,
+                                       X86::BP,  X86::SP, 0);
+        else if (VT == MVT::i8)
+          return make_vector<unsigned>(X86::AL,  X86::DL,  X86::CL, X86::BL,
+                                       X86::SIL, X86::DIL, X86::R8B,X86::R9B,
+                                       X86::R10B,X86::R11B,X86::R12B,
+                                       X86::R13B,X86::R14B,X86::R15B,
+                                       X86::BPL, X86::SPL, 0);
+
+        else if (VT == MVT::i64)
+          return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX,
+                                       X86::RSI, X86::RDI, X86::R8,  X86::R9,
+                                       X86::R10, X86::R11, X86::R12,
+                                       X86::R13, X86::R14, X86::R15,
+                                       X86::RBP, X86::RSP, 0);
+
+        break;
+      }
+      // 32-bit fallthrough
     case 'Q':   // Q_REGS
       if (VT == MVT::i32)
         return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
@@ -8852,7 +9392,7 @@ getRegClassForInlineAsmConstraint(const std::string &Constraint,
 
 std::pair<unsigned, const TargetRegisterClass*>
 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                MVT VT) const {
+                                                EVT VT) const {
   // First, see if this is a constraint that directly corresponds to an LLVM
   // register class.
   if (Constraint.size() == 1) {
@@ -8860,7 +9400,6 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     switch (Constraint[0]) {
     default: break;
     case 'r':   // GENERAL_REGS
-    case 'R':   // LEGACY_REGS
     case 'l':   // INDEX_REGS
       if (VT == MVT::i8)
         return std::make_pair(0U, X86::GR8RegisterClass);
@@ -8869,6 +9408,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       if (VT == MVT::i32 || !Subtarget->is64Bit())
         return std::make_pair(0U, X86::GR32RegisterClass);
       return std::make_pair(0U, X86::GR64RegisterClass);
+    case 'R':   // LEGACY_REGS
+      if (VT == MVT::i8)
+        return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
+      if (VT == MVT::i32 || !Subtarget->is64Bit())
+        return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
+      return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
     case 'f':  // FP Stack registers.
       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
       // value to the correct fpstack register class.
@@ -8886,7 +9433,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     case 'x':   // SSE_REGS if SSE1 allowed
       if (!Subtarget->hasSSE1()) break;
 
-      switch (VT.getSimpleVT()) {
+      switch (VT.getSimpleVT().SimpleTy) {
       default: break;
       // Scalar SSE types.
       case MVT::f32:
@@ -8915,15 +9462,39 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
 
   // Not found as a standard register?
   if (Res.second == 0) {
-    // GCC calls "st(0)" just plain "st".
+    // Map st(0) -> st(7) -> ST0
+    if (Constraint.size() == 7 && Constraint[0] == '{' &&
+        tolower(Constraint[1]) == 's' &&
+        tolower(Constraint[2]) == 't' &&
+        Constraint[3] == '(' &&
+        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+        Constraint[5] == ')' &&
+        Constraint[6] == '}') {
+
+      Res.first = X86::ST0+Constraint[4]-'0';
+      Res.second = X86::RFP80RegisterClass;
+      return Res;
+    }
+
+    // GCC allows "st(0)" to be called just plain "st".
     if (StringsEqualNoCase("{st}", Constraint)) {
       Res.first = X86::ST0;
       Res.second = X86::RFP80RegisterClass;
+      return Res;
+    }
+
+    // flags -> EFLAGS
+    if (StringsEqualNoCase("{flags}", Constraint)) {
+      Res.first = X86::EFLAGS;
+      Res.second = X86::CCRRegisterClass;
+      return Res;
     }
+
     // 'A' means EAX + EDX.
     if (Constraint == "A") {
       Res.first = X86::EAX;
-      Res.second = X86::GRADRegisterClass;
+      Res.second = X86::GR32_ADRegisterClass;
+      return Res;
     }
     return Res;
   }
@@ -9015,7 +9586,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
 /// When and where to widen is target dependent based on the cost of
 /// scalarizing vs using the wider vector type.
 
-MVT X86TargetLowering::getWidenVectorType(MVT VT) const {
+EVT X86TargetLowering::getWidenVectorType(EVT VT) const {
   assert(VT.isVector());
   if (isTypeLegal(VT))
     return VT;
@@ -9024,7 +9595,7 @@ MVT X86TargetLowering::getWidenVectorType(MVT VT) const {
   //       type based on element type.  This would speed up our search (though
   //       it may not be worth it since the size of the list is relatively
   //       small).
-  MVT EltVT = VT.getVectorElementType();
+  EVT EltVT = VT.getVectorElementType();
   unsigned NElts = VT.getVectorNumElements();
 
   // On X86, it make sense to widen any vector wider than 1
@@ -9033,7 +9604,7 @@ MVT X86TargetLowering::getWidenVectorType(MVT VT) const {
 
   for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE;
        nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
-    MVT SVT = (MVT::SimpleValueType)nVT;
+    EVT SVT = (MVT::SimpleValueType)nVT;
 
     if (isTypeLegal(SVT) &&
         SVT.getVectorElementType() == EltVT &&
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index ffed46c..2f7b8ba 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -85,7 +85,7 @@ namespace llvm {
       /// as.
       FST,
 
-      /// CALL/TAILCALL - These operations represent an abstract X86 call
+      /// CALL - These operations represent an abstract X86 call
       /// instruction, which includes a bunch of information.  In particular the
       /// operands of these node are:
       ///
@@ -102,12 +102,8 @@ namespace llvm {
       ///     #1 - The first register result value (optional)
       ///     #2 - The second register result value (optional)
       ///
-      /// The CALL vs TAILCALL distinction boils down to whether the callee is
-      /// known not to modify the caller's stack frame, as is standard with
-      /// LLVM.
       CALL,
-      TAILCALL,
-      
+
       /// RDTSC_DAG - This operation implements the lowering for 
       /// readcyclecounter
       RDTSC_DAG,
@@ -208,17 +204,6 @@ namespace llvm {
       LCMPXCHG_DAG,
       LCMPXCHG8_DAG,
 
-      // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG, 
-      // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG - 
-      // Atomic 64-bit binary operations.
-      ATOMADD64_DAG,
-      ATOMSUB64_DAG,
-      ATOMOR64_DAG,
-      ATOMXOR64_DAG,
-      ATOMAND64_DAG,
-      ATOMNAND64_DAG,
-      ATOMSWAP64_DAG,
-
       // FNSTCW16m - Store FP control world into i16 memory.
       FNSTCW16m,
 
@@ -241,10 +226,29 @@ namespace llvm {
 
       // ADD, SUB, SMUL, UMUL, etc. - Arithmetic operations with FLAGS results.
       ADD, SUB, SMUL, UMUL,
-      INC, DEC,
+      INC, DEC, OR, XOR, AND,
 
       // MUL_IMM - X86 specific multiply by immediate.
-      MUL_IMM
+      MUL_IMM,
+      
+      // PTEST - Vector bitwise comparisons
+      PTEST,
+
+      // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
+      // according to %al. An operator is needed so that this can be expanded
+      // with control flow.
+      VASTART_SAVE_XMM_REGS,
+
+      // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG, 
+      // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG - 
+      // Atomic 64-bit binary operations.
+      ATOMADD64_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      ATOMSUB64_DAG,
+      ATOMOR64_DAG,
+      ATOMXOR64_DAG,
+      ATOMAND64_DAG,
+      ATOMNAND64_DAG,
+      ATOMSWAP64_DAG
     };
   }
 
@@ -333,6 +337,15 @@ namespace llvm {
     /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
     /// instructions.
     unsigned getShufflePSHUFLWImmediate(SDNode *N);
+
+    /// isZeroNode - Returns true if Elt is a constant zero or a floating point
+    /// constant +0.0.
+    bool isZeroNode(SDValue Elt);
+
+    /// isOffsetSuitableForCodeModel - Returns true of the given offset can be
+    /// fit into displacement field of the instruction.
+    bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+                                      bool hasSymbolicDisplacement = true);
   }
 
   //===--------------------------------------------------------------------===//
@@ -374,12 +387,17 @@ namespace llvm {
 
     /// getOptimalMemOpType - Returns the target specific optimal type for load
     /// and store operations as a result of memset, memcpy, and memmove
-    /// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
+    /// lowering. It returns EVT::iAny if SelectionDAG should be responsible for
     /// determining it.
-    virtual
-    MVT getOptimalMemOpType(uint64_t Size, unsigned Align,
-                            bool isSrcConst, bool isSrcStr,
-                            SelectionDAG &DAG) const;
+    virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align,
+                                    bool isSrcConst, bool isSrcStr,
+                                    SelectionDAG &DAG) const;
+
+    /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+    /// unaligned memory accesses. of the specified type.
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT) const {
+      return true;
+    }
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     ///
@@ -395,7 +413,8 @@ namespace llvm {
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
     virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                  MachineBasicBlock *MBB) const;
+                                                         MachineBasicBlock *MBB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
 
  
     /// getTargetNodeName - This method returns the name of a target specific
@@ -403,7 +422,7 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual MVT getSetCCResultType(MVT VT) const;
+    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
 
     /// computeMaskedBitsForTargetNode - Determine which of the bits specified 
     /// in Mask are known to be either zero or one and return them in the 
@@ -420,13 +439,15 @@ namespace llvm {
     
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG);
 
+    virtual bool ExpandInlineAsm(CallInst *CI) const;
+    
     ConstraintType getConstraintType(const std::string &Constraint) const;
      
     std::vector<unsigned> 
       getRegClassForInlineAsmConstraint(const std::string &Constraint,
-                                        MVT VT) const;
+                                        EVT VT) const;
 
-    virtual const char *LowerXConstraint(MVT ConstraintVT) const;
+    virtual const char *LowerXConstraint(EVT ConstraintVT) const;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
@@ -444,7 +465,7 @@ namespace llvm {
     /// error, this returns a register number of 0.
     std::pair<unsigned, const TargetRegisterClass*> 
       getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   MVT VT) const;
+                                   EVT VT) const;
     
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
@@ -454,7 +475,7 @@ namespace llvm {
     /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
     /// register EAX to i16 by referencing its sub-register AX.
     virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const;
-    virtual bool isTruncateFree(MVT VT1, MVT VT2) const;
+    virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
 
     /// isZExtFree - Return true if any actual instruction that defines a
     /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
@@ -465,31 +486,31 @@ namespace llvm {
     /// all instructions that define 32-bit values implicit zero-extend the
     /// result out to 64 bits.
     virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const;
-    virtual bool isZExtFree(MVT VT1, MVT VT2) const;
+    virtual bool isZExtFree(EVT VT1, EVT VT2) const;
 
     /// isNarrowingProfitable - Return true if it's profitable to narrow
     /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
     /// from i32 to i8 but not from i32 to i16.
-    virtual bool isNarrowingProfitable(MVT VT1, MVT VT2) const;
+    virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const;
 
     /// isShuffleMaskLegal - Targets can use this to indicate that they only
     /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
     /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
     /// values are assumed to be legal.
     virtual bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
-                                    MVT VT) const;
+                                    EVT VT) const;
 
     /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
     /// used by Targets can use this to indicate if there is a suitable
     /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
     /// pool entry.
     virtual bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
-                                        MVT VT) const;
+                                        EVT VT) const;
 
     /// ShouldShrinkFPConstant - If true, then instruction selection should
     /// seek to shrink the FP constant of the specified type to a smaller type
     /// in order to save space and / or reduce runtime.
-    virtual bool ShouldShrinkFPConstant(MVT VT) const {
+    virtual bool ShouldShrinkFPConstant(EVT VT) const {
       // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
       // expensive than a straight movsd. On the other hand, it's important to
       // shrink long double fp constant since fldt is very slow.
@@ -497,11 +518,14 @@ namespace llvm {
     }
     
     /// IsEligibleForTailCallOptimization - Check whether the call is eligible
-    /// for tail call optimization. Target which want to do tail call
+    /// for tail call optimization. Targets which want to do tail call
     /// optimization should implement this function.
-    virtual bool IsEligibleForTailCallOptimization(CallSDNode *TheCall, 
-                                                   SDValue Ret, 
-                                                   SelectionDAG &DAG) const;
+    virtual bool
+    IsEligibleForTailCallOptimization(SDValue Callee,
+                                      CallingConv::ID CalleeCC,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      SelectionDAG& DAG) const;
 
     virtual const X86Subtarget* getSubtarget() {
       return Subtarget;
@@ -509,17 +533,17 @@ namespace llvm {
 
     /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
     /// computed in an SSE register, not on the X87 floating point stack.
-    bool isScalarFPTypeInSSEReg(MVT VT) const {
+    bool isScalarFPTypeInSSEReg(EVT VT) const {
       return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
       (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
     }
 
     /// getWidenVectorType: given a vector type, returns the type to widen
     /// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
-    /// If there is no vector type that we want to widen to, returns MVT::Other
+    /// If there is no vector type that we want to widen to, returns EVT::Other
     /// When and were to widen is target dependent based on the cost of
     /// scalarizing vs using the wider vector type.
-    virtual MVT getWidenVectorType(MVT VT) const;
+    virtual EVT getWidenVectorType(EVT VT) const;
 
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
@@ -554,28 +578,30 @@ namespace llvm {
     bool X86ScalarSSEf32;
     bool X86ScalarSSEf64;
 
-    SDNode *LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
-                            unsigned CallingConv, SelectionDAG &DAG);
-
-    SDValue LowerMemArgument(SDValue Op, SelectionDAG &DAG,
-                               const CCValAssign &VA,  MachineFrameInfo *MFI,
-                               unsigned CC, SDValue Root, unsigned i);
-
-    SDValue LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
-                               const SDValue &StackPtr,
-                               const CCValAssign &VA, SDValue Chain,
-                               SDValue Arg, ISD::ArgFlagsTy Flags);
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+    SDValue LowerMemArgument(SDValue Chain,
+                             CallingConv::ID CallConv,
+                             const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,  MachineFrameInfo *MFI,
+                              unsigned i);
+    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,
+                             ISD::ArgFlagsTy Flags);
 
     // Call lowering helpers.
-    bool IsCalleePop(bool isVarArg, unsigned CallingConv);
-    bool CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall);
-    bool CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall);
+    bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv);
     SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
                                 SDValue Chain, bool IsTailCall, bool Is64Bit,
                                 int FPDiff, DebugLoc dl);
 
-    CCAssignFn *CCAssignFnForNode(unsigned CallingConv) const;
-    NameDecorationStyle NameDecorationForFORMAL_ARGUMENTS(SDValue Op);
+    CCAssignFn *CCAssignFnForNode(CallingConv::ID CallConv) const;
+    NameDecorationStyle NameDecorationForCallConv(CallingConv::ID CallConv);
     unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG &DAG);
 
     std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
@@ -595,7 +621,7 @@ namespace llvm {
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG);
     SDValue LowerShift(SDValue Op, SelectionDAG &DAG);
-    SDValue BuildFILD(SDValue Op, MVT SrcVT, SDValue Chain, SDValue StackSlot,
+    SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
                       SelectionDAG &DAG);
     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG);
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG);
@@ -612,10 +638,7 @@ namespace llvm {
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG);
     SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG);
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG);
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG);
     SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG);
@@ -635,6 +658,26 @@ namespace llvm {
     SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG);
     SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG);
 
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
     void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                  SelectionDAG &DAG, unsigned NewOp);
 
@@ -651,9 +694,17 @@ namespace llvm {
                                     const Value *DstSV, uint64_t DstSVOff,
                                     const Value *SrcSV, uint64_t SrcSVOff);
     
+    /// Utility function to emit string processing sse4.2 instructions
+    /// that return in xmm0.
+    /// This takes the instruction to expand, the associated machine basic
+    /// block, the number of args, and whether or not the second arg is
+    /// in memory or not.
+    MachineBasicBlock *EmitPCMP(MachineInstr *BInstr, MachineBasicBlock *BB,
+				unsigned argNum, bool inMem) const;
+
     /// Utility function to emit atomic bitwise operations (and, or, xor).
-    // It takes the bitwise instruction to expand, the associated machine basic
-    // block, and the associated X86 opcodes for reg/reg and reg/imm.
+    /// It takes the bitwise instruction to expand, the associated machine basic
+    /// block, and the associated X86 opcodes for reg/reg and reg/imm.
     MachineBasicBlock *EmitAtomicBitwiseWithCustomInserter(
                                                     MachineInstr *BInstr,
                                                     MachineBasicBlock *BB,
@@ -683,6 +734,15 @@ namespace llvm {
                                                           MachineBasicBlock *BB,
                                                         unsigned cmovOpc) const;
 
+    /// Utility function to emit the xmm reg save portion of va_start.
+    MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter(
+                                                   MachineInstr *BInstr,
+                                                   MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredSelect(MachineInstr *I,
+                                         MachineBasicBlock *BB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+    
     /// Emit nodes that will be selected as "test Op0,Op0", or something
     /// equivalent, for use with the given x86 condition code.
     SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG);
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
index 472ba4c..ef19823 100644
--- a/lib/Target/X86/X86Instr64bit.td
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -28,26 +28,29 @@ def i64i32imm_pcrel : Operand<i64> {
 
 
 // 64-bits but only 8 bits are significant.
-def i64i8imm   : Operand<i64>;
+def i64i8imm   : Operand<i64> {
+  let ParserMatchClass = ImmSExt8AsmOperand;
+}
 
 def lea64mem : Operand<i64> {
   let PrintMethod = "printlea64mem";
-  let MIOperandInfo = (ops GR64, i8imm, GR64, i32imm);
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm);
+  let ParserMatchClass = X86MemAsmOperand;
 }
 
 def lea64_32mem : Operand<i32> {
   let PrintMethod = "printlea64_32mem";
   let AsmOperandLowerMethod = "lower_lea64_32mem";
-  let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+  let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm);
+  let ParserMatchClass = X86MemAsmOperand;
 }
 
 //===----------------------------------------------------------------------===//
 // Complex Pattern Definitions.
 //
 def lea64addr : ComplexPattern<i64, 4, "SelectLEAAddr",
-                        [add, mul, X86mul_imm, shl, or, frameindex, X86Wrapper,
-                         X86WrapperRIP],
-                        []>;
+                        [add, sub, mul, X86mul_imm, shl, or, frameindex,
+                         X86WrapperRIP], []>;
 
 def tls64addr : ComplexPattern<i64, 4, "SelectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
@@ -129,13 +132,40 @@ let isCall = 1 in
     def CALL64pcrel32 : Ii32<0xE8, RawFrm,
                           (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
                           "call\t$dst", []>,
-                        Requires<[In64BitMode]>;
+                        Requires<[In64BitMode, NotWin64]>;
     def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
-                          "call\t{*}$dst", [(X86call GR64:$dst)]>;
+                          "call\t{*}$dst", [(X86call GR64:$dst)]>,
+                        Requires<[NotWin64]>;
     def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops),
-                          "call\t{*}$dst", [(X86call (loadi64 addr:$dst))]>;
+                          "call\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
+                        Requires<[NotWin64]>;
+                        
+    def FARCALL64   : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
+                         "lcall{q}\t{*}$dst", []>;
   }
 
+  // FIXME: We need to teach codegen about single list of call-clobbered registers.
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. RSP is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Defs = [RAX, RCX, RDX, R8, R9, R10, R11,
+              FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, EFLAGS],
+      Uses = [RSP] in {
+    def WINCALL64pcrel32 : I<0xE8, RawFrm,
+                             (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
+                             "call\t$dst", []>,
+                           Requires<[IsWin64]>;
+    def WINCALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
+                             "call\t{*}$dst",
+                             [(X86call GR64:$dst)]>, Requires<[IsWin64]>;
+    def WINCALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops),
+                             "call\t{*}$dst",
+                             [(X86call (loadi64 addr:$dst))]>, Requires<[IsWin64]>;
+  }
 
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
@@ -162,6 +192,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
                      [(brind GR64:$dst)]>;
   def JMP64m     : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
                      [(brind (loadi64 addr:$dst))]>;
+  def FARJMP64   : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
+                      "ljmp{q}\t{*}$dst", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -182,12 +214,18 @@ let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
 def LEAVE64  : I<0xC9, RawFrm,
                  (outs), (ins), "leave", []>;
 let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
-let mayLoad = 1 in
+let mayLoad = 1 in {
 def POP64r   : I<0x58, AddRegFrm,
                  (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
-let mayStore = 1 in
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
+def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", []>;
+}
+let mayStore = 1 in {
 def PUSH64r  : I<0x50, AddRegFrm,
                  (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>;
+}
 }
 
 let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1 in {
@@ -246,6 +284,14 @@ let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI] in
 def REP_STOSQ : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
                    [(X86rep_stos i64)]>, REP;
 
+def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scas{q}", []>;
+
+def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmps{q}", []>;
+
+// Fast system-call instructions
+def SYSEXIT64 : RI<0x35, RawFrm,
+                   (outs), (ins), "sysexit", []>, TB;
+
 //===----------------------------------------------------------------------===//
 //  Move Instructions...
 //
@@ -275,6 +321,25 @@ def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
                       "mov{q}\t{$src, $dst|$dst, $src}",
                       [(store i64immSExt32:$src, addr:$dst)]>;
 
+def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins i8imm:$src),
+                      "mov{q}\t{$src, %rax|%rax, $src}", []>;
+def MOV64o32a : RIi32<0xA1, RawFrm, (outs), (ins i32imm:$src),
+                       "mov{q}\t{$src, %rax|%rax, $src}", []>;
+def MOV64ao8 : RIi8<0xA2, RawFrm, (outs i8imm:$dst), (ins),
+                       "mov{q}\t{%rax, $dst|$dst, %rax}", []>;
+def MOV64ao32 : RIi32<0xA3, RawFrm, (outs i32imm:$dst), (ins),
+                       "mov{q}\t{%rax, $dst|$dst, %rax}", []>;
+
+// Moves to and from segment registers
+def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
+                 "mov{w}\t{$src, $dst|$dst, $src}", []>;
+def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src),
+                 "mov{w}\t{$src, $dst|$dst, $src}", []>;
+def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
+                 "mov{w}\t{$src, $dst|$dst, $src}", []>;
+def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
+                 "mov{w}\t{$src, $dst|$dst, $src}", []>;
+
 // Sign/Zero extenders
 
 // MOVSX64rr8 always has a REX prefix and it has an 8-bit register
@@ -332,13 +397,15 @@ def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
                     [(set GR64:$dst, (zextloadi64i32 addr:$src))]>;
 
 // Any instruction that defines a 32-bit result leaves the high half of the
-// register. Truncate can be lowered to EXTRACT_SUBREG, and CopyFromReg may
-// be copying from a truncate, but any other 32-bit operation will zero-extend
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. And x86's cmov doesn't do anything if the
+// condition is false. But any other 32-bit operation will zero-extend
 // up to 64 bits.
 def def32 : PatLeaf<(i32 GR32:$src), [{
   return N->getOpcode() != ISD::TRUNCATE &&
          N->getOpcode() != TargetInstrInfo::EXTRACT_SUBREG &&
-         N->getOpcode() != ISD::CopyFromReg;
+         N->getOpcode() != ISD::CopyFromReg &&
+         N->getOpcode() != X86ISD::CMOV;
 }]>;
 
 // In the case of a 32-bit def that is known to implicitly zero-extend,
@@ -361,6 +428,10 @@ let neverHasSideEffects = 1 in {
 //
 
 let Defs = [EFLAGS] in {
+
+def ADD64i32 : RI<0x05, RawFrm, (outs), (ins i32imm:$src),
+                  "add{q}\t{$src, %rax|%rax, $src}", []>;
+
 let isTwoAddress = 1 in {
 let isConvertibleToThreeAddress = 1 in {
 let isCommutable = 1 in
@@ -386,6 +457,12 @@ def ADD64rm     : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:
                      "add{q}\t{$src2, $dst|$dst, $src2}",
                      [(set GR64:$dst, (add GR64:$src1, (load addr:$src2))),
                       (implicit EFLAGS)]>;
+
+// Register-Register Addition - Equivalent to the normal rr form (ADD64rr), but
+//   differently encoded.
+def ADD64mrmrr  : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                     "add{l}\t{$src2, $dst|$dst, $src2}", []>;
+
 } // isTwoAddress
 
 // Memory-Register Addition
@@ -403,6 +480,10 @@ def ADD64mi32 : RIi32<0x81, MRM0m, (outs), (ins i64mem:$dst, i64i32imm :$src2),
                 (implicit EFLAGS)]>;
 
 let Uses = [EFLAGS] in {
+
+def ADC64i32 : RI<0x15, RawFrm, (outs), (ins i32imm:$src),
+                  "adc{q}\t{$src, %rax|%rax, $src}", []>;
+
 let isTwoAddress = 1 in {
 let isCommutable = 1 in
 def ADC64rr  : RI<0x11, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
@@ -458,6 +539,9 @@ def SUB64ri32 : RIi32<0x81, MRM5r, (outs GR64:$dst),
                        (implicit EFLAGS)]>;
 } // isTwoAddress
 
+def SUB64i32 : RI<0x2D, RawFrm, (outs), (ins i32imm:$src),
+                  "sub{q}\t{$src, %rax|%rax, $src}", []>;
+
 // Memory-Register Subtraction
 def SUB64mr  : RI<0x29, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), 
                   "sub{q}\t{$src2, $dst|$dst, $src2}",
@@ -494,6 +578,9 @@ def SBB64ri32 : RIi32<0x81, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:
                       [(set GR64:$dst, (sube GR64:$src1, i64immSExt32:$src2))]>;
 } // isTwoAddress
 
+def SBB64i32 : RI<0x1D, RawFrm, (outs), (ins i32imm:$src),
+                  "sbb{q}\t{$src, %rax|%rax, $src}", []>;
+
 def SBB64mr  : RI<0x19, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), 
                   "sbb{q}\t{$src2, $dst|$dst, $src2}",
                   [(store (sube (load addr:$dst), GR64:$src2), addr:$dst)]>;
@@ -665,8 +752,10 @@ let isConvertibleToThreeAddress = 1 in   // Can transform into LEA.
 def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
                     "shl{q}\t{$src2, $dst|$dst, $src2}",
                     [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>;
-// NOTE: We don't use shifts of a register by one, because 'add reg,reg' is
-// cheaper.
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper.
+def SHL64r1  : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+                 "shr{q}\t$dst", []>;
 } // isTwoAddress
 
 let Uses = [CL] in
@@ -729,6 +818,39 @@ def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
                  [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
 
 // Rotate instructions
+
+let isTwoAddress = 1 in {
+def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src),
+                 "rcl{q}\t{1, $dst|$dst, 1}", []>;
+def RCL64m1 : RI<0xD1, MRM2m, (outs i64mem:$dst), (ins i64mem:$src),
+                 "rcl{q}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src),
+                  "rcl{q}\t{%cl, $dst|$dst, CL}", []>;
+def RCL64mCL : RI<0xD3, MRM2m, (outs i64mem:$dst), (ins i64mem:$src),
+                  "rcl{q}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src, i8imm:$cnt),
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCL64mi : RIi8<0xC1, MRM2m, (outs i64mem:$dst), (ins i64mem:$src, i8imm:$cnt),
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+
+def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src),
+                 "rcr{q}\t{1, $dst|$dst, 1}", []>;
+def RCR64m1 : RI<0xD1, MRM3m, (outs i64mem:$dst), (ins i64mem:$src),
+                 "rcr{q}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src),
+                  "rcr{q}\t{%cl, $dst|$dst, CL}", []>;
+def RCR64mCL : RI<0xD3, MRM3m, (outs i64mem:$dst), (ins i64mem:$src),
+                  "rcr{q}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src, i8imm:$cnt),
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCR64mi : RIi8<0xC1, MRM3m, (outs i64mem:$dst), (ins i64mem:$src, i8imm:$cnt),
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+}
+
 let isTwoAddress = 1 in {
 let Uses = [CL] in
 def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src),
@@ -839,6 +961,9 @@ def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
                 [(store (not (loadi64 addr:$dst)), addr:$dst)]>;
 
 let Defs = [EFLAGS] in {
+def AND64i32 : RI<0x25, RawFrm, (outs), (ins i32imm:$src),
+                  "and{q}\t{$src, %rax|%rax, $src}", []>;
+
 let isTwoAddress = 1 in {
 let isCommutable = 1 in
 def AND64rr  : RI<0x21, MRMDestReg, 
@@ -912,6 +1037,9 @@ def OR64mi32 : RIi32<0x81, MRM1m, (outs), (ins i64mem:$dst, i64i32imm:$src),
               [(store (or (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
                (implicit EFLAGS)]>;
 
+def OR64i32 : RIi32<0x0D, RawFrm, (outs), (ins i32imm:$src),
+                    "or{q}\t{$src, %rax|%rax, $src}", []>;
+
 let isTwoAddress = 1 in {
 let isCommutable = 1 in
 def XOR64rr  : RI<0x31, MRMDestReg,  (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), 
@@ -945,6 +1073,10 @@ def XOR64mi32 : RIi32<0x81, MRM6m, (outs), (ins i64mem:$dst, i64i32imm:$src),
                       "xor{q}\t{$src, $dst|$dst, $src}",
              [(store (xor (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
               (implicit EFLAGS)]>;
+              
+def XOR64i32 : RIi32<0x35, RawFrm, (outs), (ins i32imm:$src),
+                     "xor{q}\t{$src, %rax|%rax, $src}", []>;
+
 } // Defs = [EFLAGS]
 
 //===----------------------------------------------------------------------===//
@@ -953,6 +1085,8 @@ def XOR64mi32 : RIi32<0x81, MRM6m, (outs), (ins i64mem:$dst, i64i32imm:$src),
 
 // Integer comparison
 let Defs = [EFLAGS] in {
+def TEST64i32 : RI<0xa9, RawFrm, (outs), (ins i32imm:$src),
+                   "test{q}\t{$src, %rax|%rax, $src}", []>;
 let isCommutable = 1 in
 def TEST64rr : RI<0x85, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
                   "test{q}\t{$src2, $src1|$src1, $src2}",
@@ -973,10 +1107,15 @@ def TEST64mi32 : RIi32<0xF7, MRM0m, (outs),
                 [(X86cmp (and (loadi64 addr:$src1), i64immSExt32:$src2), 0),
                  (implicit EFLAGS)]>;
 
+
+def CMP64i32 : RI<0x3D, RawFrm, (outs), (ins i32imm:$src),
+                  "cmp{q}\t{$src, %rax|%rax, $src}", []>;
 def CMP64rr : RI<0x39, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
                  "cmp{q}\t{$src2, $src1|$src1, $src2}",
                  [(X86cmp GR64:$src1, GR64:$src2),
                   (implicit EFLAGS)]>;
+def CMP64mrmrr : RI<0x3B, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2),
+                    "cmp{q}\t{$src2, $src1|$src1, $src2}", []>;
 def CMP64mr : RI<0x39, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
                  "cmp{q}\t{$src2, $src1|$src1, $src2}",
                  [(X86cmp (loadi64 addr:$src1), GR64:$src2),
@@ -1306,14 +1445,12 @@ def Int_CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src)
 // Alias instructions that map movr0 to xor. Use xorl instead of xorq; it's
 // equivalent due to implicit zero-extending, and it sometimes has a smaller
 // encoding.
-// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
-// FIXME: AddedComplexity gives MOV64r0 a higher priority than MOV64ri32. Remove
+// FIXME: AddedComplexity gives this a higher priority than MOV64ri32. Remove
 // when we have a better way to specify isel priority.
-let Defs = [EFLAGS], AddedComplexity = 1,
-    isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOV64r0  : I<0x31, MRMInitReg,  (outs GR64:$dst), (ins),
-                "xor{l}\t${dst:subreg32}, ${dst:subreg32}",
-                [(set GR64:$dst, 0)]>;
+let AddedComplexity = 1 in
+def : Pat<(i64 0),
+          (SUBREG_TO_REG (i64 0), (MOV32r0), x86_subreg_32bit)>;
+
 
 // Materialize i64 constant where top 32-bits are zero.
 let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
@@ -1343,12 +1480,12 @@ def TLS_addr64 : I<0, Pseudo, (outs), (ins lea64mem:$sym),
                   [(X86tlsaddr tls64addr:$sym)]>,
                   Requires<[In64BitMode]>;
 
-let AddedComplexity = 5 in
+let AddedComplexity = 5, isCodeGenOnly = 1 in
 def MOV64GSrm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                  "movq\t%gs:$src, $dst",
                  [(set GR64:$dst, (gsload addr:$src))]>, SegGS;
 
-let AddedComplexity = 5 in
+let AddedComplexity = 5, isCodeGenOnly = 1 in
 def MOV64FSrm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                  "movq\t%fs:$src, $dst",
                  [(set GR64:$dst, (fsload addr:$src))]>, SegFS;
@@ -1371,11 +1508,43 @@ def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$ptr,GR64:$val),
                "xadd\t$val, $ptr",
                [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))]>,
                 TB, LOCK;
+
 def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$ptr,GR64:$val),
                   "xchg\t$val, $ptr", 
                   [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))]>;
 }
 
+// Optimized codegen when the non-memory output is not used.
+// FIXME: Use normal add / sub instructions and add lock prefix dynamically.
+def LOCK_ADD64mr : RI<0x03, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                      "lock\n\t"
+                      "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD64mi8 : RIi8<0x83, MRM0m, (outs),
+                                      (ins i64mem:$dst, i64i8imm :$src2),
+                    "lock\n\t"
+                    "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD64mi32 : RIi32<0x81, MRM0m, (outs),
+                                        (ins i64mem:$dst, i64i32imm :$src2),
+                      "lock\n\t"
+                      "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB64mr : RI<0x29, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), 
+                      "lock\n\t"
+                      "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB64mi8 : RIi8<0x83, MRM5m, (outs),
+                                      (ins i64mem:$dst, i64i8imm :$src2), 
+                      "lock\n\t"
+                      "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB64mi32 : RIi32<0x81, MRM5m, (outs),
+                                        (ins i64mem:$dst, i64i32imm:$src2),
+                      "lock\n\t"
+                      "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst),
+                     "lock\n\t"
+                     "inc{q}\t$dst", []>, LOCK;
+def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),
+                      "lock\n\t"
+                      "dec{q}\t$dst", []>, LOCK;
+
 // Atomic exchange, and, or, xor
 let Constraints = "$val = $dst", Defs = [EFLAGS],
                   usesCustomDAGSchedInserter = 1 in {
@@ -1405,78 +1574,88 @@ def ATOMUMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
                [(set GR64:$dst, (atomic_load_umax_64 addr:$ptr, GR64:$val))]>;
 }
 
+// Segmentation support instructions
+
+// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), 
+                 "lar{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+                 "lar{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 
+// String manipulation instructions
+
+def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", []>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
 
-// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
+// code model mode, should use 'movabs'.  FIXME: This is really a hack, the
+//  'movabs' predicate should handle this sort of thing.
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri tconstpool  :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri tjumptable  :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri texternalsym:$dst)>, Requires<[FarData]>;
+
+// In static codegen with small code model, we can get the address of a label
+// into a register with 'movl'.  FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri64i32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri64i32 tconstpool  :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri64i32 tjumptable  :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri64i32 tglobaladdr :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri64i32 texternalsym:$dst)>, Requires<[SmallCode]>;
+
+// In kernel code model, we can get the address of a label
+// into a register with 'movq'.  FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri32 should accept these.
 def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
-          (MOV64ri tconstpool  :$dst)>, Requires<[NotSmallCode]>;
+          (MOV64ri32 tconstpool  :$dst)>, Requires<[KernelCode]>;
 def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
-          (MOV64ri tjumptable  :$dst)>, Requires<[NotSmallCode]>;
+          (MOV64ri32 tjumptable  :$dst)>, Requires<[KernelCode]>;
 def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
-          (MOV64ri tglobaladdr :$dst)>, Requires<[NotSmallCode]>;
+          (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
 def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
-          (MOV64ri texternalsym:$dst)>, Requires<[NotSmallCode]>;
+          (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
 
 // If we have small model and -static mode, it is safe to store global addresses
 // directly as immediates.  FIXME: This is really a hack, the 'imm' predicate
-// should handle this sort of thing.
+// for MOV64mi32 should handle this sort of thing.
 def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tconstpool:$src)>,
-          Requires<[SmallCode, IsStatic]>;
+          Requires<[NearData, IsStatic]>;
 def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tjumptable:$src)>,
-          Requires<[SmallCode, IsStatic]>;
+          Requires<[NearData, IsStatic]>;
 def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
-          Requires<[SmallCode, IsStatic]>;
+          Requires<[NearData, IsStatic]>;
 def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, texternalsym:$src)>,
-          Requires<[SmallCode, IsStatic]>;
-
-// If we have small model and -static mode, it is safe to store global addresses
-// directly as immediates.  FIXME: This is really a hack, the 'imm' predicate
-// should handle this sort of thing.
-def : Pat<(store (i64 (X86WrapperRIP tconstpool:$src)), addr:$dst),
-          (MOV64mi32 addr:$dst, tconstpool:$src)>,
-          Requires<[SmallCode, IsStatic]>;
-def : Pat<(store (i64 (X86WrapperRIP tjumptable:$src)), addr:$dst),
-          (MOV64mi32 addr:$dst, tjumptable:$src)>,
-          Requires<[SmallCode, IsStatic]>;
-def : Pat<(store (i64 (X86WrapperRIP tglobaladdr:$src)), addr:$dst),
-          (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
-          Requires<[SmallCode, IsStatic]>;
-def : Pat<(store (i64 (X86WrapperRIP texternalsym:$src)), addr:$dst),
-          (MOV64mi32 addr:$dst, texternalsym:$src)>,
-          Requires<[SmallCode, IsStatic]>;
-
+          Requires<[NearData, IsStatic]>;
 
 // Calls
 // Direct PC relative function call for small code model. 32-bit displacement
 // sign extended to 64-bit.
 def : Pat<(X86call (i64 tglobaladdr:$dst)),
-          (CALL64pcrel32 tglobaladdr:$dst)>;
+          (CALL64pcrel32 tglobaladdr:$dst)>, Requires<[NotWin64]>;
 def : Pat<(X86call (i64 texternalsym:$dst)),
-          (CALL64pcrel32 texternalsym:$dst)>;
-
-def : Pat<(X86tailcall (i64 tglobaladdr:$dst)),
-          (CALL64pcrel32 tglobaladdr:$dst)>;
-def : Pat<(X86tailcall (i64 texternalsym:$dst)),
-          (CALL64pcrel32 texternalsym:$dst)>;
-
-def : Pat<(X86tailcall GR64:$dst),
-          (CALL64r GR64:$dst)>;
+          (CALL64pcrel32 texternalsym:$dst)>, Requires<[NotWin64]>;
 
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+          (WINCALL64pcrel32 tglobaladdr:$dst)>, Requires<[IsWin64]>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+          (WINCALL64pcrel32 texternalsym:$dst)>, Requires<[IsWin64]>;
 
 // tailcall stuff
-def : Pat<(X86tailcall GR32:$dst),
-          (TAILCALL)>;
-def : Pat<(X86tailcall (i64 tglobaladdr:$dst)),
-          (TAILCALL)>;
-def : Pat<(X86tailcall (i64 texternalsym:$dst)),
-          (TAILCALL)>;
-
 def : Pat<(X86tcret GR64:$dst, imm:$off),
           (TCRETURNri64 GR64:$dst, imm:$off)>;
 
@@ -1540,30 +1719,15 @@ def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>;
 // For other extloads, use subregs, since the high contents of the register are
 // defined after an extload.
 def : Pat<(extloadi64i32 addr:$src),
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (MOV32rm addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src),
                          x86_subreg_32bit)>;
-def : Pat<(extloadi16i1 addr:$src), 
-          (INSERT_SUBREG (i16 (IMPLICIT_DEF)), (MOV8rm addr:$src), 
-                         x86_subreg_8bit)>,
-         Requires<[In64BitMode]>;
-def : Pat<(extloadi16i8 addr:$src), 
-          (INSERT_SUBREG (i16 (IMPLICIT_DEF)), (MOV8rm addr:$src), 
-                         x86_subreg_8bit)>,
-         Requires<[In64BitMode]>;
-
-// anyext
-def : Pat<(i64 (anyext GR8:$src)),
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src, x86_subreg_8bit)>;
-def : Pat<(i64 (anyext GR16:$src)),
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR16:$src, x86_subreg_16bit)>;
-def : Pat<(i64 (anyext GR32:$src)), 
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, x86_subreg_32bit)>;
-def : Pat<(i16 (anyext GR8:$src)),
-          (INSERT_SUBREG (i16 (IMPLICIT_DEF)), GR8:$src, x86_subreg_8bit)>,
-         Requires<[In64BitMode]>;
-def : Pat<(i32 (anyext GR8:$src)),
-          (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, x86_subreg_8bit)>,
-         Requires<[In64BitMode]>;
+
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8  GR8  :$src)>;
+def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16 :$src)>;
+def : Pat<(i64 (anyext GR32:$src)),
+          (SUBREG_TO_REG (i64 0), GR32:$src, x86_subreg_32bit)>;
 
 //===----------------------------------------------------------------------===//
 // Some peepholes
@@ -1661,6 +1825,11 @@ def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
             (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
                             x86_subreg_8bit_hi))>,
       Requires<[In64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
 def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
           (SUBREG_TO_REG
             (i64 0),
@@ -1668,6 +1837,13 @@ def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
               (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
                               x86_subreg_8bit_hi)),
             x86_subreg_32bit)>;
+def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_32bit)>;
 
 // h-register extract and store.
 def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
@@ -1906,6 +2082,102 @@ def : Pat<(parallel (store (i64 (X86dec_flag (loadi64 addr:$dst))), addr:$dst),
                     (implicit EFLAGS)),
           (DEC64m addr:$dst)>;
 
+// Register-Register Logical Or with EFLAGS result
+def : Pat<(parallel (X86or_flag GR64:$src1, GR64:$src2),
+                    (implicit EFLAGS)),
+          (OR64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Integer Logical Or with EFLAGS result
+def : Pat<(parallel (X86or_flag GR64:$src1, i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86or_flag GR64:$src1, i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Register-Memory Logical Or with EFLAGS result
+def : Pat<(parallel (X86or_flag GR64:$src1, (loadi64 addr:$src2)),
+                    (implicit EFLAGS)),
+          (OR64rm GR64:$src1, addr:$src2)>;
+
+// Memory-Register Logical Or with EFLAGS result
+def : Pat<(parallel (store (X86or_flag (loadi64 addr:$dst), GR64:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR64mr addr:$dst, GR64:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi64 addr:$dst), i64immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR64mi8 addr:$dst, i64immSExt8:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi64 addr:$dst), i64immSExt32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR64mi32 addr:$dst, i64immSExt32:$src2)>;
+
+// Register-Register Logical XOr with EFLAGS result
+def : Pat<(parallel (X86xor_flag GR64:$src1, GR64:$src2),
+                    (implicit EFLAGS)),
+          (XOR64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Integer Logical XOr with EFLAGS result
+def : Pat<(parallel (X86xor_flag GR64:$src1, i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86xor_flag GR64:$src1, i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Register-Memory Logical XOr with EFLAGS result
+def : Pat<(parallel (X86xor_flag GR64:$src1, (loadi64 addr:$src2)),
+                    (implicit EFLAGS)),
+          (XOR64rm GR64:$src1, addr:$src2)>;
+
+// Memory-Register Logical XOr with EFLAGS result
+def : Pat<(parallel (store (X86xor_flag (loadi64 addr:$dst), GR64:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR64mr addr:$dst, GR64:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi64 addr:$dst), i64immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR64mi8 addr:$dst, i64immSExt8:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi64 addr:$dst), i64immSExt32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR64mi32 addr:$dst, i64immSExt32:$src2)>;
+
+// Register-Register Logical And with EFLAGS result
+def : Pat<(parallel (X86and_flag GR64:$src1, GR64:$src2),
+                    (implicit EFLAGS)),
+          (AND64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Integer Logical And with EFLAGS result
+def : Pat<(parallel (X86and_flag GR64:$src1, i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86and_flag GR64:$src1, i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Register-Memory Logical And with EFLAGS result
+def : Pat<(parallel (X86and_flag GR64:$src1, (loadi64 addr:$src2)),
+                    (implicit EFLAGS)),
+          (AND64rm GR64:$src1, addr:$src2)>;
+
+// Memory-Register Logical And with EFLAGS result
+def : Pat<(parallel (store (X86and_flag (loadi64 addr:$dst), GR64:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND64mr addr:$dst, GR64:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi64 addr:$dst), i64immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND64mi8 addr:$dst, i64immSExt8:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi64 addr:$dst), i64immSExt32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND64mi32 addr:$dst, i64immSExt32:$src2)>;
+
 //===----------------------------------------------------------------------===//
 // X86-64 SSE Instructions
 //===----------------------------------------------------------------------===//
@@ -1977,3 +2249,15 @@ let isTwoAddress = 1 in {
 }
 
 defm PINSRQ      : SS41I_insert64<0x22, "pinsrq">;
+
+// -disable-16bit support.
+def : Pat<(truncstorei16 (i64 imm:$src), addr:$dst),
+          (MOV16mi addr:$dst, imm:$src)>;
+def : Pat<(truncstorei16 GR64:$src, addr:$dst),
+          (MOV16mr addr:$dst, (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit))>;
+def : Pat<(i64 (sextloadi16 addr:$dst)),
+          (MOVSX64rm16 addr:$dst)>;
+def : Pat<(i64 (zextloadi16 addr:$dst)),
+          (MOVZX64rm16 addr:$dst)>;
+def : Pat<(i64 (extloadi16 addr:$dst)),
+          (MOVZX64rm16 addr:$dst)>;
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index 6359542..c475b56 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -26,6 +26,7 @@
 
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 
 namespace llvm {
@@ -47,7 +48,7 @@ struct X86AddressMode {
 
   unsigned Scale;
   unsigned IndexReg;
-  unsigned Disp;
+  int Disp;
   GlobalValue *GV;
   unsigned GVOpFlags;
 
@@ -61,20 +62,20 @@ struct X86AddressMode {
 /// current instruction -- that is, a dereference of an address in a register,
 /// with no scale, index or displacement. An example is: DWORD PTR [EAX].
 ///
-inline const MachineInstrBuilder &addDirectMem(const MachineInstrBuilder &MIB,
-                                               unsigned Reg) {
+static inline const MachineInstrBuilder &
+addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
   // Because memory references are always represented with four
   // values, this adds: Reg, [1, NoReg, 0] to the instruction.
   return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0);
 }
 
-inline const MachineInstrBuilder &addLeaOffset(const MachineInstrBuilder &MIB,
-                                            int Offset) {
+static inline const MachineInstrBuilder &
+addLeaOffset(const MachineInstrBuilder &MIB, int Offset) {
   return MIB.addImm(1).addReg(0).addImm(Offset);
 }
 
-inline const MachineInstrBuilder &addOffset(const MachineInstrBuilder &MIB,
-                                            int Offset) {
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
   return addLeaOffset(MIB, Offset).addReg(0);
 }
 
@@ -82,29 +83,29 @@ inline const MachineInstrBuilder &addOffset(const MachineInstrBuilder &MIB,
 /// [Reg + Offset], i.e., one with no scale or index, but with a
 /// displacement. An example is: DWORD PTR [EAX + 4].
 ///
-inline const MachineInstrBuilder &addRegOffset(const MachineInstrBuilder &MIB,
-                                               unsigned Reg, bool isKill,
-                                               int Offset) {
+static inline const MachineInstrBuilder &
+addRegOffset(const MachineInstrBuilder &MIB,
+             unsigned Reg, bool isKill, int Offset) {
   return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
 }
 
-inline const MachineInstrBuilder &addLeaRegOffset(const MachineInstrBuilder &MIB,
-                                                  unsigned Reg, bool isKill,
-                                                  int Offset) {
+static inline const MachineInstrBuilder &
+addLeaRegOffset(const MachineInstrBuilder &MIB,
+                unsigned Reg, bool isKill, int Offset) {
   return addLeaOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
 }
 
 /// addRegReg - This function is used to add a memory reference of the form:
 /// [Reg + Reg].
-inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
                                             unsigned Reg1, bool isKill1,
                                             unsigned Reg2, bool isKill2) {
   return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
     .addReg(Reg2, getKillRegState(isKill2)).addImm(0);
 }
 
-inline const MachineInstrBuilder &addLeaAddress(const MachineInstrBuilder &MIB,
-                                                const X86AddressMode &AM) {
+static inline const MachineInstrBuilder &
+addLeaAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM) {
   assert (AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
 
   if (AM.BaseType == X86AddressMode::RegBase)
@@ -120,8 +121,9 @@ inline const MachineInstrBuilder &addLeaAddress(const MachineInstrBuilder &MIB,
     return MIB.addImm(AM.Disp);
 }
 
-inline const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
-                                                 const X86AddressMode &AM) {
+static inline const MachineInstrBuilder &
+addFullAddress(const MachineInstrBuilder &MIB,
+               const X86AddressMode &AM) {
   return addLeaAddress(MIB, AM).addReg(0);
 }
 
@@ -130,7 +132,7 @@ inline const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
 /// reference has base register as the FrameIndex offset until it is resolved.
 /// This allows a constant offset to be specified as well...
 ///
-inline const MachineInstrBuilder &
+static inline const MachineInstrBuilder &
 addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
   MachineInstr *MI = MIB;
   MachineFunction &MF = *MI->getParent()->getParent();
@@ -141,11 +143,11 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
     Flags |= MachineMemOperand::MOLoad;
   if (TID.mayStore())
     Flags |= MachineMemOperand::MOStore;
-  MachineMemOperand MMO(PseudoSourceValue::getFixedStack(FI),
-                        Flags,
-                        MFI.getObjectOffset(FI) + Offset,
-                        MFI.getObjectSize(FI),
-                        MFI.getObjectAlignment(FI));
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                            Flags, Offset,
+                            MFI.getObjectSize(FI),
+                            MFI.getObjectAlignment(FI));
   return addOffset(MIB.addFrameIndex(FI), Offset)
             .addMemOperand(MMO);
 }
@@ -157,7 +159,7 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
 /// the GlobalBaseReg parameter can be used to make this a
 /// GlobalBaseReg-relative reference.
 ///
-inline const MachineInstrBuilder &
+static inline const MachineInstrBuilder &
 addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
                          unsigned GlobalBaseReg, unsigned char OpFlags) {
   //FIXME: factor this
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index bc7def4..7e37373 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -303,6 +303,31 @@ def TST_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
 }
 def TST_F  : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9;
 
+// Versions of FP instructions that take a single memory operand.  Added for the
+//   disassembler; remove as they are included with patterns elsewhere.
+def FCOM32m  : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom\t$src">;
+def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp\t$src">;
+
+def FLDENVm  : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
+def FSTENVm  : FPI<0xD9, MRM6m, (outs f32mem:$dst), (ins), "fstenv\t$dst">;
+
+def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">;
+def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
+
+def FCOM64m  : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom\t$src">;
+def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp\t$src">;
+
+def FISTTP32m: FPI<0xDD, MRM1m, (outs i32mem:$dst), (ins), "fisttp{l}\t$dst">;
+def FRSTORm  : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">;
+def FSAVEm   : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fsave\t$dst">;
+def FSTSWm   : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fstsw\t$dst">;
+
+def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{w}\t$src">;
+def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{w}\t$src">;
+
+def FBLDm    : FPI<0xDF, MRM4m, (outs), (ins f32mem:$src), "fbld\t$src">;
+def FBSTPm   : FPI<0xDF, MRM6m, (outs f32mem:$dst), (ins), "fbstp\t$dst">;
+
 // Floating point cmovs.
 multiclass FPCMov<PatLeaf cc> {
   def _Fp32  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index eeed5bd..abdb313 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -79,6 +79,7 @@ class XD     { bits<4> Prefix = 11; }
 class XS     { bits<4> Prefix = 12; }
 class T8     { bits<4> Prefix = 13; }
 class TA     { bits<4> Prefix = 14; }
+class TF     { bits<4> Prefix = 15; }
 
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
               string AsmStr>
@@ -142,6 +143,24 @@ class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
   let Pattern = pattern;
 }
 
+// Templates for instructions that use a 16- or 32-bit segmented address as
+//  their only operand: lcall (FAR CALL) and ljmp (FAR JMP)
+//
+//   Iseg16 - 16-bit segment selector, 16-bit offset
+//   Iseg32 - 16-bit segment selector, 32-bit offset
+
+class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+              list<dag> pattern> : X86Inst<o, f, NoImm, outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+              list<dag> pattern> : X86Inst<o, f, NoImm, outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
 // SSE1 Instruction Templates:
 // 
 //   SSI   - SSE1 instructions with XS prefix.
@@ -229,6 +248,16 @@ class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
       : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE42]>;
 
+//   SS42FI - SSE 4.2 instructions with TF prefix.
+class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TF, Requires<[HasSSE42]>;
+      
+//   SS42AI = SSE 4.2 instructions with TA prefix
+class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+	     list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE42]>;
+
 // X86-64 Instruction templates...
 //
 
@@ -282,4 +311,3 @@ class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> patter
       : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX]>;
 class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
       : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX]>;
-
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index e5d84c5..e8a39d1 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -18,8 +18,8 @@
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
-#include "llvm/GlobalVariable.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -27,24 +27,24 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
 using namespace llvm;
 
-namespace {
-  cl::opt<bool>
-  NoFusing("disable-spill-fusing",
-           cl::desc("Disable fusing of spill code into instructions"));
-  cl::opt<bool>
-  PrintFailedFusing("print-failed-fuse-candidates",
-                    cl::desc("Print instructions that the allocator wants to"
-                             " fuse, but the X86 backend currently can't"),
-                    cl::Hidden);
-  cl::opt<bool>
-  ReMatPICStubLoad("remat-pic-stub-load",
-                   cl::desc("Re-materialize load from stub in PIC mode"),
-                   cl::init(false), cl::Hidden);
-}
+static cl::opt<bool>
+NoFusing("disable-spill-fusing",
+         cl::desc("Disable fusing of spill code into instructions"));
+static cl::opt<bool>
+PrintFailedFusing("print-failed-fuse-candidates",
+                  cl::desc("Print instructions that the allocator wants to"
+                           " fuse, but the X86 backend currently can't"),
+                  cl::Hidden);
+static cl::opt<bool>
+ReMatPICStubLoad("remat-pic-stub-load",
+                 cl::desc("Re-materialize load from stub in PIC mode"),
+                 cl::init(false), cl::Hidden);
 
 X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
   : TargetInstrInfoImpl(X86Insts, array_lengthof(X86Insts)),
@@ -212,9 +212,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     unsigned RegOp = OpTbl2Addr[i][0];
     unsigned MemOp = OpTbl2Addr[i][1];
     if (!RegOp2MemOpTable2Addr.insert(std::make_pair((unsigned*)RegOp,
-                                                     MemOp)).second)
+                                               std::make_pair(MemOp,0))).second)
       assert(false && "Duplicated entries?");
-    unsigned AuxInfo = 0 | (1 << 4) | (1 << 5); // Index 0,folded load and store
+    // Index 0, folded load and store, no alignment requirement.
+    unsigned AuxInfo = 0 | (1 << 4) | (1 << 5);
     if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
                                                 std::make_pair(RegOp,
                                                               AuxInfo))).second)
@@ -222,93 +223,94 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
   }
 
   // If the third value is 1, then it's folding either a load or a store.
-  static const unsigned OpTbl0[][3] = {
-    { X86::BT16ri8,     X86::BT16mi8, 1 },
-    { X86::BT32ri8,     X86::BT32mi8, 1 },
-    { X86::BT64ri8,     X86::BT64mi8, 1 },
-    { X86::CALL32r,     X86::CALL32m, 1 },
-    { X86::CALL64r,     X86::CALL64m, 1 },
-    { X86::CMP16ri,     X86::CMP16mi, 1 },
-    { X86::CMP16ri8,    X86::CMP16mi8, 1 },
-    { X86::CMP16rr,     X86::CMP16mr, 1 },
-    { X86::CMP32ri,     X86::CMP32mi, 1 },
-    { X86::CMP32ri8,    X86::CMP32mi8, 1 },
-    { X86::CMP32rr,     X86::CMP32mr, 1 },
-    { X86::CMP64ri32,   X86::CMP64mi32, 1 },
-    { X86::CMP64ri8,    X86::CMP64mi8, 1 },
-    { X86::CMP64rr,     X86::CMP64mr, 1 },
-    { X86::CMP8ri,      X86::CMP8mi, 1 },
-    { X86::CMP8rr,      X86::CMP8mr, 1 },
-    { X86::DIV16r,      X86::DIV16m, 1 },
-    { X86::DIV32r,      X86::DIV32m, 1 },
-    { X86::DIV64r,      X86::DIV64m, 1 },
-    { X86::DIV8r,       X86::DIV8m, 1 },
-    { X86::EXTRACTPSrr, X86::EXTRACTPSmr, 0 },
-    { X86::FsMOVAPDrr,  X86::MOVSDmr, 0 },
-    { X86::FsMOVAPSrr,  X86::MOVSSmr, 0 },
-    { X86::IDIV16r,     X86::IDIV16m, 1 },
-    { X86::IDIV32r,     X86::IDIV32m, 1 },
-    { X86::IDIV64r,     X86::IDIV64m, 1 },
-    { X86::IDIV8r,      X86::IDIV8m, 1 },
-    { X86::IMUL16r,     X86::IMUL16m, 1 },
-    { X86::IMUL32r,     X86::IMUL32m, 1 },
-    { X86::IMUL64r,     X86::IMUL64m, 1 },
-    { X86::IMUL8r,      X86::IMUL8m, 1 },
-    { X86::JMP32r,      X86::JMP32m, 1 },
-    { X86::JMP64r,      X86::JMP64m, 1 },
-    { X86::MOV16ri,     X86::MOV16mi, 0 },
-    { X86::MOV16rr,     X86::MOV16mr, 0 },
-    { X86::MOV32ri,     X86::MOV32mi, 0 },
-    { X86::MOV32rr,     X86::MOV32mr, 0 },
-    { X86::MOV64ri32,   X86::MOV64mi32, 0 },
-    { X86::MOV64rr,     X86::MOV64mr, 0 },
-    { X86::MOV8ri,      X86::MOV8mi, 0 },
-    { X86::MOV8rr,      X86::MOV8mr, 0 },
-    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, 0 },
-    { X86::MOVAPDrr,    X86::MOVAPDmr, 0 },
-    { X86::MOVAPSrr,    X86::MOVAPSmr, 0 },
-    { X86::MOVDQArr,    X86::MOVDQAmr, 0 },
-    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0 },
-    { X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0 },
-    { X86::MOVPS2SSrr,  X86::MOVPS2SSmr, 0 },
-    { X86::MOVSDrr,     X86::MOVSDmr, 0 },
-    { X86::MOVSDto64rr, X86::MOVSDto64mr, 0 },
-    { X86::MOVSS2DIrr,  X86::MOVSS2DImr, 0 },
-    { X86::MOVSSrr,     X86::MOVSSmr, 0 },
-    { X86::MOVUPDrr,    X86::MOVUPDmr, 0 },
-    { X86::MOVUPSrr,    X86::MOVUPSmr, 0 },
-    { X86::MUL16r,      X86::MUL16m, 1 },
-    { X86::MUL32r,      X86::MUL32m, 1 },
-    { X86::MUL64r,      X86::MUL64m, 1 },
-    { X86::MUL8r,       X86::MUL8m, 1 },
-    { X86::SETAEr,      X86::SETAEm, 0 },
-    { X86::SETAr,       X86::SETAm, 0 },
-    { X86::SETBEr,      X86::SETBEm, 0 },
-    { X86::SETBr,       X86::SETBm, 0 },
-    { X86::SETEr,       X86::SETEm, 0 },
-    { X86::SETGEr,      X86::SETGEm, 0 },
-    { X86::SETGr,       X86::SETGm, 0 },
-    { X86::SETLEr,      X86::SETLEm, 0 },
-    { X86::SETLr,       X86::SETLm, 0 },
-    { X86::SETNEr,      X86::SETNEm, 0 },
-    { X86::SETNOr,      X86::SETNOm, 0 },
-    { X86::SETNPr,      X86::SETNPm, 0 },
-    { X86::SETNSr,      X86::SETNSm, 0 },
-    { X86::SETOr,       X86::SETOm, 0 },
-    { X86::SETPr,       X86::SETPm, 0 },
-    { X86::SETSr,       X86::SETSm, 0 },
-    { X86::TAILJMPr,    X86::TAILJMPm, 1 },
-    { X86::TEST16ri,    X86::TEST16mi, 1 },
-    { X86::TEST32ri,    X86::TEST32mi, 1 },
-    { X86::TEST64ri32,  X86::TEST64mi32, 1 },
-    { X86::TEST8ri,     X86::TEST8mi, 1 }
+  static const unsigned OpTbl0[][4] = {
+    { X86::BT16ri8,     X86::BT16mi8, 1, 0 },
+    { X86::BT32ri8,     X86::BT32mi8, 1, 0 },
+    { X86::BT64ri8,     X86::BT64mi8, 1, 0 },
+    { X86::CALL32r,     X86::CALL32m, 1, 0 },
+    { X86::CALL64r,     X86::CALL64m, 1, 0 },
+    { X86::CMP16ri,     X86::CMP16mi, 1, 0 },
+    { X86::CMP16ri8,    X86::CMP16mi8, 1, 0 },
+    { X86::CMP16rr,     X86::CMP16mr, 1, 0 },
+    { X86::CMP32ri,     X86::CMP32mi, 1, 0 },
+    { X86::CMP32ri8,    X86::CMP32mi8, 1, 0 },
+    { X86::CMP32rr,     X86::CMP32mr, 1, 0 },
+    { X86::CMP64ri32,   X86::CMP64mi32, 1, 0 },
+    { X86::CMP64ri8,    X86::CMP64mi8, 1, 0 },
+    { X86::CMP64rr,     X86::CMP64mr, 1, 0 },
+    { X86::CMP8ri,      X86::CMP8mi, 1, 0 },
+    { X86::CMP8rr,      X86::CMP8mr, 1, 0 },
+    { X86::DIV16r,      X86::DIV16m, 1, 0 },
+    { X86::DIV32r,      X86::DIV32m, 1, 0 },
+    { X86::DIV64r,      X86::DIV64m, 1, 0 },
+    { X86::DIV8r,       X86::DIV8m, 1, 0 },
+    { X86::EXTRACTPSrr, X86::EXTRACTPSmr, 0, 16 },
+    { X86::FsMOVAPDrr,  X86::MOVSDmr, 0, 0 },
+    { X86::FsMOVAPSrr,  X86::MOVSSmr, 0, 0 },
+    { X86::IDIV16r,     X86::IDIV16m, 1, 0 },
+    { X86::IDIV32r,     X86::IDIV32m, 1, 0 },
+    { X86::IDIV64r,     X86::IDIV64m, 1, 0 },
+    { X86::IDIV8r,      X86::IDIV8m, 1, 0 },
+    { X86::IMUL16r,     X86::IMUL16m, 1, 0 },
+    { X86::IMUL32r,     X86::IMUL32m, 1, 0 },
+    { X86::IMUL64r,     X86::IMUL64m, 1, 0 },
+    { X86::IMUL8r,      X86::IMUL8m, 1, 0 },
+    { X86::JMP32r,      X86::JMP32m, 1, 0 },
+    { X86::JMP64r,      X86::JMP64m, 1, 0 },
+    { X86::MOV16ri,     X86::MOV16mi, 0, 0 },
+    { X86::MOV16rr,     X86::MOV16mr, 0, 0 },
+    { X86::MOV32ri,     X86::MOV32mi, 0, 0 },
+    { X86::MOV32rr,     X86::MOV32mr, 0, 0 },
+    { X86::MOV64ri32,   X86::MOV64mi32, 0, 0 },
+    { X86::MOV64rr,     X86::MOV64mr, 0, 0 },
+    { X86::MOV8ri,      X86::MOV8mi, 0, 0 },
+    { X86::MOV8rr,      X86::MOV8mr, 0, 0 },
+    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, 0, 0 },
+    { X86::MOVAPDrr,    X86::MOVAPDmr, 0, 16 },
+    { X86::MOVAPSrr,    X86::MOVAPSmr, 0, 16 },
+    { X86::MOVDQArr,    X86::MOVDQAmr, 0, 16 },
+    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0, 0 },
+    { X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0, 0 },
+    { X86::MOVPS2SSrr,  X86::MOVPS2SSmr, 0, 0 },
+    { X86::MOVSDrr,     X86::MOVSDmr, 0, 0 },
+    { X86::MOVSDto64rr, X86::MOVSDto64mr, 0, 0 },
+    { X86::MOVSS2DIrr,  X86::MOVSS2DImr, 0, 0 },
+    { X86::MOVSSrr,     X86::MOVSSmr, 0, 0 },
+    { X86::MOVUPDrr,    X86::MOVUPDmr, 0, 0 },
+    { X86::MOVUPSrr,    X86::MOVUPSmr, 0, 0 },
+    { X86::MUL16r,      X86::MUL16m, 1, 0 },
+    { X86::MUL32r,      X86::MUL32m, 1, 0 },
+    { X86::MUL64r,      X86::MUL64m, 1, 0 },
+    { X86::MUL8r,       X86::MUL8m, 1, 0 },
+    { X86::SETAEr,      X86::SETAEm, 0, 0 },
+    { X86::SETAr,       X86::SETAm, 0, 0 },
+    { X86::SETBEr,      X86::SETBEm, 0, 0 },
+    { X86::SETBr,       X86::SETBm, 0, 0 },
+    { X86::SETEr,       X86::SETEm, 0, 0 },
+    { X86::SETGEr,      X86::SETGEm, 0, 0 },
+    { X86::SETGr,       X86::SETGm, 0, 0 },
+    { X86::SETLEr,      X86::SETLEm, 0, 0 },
+    { X86::SETLr,       X86::SETLm, 0, 0 },
+    { X86::SETNEr,      X86::SETNEm, 0, 0 },
+    { X86::SETNOr,      X86::SETNOm, 0, 0 },
+    { X86::SETNPr,      X86::SETNPm, 0, 0 },
+    { X86::SETNSr,      X86::SETNSm, 0, 0 },
+    { X86::SETOr,       X86::SETOm, 0, 0 },
+    { X86::SETPr,       X86::SETPm, 0, 0 },
+    { X86::SETSr,       X86::SETSm, 0, 0 },
+    { X86::TAILJMPr,    X86::TAILJMPm, 1, 0 },
+    { X86::TEST16ri,    X86::TEST16mi, 1, 0 },
+    { X86::TEST32ri,    X86::TEST32mi, 1, 0 },
+    { X86::TEST64ri32,  X86::TEST64mi32, 1, 0 },
+    { X86::TEST8ri,     X86::TEST8mi, 1, 0 }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
     unsigned RegOp = OpTbl0[i][0];
     unsigned MemOp = OpTbl0[i][1];
+    unsigned Align = OpTbl0[i][3];
     if (!RegOp2MemOpTable0.insert(std::make_pair((unsigned*)RegOp,
-                                                 MemOp)).second)
+                                           std::make_pair(MemOp,Align))).second)
       assert(false && "Duplicated entries?");
     unsigned FoldedLoad = OpTbl0[i][2];
     // Index 0, folded load or store.
@@ -319,338 +321,342 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
         AmbEntries.push_back(MemOp);
   }
 
-  static const unsigned OpTbl1[][2] = {
-    { X86::CMP16rr,         X86::CMP16rm },
-    { X86::CMP32rr,         X86::CMP32rm },
-    { X86::CMP64rr,         X86::CMP64rm },
-    { X86::CMP8rr,          X86::CMP8rm },
-    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm },
-    { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm },
-    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm },
-    { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm },
-    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm },
-    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm },
-    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm },
-    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm },
-    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm },
-    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm },
-    { X86::FsMOVAPDrr,      X86::MOVSDrm },
-    { X86::FsMOVAPSrr,      X86::MOVSSrm },
-    { X86::IMUL16rri,       X86::IMUL16rmi },
-    { X86::IMUL16rri8,      X86::IMUL16rmi8 },
-    { X86::IMUL32rri,       X86::IMUL32rmi },
-    { X86::IMUL32rri8,      X86::IMUL32rmi8 },
-    { X86::IMUL64rri32,     X86::IMUL64rmi32 },
-    { X86::IMUL64rri8,      X86::IMUL64rmi8 },
-    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm },
-    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm },
-    { X86::Int_COMISDrr,    X86::Int_COMISDrm },
-    { X86::Int_COMISSrr,    X86::Int_COMISSrm },
-    { X86::Int_CVTDQ2PDrr,  X86::Int_CVTDQ2PDrm },
-    { X86::Int_CVTDQ2PSrr,  X86::Int_CVTDQ2PSrm },
-    { X86::Int_CVTPD2DQrr,  X86::Int_CVTPD2DQrm },
-    { X86::Int_CVTPD2PSrr,  X86::Int_CVTPD2PSrm },
-    { X86::Int_CVTPS2DQrr,  X86::Int_CVTPS2DQrm },
-    { X86::Int_CVTPS2PDrr,  X86::Int_CVTPS2PDrm },
-    { X86::Int_CVTSD2SI64rr,X86::Int_CVTSD2SI64rm },
-    { X86::Int_CVTSD2SIrr,  X86::Int_CVTSD2SIrm },
-    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm },
-    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm },
-    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm },
-    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm },
-    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm },
-    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm },
-    { X86::Int_CVTSS2SI64rr,X86::Int_CVTSS2SI64rm },
-    { X86::Int_CVTSS2SIrr,  X86::Int_CVTSS2SIrm },
-    { X86::Int_CVTTPD2DQrr, X86::Int_CVTTPD2DQrm },
-    { X86::Int_CVTTPS2DQrr, X86::Int_CVTTPS2DQrm },
-    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm },
-    { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm },
-    { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm },
-    { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm },
-    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm },
-    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm },
-    { X86::MOV16rr,         X86::MOV16rm },
-    { X86::MOV32rr,         X86::MOV32rm },
-    { X86::MOV64rr,         X86::MOV64rm },
-    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm },
-    { X86::MOV64toSDrr,     X86::MOV64toSDrm },
-    { X86::MOV8rr,          X86::MOV8rm },
-    { X86::MOVAPDrr,        X86::MOVAPDrm },
-    { X86::MOVAPSrr,        X86::MOVAPSrm },
-    { X86::MOVDDUPrr,       X86::MOVDDUPrm },
-    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm },
-    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm },
-    { X86::MOVDQArr,        X86::MOVDQArm },
-    { X86::MOVSD2PDrr,      X86::MOVSD2PDrm },
-    { X86::MOVSDrr,         X86::MOVSDrm },
-    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm },
-    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm },
-    { X86::MOVSS2PSrr,      X86::MOVSS2PSrm },
-    { X86::MOVSSrr,         X86::MOVSSrm },
-    { X86::MOVSX16rr8,      X86::MOVSX16rm8 },
-    { X86::MOVSX32rr16,     X86::MOVSX32rm16 },
-    { X86::MOVSX32rr8,      X86::MOVSX32rm8 },
-    { X86::MOVSX64rr16,     X86::MOVSX64rm16 },
-    { X86::MOVSX64rr32,     X86::MOVSX64rm32 },
-    { X86::MOVSX64rr8,      X86::MOVSX64rm8 },
-    { X86::MOVUPDrr,        X86::MOVUPDrm },
-    { X86::MOVUPSrr,        X86::MOVUPSrm },
-    { X86::MOVZDI2PDIrr,    X86::MOVZDI2PDIrm },
-    { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm },
-    { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm },
-    { X86::MOVZX16rr8,      X86::MOVZX16rm8 },
-    { X86::MOVZX32rr16,     X86::MOVZX32rm16 },
-    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8 },
-    { X86::MOVZX32rr8,      X86::MOVZX32rm8 },
-    { X86::MOVZX64rr16,     X86::MOVZX64rm16 },
-    { X86::MOVZX64rr32,     X86::MOVZX64rm32 },
-    { X86::MOVZX64rr8,      X86::MOVZX64rm8 },
-    { X86::PSHUFDri,        X86::PSHUFDmi },
-    { X86::PSHUFHWri,       X86::PSHUFHWmi },
-    { X86::PSHUFLWri,       X86::PSHUFLWmi },
-    { X86::RCPPSr,          X86::RCPPSm },
-    { X86::RCPPSr_Int,      X86::RCPPSm_Int },
-    { X86::RSQRTPSr,        X86::RSQRTPSm },
-    { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int },
-    { X86::RSQRTSSr,        X86::RSQRTSSm },
-    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int },
-    { X86::SQRTPDr,         X86::SQRTPDm },
-    { X86::SQRTPDr_Int,     X86::SQRTPDm_Int },
-    { X86::SQRTPSr,         X86::SQRTPSm },
-    { X86::SQRTPSr_Int,     X86::SQRTPSm_Int },
-    { X86::SQRTSDr,         X86::SQRTSDm },
-    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int },
-    { X86::SQRTSSr,         X86::SQRTSSm },
-    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int },
-    { X86::TEST16rr,        X86::TEST16rm },
-    { X86::TEST32rr,        X86::TEST32rm },
-    { X86::TEST64rr,        X86::TEST64rm },
-    { X86::TEST8rr,         X86::TEST8rm },
+  static const unsigned OpTbl1[][3] = {
+    { X86::CMP16rr,         X86::CMP16rm, 0 },
+    { X86::CMP32rr,         X86::CMP32rm, 0 },
+    { X86::CMP64rr,         X86::CMP64rm, 0 },
+    { X86::CMP8rr,          X86::CMP8rm, 0 },
+    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm, 0 },
+    { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm, 0 },
+    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm, 0 },
+    { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm, 0 },
+    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm, 0 },
+    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm, 0 },
+    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm, 0 },
+    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm, 0 },
+    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm, 0 },
+    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm, 0 },
+    { X86::FsMOVAPDrr,      X86::MOVSDrm, 0 },
+    { X86::FsMOVAPSrr,      X86::MOVSSrm, 0 },
+    { X86::IMUL16rri,       X86::IMUL16rmi, 0 },
+    { X86::IMUL16rri8,      X86::IMUL16rmi8, 0 },
+    { X86::IMUL32rri,       X86::IMUL32rmi, 0 },
+    { X86::IMUL32rri8,      X86::IMUL32rmi8, 0 },
+    { X86::IMUL64rri32,     X86::IMUL64rmi32, 0 },
+    { X86::IMUL64rri8,      X86::IMUL64rmi8, 0 },
+    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm, 0 },
+    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm, 0 },
+    { X86::Int_COMISDrr,    X86::Int_COMISDrm, 0 },
+    { X86::Int_COMISSrr,    X86::Int_COMISSrm, 0 },
+    { X86::Int_CVTDQ2PDrr,  X86::Int_CVTDQ2PDrm, 16 },
+    { X86::Int_CVTDQ2PSrr,  X86::Int_CVTDQ2PSrm, 16 },
+    { X86::Int_CVTPD2DQrr,  X86::Int_CVTPD2DQrm, 16 },
+    { X86::Int_CVTPD2PSrr,  X86::Int_CVTPD2PSrm, 16 },
+    { X86::Int_CVTPS2DQrr,  X86::Int_CVTPS2DQrm, 16 },
+    { X86::Int_CVTPS2PDrr,  X86::Int_CVTPS2PDrm, 0 },
+    { X86::Int_CVTSD2SI64rr,X86::Int_CVTSD2SI64rm, 0 },
+    { X86::Int_CVTSD2SIrr,  X86::Int_CVTSD2SIrm, 0 },
+    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm, 0 },
+    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
+    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm, 0 },
+    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
+    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm, 0 },
+    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm, 0 },
+    { X86::Int_CVTSS2SI64rr,X86::Int_CVTSS2SI64rm, 0 },
+    { X86::Int_CVTSS2SIrr,  X86::Int_CVTSS2SIrm, 0 },
+    { X86::Int_CVTTPD2DQrr, X86::Int_CVTTPD2DQrm, 16 },
+    { X86::Int_CVTTPS2DQrr, X86::Int_CVTTPS2DQrm, 16 },
+    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 },
+    { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 },
+    { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 },
+    { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 },
+    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm, 0 },
+    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm, 0 },
+    { X86::MOV16rr,         X86::MOV16rm, 0 },
+    { X86::MOV32rr,         X86::MOV32rm, 0 },
+    { X86::MOV64rr,         X86::MOV64rm, 0 },
+    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm, 0 },
+    { X86::MOV64toSDrr,     X86::MOV64toSDrm, 0 },
+    { X86::MOV8rr,          X86::MOV8rm, 0 },
+    { X86::MOVAPDrr,        X86::MOVAPDrm, 16 },
+    { X86::MOVAPSrr,        X86::MOVAPSrm, 16 },
+    { X86::MOVDDUPrr,       X86::MOVDDUPrm, 0 },
+    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm, 0 },
+    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm, 0 },
+    { X86::MOVDQArr,        X86::MOVDQArm, 16 },
+    { X86::MOVSD2PDrr,      X86::MOVSD2PDrm, 0 },
+    { X86::MOVSDrr,         X86::MOVSDrm, 0 },
+    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm, 16 },
+    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm, 16 },
+    { X86::MOVSS2PSrr,      X86::MOVSS2PSrm, 0 },
+    { X86::MOVSSrr,         X86::MOVSSrm, 0 },
+    { X86::MOVSX16rr8,      X86::MOVSX16rm8, 0 },
+    { X86::MOVSX32rr16,     X86::MOVSX32rm16, 0 },
+    { X86::MOVSX32rr8,      X86::MOVSX32rm8, 0 },
+    { X86::MOVSX64rr16,     X86::MOVSX64rm16, 0 },
+    { X86::MOVSX64rr32,     X86::MOVSX64rm32, 0 },
+    { X86::MOVSX64rr8,      X86::MOVSX64rm8, 0 },
+    { X86::MOVUPDrr,        X86::MOVUPDrm, 16 },
+    { X86::MOVUPSrr,        X86::MOVUPSrm, 16 },
+    { X86::MOVZDI2PDIrr,    X86::MOVZDI2PDIrm, 0 },
+    { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm, 0 },
+    { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, 16 },
+    { X86::MOVZX16rr8,      X86::MOVZX16rm8, 0 },
+    { X86::MOVZX32rr16,     X86::MOVZX32rm16, 0 },
+    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
+    { X86::MOVZX32rr8,      X86::MOVZX32rm8, 0 },
+    { X86::MOVZX64rr16,     X86::MOVZX64rm16, 0 },
+    { X86::MOVZX64rr32,     X86::MOVZX64rm32, 0 },
+    { X86::MOVZX64rr8,      X86::MOVZX64rm8, 0 },
+    { X86::PSHUFDri,        X86::PSHUFDmi, 16 },
+    { X86::PSHUFHWri,       X86::PSHUFHWmi, 16 },
+    { X86::PSHUFLWri,       X86::PSHUFLWmi, 16 },
+    { X86::RCPPSr,          X86::RCPPSm, 16 },
+    { X86::RCPPSr_Int,      X86::RCPPSm_Int, 16 },
+    { X86::RSQRTPSr,        X86::RSQRTPSm, 16 },
+    { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int, 16 },
+    { X86::RSQRTSSr,        X86::RSQRTSSm, 0 },
+    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int, 0 },
+    { X86::SQRTPDr,         X86::SQRTPDm, 16 },
+    { X86::SQRTPDr_Int,     X86::SQRTPDm_Int, 16 },
+    { X86::SQRTPSr,         X86::SQRTPSm, 16 },
+    { X86::SQRTPSr_Int,     X86::SQRTPSm_Int, 16 },
+    { X86::SQRTSDr,         X86::SQRTSDm, 0 },
+    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int, 0 },
+    { X86::SQRTSSr,         X86::SQRTSSm, 0 },
+    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int, 0 },
+    { X86::TEST16rr,        X86::TEST16rm, 0 },
+    { X86::TEST32rr,        X86::TEST32rm, 0 },
+    { X86::TEST64rr,        X86::TEST64rm, 0 },
+    { X86::TEST8rr,         X86::TEST8rm, 0 },
     // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
-    { X86::UCOMISDrr,       X86::UCOMISDrm },
-    { X86::UCOMISSrr,       X86::UCOMISSrm }
+    { X86::UCOMISDrr,       X86::UCOMISDrm, 0 },
+    { X86::UCOMISSrr,       X86::UCOMISSrm, 0 }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
     unsigned RegOp = OpTbl1[i][0];
     unsigned MemOp = OpTbl1[i][1];
+    unsigned Align = OpTbl1[i][2];
     if (!RegOp2MemOpTable1.insert(std::make_pair((unsigned*)RegOp,
-                                                 MemOp)).second)
+                                           std::make_pair(MemOp,Align))).second)
       assert(false && "Duplicated entries?");
-    unsigned AuxInfo = 1 | (1 << 4); // Index 1, folded load
+    // Index 1, folded load
+    unsigned AuxInfo = 1 | (1 << 4);
     if (RegOp != X86::FsMOVAPDrr && RegOp != X86::FsMOVAPSrr)
       if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
                                      std::make_pair(RegOp, AuxInfo))).second)
         AmbEntries.push_back(MemOp);
   }
 
-  static const unsigned OpTbl2[][2] = {
-    { X86::ADC32rr,         X86::ADC32rm },
-    { X86::ADC64rr,         X86::ADC64rm },
-    { X86::ADD16rr,         X86::ADD16rm },
-    { X86::ADD32rr,         X86::ADD32rm },
-    { X86::ADD64rr,         X86::ADD64rm },
-    { X86::ADD8rr,          X86::ADD8rm },
-    { X86::ADDPDrr,         X86::ADDPDrm },
-    { X86::ADDPSrr,         X86::ADDPSrm },
-    { X86::ADDSDrr,         X86::ADDSDrm },
-    { X86::ADDSSrr,         X86::ADDSSrm },
-    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm },
-    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm },
-    { X86::AND16rr,         X86::AND16rm },
-    { X86::AND32rr,         X86::AND32rm },
-    { X86::AND64rr,         X86::AND64rm },
-    { X86::AND8rr,          X86::AND8rm },
-    { X86::ANDNPDrr,        X86::ANDNPDrm },
-    { X86::ANDNPSrr,        X86::ANDNPSrm },
-    { X86::ANDPDrr,         X86::ANDPDrm },
-    { X86::ANDPSrr,         X86::ANDPSrm },
-    { X86::CMOVA16rr,       X86::CMOVA16rm },
-    { X86::CMOVA32rr,       X86::CMOVA32rm },
-    { X86::CMOVA64rr,       X86::CMOVA64rm },
-    { X86::CMOVAE16rr,      X86::CMOVAE16rm },
-    { X86::CMOVAE32rr,      X86::CMOVAE32rm },
-    { X86::CMOVAE64rr,      X86::CMOVAE64rm },
-    { X86::CMOVB16rr,       X86::CMOVB16rm },
-    { X86::CMOVB32rr,       X86::CMOVB32rm },
-    { X86::CMOVB64rr,       X86::CMOVB64rm },
-    { X86::CMOVBE16rr,      X86::CMOVBE16rm },
-    { X86::CMOVBE32rr,      X86::CMOVBE32rm },
-    { X86::CMOVBE64rr,      X86::CMOVBE64rm },
-    { X86::CMOVE16rr,       X86::CMOVE16rm },
-    { X86::CMOVE32rr,       X86::CMOVE32rm },
-    { X86::CMOVE64rr,       X86::CMOVE64rm },
-    { X86::CMOVG16rr,       X86::CMOVG16rm },
-    { X86::CMOVG32rr,       X86::CMOVG32rm },
-    { X86::CMOVG64rr,       X86::CMOVG64rm },
-    { X86::CMOVGE16rr,      X86::CMOVGE16rm },
-    { X86::CMOVGE32rr,      X86::CMOVGE32rm },
-    { X86::CMOVGE64rr,      X86::CMOVGE64rm },
-    { X86::CMOVL16rr,       X86::CMOVL16rm },
-    { X86::CMOVL32rr,       X86::CMOVL32rm },
-    { X86::CMOVL64rr,       X86::CMOVL64rm },
-    { X86::CMOVLE16rr,      X86::CMOVLE16rm },
-    { X86::CMOVLE32rr,      X86::CMOVLE32rm },
-    { X86::CMOVLE64rr,      X86::CMOVLE64rm },
-    { X86::CMOVNE16rr,      X86::CMOVNE16rm },
-    { X86::CMOVNE32rr,      X86::CMOVNE32rm },
-    { X86::CMOVNE64rr,      X86::CMOVNE64rm },
-    { X86::CMOVNO16rr,      X86::CMOVNO16rm },
-    { X86::CMOVNO32rr,      X86::CMOVNO32rm },
-    { X86::CMOVNO64rr,      X86::CMOVNO64rm },
-    { X86::CMOVNP16rr,      X86::CMOVNP16rm },
-    { X86::CMOVNP32rr,      X86::CMOVNP32rm },
-    { X86::CMOVNP64rr,      X86::CMOVNP64rm },
-    { X86::CMOVNS16rr,      X86::CMOVNS16rm },
-    { X86::CMOVNS32rr,      X86::CMOVNS32rm },
-    { X86::CMOVNS64rr,      X86::CMOVNS64rm },
-    { X86::CMOVO16rr,       X86::CMOVO16rm },
-    { X86::CMOVO32rr,       X86::CMOVO32rm },
-    { X86::CMOVO64rr,       X86::CMOVO64rm },
-    { X86::CMOVP16rr,       X86::CMOVP16rm },
-    { X86::CMOVP32rr,       X86::CMOVP32rm },
-    { X86::CMOVP64rr,       X86::CMOVP64rm },
-    { X86::CMOVS16rr,       X86::CMOVS16rm },
-    { X86::CMOVS32rr,       X86::CMOVS32rm },
-    { X86::CMOVS64rr,       X86::CMOVS64rm },
-    { X86::CMPPDrri,        X86::CMPPDrmi },
-    { X86::CMPPSrri,        X86::CMPPSrmi },
-    { X86::CMPSDrr,         X86::CMPSDrm },
-    { X86::CMPSSrr,         X86::CMPSSrm },
-    { X86::DIVPDrr,         X86::DIVPDrm },
-    { X86::DIVPSrr,         X86::DIVPSrm },
-    { X86::DIVSDrr,         X86::DIVSDrm },
-    { X86::DIVSSrr,         X86::DIVSSrm },
-    { X86::FsANDNPDrr,      X86::FsANDNPDrm },
-    { X86::FsANDNPSrr,      X86::FsANDNPSrm },
-    { X86::FsANDPDrr,       X86::FsANDPDrm },
-    { X86::FsANDPSrr,       X86::FsANDPSrm },
-    { X86::FsORPDrr,        X86::FsORPDrm },
-    { X86::FsORPSrr,        X86::FsORPSrm },
-    { X86::FsXORPDrr,       X86::FsXORPDrm },
-    { X86::FsXORPSrr,       X86::FsXORPSrm },
-    { X86::HADDPDrr,        X86::HADDPDrm },
-    { X86::HADDPSrr,        X86::HADDPSrm },
-    { X86::HSUBPDrr,        X86::HSUBPDrm },
-    { X86::HSUBPSrr,        X86::HSUBPSrm },
-    { X86::IMUL16rr,        X86::IMUL16rm },
-    { X86::IMUL32rr,        X86::IMUL32rm },
-    { X86::IMUL64rr,        X86::IMUL64rm },
-    { X86::MAXPDrr,         X86::MAXPDrm },
-    { X86::MAXPDrr_Int,     X86::MAXPDrm_Int },
-    { X86::MAXPSrr,         X86::MAXPSrm },
-    { X86::MAXPSrr_Int,     X86::MAXPSrm_Int },
-    { X86::MAXSDrr,         X86::MAXSDrm },
-    { X86::MAXSDrr_Int,     X86::MAXSDrm_Int },
-    { X86::MAXSSrr,         X86::MAXSSrm },
-    { X86::MAXSSrr_Int,     X86::MAXSSrm_Int },
-    { X86::MINPDrr,         X86::MINPDrm },
-    { X86::MINPDrr_Int,     X86::MINPDrm_Int },
-    { X86::MINPSrr,         X86::MINPSrm },
-    { X86::MINPSrr_Int,     X86::MINPSrm_Int },
-    { X86::MINSDrr,         X86::MINSDrm },
-    { X86::MINSDrr_Int,     X86::MINSDrm_Int },
-    { X86::MINSSrr,         X86::MINSSrm },
-    { X86::MINSSrr_Int,     X86::MINSSrm_Int },
-    { X86::MULPDrr,         X86::MULPDrm },
-    { X86::MULPSrr,         X86::MULPSrm },
-    { X86::MULSDrr,         X86::MULSDrm },
-    { X86::MULSSrr,         X86::MULSSrm },
-    { X86::OR16rr,          X86::OR16rm },
-    { X86::OR32rr,          X86::OR32rm },
-    { X86::OR64rr,          X86::OR64rm },
-    { X86::OR8rr,           X86::OR8rm },
-    { X86::ORPDrr,          X86::ORPDrm },
-    { X86::ORPSrr,          X86::ORPSrm },
-    { X86::PACKSSDWrr,      X86::PACKSSDWrm },
-    { X86::PACKSSWBrr,      X86::PACKSSWBrm },
-    { X86::PACKUSWBrr,      X86::PACKUSWBrm },
-    { X86::PADDBrr,         X86::PADDBrm },
-    { X86::PADDDrr,         X86::PADDDrm },
-    { X86::PADDQrr,         X86::PADDQrm },
-    { X86::PADDSBrr,        X86::PADDSBrm },
-    { X86::PADDSWrr,        X86::PADDSWrm },
-    { X86::PADDWrr,         X86::PADDWrm },
-    { X86::PANDNrr,         X86::PANDNrm },
-    { X86::PANDrr,          X86::PANDrm },
-    { X86::PAVGBrr,         X86::PAVGBrm },
-    { X86::PAVGWrr,         X86::PAVGWrm },
-    { X86::PCMPEQBrr,       X86::PCMPEQBrm },
-    { X86::PCMPEQDrr,       X86::PCMPEQDrm },
-    { X86::PCMPEQWrr,       X86::PCMPEQWrm },
-    { X86::PCMPGTBrr,       X86::PCMPGTBrm },
-    { X86::PCMPGTDrr,       X86::PCMPGTDrm },
-    { X86::PCMPGTWrr,       X86::PCMPGTWrm },
-    { X86::PINSRWrri,       X86::PINSRWrmi },
-    { X86::PMADDWDrr,       X86::PMADDWDrm },
-    { X86::PMAXSWrr,        X86::PMAXSWrm },
-    { X86::PMAXUBrr,        X86::PMAXUBrm },
-    { X86::PMINSWrr,        X86::PMINSWrm },
-    { X86::PMINUBrr,        X86::PMINUBrm },
-    { X86::PMULDQrr,        X86::PMULDQrm },
-    { X86::PMULHUWrr,       X86::PMULHUWrm },
-    { X86::PMULHWrr,        X86::PMULHWrm },
-    { X86::PMULLDrr,        X86::PMULLDrm },
-    { X86::PMULLDrr_int,    X86::PMULLDrm_int },
-    { X86::PMULLWrr,        X86::PMULLWrm },
-    { X86::PMULUDQrr,       X86::PMULUDQrm },
-    { X86::PORrr,           X86::PORrm },
-    { X86::PSADBWrr,        X86::PSADBWrm },
-    { X86::PSLLDrr,         X86::PSLLDrm },
-    { X86::PSLLQrr,         X86::PSLLQrm },
-    { X86::PSLLWrr,         X86::PSLLWrm },
-    { X86::PSRADrr,         X86::PSRADrm },
-    { X86::PSRAWrr,         X86::PSRAWrm },
-    { X86::PSRLDrr,         X86::PSRLDrm },
-    { X86::PSRLQrr,         X86::PSRLQrm },
-    { X86::PSRLWrr,         X86::PSRLWrm },
-    { X86::PSUBBrr,         X86::PSUBBrm },
-    { X86::PSUBDrr,         X86::PSUBDrm },
-    { X86::PSUBSBrr,        X86::PSUBSBrm },
-    { X86::PSUBSWrr,        X86::PSUBSWrm },
-    { X86::PSUBWrr,         X86::PSUBWrm },
-    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm },
-    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm },
-    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm },
-    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm },
-    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm },
-    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm },
-    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm },
-    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm },
-    { X86::PXORrr,          X86::PXORrm },
-    { X86::SBB32rr,         X86::SBB32rm },
-    { X86::SBB64rr,         X86::SBB64rm },
-    { X86::SHUFPDrri,       X86::SHUFPDrmi },
-    { X86::SHUFPSrri,       X86::SHUFPSrmi },
-    { X86::SUB16rr,         X86::SUB16rm },
-    { X86::SUB32rr,         X86::SUB32rm },
-    { X86::SUB64rr,         X86::SUB64rm },
-    { X86::SUB8rr,          X86::SUB8rm },
-    { X86::SUBPDrr,         X86::SUBPDrm },
-    { X86::SUBPSrr,         X86::SUBPSrm },
-    { X86::SUBSDrr,         X86::SUBSDrm },
-    { X86::SUBSSrr,         X86::SUBSSrm },
+  static const unsigned OpTbl2[][3] = {
+    { X86::ADC32rr,         X86::ADC32rm, 0 },
+    { X86::ADC64rr,         X86::ADC64rm, 0 },
+    { X86::ADD16rr,         X86::ADD16rm, 0 },
+    { X86::ADD32rr,         X86::ADD32rm, 0 },
+    { X86::ADD64rr,         X86::ADD64rm, 0 },
+    { X86::ADD8rr,          X86::ADD8rm, 0 },
+    { X86::ADDPDrr,         X86::ADDPDrm, 16 },
+    { X86::ADDPSrr,         X86::ADDPSrm, 16 },
+    { X86::ADDSDrr,         X86::ADDSDrm, 0 },
+    { X86::ADDSSrr,         X86::ADDSSrm, 0 },
+    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm, 16 },
+    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm, 16 },
+    { X86::AND16rr,         X86::AND16rm, 0 },
+    { X86::AND32rr,         X86::AND32rm, 0 },
+    { X86::AND64rr,         X86::AND64rm, 0 },
+    { X86::AND8rr,          X86::AND8rm, 0 },
+    { X86::ANDNPDrr,        X86::ANDNPDrm, 16 },
+    { X86::ANDNPSrr,        X86::ANDNPSrm, 16 },
+    { X86::ANDPDrr,         X86::ANDPDrm, 16 },
+    { X86::ANDPSrr,         X86::ANDPSrm, 16 },
+    { X86::CMOVA16rr,       X86::CMOVA16rm, 0 },
+    { X86::CMOVA32rr,       X86::CMOVA32rm, 0 },
+    { X86::CMOVA64rr,       X86::CMOVA64rm, 0 },
+    { X86::CMOVAE16rr,      X86::CMOVAE16rm, 0 },
+    { X86::CMOVAE32rr,      X86::CMOVAE32rm, 0 },
+    { X86::CMOVAE64rr,      X86::CMOVAE64rm, 0 },
+    { X86::CMOVB16rr,       X86::CMOVB16rm, 0 },
+    { X86::CMOVB32rr,       X86::CMOVB32rm, 0 },
+    { X86::CMOVB64rr,       X86::CMOVB64rm, 0 },
+    { X86::CMOVBE16rr,      X86::CMOVBE16rm, 0 },
+    { X86::CMOVBE32rr,      X86::CMOVBE32rm, 0 },
+    { X86::CMOVBE64rr,      X86::CMOVBE64rm, 0 },
+    { X86::CMOVE16rr,       X86::CMOVE16rm, 0 },
+    { X86::CMOVE32rr,       X86::CMOVE32rm, 0 },
+    { X86::CMOVE64rr,       X86::CMOVE64rm, 0 },
+    { X86::CMOVG16rr,       X86::CMOVG16rm, 0 },
+    { X86::CMOVG32rr,       X86::CMOVG32rm, 0 },
+    { X86::CMOVG64rr,       X86::CMOVG64rm, 0 },
+    { X86::CMOVGE16rr,      X86::CMOVGE16rm, 0 },
+    { X86::CMOVGE32rr,      X86::CMOVGE32rm, 0 },
+    { X86::CMOVGE64rr,      X86::CMOVGE64rm, 0 },
+    { X86::CMOVL16rr,       X86::CMOVL16rm, 0 },
+    { X86::CMOVL32rr,       X86::CMOVL32rm, 0 },
+    { X86::CMOVL64rr,       X86::CMOVL64rm, 0 },
+    { X86::CMOVLE16rr,      X86::CMOVLE16rm, 0 },
+    { X86::CMOVLE32rr,      X86::CMOVLE32rm, 0 },
+    { X86::CMOVLE64rr,      X86::CMOVLE64rm, 0 },
+    { X86::CMOVNE16rr,      X86::CMOVNE16rm, 0 },
+    { X86::CMOVNE32rr,      X86::CMOVNE32rm, 0 },
+    { X86::CMOVNE64rr,      X86::CMOVNE64rm, 0 },
+    { X86::CMOVNO16rr,      X86::CMOVNO16rm, 0 },
+    { X86::CMOVNO32rr,      X86::CMOVNO32rm, 0 },
+    { X86::CMOVNO64rr,      X86::CMOVNO64rm, 0 },
+    { X86::CMOVNP16rr,      X86::CMOVNP16rm, 0 },
+    { X86::CMOVNP32rr,      X86::CMOVNP32rm, 0 },
+    { X86::CMOVNP64rr,      X86::CMOVNP64rm, 0 },
+    { X86::CMOVNS16rr,      X86::CMOVNS16rm, 0 },
+    { X86::CMOVNS32rr,      X86::CMOVNS32rm, 0 },
+    { X86::CMOVNS64rr,      X86::CMOVNS64rm, 0 },
+    { X86::CMOVO16rr,       X86::CMOVO16rm, 0 },
+    { X86::CMOVO32rr,       X86::CMOVO32rm, 0 },
+    { X86::CMOVO64rr,       X86::CMOVO64rm, 0 },
+    { X86::CMOVP16rr,       X86::CMOVP16rm, 0 },
+    { X86::CMOVP32rr,       X86::CMOVP32rm, 0 },
+    { X86::CMOVP64rr,       X86::CMOVP64rm, 0 },
+    { X86::CMOVS16rr,       X86::CMOVS16rm, 0 },
+    { X86::CMOVS32rr,       X86::CMOVS32rm, 0 },
+    { X86::CMOVS64rr,       X86::CMOVS64rm, 0 },
+    { X86::CMPPDrri,        X86::CMPPDrmi, 16 },
+    { X86::CMPPSrri,        X86::CMPPSrmi, 16 },
+    { X86::CMPSDrr,         X86::CMPSDrm, 0 },
+    { X86::CMPSSrr,         X86::CMPSSrm, 0 },
+    { X86::DIVPDrr,         X86::DIVPDrm, 16 },
+    { X86::DIVPSrr,         X86::DIVPSrm, 16 },
+    { X86::DIVSDrr,         X86::DIVSDrm, 0 },
+    { X86::DIVSSrr,         X86::DIVSSrm, 0 },
+    { X86::FsANDNPDrr,      X86::FsANDNPDrm, 16 },
+    { X86::FsANDNPSrr,      X86::FsANDNPSrm, 16 },
+    { X86::FsANDPDrr,       X86::FsANDPDrm, 16 },
+    { X86::FsANDPSrr,       X86::FsANDPSrm, 16 },
+    { X86::FsORPDrr,        X86::FsORPDrm, 16 },
+    { X86::FsORPSrr,        X86::FsORPSrm, 16 },
+    { X86::FsXORPDrr,       X86::FsXORPDrm, 16 },
+    { X86::FsXORPSrr,       X86::FsXORPSrm, 16 },
+    { X86::HADDPDrr,        X86::HADDPDrm, 16 },
+    { X86::HADDPSrr,        X86::HADDPSrm, 16 },
+    { X86::HSUBPDrr,        X86::HSUBPDrm, 16 },
+    { X86::HSUBPSrr,        X86::HSUBPSrm, 16 },
+    { X86::IMUL16rr,        X86::IMUL16rm, 0 },
+    { X86::IMUL32rr,        X86::IMUL32rm, 0 },
+    { X86::IMUL64rr,        X86::IMUL64rm, 0 },
+    { X86::MAXPDrr,         X86::MAXPDrm, 16 },
+    { X86::MAXPDrr_Int,     X86::MAXPDrm_Int, 16 },
+    { X86::MAXPSrr,         X86::MAXPSrm, 16 },
+    { X86::MAXPSrr_Int,     X86::MAXPSrm_Int, 16 },
+    { X86::MAXSDrr,         X86::MAXSDrm, 0 },
+    { X86::MAXSDrr_Int,     X86::MAXSDrm_Int, 0 },
+    { X86::MAXSSrr,         X86::MAXSSrm, 0 },
+    { X86::MAXSSrr_Int,     X86::MAXSSrm_Int, 0 },
+    { X86::MINPDrr,         X86::MINPDrm, 16 },
+    { X86::MINPDrr_Int,     X86::MINPDrm_Int, 16 },
+    { X86::MINPSrr,         X86::MINPSrm, 16 },
+    { X86::MINPSrr_Int,     X86::MINPSrm_Int, 16 },
+    { X86::MINSDrr,         X86::MINSDrm, 0 },
+    { X86::MINSDrr_Int,     X86::MINSDrm_Int, 0 },
+    { X86::MINSSrr,         X86::MINSSrm, 0 },
+    { X86::MINSSrr_Int,     X86::MINSSrm_Int, 0 },
+    { X86::MULPDrr,         X86::MULPDrm, 16 },
+    { X86::MULPSrr,         X86::MULPSrm, 16 },
+    { X86::MULSDrr,         X86::MULSDrm, 0 },
+    { X86::MULSSrr,         X86::MULSSrm, 0 },
+    { X86::OR16rr,          X86::OR16rm, 0 },
+    { X86::OR32rr,          X86::OR32rm, 0 },
+    { X86::OR64rr,          X86::OR64rm, 0 },
+    { X86::OR8rr,           X86::OR8rm, 0 },
+    { X86::ORPDrr,          X86::ORPDrm, 16 },
+    { X86::ORPSrr,          X86::ORPSrm, 16 },
+    { X86::PACKSSDWrr,      X86::PACKSSDWrm, 16 },
+    { X86::PACKSSWBrr,      X86::PACKSSWBrm, 16 },
+    { X86::PACKUSWBrr,      X86::PACKUSWBrm, 16 },
+    { X86::PADDBrr,         X86::PADDBrm, 16 },
+    { X86::PADDDrr,         X86::PADDDrm, 16 },
+    { X86::PADDQrr,         X86::PADDQrm, 16 },
+    { X86::PADDSBrr,        X86::PADDSBrm, 16 },
+    { X86::PADDSWrr,        X86::PADDSWrm, 16 },
+    { X86::PADDWrr,         X86::PADDWrm, 16 },
+    { X86::PANDNrr,         X86::PANDNrm, 16 },
+    { X86::PANDrr,          X86::PANDrm, 16 },
+    { X86::PAVGBrr,         X86::PAVGBrm, 16 },
+    { X86::PAVGWrr,         X86::PAVGWrm, 16 },
+    { X86::PCMPEQBrr,       X86::PCMPEQBrm, 16 },
+    { X86::PCMPEQDrr,       X86::PCMPEQDrm, 16 },
+    { X86::PCMPEQWrr,       X86::PCMPEQWrm, 16 },
+    { X86::PCMPGTBrr,       X86::PCMPGTBrm, 16 },
+    { X86::PCMPGTDrr,       X86::PCMPGTDrm, 16 },
+    { X86::PCMPGTWrr,       X86::PCMPGTWrm, 16 },
+    { X86::PINSRWrri,       X86::PINSRWrmi, 16 },
+    { X86::PMADDWDrr,       X86::PMADDWDrm, 16 },
+    { X86::PMAXSWrr,        X86::PMAXSWrm, 16 },
+    { X86::PMAXUBrr,        X86::PMAXUBrm, 16 },
+    { X86::PMINSWrr,        X86::PMINSWrm, 16 },
+    { X86::PMINUBrr,        X86::PMINUBrm, 16 },
+    { X86::PMULDQrr,        X86::PMULDQrm, 16 },
+    { X86::PMULHUWrr,       X86::PMULHUWrm, 16 },
+    { X86::PMULHWrr,        X86::PMULHWrm, 16 },
+    { X86::PMULLDrr,        X86::PMULLDrm, 16 },
+    { X86::PMULLDrr_int,    X86::PMULLDrm_int, 16 },
+    { X86::PMULLWrr,        X86::PMULLWrm, 16 },
+    { X86::PMULUDQrr,       X86::PMULUDQrm, 16 },
+    { X86::PORrr,           X86::PORrm, 16 },
+    { X86::PSADBWrr,        X86::PSADBWrm, 16 },
+    { X86::PSLLDrr,         X86::PSLLDrm, 16 },
+    { X86::PSLLQrr,         X86::PSLLQrm, 16 },
+    { X86::PSLLWrr,         X86::PSLLWrm, 16 },
+    { X86::PSRADrr,         X86::PSRADrm, 16 },
+    { X86::PSRAWrr,         X86::PSRAWrm, 16 },
+    { X86::PSRLDrr,         X86::PSRLDrm, 16 },
+    { X86::PSRLQrr,         X86::PSRLQrm, 16 },
+    { X86::PSRLWrr,         X86::PSRLWrm, 16 },
+    { X86::PSUBBrr,         X86::PSUBBrm, 16 },
+    { X86::PSUBDrr,         X86::PSUBDrm, 16 },
+    { X86::PSUBSBrr,        X86::PSUBSBrm, 16 },
+    { X86::PSUBSWrr,        X86::PSUBSWrm, 16 },
+    { X86::PSUBWrr,         X86::PSUBWrm, 16 },
+    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm, 16 },
+    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm, 16 },
+    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm, 16 },
+    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm, 16 },
+    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm, 16 },
+    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm, 16 },
+    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm, 16 },
+    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm, 16 },
+    { X86::PXORrr,          X86::PXORrm, 16 },
+    { X86::SBB32rr,         X86::SBB32rm, 0 },
+    { X86::SBB64rr,         X86::SBB64rm, 0 },
+    { X86::SHUFPDrri,       X86::SHUFPDrmi, 16 },
+    { X86::SHUFPSrri,       X86::SHUFPSrmi, 16 },
+    { X86::SUB16rr,         X86::SUB16rm, 0 },
+    { X86::SUB32rr,         X86::SUB32rm, 0 },
+    { X86::SUB64rr,         X86::SUB64rm, 0 },
+    { X86::SUB8rr,          X86::SUB8rm, 0 },
+    { X86::SUBPDrr,         X86::SUBPDrm, 16 },
+    { X86::SUBPSrr,         X86::SUBPSrm, 16 },
+    { X86::SUBSDrr,         X86::SUBSDrm, 0 },
+    { X86::SUBSSrr,         X86::SUBSSrm, 0 },
     // FIXME: TEST*rr -> swapped operand of TEST*mr.
-    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm },
-    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm },
-    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm },
-    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm },
-    { X86::XOR16rr,         X86::XOR16rm },
-    { X86::XOR32rr,         X86::XOR32rm },
-    { X86::XOR64rr,         X86::XOR64rm },
-    { X86::XOR8rr,          X86::XOR8rm },
-    { X86::XORPDrr,         X86::XORPDrm },
-    { X86::XORPSrr,         X86::XORPSrm }
+    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm, 16 },
+    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm, 16 },
+    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm, 16 },
+    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm, 16 },
+    { X86::XOR16rr,         X86::XOR16rm, 0 },
+    { X86::XOR32rr,         X86::XOR32rm, 0 },
+    { X86::XOR64rr,         X86::XOR64rm, 0 },
+    { X86::XOR8rr,          X86::XOR8rm, 0 },
+    { X86::XORPDrr,         X86::XORPDrm, 16 },
+    { X86::XORPSrr,         X86::XORPSrm, 16 }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
     unsigned RegOp = OpTbl2[i][0];
     unsigned MemOp = OpTbl2[i][1];
+    unsigned Align = OpTbl2[i][2];
     if (!RegOp2MemOpTable2.insert(std::make_pair((unsigned*)RegOp,
-                                                 MemOp)).second)
+                                           std::make_pair(MemOp,Align))).second)
       assert(false && "Duplicated entries?");
-    unsigned AuxInfo = 2 | (1 << 4); // Index 2, folded load
+    // Index 2, folded load
+    unsigned AuxInfo = 2 | (1 << 4);
     if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
                                    std::make_pair(RegOp, AuxInfo))).second)
       AmbEntries.push_back(MemOp);
@@ -760,7 +766,6 @@ unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   return 0;
 }
 
-
 /// regIsPICBase - Return true if register is PIC base (i.e.g defined by
 /// X86::MOVPC32r.
 static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
@@ -776,37 +781,9 @@ static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
   return isPICBase;
 }
 
-/// isGVStub - Return true if the GV requires an extra load to get the
-/// real address.
-static inline bool isGVStub(GlobalValue *GV, X86TargetMachine &TM) {
-  return TM.getSubtarget<X86Subtarget>().GVRequiresExtraLoad(GV, TM, false);
-}
-
-/// CanRematLoadWithDispOperand - Return true if a load with the specified
-/// operand is a candidate for remat: for this to be true we need to know that
-/// the load will always return the same value, even if moved.
-static bool CanRematLoadWithDispOperand(const MachineOperand &MO,
-                                        X86TargetMachine &TM) {
-  // Loads from constant pool entries can be remat'd.
-  if (MO.isCPI()) return true;
-  
-  // We can remat globals in some cases.
-  if (MO.isGlobal()) {
-    // If this is a load of a stub, not of the global, we can remat it.  This
-    // access will always return the address of the global.
-    if (isGVStub(MO.getGlobal(), TM))
-      return true;
-    
-    // If the global itself is constant, we can remat the load.
-    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(MO.getGlobal()))
-      if (GV->isConstant())
-        return true;
-  }
-  return false;
-}
- 
 bool
-X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI) const {
+X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
+                                                AliasAnalysis *AA) const {
   switch (MI->getOpcode()) {
   default: break;
     case X86::MOV8rm:
@@ -825,7 +802,7 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI) const {
       if (MI->getOperand(1).isReg() &&
           MI->getOperand(2).isImm() &&
           MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
-          CanRematLoadWithDispOperand(MI->getOperand(4), TM)) {
+          MI->isInvariantLoad(AA)) {
         unsigned BaseReg = MI->getOperand(1).getReg();
         if (BaseReg == 0 || BaseReg == X86::RIP)
           return true;
@@ -876,7 +853,7 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI) const {
 /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that
 /// would clobber the EFLAGS condition register. Note the result may be
 /// conservative. If it cannot definitely determine the safety after visiting
-/// two instructions it assumes it's not safe.
+/// a few instructions in each direction it assumes it's not safe.
 static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I) {
   // It's always safe to clobber EFLAGS at the end of a block.
@@ -884,11 +861,13 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
     return true;
 
   // For compile time consideration, if we are not able to determine the
-  // safety after visiting 2 instructions, we will assume it's not safe.
-  for (unsigned i = 0; i < 2; ++i) {
+  // safety after visiting 4 instructions in each direction, we will assume
+  // it's not safe.
+  MachineBasicBlock::iterator Iter = I;
+  for (unsigned i = 0; i < 4; ++i) {
     bool SeenDef = false;
-    for (unsigned j = 0, e = I->getNumOperands(); j != e; ++j) {
-      MachineOperand &MO = I->getOperand(j);
+    for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = Iter->getOperand(j);
       if (!MO.isReg())
         continue;
       if (MO.getReg() == X86::EFLAGS) {
@@ -901,10 +880,33 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
     if (SeenDef)
       // This instruction defines EFLAGS, no need to look any further.
       return true;
-    ++I;
+    ++Iter;
 
     // If we make it to the end of the block, it's safe to clobber EFLAGS.
-    if (I == MBB.end())
+    if (Iter == MBB.end())
+      return true;
+  }
+
+  Iter = I;
+  for (unsigned i = 0; i < 4; ++i) {
+    // If we make it to the beginning of the block, it's safe to clobber
+    // EFLAGS iff EFLAGS is not live-in.
+    if (Iter == MBB.begin())
+      return !MBB.isLiveIn(X86::EFLAGS);
+
+    --Iter;
+    bool SawKill = false;
+    for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = Iter->getOperand(j);
+      if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
+        if (MO.isDef()) return MO.isDead();
+        if (MO.isKill()) SawKill = true;
+      }
+    }
+
+    if (SawKill)
+      // This instruction kills EFLAGS and doesn't redefine it, so
+      // there's no need to look further.
       return true;
   }
 
@@ -914,14 +916,11 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
 
 void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
-                                 unsigned DestReg,
+                                 unsigned DestReg, unsigned SubIdx,
                                  const MachineInstr *Orig) const {
   DebugLoc DL = DebugLoc::getUnknownLoc();
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  unsigned SubIdx = Orig->getOperand(0).isReg()
-    ? Orig->getOperand(0).getSubReg() : 0;
-  bool ChangeSubIdx = SubIdx != 0;
   if (SubIdx && TargetRegisterInfo::isPhysicalRegister(DestReg)) {
     DestReg = RI.getSubReg(DestReg, SubIdx);
     SubIdx = 0;
@@ -929,76 +928,36 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
 
   // MOV32r0 etc. are implemented with xor which clobbers condition code.
   // Re-materialize them as movri instructions to avoid side effects.
-  bool Emitted = false;
-  switch (Orig->getOpcode()) {
+  bool Clone = true;
+  unsigned Opc = Orig->getOpcode();
+  switch (Opc) {
   default: break;
   case X86::MOV8r0:
   case X86::MOV16r0:
-  case X86::MOV32r0:
-  case X86::MOV64r0: {
+  case X86::MOV32r0: {
     if (!isSafeToClobberEFLAGS(MBB, I)) {
-      unsigned Opc = 0;
-      switch (Orig->getOpcode()) {
+      switch (Opc) {
       default: break;
       case X86::MOV8r0:  Opc = X86::MOV8ri;  break;
       case X86::MOV16r0: Opc = X86::MOV16ri; break;
       case X86::MOV32r0: Opc = X86::MOV32ri; break;
-      case X86::MOV64r0: Opc = X86::MOV64ri32; break;
       }
-      BuildMI(MBB, I, DL, get(Opc), DestReg).addImm(0);
-      Emitted = true;
+      Clone = false;
     }
     break;
   }
   }
 
-  if (!Emitted) {
+  if (Clone) {
     MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
     MI->getOperand(0).setReg(DestReg);
     MBB.insert(I, MI);
+  } else {
+    BuildMI(MBB, I, DL, get(Opc), DestReg).addImm(0);
   }
 
-  if (ChangeSubIdx) {
-    MachineInstr *NewMI = prior(I);
-    NewMI->getOperand(0).setSubReg(SubIdx);
-  }
-}
-
-/// isInvariantLoad - Return true if the specified instruction (which is marked
-/// mayLoad) is loading from a location whose value is invariant across the
-/// function.  For example, loading a value from the constant pool or from
-/// from the argument area of a function if it does not change.  This should
-/// only return true of *all* loads the instruction does are invariant (if it
-/// does multiple loads).
-bool X86InstrInfo::isInvariantLoad(const MachineInstr *MI) const {
-  // This code cares about loads from three cases: constant pool entries,
-  // invariant argument slots, and global stubs.  In order to handle these cases
-  // for all of the myriad of X86 instructions, we just scan for a CP/FI/GV
-  // operand and base our analysis on it.  This is safe because the address of
-  // none of these three cases is ever used as anything other than a load base
-  // and X86 doesn't have any instructions that load from multiple places.
-  
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    // Loads from constant pools are trivially invariant.
-    if (MO.isCPI())
-      return true;
-
-    if (MO.isGlobal())
-      return isGVStub(MO.getGlobal(), TM);
-
-    // If this is a load from an invariant stack slot, the load is a constant.
-    if (MO.isFI()) {
-      const MachineFrameInfo &MFI =
-        *MI->getParent()->getParent()->getFrameInfo();
-      int Idx = MO.getIndex();
-      return MFI.isFixedObjectIndex(Idx) && MFI.isImmutableObjectIndex(Idx);
-    }
-  }
-  
-  // All other instances of these instructions are presumed to have other
-  // issues.
-  return false;
+  MachineInstr *NewMI = prior(I);
+  NewMI->getOperand(0).setSubReg(SubIdx);
 }
 
 /// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that
@@ -1304,7 +1263,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
     unsigned Opc;
     unsigned Size;
     switch (MI->getOpcode()) {
-    default: assert(0 && "Unreachable!");
+    default: llvm_unreachable("Unreachable!");
     case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
     case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
     case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
@@ -1459,7 +1418,7 @@ static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
 
 unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
   switch (CC) {
-  default: assert(0 && "Illegal condition code!");
+  default: llvm_unreachable("Illegal condition code!");
   case X86::COND_E:  return X86::JE;
   case X86::COND_NE: return X86::JNE;
   case X86::COND_L:  return X86::JL;
@@ -1483,7 +1442,7 @@ unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
 /// e.g. turning COND_E to COND_NE.
 X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
   switch (CC) {
-  default: assert(0 && "Illegal condition code!");
+  default: llvm_unreachable("Illegal condition code!");
   case X86::COND_E:  return X86::COND_NE;
   case X86::COND_NE: return X86::COND_E;
   case X86::COND_L:  return X86::COND_GE;
@@ -1699,14 +1658,26 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
     /* Source and destination have the same register class. */;
   else if (CommonRC->hasSuperClass(SrcRC))
     CommonRC = SrcRC;
-  else if (!DestRC->hasSubClass(SrcRC))
-    CommonRC = 0;
+  else if (!DestRC->hasSubClass(SrcRC)) {
+    // Neither of GR64_NOREX or GR64_NOSP is a superclass of the other,
+    // but we want to copy then as GR64. Similarly, for GR32_NOREX and
+    // GR32_NOSP, copy as GR32.
+    if (SrcRC->hasSuperClass(&X86::GR64RegClass) &&
+        DestRC->hasSuperClass(&X86::GR64RegClass))
+      CommonRC = &X86::GR64RegClass;
+    else if (SrcRC->hasSuperClass(&X86::GR32RegClass) &&
+             DestRC->hasSuperClass(&X86::GR32RegClass))
+      CommonRC = &X86::GR32RegClass;
+    else
+      CommonRC = 0;
+  }
 
   if (CommonRC) {
     unsigned Opc;
-    if (CommonRC == &X86::GR64RegClass) {
+    if (CommonRC == &X86::GR64RegClass || CommonRC == &X86::GR64_NOSPRegClass) {
       Opc = X86::MOV64rr;
-    } else if (CommonRC == &X86::GR32RegClass) {
+    } else if (CommonRC == &X86::GR32RegClass ||
+               CommonRC == &X86::GR32_NOSPRegClass) {
       Opc = X86::MOV32rr;
     } else if (CommonRC == &X86::GR16RegClass) {
       Opc = X86::MOV16rr;
@@ -1731,7 +1702,8 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
         Opc = X86::MOV8rr_NOREX;
       else
         Opc = X86::MOV8rr;
-    } else if (CommonRC == &X86::GR64_NOREXRegClass) {
+    } else if (CommonRC == &X86::GR64_NOREXRegClass ||
+               CommonRC == &X86::GR64_NOREX_NOSPRegClass) {
       Opc = X86::MOV64rr;
     } else if (CommonRC == &X86::GR32_NOREXRegClass) {
       Opc = X86::MOV32rr;
@@ -1759,16 +1731,17 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
     BuildMI(MBB, MI, DL, get(Opc), DestReg).addReg(SrcReg);
     return true;
   }
-  
+
   // Moving EFLAGS to / from another register requires a push and a pop.
   if (SrcRC == &X86::CCRRegClass) {
     if (SrcReg != X86::EFLAGS)
       return false;
-    if (DestRC == &X86::GR64RegClass) {
+    if (DestRC == &X86::GR64RegClass || DestRC == &X86::GR64_NOSPRegClass) {
       BuildMI(MBB, MI, DL, get(X86::PUSHFQ));
       BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
       return true;
-    } else if (DestRC == &X86::GR32RegClass) {
+    } else if (DestRC == &X86::GR32RegClass ||
+               DestRC == &X86::GR32_NOSPRegClass) {
       BuildMI(MBB, MI, DL, get(X86::PUSHFD));
       BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
       return true;
@@ -1776,11 +1749,12 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
   } else if (DestRC == &X86::CCRRegClass) {
     if (DestReg != X86::EFLAGS)
       return false;
-    if (SrcRC == &X86::GR64RegClass) {
+    if (SrcRC == &X86::GR64RegClass || DestRC == &X86::GR64_NOSPRegClass) {
       BuildMI(MBB, MI, DL, get(X86::PUSH64r)).addReg(SrcReg);
       BuildMI(MBB, MI, DL, get(X86::POPFQ));
       return true;
-    } else if (SrcRC == &X86::GR32RegClass) {
+    } else if (SrcRC == &X86::GR32RegClass ||
+               DestRC == &X86::GR32_NOSPRegClass) {
       BuildMI(MBB, MI, DL, get(X86::PUSH32r)).addReg(SrcReg);
       BuildMI(MBB, MI, DL, get(X86::POPFD));
       return true;
@@ -1838,9 +1812,9 @@ static unsigned getStoreRegOpcode(unsigned SrcReg,
                                   bool isStackAligned,
                                   TargetMachine &TM) {
   unsigned Opc = 0;
-  if (RC == &X86::GR64RegClass) {
+  if (RC == &X86::GR64RegClass || RC == &X86::GR64_NOSPRegClass) {
     Opc = X86::MOV64mr;
-  } else if (RC == &X86::GR32RegClass) {
+  } else if (RC == &X86::GR32RegClass || RC == &X86::GR32_NOSPRegClass) {
     Opc = X86::MOV32mr;
   } else if (RC == &X86::GR16RegClass) {
     Opc = X86::MOV16mr;
@@ -1865,7 +1839,8 @@ static unsigned getStoreRegOpcode(unsigned SrcReg,
       Opc = X86::MOV8mr_NOREX;
     else
       Opc = X86::MOV8mr;
-  } else if (RC == &X86::GR64_NOREXRegClass) {
+  } else if (RC == &X86::GR64_NOREXRegClass ||
+             RC == &X86::GR64_NOREX_NOSPRegClass) {
     Opc = X86::MOV64mr;
   } else if (RC == &X86::GR32_NOREXRegClass) {
     Opc = X86::MOV32mr;
@@ -1889,8 +1864,7 @@ static unsigned getStoreRegOpcode(unsigned SrcReg,
   } else if (RC == &X86::VR64RegClass) {
     Opc = X86::MMX_MOVQ64mr;
   } else {
-    assert(0 && "Unknown regclass");
-    abort();
+    llvm_unreachable("Unknown regclass");
   }
 
   return Opc;
@@ -1914,6 +1888,8 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
                                   bool isKill,
                                   SmallVectorImpl<MachineOperand> &Addr,
                                   const TargetRegisterClass *RC,
+                                  MachineInstr::mmo_iterator MMOBegin,
+                                  MachineInstr::mmo_iterator MMOEnd,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const {
   bool isAligned = (RI.getStackAlignment() >= 16) ||
     RI.needsStackRealignment(MF);
@@ -1923,6 +1899,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
   for (unsigned i = 0, e = Addr.size(); i != e; ++i)
     MIB.addOperand(Addr[i]);
   MIB.addReg(SrcReg, getKillRegState(isKill));
+  (*MIB).setMemRefs(MMOBegin, MMOEnd);
   NewMIs.push_back(MIB);
 }
 
@@ -1931,9 +1908,9 @@ static unsigned getLoadRegOpcode(unsigned DestReg,
                                  bool isStackAligned,
                                  const TargetMachine &TM) {
   unsigned Opc = 0;
-  if (RC == &X86::GR64RegClass) {
+  if (RC == &X86::GR64RegClass || RC == &X86::GR64_NOSPRegClass) {
     Opc = X86::MOV64rm;
-  } else if (RC == &X86::GR32RegClass) {
+  } else if (RC == &X86::GR32RegClass || RC == &X86::GR32_NOSPRegClass) {
     Opc = X86::MOV32rm;
   } else if (RC == &X86::GR16RegClass) {
     Opc = X86::MOV16rm;
@@ -1958,7 +1935,8 @@ static unsigned getLoadRegOpcode(unsigned DestReg,
       Opc = X86::MOV8rm_NOREX;
     else
       Opc = X86::MOV8rm;
-  } else if (RC == &X86::GR64_NOREXRegClass) {
+  } else if (RC == &X86::GR64_NOREXRegClass ||
+             RC == &X86::GR64_NOREX_NOSPRegClass) {
     Opc = X86::MOV64rm;
   } else if (RC == &X86::GR32_NOREXRegClass) {
     Opc = X86::MOV32rm;
@@ -1982,8 +1960,7 @@ static unsigned getLoadRegOpcode(unsigned DestReg,
   } else if (RC == &X86::VR64RegClass) {
     Opc = X86::MMX_MOVQ64rm;
   } else {
-    assert(0 && "Unknown regclass");
-    abort();
+    llvm_unreachable("Unknown regclass");
   }
 
   return Opc;
@@ -2005,6 +1982,8 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                  SmallVectorImpl<MachineOperand> &Addr,
                                  const TargetRegisterClass *RC,
+                                 MachineInstr::mmo_iterator MMOBegin,
+                                 MachineInstr::mmo_iterator MMOEnd,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
   bool isAligned = (RI.getStackAlignment() >= 16) ||
     RI.needsStackRealignment(MF);
@@ -2013,6 +1992,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
   for (unsigned i = 0, e = Addr.size(); i != e; ++i)
     MIB.addOperand(Addr[i]);
+  (*MIB).setMemRefs(MMOBegin, MMOEnd);
   NewMIs.push_back(MIB);
 }
 
@@ -2026,9 +2006,11 @@ bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   if (MI != MBB.end()) DL = MI->getDebugLoc();
 
   bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+  bool isWin64 = TM.getSubtarget<X86Subtarget>().isTargetWin64();
   unsigned SlotSize = is64Bit ? 8 : 4;
 
   MachineFunction &MF = *MBB.getParent();
+  unsigned FPReg = RI.getFrameRegister(MF);
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   unsigned CalleeFrameSize = 0;
   
@@ -2038,10 +2020,12 @@ bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     const TargetRegisterClass *RegClass = CSI[i-1].getRegClass();
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
-    if (RegClass != &X86::VR128RegClass) {
+    if (Reg == FPReg)
+      // X86RegisterInfo::emitPrologue will handle spilling of frame register.
+      continue;
+    if (RegClass != &X86::VR128RegClass && !isWin64) {
       CalleeFrameSize += SlotSize;
-      BuildMI(MBB, MI, DL, get(Opc))
-        .addReg(Reg, RegState::Kill);
+      BuildMI(MBB, MI, DL, get(Opc)).addReg(Reg, RegState::Kill);
     } else {
       storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), RegClass);
     }
@@ -2060,13 +2044,18 @@ bool X86InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   DebugLoc DL = DebugLoc::getUnknownLoc();
   if (MI != MBB.end()) DL = MI->getDebugLoc();
 
+  MachineFunction &MF = *MBB.getParent();
+  unsigned FPReg = RI.getFrameRegister(MF);
   bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
-
+  bool isWin64 = TM.getSubtarget<X86Subtarget>().isTargetWin64();
   unsigned Opc = is64Bit ? X86::POP64r : X86::POP32r;
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
+    if (Reg == FPReg)
+      // X86RegisterInfo::emitEpilogue will handle restoring of frame register.
+      continue;
     const TargetRegisterClass *RegClass = CSI[i].getRegClass();
-    if (RegClass != &X86::VR128RegClass) {
+    if (RegClass != &X86::VR128RegClass && !isWin64) {
       BuildMI(MBB, MI, DL, get(Opc), Reg);
     } else {
       loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RegClass);
@@ -2143,8 +2132,9 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
 MachineInstr*
 X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                     MachineInstr *MI, unsigned i,
-                                    const SmallVectorImpl<MachineOperand> &MOs) const{
-  const DenseMap<unsigned*, unsigned> *OpcodeTablePtr = NULL;
+                                    const SmallVectorImpl<MachineOperand> &MOs,
+                                    unsigned Size, unsigned Align) const {
+  const DenseMap<unsigned*, std::pair<unsigned,unsigned> > *OpcodeTablePtr=NULL;
   bool isTwoAddrFold = false;
   unsigned NumOps = MI->getDesc().getNumOperands();
   bool isTwoAddr = NumOps > 1 &&
@@ -2165,8 +2155,6 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       NewMI = MakeM0Inst(*this, X86::MOV16mi, MOs, MI);
     else if (MI->getOpcode() == X86::MOV32r0)
       NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
-    else if (MI->getOpcode() == X86::MOV64r0)
-      NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI);
     else if (MI->getOpcode() == X86::MOV8r0)
       NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI);
     if (NewMI)
@@ -2182,60 +2170,82 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // If table selected...
   if (OpcodeTablePtr) {
     // Find the Opcode to fuse
-    DenseMap<unsigned*, unsigned>::iterator I =
+    DenseMap<unsigned*, std::pair<unsigned,unsigned> >::iterator I =
       OpcodeTablePtr->find((unsigned*)MI->getOpcode());
     if (I != OpcodeTablePtr->end()) {
+      unsigned Opcode = I->second.first;
+      unsigned MinAlign = I->second.second;
+      if (Align < MinAlign)
+        return NULL;
+      bool NarrowToMOV32rm = false;
+      if (Size) {
+        unsigned RCSize =  MI->getDesc().OpInfo[i].getRegClass(&RI)->getSize();
+        if (Size < RCSize) {
+          // Check if it's safe to fold the load. If the size of the object is
+          // narrower than the load width, then it's not.
+          if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+            return NULL;
+          // If this is a 64-bit load, but the spill slot is 32, then we can do
+          // a 32-bit load which is implicitly zero-extended. This likely is due
+          // to liveintervalanalysis remat'ing a load from stack slot.
+          if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
+            return NULL;
+          Opcode = X86::MOV32rm;
+          NarrowToMOV32rm = true;
+        }
+      }
+
       if (isTwoAddrFold)
-        NewMI = FuseTwoAddrInst(MF, I->second, MOs, MI, *this);
+        NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this);
       else
-        NewMI = FuseInst(MF, I->second, i, MOs, MI, *this);
+        NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this);
+
+      if (NarrowToMOV32rm) {
+        // If this is the special case where we use a MOV32rm to load a 32-bit
+        // value and zero-extend the top bits. Change the destination register
+        // to a 32-bit one.
+        unsigned DstReg = NewMI->getOperand(0).getReg();
+        if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+          NewMI->getOperand(0).setReg(RI.getSubReg(DstReg,
+                                                   4/*x86_subreg_32bit*/));
+        else
+          NewMI->getOperand(0).setSubReg(4/*x86_subreg_32bit*/);
+      }
       return NewMI;
     }
   }
   
   // No fusion 
   if (PrintFailedFusing)
-    cerr << "We failed to fuse operand " << i << " in " << *MI;
+    errs() << "We failed to fuse operand " << i << " in " << *MI;
   return NULL;
 }
 
 
 MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                                   MachineInstr *MI,
-                                                  const SmallVectorImpl<unsigned> &Ops,
+                                           const SmallVectorImpl<unsigned> &Ops,
                                                   int FrameIndex) const {
   // Check switch flag 
   if (NoFusing) return NULL;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned Size = MFI->getObjectSize(FrameIndex);
   unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
-  // FIXME: Move alignment requirement into tables?
-  if (Alignment < 16) {
-    switch (MI->getOpcode()) {
-    default: break;
-    // Not always safe to fold movsd into these instructions since their load
-    // folding variants expects the address to be 16 byte aligned.
-    case X86::FsANDNPDrr:
-    case X86::FsANDNPSrr:
-    case X86::FsANDPDrr:
-    case X86::FsANDPSrr:
-    case X86::FsORPDrr:
-    case X86::FsORPSrr:
-    case X86::FsXORPDrr:
-    case X86::FsXORPSrr:
-      return NULL;
-    }
-  }
-
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
+    unsigned RCSize = 0;
     switch (MI->getOpcode()) {
     default: return NULL;
-    case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
-    case X86::TEST16rr: NewOpc = X86::CMP16ri; break;
-    case X86::TEST32rr: NewOpc = X86::CMP32ri; break;
-    case X86::TEST64rr: NewOpc = X86::CMP64ri32; break;
+    case X86::TEST8rr:  NewOpc = X86::CMP8ri; RCSize = 1; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri; RCSize = 2; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri; RCSize = 4; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri32; RCSize = 8; break;
     }
+    // Check if it's safe to fold the load. If the size of the object is
+    // narrower than the load width, then it's not.
+    if (Size < RCSize)
+      return NULL;
     // Change to CMPXXri r, 0 first.
     MI->setDesc(get(NewOpc));
     MI->getOperand(1).ChangeToImmediate(0);
@@ -2244,12 +2254,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 
   SmallVector<MachineOperand,4> MOs;
   MOs.push_back(MachineOperand::CreateFI(FrameIndex));
-  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs);
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment);
 }
 
 MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                                   MachineInstr *MI,
-                                            const SmallVectorImpl<unsigned> &Ops,
+                                           const SmallVectorImpl<unsigned> &Ops,
                                                   MachineInstr *LoadMI) const {
   // Check switch flag 
   if (NoFusing) return NULL;
@@ -2257,26 +2267,22 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // Determine the alignment of the load.
   unsigned Alignment = 0;
   if (LoadMI->hasOneMemOperand())
-    Alignment = LoadMI->memoperands_begin()->getAlignment();
-
-  // FIXME: Move alignment requirement into tables?
-  if (Alignment < 16) {
-    switch (MI->getOpcode()) {
-    default: break;
-    // Not always safe to fold movsd into these instructions since their load
-    // folding variants expects the address to be 16 byte aligned.
-    case X86::FsANDNPDrr:
-    case X86::FsANDNPSrr:
-    case X86::FsANDPDrr:
-    case X86::FsANDPSrr:
-    case X86::FsORPDrr:
-    case X86::FsORPSrr:
-    case X86::FsXORPDrr:
-    case X86::FsXORPSrr:
-      return NULL;
+    Alignment = (*LoadMI->memoperands_begin())->getAlignment();
+  else
+    switch (LoadMI->getOpcode()) {
+    case X86::V_SET0:
+    case X86::V_SETALLONES:
+      Alignment = 16;
+      break;
+    case X86::FsFLD0SD:
+      Alignment = 8;
+      break;
+    case X86::FsFLD0SS:
+      Alignment = 4;
+      break;
+    default:
+      llvm_unreachable("Don't know how to fold this instruction!");
     }
-  }
-
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     switch (MI->getOpcode()) {
@@ -2293,28 +2299,40 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     return NULL;
 
   SmallVector<MachineOperand,X86AddrNumOperands> MOs;
-  if (LoadMI->getOpcode() == X86::V_SET0 ||
-      LoadMI->getOpcode() == X86::V_SETALLONES) {
+  switch (LoadMI->getOpcode()) {
+  case X86::V_SET0:
+  case X86::V_SETALLONES:
+  case X86::FsFLD0SD:
+  case X86::FsFLD0SS: {
     // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
     // Create a constant-pool entry and operands to load from it.
 
     // x86-32 PIC requires a PIC base register for constant pools.
     unsigned PICBase = 0;
-    if (TM.getRelocationModel() == Reloc::PIC_ &&
-        !TM.getSubtarget<X86Subtarget>().is64Bit())
-      // FIXME: PICBase = TM.getInstrInfo()->getGlobalBaseReg(&MF);
-      // This doesn't work for several reasons.
-      // 1. GlobalBaseReg may have been spilled.
-      // 2. It may not be live at MI.
-      return false;
+    if (TM.getRelocationModel() == Reloc::PIC_) {
+      if (TM.getSubtarget<X86Subtarget>().is64Bit())
+        PICBase = X86::RIP;
+      else
+        // FIXME: PICBase = TM.getInstrInfo()->getGlobalBaseReg(&MF);
+        // This doesn't work for several reasons.
+        // 1. GlobalBaseReg may have been spilled.
+        // 2. It may not be live at MI.
+        return NULL;
+    }
 
-    // Create a v4i32 constant-pool entry.
+    // Create a constant-pool entry.
     MachineConstantPool &MCP = *MF.getConstantPool();
-    const VectorType *Ty = VectorType::get(Type::Int32Ty, 4);
-    Constant *C = LoadMI->getOpcode() == X86::V_SET0 ?
-                    ConstantVector::getNullValue(Ty) :
-                    ConstantVector::getAllOnesValue(Ty);
-    unsigned CPI = MCP.getConstantPoolIndex(C, 16);
+    const Type *Ty;
+    if (LoadMI->getOpcode() == X86::FsFLD0SS)
+      Ty = Type::getFloatTy(MF.getFunction()->getContext());
+    else if (LoadMI->getOpcode() == X86::FsFLD0SD)
+      Ty = Type::getDoubleTy(MF.getFunction()->getContext());
+    else
+      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
+    Constant *C = LoadMI->getOpcode() == X86::V_SETALLONES ?
+                    Constant::getAllOnesValue(Ty) :
+                    Constant::getNullValue(Ty);
+    unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
 
     // Create operands to load from the constant pool entry.
     MOs.push_back(MachineOperand::CreateReg(PICBase, false));
@@ -2322,13 +2340,17 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     MOs.push_back(MachineOperand::CreateReg(0, false));
     MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
     MOs.push_back(MachineOperand::CreateReg(0, false));
-  } else {
+    break;
+  }
+  default: {
     // Folding a normal load. Just copy the load's address operands.
     unsigned NumOps = LoadMI->getDesc().getNumOperands();
     for (unsigned i = NumOps - X86AddrNumOperands; i != NumOps; ++i)
       MOs.push_back(LoadMI->getOperand(i));
+    break;
+  }
   }
-  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs);
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment);
 }
 
 
@@ -2360,15 +2382,14 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
   // Folding a memory location into the two-address part of a two-address
   // instruction is different than folding it other places.  It requires
   // replacing the *two* registers with the memory location.
-  const DenseMap<unsigned*, unsigned> *OpcodeTablePtr = NULL;
+  const DenseMap<unsigned*, std::pair<unsigned,unsigned> > *OpcodeTablePtr=NULL;
   if (isTwoAddr && NumOps >= 2 && OpNum < 2) { 
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
   } else if (OpNum == 0) { // If operand 0
     switch (Opc) {
+    case X86::MOV8r0:
     case X86::MOV16r0:
     case X86::MOV32r0:
-    case X86::MOV64r0:
-    case X86::MOV8r0:
       return true;
     default: break;
     }
@@ -2381,7 +2402,7 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
   
   if (OpcodeTablePtr) {
     // Find the Opcode to fuse
-    DenseMap<unsigned*, unsigned>::iterator I =
+    DenseMap<unsigned*, std::pair<unsigned,unsigned> >::iterator I =
       OpcodeTablePtr->find((unsigned*)Opc);
     if (I != OpcodeTablePtr->end())
       return true;
@@ -2410,8 +2431,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
 
   const TargetInstrDesc &TID = get(Opc);
   const TargetOperandInfo &TOI = TID.OpInfo[Index];
-  const TargetRegisterClass *RC = TOI.isLookupPtrRegClass()
-    ? RI.getPointerRegClass() : RI.getRegClass(TOI.RegClass);
+  const TargetRegisterClass *RC = TOI.getRegClass(&RI);
   SmallVector<MachineOperand, X86AddrNumOperands> AddrOps;
   SmallVector<MachineOperand,2> BeforeOps;
   SmallVector<MachineOperand,2> AfterOps;
@@ -2430,7 +2450,11 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
 
   // Emit the load instruction.
   if (UnfoldLoad) {
-    loadRegFromAddr(MF, Reg, AddrOps, RC, NewMIs);
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractLoadMemRefs(MI->memoperands_begin(),
+                            MI->memoperands_end());
+    loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
     if (UnfoldStore) {
       // Address operands cannot be marked isKill.
       for (unsigned i = 1; i != 1 + X86AddrNumOperands; ++i) {
@@ -2489,10 +2513,12 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
 
   // Emit the store instruction.
   if (UnfoldStore) {
-    const TargetOperandInfo &DstTOI = TID.OpInfo[0];
-    const TargetRegisterClass *DstRC = DstTOI.isLookupPtrRegClass()
-      ? RI.getPointerRegClass() : RI.getRegClass(DstTOI.RegClass);
-    storeRegToAddr(MF, Reg, true, AddrOps, DstRC, NewMIs);
+    const TargetRegisterClass *DstRC = TID.OpInfo[0].getRegClass(&RI);
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractStoreMemRefs(MI->memoperands_begin(),
+                             MI->memoperands_end());
+    storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
   }
 
   return true;
@@ -2513,9 +2539,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   bool FoldedLoad = I->second.second & (1 << 4);
   bool FoldedStore = I->second.second & (1 << 5);
   const TargetInstrDesc &TID = get(Opc);
-  const TargetOperandInfo &TOI = TID.OpInfo[Index];
-  const TargetRegisterClass *RC = TOI.isLookupPtrRegClass()
-    ? RI.getPointerRegClass() : RI.getRegClass(TOI.RegClass);
+  const TargetRegisterClass *RC = TID.OpInfo[Index].getRegClass(&RI);
   unsigned NumDefs = TID.NumDefs;
   std::vector<SDValue> AddrOps;
   std::vector<SDValue> BeforeOps;
@@ -2536,35 +2560,40 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
 
   // Emit the load instruction.
   SDNode *Load = 0;
-  const MachineFunction &MF = DAG.getMachineFunction();
+  MachineFunction &MF = DAG.getMachineFunction();
   if (FoldedLoad) {
-    MVT VT = *RC->vt_begin();
+    EVT VT = *RC->vt_begin();
     bool isAligned = (RI.getStackAlignment() >= 16) ||
       RI.needsStackRealignment(MF);
-    Load = DAG.getTargetNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
-                             VT, MVT::Other, &AddrOps[0], AddrOps.size());
+    Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
+                              VT, MVT::Other, &AddrOps[0], AddrOps.size());
     NewNodes.push_back(Load);
+
+    // Preserve memory reference information.
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+                            cast<MachineSDNode>(N)->memoperands_end());
+    cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
   }
 
   // Emit the data processing instruction.
-  std::vector<MVT> VTs;
+  std::vector<EVT> VTs;
   const TargetRegisterClass *DstRC = 0;
   if (TID.getNumDefs() > 0) {
-    const TargetOperandInfo &DstTOI = TID.OpInfo[0];
-    DstRC = DstTOI.isLookupPtrRegClass()
-      ? RI.getPointerRegClass() : RI.getRegClass(DstTOI.RegClass);
+    DstRC = TID.OpInfo[0].getRegClass(&RI);
     VTs.push_back(*DstRC->vt_begin());
   }
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
-    MVT VT = N->getValueType(i);
+    EVT VT = N->getValueType(i);
     if (VT != MVT::Other && i >= (unsigned)TID.getNumDefs())
       VTs.push_back(VT);
   }
   if (Load)
     BeforeOps.push_back(SDValue(Load, 0));
   std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps));
-  SDNode *NewNode= DAG.getTargetNode(Opc, dl, VTs, &BeforeOps[0],
-                                     BeforeOps.size());
+  SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, &BeforeOps[0],
+                                      BeforeOps.size());
   NewNodes.push_back(NewNode);
 
   // Emit the store instruction.
@@ -2574,11 +2603,18 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
     AddrOps.push_back(Chain);
     bool isAligned = (RI.getStackAlignment() >= 16) ||
       RI.needsStackRealignment(MF);
-    SDNode *Store = DAG.getTargetNode(getStoreRegOpcode(0, DstRC,
-                                                        isAligned, TM),
-                                      dl, MVT::Other,
-                                      &AddrOps[0], AddrOps.size());
+    SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
+                                                         isAligned, TM),
+                                       dl, MVT::Other,
+                                       &AddrOps[0], AddrOps.size());
     NewNodes.push_back(Store);
+
+    // Preserve memory reference information.
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+                             cast<MachineSDNode>(N)->memoperands_end());
+    cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
   }
 
   return true;
@@ -2644,7 +2680,7 @@ unsigned X86InstrInfo::sizeOfImm(const TargetInstrDesc *Desc) {
   case X86II::Imm16:  return 2;
   case X86II::Imm32:  return 4;
   case X86II::Imm64:  return 8;
-  default: assert(0 && "Immediate size not set!");
+  default: llvm_unreachable("Immediate size not set!");
     return 0;
   }
 }
@@ -2829,7 +2865,7 @@ static unsigned getDisplacementFieldSize(const MachineOperand *RelocOp) {
   } else if (RelocOp->isJTI()) {
     FinalSize += sizeJumpTableAddress(false);
   } else {
-    assert(0 && "Unknown value to relocate!");
+    llvm_unreachable("Unknown value to relocate!");
   }
   return FinalSize;
 }
@@ -2926,7 +2962,7 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
   case X86II::GS:
    ++FinalSize;
    break;
-  default: assert(0 && "Invalid segment!");
+  default: llvm_unreachable("Invalid segment!");
   case 0: break;  // No segment override!
   }
 
@@ -2946,6 +2982,10 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
   case X86II::TA:  // 0F 3A
     Need0FPrefix = true;
     break;
+  case X86II::TF: // F2 0F 38
+    ++FinalSize;
+    Need0FPrefix = true;
+    break;
   case X86II::REP: break; // already handled.
   case X86II::XS:   // F3 0F
     ++FinalSize;
@@ -2959,7 +2999,7 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
   case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
     ++FinalSize;
     break; // Two-byte opcode prefix
-  default: assert(0 && "Invalid prefix!");
+  default: llvm_unreachable("Invalid prefix!");
   case 0: break;  // No prefix!
   }
 
@@ -2981,6 +3021,9 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
   case X86II::TA:  // 0F 3A
     ++FinalSize;
     break;
+  case X86II::TF: // F2 0F 38
+    ++FinalSize;
+    break;
   }
 
   // If this is a two-address instruction, skip one of the register operands.
@@ -2993,7 +3036,7 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
     --NumOps;
 
   switch (Desc->TSFlags & X86II::FormMask) {
-  default: assert(0 && "Unknown FormMask value in X86 MachineCodeEmitter!");
+  default: llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
   case X86II::Pseudo:
     // Remember the current PC offset, this is the PIC relocation
     // base address.
@@ -3002,16 +3045,16 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
       break;
     case TargetInstrInfo::INLINEASM: {
       const MachineFunction *MF = MI.getParent()->getParent();
-      const char *AsmStr = MI.getOperand(0).getSymbolName();
-      const TargetAsmInfo* AI = MF->getTarget().getTargetAsmInfo();
-      FinalSize += AI->getInlineAsmLength(AsmStr);
+      const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+      FinalSize += TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
+                                          *MF->getTarget().getMCAsmInfo());
       break;
     }
     case TargetInstrInfo::DBG_LABEL:
     case TargetInstrInfo::EH_LABEL:
       break;
     case TargetInstrInfo::IMPLICIT_DEF:
-    case TargetInstrInfo::DECLARE:
+    case TargetInstrInfo::KILL:
     case X86::DWARF_LOC:
     case X86::FP_REG_KILL:
       break;
@@ -3038,7 +3081,7 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
       } else if (MO.isImm()) {
         FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
       } else {
-        assert(0 && "Unknown RawFrm operand!");
+        llvm_unreachable("Unknown RawFrm operand!");
       }
     }
     break;
@@ -3196,10 +3239,10 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
   }
 
   if (!Desc->isVariadic() && CurOp != NumOps) {
-    cerr << "Cannot determine size: ";
-    MI.dump();
-    cerr << '\n';
-    abort();
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "Cannot determine size: " << MI;
+    llvm_report_error(Msg.str());
   }
   
 
@@ -3209,7 +3252,7 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
 
 unsigned X86InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   const TargetInstrDesc &Desc = MI->getDesc();
-  bool IsPIC = (TM.getRelocationModel() == Reloc::PIC_);
+  bool IsPIC = TM.getRelocationModel() == Reloc::PIC_;
   bool Is64BitMode = TM.getSubtargetImpl()->is64Bit();
   unsigned Size = GetInstSizeWithDesc(*MI, &Desc, IsPIC, Is64BitMode);
   if (Desc.getOpcode() == X86::MOVPC32r)
@@ -3245,12 +3288,11 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   
   // If we're using vanilla 'GOT' PIC style, we should use relative addressing
   // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
-  if (TM.getRelocationModel() == Reloc::PIC_ &&
-      TM.getSubtarget<X86Subtarget>().isPICStyleGOT()) {
+  if (TM.getSubtarget<X86Subtarget>().isPICStyleGOT()) {
     GlobalBaseReg = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
     // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
     BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
-      .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_", 0,
+      .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
                                     X86II::MO_GOT_ABSOLUTE_ADDRESS);
   } else {
     GlobalBaseReg = PC;
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 83f0194..2237c8b 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -69,35 +69,36 @@ namespace X86 {
 /// instruction info tracks.
 ///
 namespace X86II {
-  enum {
+  /// Target Operand Flag enum.
+  enum TOF {
     //===------------------------------------------------------------------===//
     // X86 Specific MachineOperand flags.
     
-    MO_NO_FLAG = 0,
+    MO_NO_FLAG,
     
     /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
     /// relocation of:
     ///    SYMBOL_LABEL + [. - PICBASELABEL]
-    MO_GOT_ABSOLUTE_ADDRESS = 1,
+    MO_GOT_ABSOLUTE_ADDRESS,
     
     /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
     /// immediate should get the value of the symbol minus the PIC base label:
     ///    SYMBOL_LABEL - PICBASELABEL
-    MO_PIC_BASE_OFFSET = 2,
+    MO_PIC_BASE_OFFSET,
 
     /// MO_GOT - On a symbol operand this indicates that the immediate is the
     /// offset to the GOT entry for the symbol name from the base of the GOT.
     ///
     /// See the X86-64 ELF ABI supplement for more details. 
     ///    SYMBOL_LABEL @GOT
-    MO_GOT = 3,
+    MO_GOT,
     
     /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
     /// the offset to the location of the symbol name from the base of the GOT. 
     ///
     /// See the X86-64 ELF ABI supplement for more details. 
     ///    SYMBOL_LABEL @GOTOFF
-    MO_GOTOFF = 4,
+    MO_GOTOFF,
     
     /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
     /// offset to the GOT entry for the symbol name from the current code
@@ -105,50 +106,115 @@ namespace X86II {
     ///
     /// See the X86-64 ELF ABI supplement for more details. 
     ///    SYMBOL_LABEL @GOTPCREL
-    MO_GOTPCREL = 5,
+    MO_GOTPCREL,
     
     /// MO_PLT - On a symbol operand this indicates that the immediate is
     /// offset to the PLT entry of symbol name from the current code location. 
     ///
     /// See the X86-64 ELF ABI supplement for more details. 
     ///    SYMBOL_LABEL @PLT
-    MO_PLT = 6,
+    MO_PLT,
     
     /// MO_TLSGD - On a symbol operand this indicates that the immediate is
     /// some TLS offset.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details. 
     ///    SYMBOL_LABEL @TLSGD
-    MO_TLSGD = 7,
+    MO_TLSGD,
     
     /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
     /// some TLS offset.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details. 
     ///    SYMBOL_LABEL @GOTTPOFF
-    MO_GOTTPOFF = 8,
+    MO_GOTTPOFF,
    
     /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
     /// some TLS offset.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details. 
     ///    SYMBOL_LABEL @INDNTPOFF
-    MO_INDNTPOFF = 9,
+    MO_INDNTPOFF,
     
     /// MO_TPOFF - On a symbol operand this indicates that the immediate is
     /// some TLS offset.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details. 
     ///    SYMBOL_LABEL @TPOFF
-    MO_TPOFF = 10,
+    MO_TPOFF,
     
     /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
     /// some TLS offset.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details. 
     ///    SYMBOL_LABEL @NTPOFF
-    MO_NTPOFF = 11,
+    MO_NTPOFF,
+    
+    /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "__imp_FOO" symbol.  This is used for
+    /// dllimport linkage on windows.
+    MO_DLLIMPORT,
+    
+    /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$stub" symbol.  This is used for calls
+    /// and jumps to external functions on Tiger and before.
+    MO_DARWIN_STUB,
     
+    /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
+    /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY,
+
+    /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
+    /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
+    /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY_PIC_BASE,
+    
+    /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
+    /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
+    /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
+    /// stub.
+    MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE
+  };
+}
+
+/// isGlobalStubReference - Return true if the specified TargetFlag operand is
+/// a reference to a stub for a global, not the global itself.
+inline static bool isGlobalStubReference(unsigned char TargetFlag) {
+  switch (TargetFlag) {
+  case X86II::MO_DLLIMPORT: // dllimport stub.
+  case X86II::MO_GOTPCREL:  // rip-relative GOT reference.
+  case X86II::MO_GOT:       // normal GOT reference.
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Normal $non_lazy_ptr ref.
+  case X86II::MO_DARWIN_NONLAZY:                 // Normal $non_lazy_ptr ref.
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref.
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isGlobalRelativeToPICBase - Return true if the specified global value
+/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg).  If this
+/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
+inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
+  switch (TargetFlag) {
+  case X86II::MO_GOTOFF:                         // isPICStyleGOT: local global.
+  case X86II::MO_GOT:                            // isPICStyleGOT: other global.
+  case X86II::MO_PIC_BASE_OFFSET:                // Darwin local global.
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Darwin/32 external global.
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global.
+    return true;
+  default:
+    return false;
+  }
+}
+ 
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+  enum {
     //===------------------------------------------------------------------===//
     // Instruction encodings.  These are the standard/most common forms for X86
     // instructions.
@@ -249,6 +315,9 @@ namespace X86II {
 
     // T8, TA - Prefix after the 0x0F prefix.
     T8 = 13 << Op0Shift,  TA = 14 << Op0Shift,
+    
+    // TF - Prefix before and after 0x0F
+    TF = 15 << Op0Shift,
 
     //===------------------------------------------------------------------===//
     // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
@@ -355,10 +424,10 @@ class X86InstrInfo : public TargetInstrInfoImpl {
   /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
   /// RegOp2MemOpTable2 - Load / store folding opcode maps.
   ///
-  DenseMap<unsigned*, unsigned> RegOp2MemOpTable2Addr;
-  DenseMap<unsigned*, unsigned> RegOp2MemOpTable0;
-  DenseMap<unsigned*, unsigned> RegOp2MemOpTable1;
-  DenseMap<unsigned*, unsigned> RegOp2MemOpTable2;
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> > RegOp2MemOpTable2Addr;
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> > RegOp2MemOpTable0;
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> > RegOp2MemOpTable1;
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> > RegOp2MemOpTable2;
   
   /// MemOp2RegOpTable - Load / store unfolding opcode map.
   ///
@@ -382,11 +451,11 @@ public:
   unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
   unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
 
-  bool isReallyTriviallyReMaterializable(const MachineInstr *MI) const;
+  bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
+                                         AliasAnalysis *AA) const;
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                     unsigned DestReg, const MachineInstr *Orig) const;
-
-  bool isInvariantLoad(const MachineInstr *MI) const;
+                     unsigned DestReg, unsigned SubIdx,
+                     const MachineInstr *Orig) const;
 
   /// convertToThreeAddress - This method must be implemented by targets that
   /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
@@ -430,6 +499,8 @@ public:
   virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
                               SmallVectorImpl<MachineOperand> &Addr,
                               const TargetRegisterClass *RC,
+                              MachineInstr::mmo_iterator MMOBegin,
+                              MachineInstr::mmo_iterator MMOEnd,
                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
 
   virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -440,6 +511,8 @@ public:
   virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                SmallVectorImpl<MachineOperand> &Addr,
                                const TargetRegisterClass *RC,
+                               MachineInstr::mmo_iterator MMOBegin,
+                               MachineInstr::mmo_iterator MMOEnd,
                                SmallVectorImpl<MachineInstr*> &NewMIs) const;
   
   virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
@@ -530,9 +603,10 @@ public:
 
 private:
   MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                      MachineInstr* MI,
-                                      unsigned OpNum,
-                                      const SmallVectorImpl<MachineOperand> &MOs) const;
+                                     MachineInstr* MI,
+                                     unsigned OpNum,
+                                     const SmallVectorImpl<MachineOperand> &MOs,
+                                     unsigned Size, unsigned Alignment) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 03df10d..30b57d8 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -56,6 +56,10 @@ def SDT_X86CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
 
 def SDT_X86Call   : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
 
+def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
+                                                         SDTCisVT<1, iPTR>,
+                                                         SDTCisVT<2, iPTR>]>;
+
 def SDTX86RepStr  : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
 
 def SDTX86RdTsc   : SDTypeProfile<0, 0, []>;
@@ -114,6 +118,11 @@ def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary,
 def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
                         [SDNPHasChain, SDNPOptInFlag]>;
 
+def X86vastart_save_xmm_regs :
+                 SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
+                        SDT_X86VASTART_SAVE_XMM_REGS,
+                        [SDNPHasChain]>;
+
 def X86callseq_start :
                  SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
                         [SDNPHasChain, SDNPOutFlag]>;
@@ -124,9 +133,6 @@ def X86callseq_end :
 def X86call    : SDNode<"X86ISD::CALL",     SDT_X86Call,
                         [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
 
-def X86tailcall: SDNode<"X86ISD::TAILCALL",     SDT_X86Call,
-                        [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
-
 def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
                         [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore]>;
 def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
@@ -156,6 +162,9 @@ def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags>;
 def X86umul_flag : SDNode<"X86ISD::UMUL", SDTUnaryArithWithFlags>;
 def X86inc_flag  : SDNode<"X86ISD::INC",  SDTUnaryArithWithFlags>;
 def X86dec_flag  : SDNode<"X86ISD::DEC",  SDTUnaryArithWithFlags>;
+def X86or_flag   : SDNode<"X86ISD::OR",   SDTBinaryArithWithFlags>;
+def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags>;
+def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags>;
 
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
@@ -167,57 +176,80 @@ def i32imm_pcrel : Operand<i32> {
   let PrintMethod = "print_pcrel_imm";
 }
 
+// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for
+// the index operand of an address, to conform to x86 encoding restrictions.
+def ptr_rc_nosp : PointerLikeRegClass<1>;
 
 // *mem - Operand definitions for the funky X86 addressing mode operands.
 //
+def X86MemAsmOperand : AsmOperandClass {
+  let Name = "Mem";
+  let SuperClass = ?;
+}
 class X86MemOperand<string printMethod> : Operand<iPTR> {
   let PrintMethod = printMethod;
-  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm, i8imm);
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
 }
 
+def opaque32mem : X86MemOperand<"printopaquemem">;
+def opaque48mem : X86MemOperand<"printopaquemem">;
+def opaque80mem : X86MemOperand<"printopaquemem">;
+
 def i8mem   : X86MemOperand<"printi8mem">;
 def i16mem  : X86MemOperand<"printi16mem">;
 def i32mem  : X86MemOperand<"printi32mem">;
 def i64mem  : X86MemOperand<"printi64mem">;
 def i128mem : X86MemOperand<"printi128mem">;
-def i256mem : X86MemOperand<"printi256mem">;
+//def i256mem : X86MemOperand<"printi256mem">;
 def f32mem  : X86MemOperand<"printf32mem">;
 def f64mem  : X86MemOperand<"printf64mem">;
 def f80mem  : X86MemOperand<"printf80mem">;
 def f128mem : X86MemOperand<"printf128mem">;
-def f256mem : X86MemOperand<"printf256mem">;
+//def f256mem : X86MemOperand<"printf256mem">;
 
 // A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
 // plain GR64, so that it doesn't potentially require a REX prefix.
 def i8mem_NOREX : Operand<i64> {
   let PrintMethod = "printi8mem";
-  let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX, i32imm, i8imm);
+  let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
 }
 
 def lea32mem : Operand<i32> {
   let PrintMethod = "printlea32mem";
-  let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+  let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm);
+  let ParserMatchClass = X86MemAsmOperand;
 }
 
 def SSECC : Operand<i8> {
   let PrintMethod = "printSSECC";
 }
 
-def piclabel: Operand<i32> {
-  let PrintMethod = "printPICLabel";
+def ImmSExt8AsmOperand : AsmOperandClass {
+  let Name = "ImmSExt8";
+  let SuperClass = ImmAsmOperand;
 }
 
 // A couple of more descriptive operand definitions.
 // 16-bits but only 8 bits are significant.
-def i16i8imm  : Operand<i16>;
+def i16i8imm  : Operand<i16> {
+  let ParserMatchClass = ImmSExt8AsmOperand;
+}
 // 32-bits but only 8 bits are significant.
-def i32i8imm  : Operand<i32>;
+def i32i8imm  : Operand<i32> {
+  let ParserMatchClass = ImmSExt8AsmOperand;
+}
 
 // Branch targets have OtherVT type and print as pc-relative values.
 def brtarget : Operand<OtherVT> {
   let PrintMethod = "print_pcrel_imm";
 }
 
+def brtarget8 : Operand<OtherVT> {
+  let PrintMethod = "print_pcrel_imm";
+}
+
 //===----------------------------------------------------------------------===//
 // X86 Complex Pattern Definitions.
 //
@@ -225,7 +257,8 @@ def brtarget : Operand<OtherVT> {
 // Define X86 specific addressing mode.
 def addr      : ComplexPattern<iPTR, 5, "SelectAddr", [], []>;
 def lea32addr : ComplexPattern<i32, 4, "SelectLEAAddr",
-                               [add, sub, mul, shl, or, frameindex], []>;
+                               [add, sub, mul, X86mul_imm, shl, or, frameindex],
+                               []>;
 def tls32addr : ComplexPattern<i32, 4, "SelectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
 
@@ -246,8 +279,14 @@ def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def In32BitMode  : Predicate<"!Subtarget->is64Bit()">;
 def In64BitMode  : Predicate<"Subtarget->is64Bit()">;
+def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
+def NotWin64     : Predicate<"!Subtarget->isTargetWin64()">;
 def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
-def NotSmallCode : Predicate<"TM.getCodeModel() != CodeModel::Small">;
+def KernelCode   : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
+def FarData      : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
+                             "TM.getCodeModel() != CodeModel::Kernel">;
+def NearData     : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
+                             "TM.getCodeModel() == CodeModel::Kernel">;
 def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
 def OptForSpeed  : Predicate<"!OptForSize">;
 def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
@@ -484,15 +523,35 @@ def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                           Requires<[In32BitMode]>;
 }
 
+// x86-64 va_start lowering magic.
+let usesCustomDAGSchedInserter = 1 in
+def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
+                              (outs),
+                              (ins GR8:$al,
+                                   i64imm:$regsavefi, i64imm:$offset,
+                                   variable_ops),
+                              "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
+                              [(X86vastart_save_xmm_regs GR8:$al,
+                                                         imm:$regsavefi,
+                                                         imm:$offset)]>;
+
 // Nop
-let neverHasSideEffects = 1 in
+let neverHasSideEffects = 1 in {
   def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>;
+  def NOOPL : I<0x1f, MRM0m, (outs), (ins i32mem:$zero),
+                "nopl\t$zero", []>, TB;
+}
 
-// PIC base
+// Trap
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int 3", []>;
+def INT : I<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", []>;
+
+// PIC base construction.  This expands to code that looks like this:
+//     call  $next_inst
+//     popl %destreg"
 let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in
-  def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins piclabel:$label),
-                      "call\t$label\n\t"
-                      "pop{l}\t$reg", []>;
+  def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
+                      "", []>;
 
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions...
@@ -506,7 +565,11 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
                     [(X86retflag 0)]>;
   def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
                     "ret\t$amt",
-                    [(X86retflag imm:$amt)]>;
+                    [(X86retflag timm:$amt)]>;
+  def LRET   : I   <0xCB, RawFrm, (outs), (ins),
+                    "lret", []>;
+  def LRETI  : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "lret\t$amt", []>;
 }
 
 // All branches are RawFrm, Void, Branch, and Terminators
@@ -514,8 +577,10 @@ let isBranch = 1, isTerminator = 1 in
   class IBr<bits<8> opcode, dag ins, string asm, list<dag> pattern> :
         I<opcode, RawFrm, (outs), ins, asm, pattern>;
 
-let isBranch = 1, isBarrier = 1 in
+let isBranch = 1, isBarrier = 1 in {
   def JMP : IBr<0xE9, (ins brtarget:$dst), "jmp\t$dst", [(br bb:$dst)]>;
+  def JMP8 : IBr<0xEB, (ins brtarget8:$dst), "jmp\t$dst", []>;
+}
 
 // Indirect branches
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
@@ -523,10 +588,42 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
                      [(brind GR32:$dst)]>;
   def JMP32m     : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
                      [(brind (loadi32 addr:$dst))]>;
+                     
+  def FARJMP16i  : Iseg16<0xEA, RawFrm, (outs), 
+                          (ins i16imm:$seg, i16imm:$off),
+                          "ljmp{w}\t$seg, $off", []>, OpSize;
+  def FARJMP32i  : Iseg32<0xEA, RawFrm, (outs),
+                          (ins i16imm:$seg, i32imm:$off),
+                          "ljmp{l}\t$seg, $off", []>;                     
+
+  def FARJMP16m  : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst), 
+                     "ljmp{w}\t{*}$dst", []>, OpSize;
+  def FARJMP32m  : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
+                     "ljmp{l}\t{*}$dst", []>;
 }
 
 // Conditional branches
 let Uses = [EFLAGS] in {
+// Short conditional jumps
+def JO8   : IBr<0x70, (ins brtarget8:$dst), "jo\t$dst", []>;
+def JNO8  : IBr<0x71, (ins brtarget8:$dst), "jno\t$dst", []>;
+def JB8   : IBr<0x72, (ins brtarget8:$dst), "jb\t$dst", []>;
+def JAE8  : IBr<0x73, (ins brtarget8:$dst), "jae\t$dst", []>;
+def JE8   : IBr<0x74, (ins brtarget8:$dst), "je\t$dst", []>;
+def JNE8  : IBr<0x75, (ins brtarget8:$dst), "jne\t$dst", []>;
+def JBE8  : IBr<0x76, (ins brtarget8:$dst), "jbe\t$dst", []>;
+def JA8   : IBr<0x77, (ins brtarget8:$dst), "ja\t$dst", []>;
+def JS8   : IBr<0x78, (ins brtarget8:$dst), "js\t$dst", []>;
+def JNS8  : IBr<0x79, (ins brtarget8:$dst), "jns\t$dst", []>;
+def JP8   : IBr<0x7A, (ins brtarget8:$dst), "jp\t$dst", []>;
+def JNP8  : IBr<0x7B, (ins brtarget8:$dst), "jnp\t$dst", []>;
+def JL8   : IBr<0x7C, (ins brtarget8:$dst), "jl\t$dst", []>;
+def JGE8  : IBr<0x7D, (ins brtarget8:$dst), "jge\t$dst", []>;
+def JLE8  : IBr<0x7E, (ins brtarget8:$dst), "jle\t$dst", []>;
+def JG8   : IBr<0x7F, (ins brtarget8:$dst), "jg\t$dst", []>;
+
+def JCXZ8 : IBr<0xE3, (ins brtarget8:$dst), "jcxz\t$dst", []>;
+
 def JE  : IBr<0x84, (ins brtarget:$dst), "je\t$dst",
               [(X86brcond bb:$dst, X86_COND_E, EFLAGS)]>, TB;
 def JNE : IBr<0x85, (ins brtarget:$dst), "jne\t$dst",
@@ -563,6 +660,12 @@ def JNO : IBr<0x81, (ins brtarget:$dst), "jno\t$dst",
               [(X86brcond bb:$dst, X86_COND_NO, EFLAGS)]>, TB;
 } // Uses = [EFLAGS]
 
+// Loop instructions
+
+def LOOP   : I<0xE2, RawFrm, (ins brtarget8:$dst), (outs), "loop\t$dst", []>;
+def LOOPE  : I<0xE1, RawFrm, (ins brtarget8:$dst), (outs), "loope\t$dst", []>;
+def LOOPNE : I<0xE0, RawFrm, (ins brtarget8:$dst), (outs), "loopne\t$dst", []>;
+
 //===----------------------------------------------------------------------===//
 //  Call Instructions...
 //
@@ -583,13 +686,26 @@ let isCall = 1 in
                         "call\t{*}$dst", [(X86call GR32:$dst)]>;
     def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
                         "call\t{*}$dst", [(X86call (loadi32 addr:$dst))]>;
+  
+    def FARCALL16i  : Iseg16<0x9A, RawFrm, (outs), 
+                             (ins i16imm:$seg, i16imm:$off),
+                             "lcall{w}\t$seg, $off", []>, OpSize;
+    def FARCALL32i  : Iseg32<0x9A, RawFrm, (outs),
+                             (ins i16imm:$seg, i32imm:$off),
+                             "lcall{l}\t$seg, $off", []>;
+                             
+    def FARCALL16m  : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
+                        "lcall{w}\t{*}$dst", []>, OpSize;
+    def FARCALL32m  : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
+                        "lcall{l}\t{*}$dst", []>;
   }
 
-// Tail call stuff.
+// Constructing a stack frame.
+
+def ENTER : I<0xC8, RawFrm, (outs), (ins i16imm:$len, i8imm:$lvl),
+              "enter\t$len, $lvl", []>;
 
-def TAILCALL : I<0, Pseudo, (outs), (ins),
-                         "#TAILCALL",
-                         []>;
+// Tail call stuff.
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
 def TCRETURNdi : I<0, Pseudo, (outs), (ins i32imm:$dst, i32imm:$offset, variable_ops),
@@ -620,11 +736,29 @@ def LEAVE    : I<0xC9, RawFrm,
                  (outs), (ins), "leave", []>;
 
 let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
-let mayLoad = 1 in
-def POP32r   : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
+let mayLoad = 1 in {
+def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
+  OpSize;
+def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
+  OpSize;
+def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", []>,
+  OpSize;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
+def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", []>;
+}
 
-let mayStore = 1 in
+let mayStore = 1 in {
+def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
+  OpSize;
 def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
+  OpSize;
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[]>,
+  OpSize;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[]>;
+}
 }
 
 let Defs = [ESP], Uses = [ESP], neverHasSideEffects = 1, mayStore = 1 in {
@@ -710,6 +844,14 @@ let Defs = [ECX,EDI], Uses = [EAX,ECX,EDI] in
 def REP_STOSD : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
                   [(X86rep_stos i32)]>, REP;
 
+def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scas{b}", []>;
+def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scas{w}", []>, OpSize;
+def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l}", []>;
+
+def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmps{b}", []>;
+def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmps{w}", []>, OpSize;
+def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l}", []>;
+
 let Defs = [RAX, RDX] in
 def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>,
             TB;
@@ -718,6 +860,18 @@ let isBarrier = 1, hasCtrlDep = 1 in {
 def TRAP    : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
 }
 
+def SYSCALL  : I<0x05, RawFrm,
+                 (outs), (ins), "syscall", []>, TB;
+def SYSRET   : I<0x07, RawFrm,
+                 (outs), (ins), "sysret", []>, TB;
+def SYSENTER : I<0x34, RawFrm,
+                 (outs), (ins), "sysenter", []>, TB;
+def SYSEXIT  : I<0x35, RawFrm,
+                 (outs), (ins), "sysexit", []>, TB;
+
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>;
+
+
 //===----------------------------------------------------------------------===//
 //  Input/Output Instructions...
 //
@@ -793,6 +947,30 @@ def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
                    "mov{l}\t{$src, $dst|$dst, $src}",
                    [(store (i32 imm:$src), addr:$dst)]>;
 
+def MOV8o8a : Ii8 <0xA0, RawFrm, (outs), (ins i8imm:$src),
+                   "mov{b}\t{$src, %al|%al, $src}", []>;
+def MOV16o16a : Ii16 <0xA1, RawFrm, (outs), (ins i16imm:$src),
+                      "mov{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins i32imm:$src),
+                      "mov{l}\t{$src, %eax|%eax, $src}", []>;
+
+def MOV8ao8 : Ii8 <0xA2, RawFrm, (outs i8imm:$dst), (ins),
+                   "mov{b}\t{%al, $dst|$dst, %al}", []>;
+def MOV16ao16 : Ii16 <0xA3, RawFrm, (outs i16imm:$dst), (ins),
+                      "mov{w}\t{%ax, $dst|$dst, %ax}", []>, OpSize;
+def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs i32imm:$dst), (ins),
+                      "mov{l}\t{%eax, $dst|$dst, %eax}", []>;
+
+// Moves to and from segment registers
+def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", []>;
+def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", []>;
+def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", []>;
+def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", []>;
+
 let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
 def MOV8rm  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
                 "mov{b}\t{$src, $dst|$dst, $src}",
@@ -950,6 +1128,20 @@ let isTwoAddress = 1 in {
 
 // Conditional moves
 let Uses = [EFLAGS] in {
+
+// X86 doesn't have 8-bit conditional moves. Use a customDAGSchedInserter to
+// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
+// however that requires promoting the operands, and can induce additional
+// i8 register pressure. Note that CMOV_GR8 is conservatively considered to
+// clobber EFLAGS, because if one of the operands is zero, the expansion
+// could involve an xor.
+let usesCustomDAGSchedInserter = 1, isTwoAddress = 0, Defs = [EFLAGS] in
+def CMOV_GR8 : I<0, Pseudo,
+                 (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond),
+                 "#CMOV_GR8 PSEUDO!",
+                 [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2,
+                                          imm:$cond, EFLAGS))]>;
+
 let isCommutable = 1 in {
 def CMOVB16rr : I<0x42, MRMSrcReg,       // if <u, GR16 = GR16
                   (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
@@ -1549,6 +1741,14 @@ let isTwoAddress = 0 in {
                      "and{l}\t{$src, $dst|$dst, $src}",
                 [(store (and (load addr:$dst), i32immSExt8:$src), addr:$dst),
                  (implicit EFLAGS)]>;
+
+  def AND8i8 : Ii8<0x24, RawFrm, (outs), (ins i8imm:$src),
+                   "and{b}\t{$src, %al|%al, $src}", []>;
+  def AND16i16 : Ii16<0x25, RawFrm, (outs), (ins i16imm:$src),
+                      "and{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def AND32i32 : Ii32<0x25, RawFrm, (outs), (ins i32imm:$src),
+                      "and{l}\t{$src, %eax|%eax, $src}", []>;
+
 }
 
 
@@ -1635,6 +1835,13 @@ let isTwoAddress = 0 in {
                  "or{l}\t{$src, $dst|$dst, $src}",
                  [(store (or (load addr:$dst), i32immSExt8:$src), addr:$dst),
                   (implicit EFLAGS)]>;
+                  
+  def OR8i8 : Ii8 <0x0C, RawFrm, (outs), (ins i8imm:$src),
+                   "or{b}\t{$src, %al|%al, $src}", []>;
+  def OR16i16 : Ii16 <0x0D, RawFrm, (outs), (ins i16imm:$src),
+                      "or{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def OR32i32 : Ii32 <0x0D, RawFrm, (outs), (ins i32imm:$src),
+                      "or{l}\t{$src, %eax|%eax, $src}", []>;
 } // isTwoAddress = 0
 
 
@@ -1744,6 +1951,13 @@ let isTwoAddress = 0 in {
                      "xor{l}\t{$src, $dst|$dst, $src}",
                  [(store (xor (load addr:$dst), i32immSExt8:$src), addr:$dst),
                   (implicit EFLAGS)]>;
+                  
+  def XOR8i8 : Ii8 <0x34, RawFrm, (outs), (ins i8imm:$src),
+                   "xor{b}\t{$src, %al|%al, $src}", []>;
+  def XOR16i16 : Ii16 <0x35, RawFrm, (outs), (ins i16imm:$src),
+                        "xor{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def XOR32i32 : Ii32 <0x35, RawFrm, (outs), (ins i32imm:$src),
+                        "xor{l}\t{$src, %eax|%eax, $src}", []>;
 } // isTwoAddress = 0
 } // Defs = [EFLAGS]
 
@@ -1771,8 +1985,17 @@ def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
 def SHL32ri  : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
                    "shl{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>;
-// NOTE: We don't use shifts of a register by one, because 'add reg,reg' is
-// cheaper.
+
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper.
+
+def SHL8r1   : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
+                 "shl{b}\t$dst", []>;
+def SHL16r1  : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shl{w}\t$dst", []>, OpSize;
+def SHL32r1  : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shl{l}\t$dst", []>;
+
 } // isConvertibleToThreeAddress = 1
 
 let isTwoAddress = 0 in {
@@ -1951,6 +2174,97 @@ let isTwoAddress = 0 in {
 }
 
 // Rotate instructions
+
+def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src),
+               "rcl{b}\t{1, $dst|$dst, 1}", []>;
+def RCL8m1 : I<0xD0, MRM2m, (outs i8mem:$dst), (ins i8mem:$src),
+               "rcl{b}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src),
+                "rcl{b}\t{%cl, $dst|$dst, CL}", []>;
+def RCL8mCL : I<0xD2, MRM2m, (outs i8mem:$dst), (ins i8mem:$src),
+                "rcl{b}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src, i8imm:$cnt),
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCL8mi : Ii8<0xC0, MRM2m, (outs i8mem:$dst), (ins i8mem:$src, i8imm:$cnt),
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+  
+def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src),
+                "rcl{w}\t{1, $dst|$dst, 1}", []>, OpSize;
+def RCL16m1 : I<0xD1, MRM2m, (outs i16mem:$dst), (ins i16mem:$src),
+                "rcl{w}\t{1, $dst|$dst, 1}", []>, OpSize;
+let Uses = [CL] in {
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src),
+                 "rcl{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+def RCL16mCL : I<0xD3, MRM2m, (outs i16mem:$dst), (ins i16mem:$src),
+                 "rcl{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+}
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src, i8imm:$cnt),
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+def RCL16mi : Ii8<0xC1, MRM2m, (outs i16mem:$dst), (ins i16mem:$src, i8imm:$cnt),
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+
+def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src),
+                "rcl{l}\t{1, $dst|$dst, 1}", []>;
+def RCL32m1 : I<0xD1, MRM2m, (outs i32mem:$dst), (ins i32mem:$src),
+                "rcl{l}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src),
+                 "rcl{l}\t{%cl, $dst|$dst, CL}", []>;
+def RCL32mCL : I<0xD3, MRM2m, (outs i32mem:$dst), (ins i32mem:$src),
+                 "rcl{l}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src, i8imm:$cnt),
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCL32mi : Ii8<0xC1, MRM2m, (outs i32mem:$dst), (ins i32mem:$src, i8imm:$cnt),
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+                  
+def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src),
+               "rcr{b}\t{1, $dst|$dst, 1}", []>;
+def RCR8m1 : I<0xD0, MRM3m, (outs i8mem:$dst), (ins i8mem:$src),
+               "rcr{b}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src),
+                "rcr{b}\t{%cl, $dst|$dst, CL}", []>;
+def RCR8mCL : I<0xD2, MRM3m, (outs i8mem:$dst), (ins i8mem:$src),
+                "rcr{b}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src, i8imm:$cnt),
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCR8mi : Ii8<0xC0, MRM3m, (outs i8mem:$dst), (ins i8mem:$src, i8imm:$cnt),
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+  
+def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src),
+                "rcr{w}\t{1, $dst|$dst, 1}", []>, OpSize;
+def RCR16m1 : I<0xD1, MRM3m, (outs i16mem:$dst), (ins i16mem:$src),
+                "rcr{w}\t{1, $dst|$dst, 1}", []>, OpSize;
+let Uses = [CL] in {
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src),
+                 "rcr{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+def RCR16mCL : I<0xD3, MRM3m, (outs i16mem:$dst), (ins i16mem:$src),
+                 "rcr{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+}
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src, i8imm:$cnt),
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+def RCR16mi : Ii8<0xC1, MRM3m, (outs i16mem:$dst), (ins i16mem:$src, i8imm:$cnt),
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+
+def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src),
+                "rcr{l}\t{1, $dst|$dst, 1}", []>;
+def RCR32m1 : I<0xD1, MRM3m, (outs i32mem:$dst), (ins i32mem:$src),
+                "rcr{l}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src),
+                 "rcr{l}\t{%cl, $dst|$dst, CL}", []>;
+def RCR32mCL : I<0xD3, MRM3m, (outs i32mem:$dst), (ins i32mem:$src),
+                 "rcr{l}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src, i8imm:$cnt),
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCR32mi : Ii8<0xC1, MRM3m, (outs i32mem:$dst), (ins i32mem:$src, i8imm:$cnt),
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+
 // FIXME: provide shorter instructions when imm8 == 1
 let Uses = [CL] in {
 def ROL8rCL  : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src),
@@ -2228,6 +2542,15 @@ def ADD32rm  : I<0x03, MRMSrcMem, (outs GR32:$dst),
                  "add{l}\t{$src2, $dst|$dst, $src2}",
                  [(set GR32:$dst, (add GR32:$src1, (load addr:$src2))),
                   (implicit EFLAGS)]>;
+                  
+// Register-Register Addition - Equivalent to the normal rr forms (ADD8rr, 
+//   ADD16rr, and ADD32rr), but differently encoded.
+def ADD8mrmrr: I<0x02, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                 "add{b}\t{$src2, $dst|$dst, $src2}", []>;
+def ADD16mrmrr: I<0x03, MRMSrcReg,(outs GR16:$dst),(ins GR16:$src1, GR16:$src2),
+                  "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def ADD32mrmrr: I<0x03, MRMSrcReg,(outs GR16:$dst),(ins GR16:$src1, GR16:$src2),
+                  "add{l}\t{$src2, $dst|$dst, $src2}", []>;
 
 // Register-Integer Addition
 def ADD8ri    : Ii8<0x80, MRM0r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
@@ -2295,6 +2618,14 @@ let isTwoAddress = 0 in {
                   [(store (add (load addr:$dst), i32immSExt8:$src2),
                                addr:$dst),
                    (implicit EFLAGS)]>;
+
+  // addition to rAX
+  def ADD8i8 : Ii8<0x04, RawFrm, (outs), (ins i8imm:$src),
+                   "add{b}\t{$src, %al|%al, $src}", []>;
+  def ADD16i16 : Ii16<0x05, RawFrm, (outs), (ins i16imm:$src),
+                      "add{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def ADD32i32 : Ii32<0x05, RawFrm, (outs), (ins i32imm:$src),
+                      "add{l}\t{$src, %eax|%eax, $src}", []>;
 }
 
 let Uses = [EFLAGS] in {
@@ -2373,6 +2704,13 @@ let isTwoAddress = 0 in {
   def ADC32mi8 : Ii8<0x83, MRM2m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
                      "adc{l}\t{$src2, $dst|$dst, $src2}",
                [(store (adde (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+
+  def ADC8i8 : Ii8<0x14, RawFrm, (outs), (ins i8imm:$src),
+                   "adc{b}\t{$src, %al|%al, $src}", []>;
+  def ADC16i16 : Ii16<0x15, RawFrm, (outs), (ins i16imm:$src),
+                      "adc{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def ADC32i32 : Ii32<0x15, RawFrm, (outs), (ins i32imm:$src),
+                      "adc{l}\t{$src, %eax|%eax, $src}", []>;
 }
 } // Uses = [EFLAGS]
 
@@ -2472,6 +2810,13 @@ let isTwoAddress = 0 in {
                      [(store (sub (load addr:$dst), i32immSExt8:$src2),
                              addr:$dst),
                       (implicit EFLAGS)]>;
+                      
+  def SUB8i8 : Ii8<0x2C, RawFrm, (outs), (ins i8imm:$src),
+                   "sub{b}\t{$src, %al|%al, $src}", []>;
+  def SUB16i16 : Ii16<0x2D, RawFrm, (outs), (ins i16imm:$src),
+                      "sub{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def SUB32i32 : Ii32<0x2D, RawFrm, (outs), (ins i32imm:$src),
+                      "sub{l}\t{$src, %eax|%eax, $src}", []>;
 }
 
 let Uses = [EFLAGS] in {
@@ -2516,6 +2861,13 @@ let isTwoAddress = 0 in {
   def SBB32mi8 : Ii8<0x83, MRM3m, (outs), (ins i32mem:$dst, i32i8imm :$src2), 
                      "sbb{l}\t{$src2, $dst|$dst, $src2}",
                [(store (sube (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+               
+  def SBB8i8 : Ii8<0x1C, RawFrm, (outs), (ins i8imm:$src),
+                   "sbb{b}\t{$src, %al|%al, $src}", []>;
+  def SBB16i16 : Ii16<0x1D, RawFrm, (outs), (ins i16imm:$src),
+                      "sbb{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def SBB32i32 : Ii32<0x1D, RawFrm, (outs), (ins i32imm:$src),
+                      "sbb{l}\t{$src, %eax|%eax, $src}", []>;
 }
 def SBB8rm   : I<0x1A, MRMSrcMem, (outs GR8:$dst), (ins GR8:$src1, i8mem:$src2),
                     "sbb{b}\t{$src2, $dst|$dst, $src2}",
@@ -2647,6 +2999,13 @@ def TEST32rr : I<0x85, MRMDestReg, (outs),  (ins GR32:$src1, GR32:$src2),
                       (implicit EFLAGS)]>;
 }
 
+def TEST8i8  : Ii8<0xA8, RawFrm, (outs), (ins i8imm:$src),
+                   "test{b}\t{$src, %al|%al, $src}", []>;
+def TEST16i16 : Ii16<0xA9, RawFrm, (outs), (ins i16imm:$src),
+                     "test{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+def TEST32i32 : Ii32<0xA9, RawFrm, (outs), (ins i32imm:$src),
+                     "test{l}\t{$src, %eax|%eax, $src}", []>;
+
 def TEST8rm  : I<0x84, MRMSrcMem, (outs),  (ins GR8 :$src1, i8mem :$src2),
                      "test{b}\t{$src2, $src1|$src1, $src2}",
                      [(X86cmp (and GR8:$src1, (loadi8 addr:$src2)), 0),
@@ -2878,6 +3237,13 @@ def SETNOm   : I<0x91, MRM0m,
 
 // Integer comparisons
 let Defs = [EFLAGS] in {
+def CMP8i8 : Ii8<0x3C, RawFrm, (outs), (ins i8imm:$src),
+                 "cmp{b}\t{$src, %al|%al, $src}", []>;
+def CMP16i16 : Ii16<0x3D, RawFrm, (outs), (ins i16imm:$src),
+                    "cmp{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+def CMP32i32 : Ii32<0x3D, RawFrm, (outs), (ins i32imm:$src),
+                    "cmp{l}\t{$src, %eax|%eax, $src}", []>;
+
 def CMP8rr  : I<0x38, MRMDestReg,
                 (outs), (ins GR8 :$src1, GR8 :$src2),
                 "cmp{b}\t{$src2, $src1|$src1, $src2}",
@@ -2920,6 +3286,12 @@ def CMP32rm : I<0x3B, MRMSrcMem,
                 "cmp{l}\t{$src2, $src1|$src1, $src2}",
                 [(X86cmp GR32:$src1, (loadi32 addr:$src2)),
                  (implicit EFLAGS)]>;
+def CMP8mrmrr : I<0x3A, MRMSrcReg, (outs), (ins GR8:$src1, GR8:$src2),
+                  "cmp{b}\t{$src2, $src1|$src1, $src2}", []>;
+def CMP16mrmrr : I<0x3B, MRMSrcReg, (outs), (ins GR16:$src1, GR16:$src2),
+                   "cmp{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize;
+def CMP32mrmrr : I<0x3B, MRMSrcReg, (outs), (ins GR32:$src1, GR32:$src2),
+                   "cmp{l}\t{$src2, $src1|$src1, $src2}", []>;
 def CMP8ri  : Ii8<0x80, MRM7r,
                   (outs), (ins GR8:$src1, i8imm:$src2),
                   "cmp{b}\t{$src2, $src1|$src1, $src2}",
@@ -3095,7 +3467,8 @@ let neverHasSideEffects = 1 in {
 
 // Alias instructions that map movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
-let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
+    isCodeGenOnly = 1 in {
 def MOV8r0   : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins),
                  "xor{b}\t$dst, $dst",
                  [(set GR8:$dst, 0)]>;
@@ -3127,12 +3500,12 @@ def TLS_addr32 : I<0, Pseudo, (outs), (ins lea32mem:$sym),
                   [(X86tlsaddr tls32addr:$sym)]>,
                   Requires<[In32BitMode]>;
 
-let AddedComplexity = 5 in
+let AddedComplexity = 5, isCodeGenOnly = 1 in
 def GS_MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                    "movl\t%gs:$src, $dst",
                    [(set GR32:$dst, (gsload addr:$src))]>, SegGS;
 
-let AddedComplexity = 5 in
+let AddedComplexity = 5, isCodeGenOnly = 1 in
 def FS_MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                    "movl\t%fs:$src, $dst",
                    [(set GR32:$dst, (fsload addr:$src))]>, SegFS;
@@ -3143,7 +3516,7 @@ def FS_MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
 
 def DWARF_LOC   : I<0, Pseudo, (outs),
                     (ins i32imm:$line, i32imm:$col, i32imm:$file),
-                    ".loc\t${file:debug} ${line:debug} ${col:debug}",
+                    ".loc\t$file $line $col",
                     [(dwarf_loc (i32 imm:$line), (i32 imm:$col),
                       (i32 imm:$file))]>;
 
@@ -3151,7 +3524,7 @@ def DWARF_LOC   : I<0, Pseudo, (outs),
 // EH Pseudo Instructions
 //
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
-    hasCtrlDep = 1 in {
+    hasCtrlDep = 1, isCodeGenOnly = 1 in {
 def EH_RETURN   : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
                     "ret\t#eh_return, addr: $addr",
                     [(X86ehret GR32:$addr)]>;
@@ -3223,6 +3596,78 @@ def LXADD8  : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins i8mem:$ptr, GR8:$val),
                 TB, LOCK;
 }
 
+// Optimized codegen when the non-memory output is not used.
+// FIXME: Use normal add / sub instructions and add lock prefix dynamically.
+def LOCK_ADD8mr  : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+                    "lock\n\t"
+                    "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD16mr  : I<0x01, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                    "lock\n\t"
+                    "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_ADD32mr  : I<0x01, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                    "lock\n\t"
+                    "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD8mi   : Ii8<0x80, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src2),
+                    "lock\n\t"
+                    "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD16mi  : Ii16<0x81, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src2),
+                    "lock\n\t"
+                     "add{w}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD32mi  : Ii32<0x81, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src2),
+                    "lock\n\t"
+                    "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD16mi8 : Ii8<0x83, MRM0m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+                    "lock\n\t"
+                    "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_ADD32mi8 : Ii8<0x83, MRM0m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+                    "lock\n\t"
+                    "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+
+def LOCK_INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst),
+                    "lock\n\t"
+                    "inc{b}\t$dst", []>, LOCK;
+def LOCK_INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst),
+                    "lock\n\t"
+                    "inc{w}\t$dst", []>, OpSize, LOCK;
+def LOCK_INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst),
+                    "lock\n\t"
+                    "inc{l}\t$dst", []>, LOCK;
+
+def LOCK_SUB8mr   : I<0x28, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src2),
+                    "lock\n\t"
+                    "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB16mr  : I<0x29, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                    "lock\n\t"
+                    "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_SUB32mr  : I<0x29, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), 
+                    "lock\n\t"
+                    "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB8mi   : Ii8<0x80, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src2), 
+                    "lock\n\t"
+                    "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB16mi  : Ii16<0x81, MRM5m, (outs), (ins i16mem:$dst, i16imm:$src2), 
+                    "lock\n\t"
+                    "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_SUB32mi  : Ii32<0x81, MRM5m, (outs), (ins i32mem:$dst, i32imm:$src2), 
+                    "lock\n\t"
+                     "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB16mi8 : Ii8<0x83, MRM5m, (outs), (ins i16mem:$dst, i16i8imm :$src2), 
+                    "lock\n\t"
+                     "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_SUB32mi8 : Ii8<0x83, MRM5m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+                    "lock\n\t"
+                     "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+
+def LOCK_DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst),
+                    "lock\n\t"
+                    "dec{b}\t$dst", []>, LOCK;
+def LOCK_DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst),
+                    "lock\n\t"
+                    "dec{w}\t$dst", []>, OpSize, LOCK;
+def LOCK_DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst),
+                    "lock\n\t"
+                    "dec{l}\t$dst", []>, LOCK;
+
 // Atomic exchange, and, or, xor
 let Constraints = "$val = $dst", Defs = [EFLAGS],
                   usesCustomDAGSchedInserter = 1 in {
@@ -3318,6 +3763,25 @@ def ATOMSWAP6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
                "#ATOMSWAP6432 PSEUDO!", []>;
 }
 
+// Segmentation support instructions.
+
+def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 
+                "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+
+// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
+def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), 
+                "lar{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                "lar{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                
+// String manipulation instructions
+
+def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", []>;
+def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", []>, OpSize;
+def LODSD : I<0xAD, RawFrm, (outs), (ins), "lodsd", []>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
@@ -3345,14 +3809,6 @@ def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
 
 // Calls
 // tailcall stuff
-def : Pat<(X86tailcall GR32:$dst),
-          (TAILCALL)>;
-
-def : Pat<(X86tailcall (i32 tglobaladdr:$dst)),
-          (TAILCALL)>;
-def : Pat<(X86tailcall (i32 texternalsym:$dst)),
-          (TAILCALL)>;
-
 def : Pat<(X86tcret GR32:$dst, imm:$off),
           (TCRETURNri GR32:$dst, imm:$off)>;
 
@@ -3362,6 +3818,7 @@ def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
 def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
           (TCRETURNdi texternalsym:$dst, imm:$off)>;
 
+// Normal calls, with various flavors of addresses.
 def : Pat<(X86call (i32 tglobaladdr:$dst)),
           (CALLpcrel32 tglobaladdr:$dst)>;
 def : Pat<(X86call (i32 texternalsym:$dst)),
@@ -3472,21 +3929,17 @@ def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
 
 // extload bool -> extload byte
 def : Pat<(extloadi8i1 addr:$src),   (MOV8rm      addr:$src)>;
-def : Pat<(extloadi16i1 addr:$src),  (MOVZX16rm8  addr:$src)>,
-         Requires<[In32BitMode]>;
+def : Pat<(extloadi16i1 addr:$src),  (MOVZX16rm8  addr:$src)>;
 def : Pat<(extloadi32i1 addr:$src),  (MOVZX32rm8  addr:$src)>;
-def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>,
-         Requires<[In32BitMode]>;
+def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>;
 def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>;
 def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
 
-// anyext
-def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8  GR8 :$src)>,
-         Requires<[In32BitMode]>;
-def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>,
-         Requires<[In32BitMode]>;
-def : Pat<(i32 (anyext GR16:$src)),
-          (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, x86_subreg_16bit)>;
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8  GR8 :$src)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
+def : Pat<(i32 (anyext GR16:$src)), (MOVZX32rr16 GR16:$src)>;
 
 // (and (i32 load), 255) -> (zextload i8)
 def : Pat<(i32 (and (nvloadi32 addr:$src), (i32 255))),
@@ -3567,6 +4020,10 @@ def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
           (MOVZX32rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
                                       x86_subreg_8bit_hi))>,
       Requires<[In32BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+                                      x86_subreg_8bit_hi))>,
+      Requires<[In32BitMode]>;
 def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
           (MOVZX32rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src, GR32_ABCD),
                                       x86_subreg_8bit_hi))>,
@@ -3961,6 +4418,243 @@ def : Pat<(parallel (store (i32 (X86dec_flag (loadi32 addr:$dst))), addr:$dst),
                     (implicit EFLAGS)),
           (DEC32m addr:$dst)>, Requires<[In32BitMode]>;
 
+// Register-Register Or with EFLAGS result
+def : Pat<(parallel (X86or_flag GR8:$src1, GR8:$src2),
+                    (implicit EFLAGS)),
+          (OR8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(parallel (X86or_flag GR16:$src1, GR16:$src2),
+                    (implicit EFLAGS)),
+          (OR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86or_flag GR32:$src1, GR32:$src2),
+                    (implicit EFLAGS)),
+          (OR32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory Or with EFLAGS result
+def : Pat<(parallel (X86or_flag GR8:$src1, (loadi8 addr:$src2)),
+                    (implicit EFLAGS)),
+          (OR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(parallel (X86or_flag GR16:$src1, (loadi16 addr:$src2)),
+                    (implicit EFLAGS)),
+          (OR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86or_flag GR32:$src1, (loadi32 addr:$src2)),
+                    (implicit EFLAGS)),
+          (OR32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer Or with EFLAGS result
+def : Pat<(parallel (X86or_flag GR8:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (OR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(parallel (X86or_flag GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (OR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86or_flag GR32:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (OR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86or_flag GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86or_flag GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Register Or with EFLAGS result
+def : Pat<(parallel (store (X86or_flag (loadi8 addr:$dst), GR8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR8mr addr:$dst, GR8:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi16 addr:$dst), GR16:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR16mr addr:$dst, GR16:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi32 addr:$dst), GR32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR32mr addr:$dst, GR32:$src2)>;
+
+// Memory-Integer Or with EFLAGS result
+def : Pat<(parallel (store (X86or_flag (loadi8 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR8mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi16 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR16mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi32 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR32mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi16 addr:$dst), i16immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR16mi8 addr:$dst, i16immSExt8:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi32 addr:$dst), i32immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR32mi8 addr:$dst, i32immSExt8:$src2)>;
+
+// Register-Register XOr with EFLAGS result
+def : Pat<(parallel (X86xor_flag GR8:$src1, GR8:$src2),
+                    (implicit EFLAGS)),
+          (XOR8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(parallel (X86xor_flag GR16:$src1, GR16:$src2),
+                    (implicit EFLAGS)),
+          (XOR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86xor_flag GR32:$src1, GR32:$src2),
+                    (implicit EFLAGS)),
+          (XOR32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory XOr with EFLAGS result
+def : Pat<(parallel (X86xor_flag GR8:$src1, (loadi8 addr:$src2)),
+                    (implicit EFLAGS)),
+          (XOR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(parallel (X86xor_flag GR16:$src1, (loadi16 addr:$src2)),
+                    (implicit EFLAGS)),
+          (XOR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86xor_flag GR32:$src1, (loadi32 addr:$src2)),
+                    (implicit EFLAGS)),
+          (XOR32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer XOr with EFLAGS result
+def : Pat<(parallel (X86xor_flag GR8:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(parallel (X86xor_flag GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86xor_flag GR32:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86xor_flag GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86xor_flag GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Register XOr with EFLAGS result
+def : Pat<(parallel (store (X86xor_flag (loadi8 addr:$dst), GR8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR8mr addr:$dst, GR8:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi16 addr:$dst), GR16:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR16mr addr:$dst, GR16:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi32 addr:$dst), GR32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR32mr addr:$dst, GR32:$src2)>;
+
+// Memory-Integer XOr with EFLAGS result
+def : Pat<(parallel (store (X86xor_flag (loadi8 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR8mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi16 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR16mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi32 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR32mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi16 addr:$dst), i16immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR16mi8 addr:$dst, i16immSExt8:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi32 addr:$dst), i32immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR32mi8 addr:$dst, i32immSExt8:$src2)>;
+
+// Register-Register And with EFLAGS result
+def : Pat<(parallel (X86and_flag GR8:$src1, GR8:$src2),
+                    (implicit EFLAGS)),
+          (AND8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(parallel (X86and_flag GR16:$src1, GR16:$src2),
+                    (implicit EFLAGS)),
+          (AND16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86and_flag GR32:$src1, GR32:$src2),
+                    (implicit EFLAGS)),
+          (AND32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory And with EFLAGS result
+def : Pat<(parallel (X86and_flag GR8:$src1, (loadi8 addr:$src2)),
+                    (implicit EFLAGS)),
+          (AND8rm GR8:$src1, addr:$src2)>;
+def : Pat<(parallel (X86and_flag GR16:$src1, (loadi16 addr:$src2)),
+                    (implicit EFLAGS)),
+          (AND16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86and_flag GR32:$src1, (loadi32 addr:$src2)),
+                    (implicit EFLAGS)),
+          (AND32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer And with EFLAGS result
+def : Pat<(parallel (X86and_flag GR8:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (AND8ri GR8:$src1, imm:$src2)>;
+def : Pat<(parallel (X86and_flag GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (AND16ri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86and_flag GR32:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (AND32ri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86and_flag GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86and_flag GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Register And with EFLAGS result
+def : Pat<(parallel (store (X86and_flag (loadi8 addr:$dst), GR8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND8mr addr:$dst, GR8:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi16 addr:$dst), GR16:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND16mr addr:$dst, GR16:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi32 addr:$dst), GR32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND32mr addr:$dst, GR32:$src2)>;
+
+// Memory-Integer And with EFLAGS result
+def : Pat<(parallel (store (X86and_flag (loadi8 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND8mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi16 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND16mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi32 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND32mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi16 addr:$dst), i16immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND16mi8 addr:$dst, i16immSExt8:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi32 addr:$dst), i32immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND32mi8 addr:$dst, i32immSExt8:$src2)>;
+
+// -disable-16bit support.
+def : Pat<(truncstorei16 (i32 imm:$src), addr:$dst),
+          (MOV16mi addr:$dst, imm:$src)>;
+def : Pat<(truncstorei16 GR32:$src, addr:$dst),
+          (MOV16mr addr:$dst, (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit))>;
+def : Pat<(i32 (sextloadi16 addr:$dst)),
+          (MOVSX32rm16 addr:$dst)>;
+def : Pat<(i32 (zextloadi16 addr:$dst)),
+          (MOVZX32rm16 addr:$dst)>;
+def : Pat<(i32 (extloadi16 addr:$dst)),
+          (MOVZX32rm16 addr:$dst)>;
+
 //===----------------------------------------------------------------------===//
 // Floating Point Stack Support
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index b79a006..ce76b4e 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -1,10 +1,10 @@
 //====- X86InstrMMX.td - Describe the X86 Instruction Set --*- tablegen -*-===//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // This file describes the X86 MMX instruction set, defining the instructions,
@@ -67,16 +67,18 @@ def mmx_pshufw : PatFrag<(ops node:$lhs, node:$rhs),
 // MMX Multiclasses
 //===----------------------------------------------------------------------===//
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
   // MMXI_binop_rm - Simple MMX binary operator.
   multiclass MMXI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            ValueType OpVT, bit Commutable = 0> {
-    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+				  (ins VR64:$src1, VR64:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                   [(set VR64:$dst, (OpVT (OpNode VR64:$src1, VR64:$src2)))]> {
       let isCommutable = Commutable;
     }
-    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+				  (ins VR64:$src1, i64mem:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                   [(set VR64:$dst, (OpVT (OpNode VR64:$src1,
                                          (bitconvert
@@ -85,12 +87,14 @@ let isTwoAddress = 1 in {
 
   multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
                                bit Commutable = 0> {
-    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+				  (ins VR64:$src1, VR64:$src2),
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> {
       let isCommutable = Commutable;
     }
-    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+				  (ins VR64:$src1, i64mem:$src2),
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                  [(set VR64:$dst, (IntId VR64:$src1,
                                    (bitconvert (load_mmx addr:$src2))))]>;
@@ -139,8 +143,10 @@ let isTwoAddress = 1 in {
 // MMX EMMS & FEMMS Instructions
 //===----------------------------------------------------------------------===//
 
-def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",  [(int_x86_mmx_emms)]>;
-def MMX_FEMMS : MMXI<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>;
+def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
+						  [(int_x86_mmx_emms)]>;
+def MMX_FEMMS : MMXI<0x0E, RawFrm, (outs), (ins), "femms",
+						  [(int_x86_mmx_femms)]>;
 
 //===----------------------------------------------------------------------===//
 // MMX Scalar Instructions
@@ -149,12 +155,14 @@ def MMX_FEMMS : MMXI<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]
 // Data Transfer Instructions
 def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
-                        [(set VR64:$dst, (v2i32 (scalar_to_vector GR32:$src)))]>;
+                        [(set VR64:$dst,
+		   	  (v2i32 (scalar_to_vector GR32:$src)))]>;
 let canFoldAsLoad = 1, isReMaterializable = 1 in
 def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
                         "movd\t{$src, $dst|$dst, $src}",
-              [(set VR64:$dst, (v2i32 (scalar_to_vector (loadi32 addr:$src))))]>;
-let mayStore = 1 in 
+              [(set VR64:$dst,
+		(v2i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+let mayStore = 1 in
 def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
                         "movd\t{$src, $dst|$dst, $src}", []>;
 
@@ -164,9 +172,16 @@ def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
                              []>;
 
 let neverHasSideEffects = 1 in
-def MMX_MOVD64from64rr : MMXRI<0x7E, MRMSrcReg,
+// These are 64 bit moves, but since the OS X assembler doesn't
+// recognize a register-register movq, we write them as
+// movd.
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
                                (outs GR64:$dst), (ins VR64:$src),
                                "movd\t{$src, $dst|$dst, $src}", []>;
+def MMX_MOVD64rrv164 : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+                            "movd\t{$src, $dst|$dst, $src}",
+                            [(set VR64:$dst,
+			      (v1i64 (scalar_to_vector GR64:$src)))]>;
 
 let neverHasSideEffects = 1 in
 def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
@@ -179,21 +194,21 @@ def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                         [(store (v1i64 VR64:$src), addr:$dst)]>;
 
-def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMDestMem, (outs VR64:$dst), (ins VR128:$src),
+def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
                           "movdq2q\t{$src, $dst|$dst, $src}",
                           [(set VR64:$dst,
                             (v1i64 (bitconvert
                             (i64 (vector_extract (v2i64 VR128:$src),
                                   (iPTR 0))))))]>;
 
-def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMDestMem, (outs VR128:$dst), (ins VR64:$src),
+def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
                            "movq2dq\t{$src, $dst|$dst, $src}",
           [(set VR128:$dst,
             (movl immAllZerosV,
                   (v2i64 (scalar_to_vector (i64 (bitconvert VR64:$src))))))]>;
 
 let neverHasSideEffects = 1 in
-def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMDestMem, (outs FR64:$dst), (ins VR64:$src),
+def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst), (ins VR64:$src),
                            "movq2dq\t{$src, $dst|$dst, $src}", []>;
 
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
@@ -207,7 +222,8 @@ def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
               [(set VR64:$dst,
                     (v2i32 (X86vzmovl (v2i32 (scalar_to_vector GR32:$src)))))]>;
 let AddedComplexity = 20 in
-def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
+					     (ins i32mem:$src),
                              "movd\t{$src, $dst|$dst, $src}",
           [(set VR64:$dst,
                 (v2i32 (X86vzmovl (v2i32
@@ -265,7 +281,7 @@ defm MMX_PAND : MMXI_binop_rm_v1i64<0xDB, "pand", and, 1>;
 defm MMX_POR  : MMXI_binop_rm_v1i64<0xEB, "por" , or,  1>;
 defm MMX_PXOR : MMXI_binop_rm_v1i64<0xEF, "pxor", xor, 1>;
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
   def MMX_PANDNrr : MMXI<0xDF, MRMSrcReg,
                          (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
                          "pandn\t{$src2, $dst|$dst, $src2}",
@@ -316,33 +332,33 @@ defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>;
 // Conversion Instructions
 
 // -- Unpack Instructions
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
   // Unpack High Packed Data Instructions
-  def MMX_PUNPCKHBWrr : MMXI<0x68, MRMSrcReg, 
+  def MMX_PUNPCKHBWrr : MMXI<0x68, MRMSrcReg,
                              (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
                              "punpckhbw\t{$src2, $dst|$dst, $src2}",
                              [(set VR64:$dst,
                                (v8i8 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
-  def MMX_PUNPCKHBWrm : MMXI<0x68, MRMSrcMem, 
+  def MMX_PUNPCKHBWrm : MMXI<0x68, MRMSrcMem,
                              (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
                              "punpckhbw\t{$src2, $dst|$dst, $src2}",
                              [(set VR64:$dst,
                                (v8i8 (mmx_unpckh VR64:$src1,
                                       (bc_v8i8 (load_mmx addr:$src2)))))]>;
 
-  def MMX_PUNPCKHWDrr : MMXI<0x69, MRMSrcReg, 
+  def MMX_PUNPCKHWDrr : MMXI<0x69, MRMSrcReg,
                              (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
                              "punpckhwd\t{$src2, $dst|$dst, $src2}",
                              [(set VR64:$dst,
                                (v4i16 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
-  def MMX_PUNPCKHWDrm : MMXI<0x69, MRMSrcMem, 
+  def MMX_PUNPCKHWDrm : MMXI<0x69, MRMSrcMem,
                              (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
                              "punpckhwd\t{$src2, $dst|$dst, $src2}",
                              [(set VR64:$dst,
                                (v4i16 (mmx_unpckh VR64:$src1,
                                        (bc_v4i16 (load_mmx addr:$src2)))))]>;
 
-  def MMX_PUNPCKHDQrr : MMXI<0x6A, MRMSrcReg, 
+  def MMX_PUNPCKHDQrr : MMXI<0x6A, MRMSrcReg,
                              (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
                              "punpckhdq\t{$src2, $dst|$dst, $src2}",
                              [(set VR64:$dst,
@@ -379,12 +395,12 @@ let isTwoAddress = 1 in {
                                (v4i16 (mmx_unpckl VR64:$src1,
                                        (bc_v4i16 (load_mmx addr:$src2)))))]>;
 
-  def MMX_PUNPCKLDQrr : MMXI<0x62, MRMSrcReg, 
+  def MMX_PUNPCKLDQrr : MMXI<0x62, MRMSrcReg,
                              (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
                              "punpckldq\t{$src2, $dst|$dst, $src2}",
                              [(set VR64:$dst,
                                (v2i32 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
-  def MMX_PUNPCKLDQrm : MMXI<0x62, MRMSrcMem, 
+  def MMX_PUNPCKLDQrm : MMXI<0x62, MRMSrcMem,
                              (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
                              "punpckldq\t{$src2, $dst|$dst, $src2}",
                              [(set VR64:$dst,
@@ -415,19 +431,22 @@ let neverHasSideEffects = 1 in {
 def MMX_CVTPD2PIrr  : MMX2I<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
                             "cvtpd2pi\t{$src, $dst|$dst, $src}", []>;
 let mayLoad = 1 in
-def MMX_CVTPD2PIrm  : MMX2I<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+def MMX_CVTPD2PIrm  : MMX2I<0x2D, MRMSrcMem, (outs VR64:$dst),
+					     (ins f128mem:$src),
                             "cvtpd2pi\t{$src, $dst|$dst, $src}", []>;
 
 def MMX_CVTPI2PDrr  : MMX2I<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
                             "cvtpi2pd\t{$src, $dst|$dst, $src}", []>;
 let mayLoad = 1 in
-def MMX_CVTPI2PDrm  : MMX2I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+def MMX_CVTPI2PDrm  : MMX2I<0x2A, MRMSrcMem, (outs VR128:$dst),
+	  				     (ins i64mem:$src),
                             "cvtpi2pd\t{$src, $dst|$dst, $src}", []>;
 
 def MMX_CVTPI2PSrr  : MMXI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
                            "cvtpi2ps\t{$src, $dst|$dst, $src}", []>;
 let mayLoad = 1 in
-def MMX_CVTPI2PSrm  : MMXI<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+def MMX_CVTPI2PSrm  : MMXI<0x2A, MRMSrcMem, (outs VR128:$dst),
+					    (ins i64mem:$src),
                            "cvtpi2ps\t{$src, $dst|$dst, $src}", []>;
 
 def MMX_CVTPS2PIrr  : MMXI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
@@ -439,7 +458,8 @@ def MMX_CVTPS2PIrm  : MMXI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
 def MMX_CVTTPD2PIrr : MMX2I<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
                             "cvttpd2pi\t{$src, $dst|$dst, $src}", []>;
 let mayLoad = 1 in
-def MMX_CVTTPD2PIrm : MMX2I<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+def MMX_CVTTPD2PIrm : MMX2I<0x2C, MRMSrcMem, (outs VR64:$dst),
+					     (ins f128mem:$src),
                             "cvttpd2pi\t{$src, $dst|$dst, $src}", []>;
 
 def MMX_CVTTPS2PIrr : MMXI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
@@ -459,14 +479,16 @@ def MMX_PEXTRWri  : MMXIi8<0xC5, MRMSrcReg,
                            "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                            [(set GR32:$dst, (MMX_X86pextrw (v4i16 VR64:$src1),
                                              (iPTR imm:$src2)))]>;
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
   def MMX_PINSRWrri : MMXIi8<0xC4, MRMSrcReg,
-                      (outs VR64:$dst), (ins VR64:$src1, GR32:$src2, i16i8imm:$src3),
+                      (outs VR64:$dst), (ins VR64:$src1, GR32:$src2,
+					     i16i8imm:$src3),
                       "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set VR64:$dst, (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
-                                               GR32:$src2, (iPTR imm:$src3))))]>;
+                                               GR32:$src2,(iPTR imm:$src3))))]>;
   def MMX_PINSRWrmi : MMXIi8<0xC4, MRMSrcMem,
-                     (outs VR64:$dst), (ins VR64:$src1, i16mem:$src2, i16i8imm:$src3),
+                     (outs VR64:$dst), (ins VR64:$src1, i16mem:$src2,
+					    i16i8imm:$src3),
                      "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set VR64:$dst,
                        (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
@@ -494,7 +516,7 @@ def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
 //===----------------------------------------------------------------------===//
 
 // Alias instructions that map zero vector to pxor.
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isCodeGenOnly = 1 in {
   def MMX_V_SET0       : MMXI<0xEF, MRMInitReg, (outs VR64:$dst), (ins),
                               "pxor\t$dst, $dst",
                               [(set VR64:$dst, (v2i32 immAllZerosV))]>;
@@ -579,7 +601,7 @@ def : Pat<(f64 (bitconvert (v8i8 VR64:$src))),
 
 let AddedComplexity = 20 in {
   def : Pat<(v2i32 (X86vzmovl (bc_v2i32 (load_mmx addr:$src)))),
-            (MMX_MOVZDI2PDIrm addr:$src)>; 
+            (MMX_MOVZDI2PDIrm addr:$src)>;
 }
 
 // Clear top half.
@@ -657,6 +679,33 @@ def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
                                                   (iPTR 0))))),
           (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
 
+// Patterns for vector comparisons
+def : Pat<(v8i8 (X86pcmpeqb VR64:$src1, VR64:$src2)),
+          (MMX_PCMPEQBrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v8i8 (X86pcmpeqb VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+          (MMX_PCMPEQBrm VR64:$src1, addr:$src2)>;
+def : Pat<(v4i16 (X86pcmpeqw VR64:$src1, VR64:$src2)),
+          (MMX_PCMPEQWrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v4i16 (X86pcmpeqw VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+          (MMX_PCMPEQWrm VR64:$src1, addr:$src2)>;
+def : Pat<(v2i32 (X86pcmpeqd VR64:$src1, VR64:$src2)),
+          (MMX_PCMPEQDrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v2i32 (X86pcmpeqd VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+          (MMX_PCMPEQDrm VR64:$src1, addr:$src2)>;
+
+def : Pat<(v8i8 (X86pcmpgtb VR64:$src1, VR64:$src2)),
+          (MMX_PCMPGTBrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v8i8 (X86pcmpgtb VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+          (MMX_PCMPGTBrm VR64:$src1, addr:$src2)>;
+def : Pat<(v4i16 (X86pcmpgtw VR64:$src1, VR64:$src2)),
+          (MMX_PCMPGTWrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v4i16 (X86pcmpgtw VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+          (MMX_PCMPGTWrm VR64:$src1, addr:$src2)>;
+def : Pat<(v2i32 (X86pcmpgtd VR64:$src1, VR64:$src2)),
+          (MMX_PCMPGTDrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v2i32 (X86pcmpgtd VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+          (MMX_PCMPGTDrm VR64:$src1, addr:$src2)>;
+
 // CMOV* - Used to implement the SELECT DAG operation.  Expanded by the
 // scheduler into a branch sequence.
 // These are expanded by the scheduler.
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 5d6ef36..96fc932 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1,10 +1,10 @@
 //====- X86InstrSSE.td - Describe the X86 Instruction Set --*- tablegen -*-===//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // This file describes the X86 SSE instruction set, defining the instructions,
@@ -36,22 +36,22 @@ def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
 def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
-def X86pshufb  : SDNode<"X86ISD::PSHUFB", 
+def X86pshufb  : SDNode<"X86ISD::PSHUFB",
                  SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
 def X86pextrb  : SDNode<"X86ISD::PEXTRB",
                  SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
 def X86pextrw  : SDNode<"X86ISD::PEXTRW",
                  SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
-def X86pinsrb  : SDNode<"X86ISD::PINSRB", 
+def X86pinsrb  : SDNode<"X86ISD::PINSRB",
                  SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
                                       SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
-def X86pinsrw  : SDNode<"X86ISD::PINSRW", 
+def X86pinsrw  : SDNode<"X86ISD::PINSRW",
                  SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
                                       SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
-def X86insrtps : SDNode<"X86ISD::INSERTPS", 
+def X86insrtps : SDNode<"X86ISD::INSERTPS",
                  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
-                                      SDTCisVT<2, f32>, SDTCisPtrTy<3>]>>;
+                                      SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
 def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
                  SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
 def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
@@ -69,6 +69,10 @@ def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>;
 def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>;
 def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>;
 
+def SDTX86CmpPTest : SDTypeProfile<0, 2, [SDTCisVT<0, v4f32>,
+					  SDTCisVT<1, v4f32>]>;
+def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
+
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
 //===----------------------------------------------------------------------===//
@@ -83,11 +87,13 @@ def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
 
 def ssmem : Operand<v4f32> {
   let PrintMethod = "printf32mem";
-  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm, i8imm);
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
 }
 def sdmem : Operand<v2f64> {
   let PrintMethod = "printf64mem";
-  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm, i8imm);
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
 }
 
 //===----------------------------------------------------------------------===//
@@ -179,13 +185,13 @@ def SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
   return getI8Imm(X86::getShuffleSHUFImmediate(N));
 }]>;
 
-// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to 
+// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to
 // PSHUFHW imm.
 def SHUFFLE_get_pshufhw_imm : SDNodeXForm<vector_shuffle, [{
   return getI8Imm(X86::getShufflePSHUFHWImmediate(N));
 }]>;
 
-// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to 
+// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to
 // PSHUFLW imm.
 def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{
   return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
@@ -360,25 +366,25 @@ def Int_CVTPS2PIrr : PSI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
                          [(set VR64:$dst, (int_x86_sse_cvtps2pi VR128:$src))]>;
 def Int_CVTPS2PIrm : PSI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
                          "cvtps2pi\t{$src, $dst|$dst, $src}",
-                         [(set VR64:$dst, (int_x86_sse_cvtps2pi 
+                         [(set VR64:$dst, (int_x86_sse_cvtps2pi
                                            (load addr:$src)))]>;
 def Int_CVTTPS2PIrr: PSI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
                          "cvttps2pi\t{$src, $dst|$dst, $src}",
                          [(set VR64:$dst, (int_x86_sse_cvttps2pi VR128:$src))]>;
 def Int_CVTTPS2PIrm: PSI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
                          "cvttps2pi\t{$src, $dst|$dst, $src}",
-                         [(set VR64:$dst, (int_x86_sse_cvttps2pi 
+                         [(set VR64:$dst, (int_x86_sse_cvttps2pi
                                            (load addr:$src)))]>;
 let Constraints = "$src1 = $dst" in {
-  def Int_CVTPI2PSrr : PSI<0x2A, MRMSrcReg, 
+  def Int_CVTPI2PSrr : PSI<0x2A, MRMSrcReg,
                            (outs VR128:$dst), (ins VR128:$src1, VR64:$src2),
                         "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
                                            VR64:$src2))]>;
-  def Int_CVTPI2PSrm : PSI<0x2A, MRMSrcMem, 
+  def Int_CVTPI2PSrm : PSI<0x2A, MRMSrcMem,
                            (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
                         "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1, 
+                        [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
                                             (load addr:$src2)))]>;
 }
 
@@ -407,11 +413,11 @@ let Constraints = "$src1 = $dst" in {
 
 // Comparison instructions
 let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
-  def CMPSSrr : SSIi8<0xC2, MRMSrcReg, 
+  def CMPSSrr : SSIi8<0xC2, MRMSrcReg,
                     (outs FR32:$dst), (ins FR32:$src1, FR32:$src, SSECC:$cc),
                     "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
 let mayLoad = 1 in
-  def CMPSSrm : SSIi8<0xC2, MRMSrcMem, 
+  def CMPSSrm : SSIi8<0xC2, MRMSrcMem,
                     (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, SSECC:$cc),
                     "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
 }
@@ -428,13 +434,15 @@ def UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs), (ins FR32:$src1, f32mem:$src2),
 
 // Aliases to match intrinsics which expect XMM operand(s).
 let Constraints = "$src1 = $dst" in {
-  def Int_CMPSSrr : SSIi8<0xC2, MRMSrcReg, 
-                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+  def Int_CMPSSrr : SSIi8<0xC2, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src,
+					        SSECC:$cc),
                         "cmp${cc}ss\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
-                                           VR128:$src, imm:$cc))]>;
-  def Int_CMPSSrm : SSIi8<0xC2, MRMSrcMem, 
-                        (outs VR128:$dst), (ins VR128:$src1, f32mem:$src, SSECC:$cc),
+                                           	VR128:$src, imm:$cc))]>;
+  def Int_CMPSSrm : SSIi8<0xC2, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, f32mem:$src,
+						SSECC:$cc),
                         "cmp${cc}ss\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
                                            (load addr:$src), imm:$cc))]>;
@@ -460,18 +468,19 @@ def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
                        (implicit EFLAGS)]>;
 } // Defs = [EFLAGS]
 
-// Aliases of packed SSE1 instructions for scalar use. These all have names that
-// start with 'Fs'.
+// Aliases of packed SSE1 instructions for scalar use. These all have names
+// that start with 'Fs'.
 
 // Alias instructions that map fld0 to pxor for sse.
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
+    canFoldAsLoad = 1 in
 def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins),
                  "pxor\t$dst, $dst", [(set FR32:$dst, fp32imm0)]>,
                Requires<[HasSSE1]>, TB, OpSize;
 
 // Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are
 // disregarded.
-let neverHasSideEffects = 1 in 
+let neverHasSideEffects = 1 in
 def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
                      "movaps\t{$src, $dst|$dst, $src}", []>;
 
@@ -552,7 +561,7 @@ multiclass basic_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
                                  (ins FR32:$src1, f32mem:$src2),
                  !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
                  [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
-                 
+
   // Vector operation, reg+reg.
   def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
                                  (ins VR128:$src1, VR128:$src2),
@@ -616,7 +625,7 @@ multiclass sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
                                  (ins FR32:$src1, f32mem:$src2),
                  !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
                  [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
-                 
+
   // Vector operation, reg+reg.
   def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
                                  (ins VR128:$src1, VR128:$src2),
@@ -671,7 +680,7 @@ defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin,
 // SSE packed FP Instructions
 
 // Move Instructions
-let neverHasSideEffects = 1 in 
+let neverHasSideEffects = 1 in
 def MOVAPSrr : PSI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}", []>;
 let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
@@ -708,13 +717,13 @@ let Constraints = "$src1 = $dst" in {
     def MOVLPSrm : PSI<0x12, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
                        "movlps\t{$src2, $dst|$dst, $src2}",
-       [(set VR128:$dst, 
+       [(set VR128:$dst,
          (movlp VR128:$src1,
                 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
     def MOVHPSrm : PSI<0x16, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
                        "movhps\t{$src2, $dst|$dst, $src2}",
-       [(set VR128:$dst, 
+       [(set VR128:$dst,
          (movhp VR128:$src1,
                 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
   } // AddedComplexity
@@ -789,7 +798,7 @@ multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr,
   def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
                 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                 [(set FR32:$dst, (OpNode (load addr:$src)))]>;
-                 
+
   // Vector operation, reg.
   def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
               !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
@@ -890,12 +899,12 @@ let Constraints = "$src1 = $dst" in {
 }
 
 let Constraints = "$src1 = $dst" in {
-  def CMPPSrri : PSIi8<0xC2, MRMSrcReg, 
+  def CMPPSrri : PSIi8<0xC2, MRMSrcReg,
                     (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
                     "cmp${cc}ps\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
                                                         VR128:$src, imm:$cc))]>;
-  def CMPPSrmi : PSIi8<0xC2, MRMSrcMem, 
+  def CMPPSrmi : PSIi8<0xC2, MRMSrcMem,
                   (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
                   "cmp${cc}ps\t{$src, $dst|$dst, $src}",
                   [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
@@ -909,13 +918,13 @@ def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
 // Shuffle and unpack instructions
 let Constraints = "$src1 = $dst" in {
   let isConvertibleToThreeAddress = 1 in // Convert to pshufd
-    def SHUFPSrri : PSIi8<0xC6, MRMSrcReg, 
+    def SHUFPSrri : PSIi8<0xC6, MRMSrcReg,
                           (outs VR128:$dst), (ins VR128:$src1,
                            VR128:$src2, i8imm:$src3),
                           "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                           [(set VR128:$dst,
                             (v4f32 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
-  def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem, 
+  def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem,
                         (outs VR128:$dst), (ins VR128:$src1,
                          f128mem:$src2, i8imm:$src3),
                         "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
@@ -924,24 +933,24 @@ let Constraints = "$src1 = $dst" in {
                                   VR128:$src1, (memopv4f32 addr:$src2))))]>;
 
   let AddedComplexity = 10 in {
-    def UNPCKHPSrr : PSI<0x15, MRMSrcReg, 
+    def UNPCKHPSrr : PSI<0x15, MRMSrcReg,
                          (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                          "unpckhps\t{$src2, $dst|$dst, $src2}",
                          [(set VR128:$dst,
                            (v4f32 (unpckh VR128:$src1, VR128:$src2)))]>;
-    def UNPCKHPSrm : PSI<0x15, MRMSrcMem, 
+    def UNPCKHPSrm : PSI<0x15, MRMSrcMem,
                          (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
                          "unpckhps\t{$src2, $dst|$dst, $src2}",
                          [(set VR128:$dst,
                            (v4f32 (unpckh VR128:$src1,
                                           (memopv4f32 addr:$src2))))]>;
 
-    def UNPCKLPSrr : PSI<0x14, MRMSrcReg, 
+    def UNPCKLPSrr : PSI<0x14, MRMSrcReg,
                          (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                          "unpcklps\t{$src2, $dst|$dst, $src2}",
                          [(set VR128:$dst,
                            (v4f32 (unpckl VR128:$src1, VR128:$src2)))]>;
-    def UNPCKLPSrm : PSI<0x14, MRMSrcMem, 
+    def UNPCKLPSrm : PSI<0x14, MRMSrcMem,
                          (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
                          "unpcklps\t{$src2, $dst|$dst, $src2}",
                          [(set VR128:$dst,
@@ -984,7 +993,8 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
 // Alias instructions that map zero vector to pxor / xorp* for sse.
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1 in
 def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins),
                  "xorps\t$dst, $dst",
                  [(set VR128:$dst, (v4i32 immAllZerosV))]>;
@@ -1046,14 +1056,14 @@ let AddedComplexity = 20 in
 def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
                       "movss\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (v4f32 (X86vzmovl (v4f32 (scalar_to_vector
-                                                      (loadf32 addr:$src))))))]>;
+                                                    (loadf32 addr:$src))))))]>;
 
 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
           (MOVZSS2PSrm addr:$src)>;
 
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 // SSE2 Instructions
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 
 // Move Instructions
 let neverHasSideEffects = 1 in
@@ -1077,7 +1087,7 @@ def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f64mem:$src),
 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (fround FR64:$src))]>;
-def CVTSD2SSrm  : SDI<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 
+def CVTSD2SSrm  : SDI<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (fround (loadf64 addr:$src)))]>;
 def CVTSI2SDrr  : SDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR32:$src),
@@ -1087,6 +1097,27 @@ def CVTSI2SDrm  : SDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i32mem:$src),
                       "cvtsi2sd\t{$src, $dst|$dst, $src}",
                       [(set FR64:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
 
+def CVTPD2DQrm  : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
+def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PSrr : PSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtdq2ps\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PSrm : PSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtdq2ps\t{$src, $dst|$dst, $src}", []>;
+def COMISDrr: PDI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                  "comisd\t{$src2, $src1|$src1, $src2}", []>;
+def COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+                      "comisd\t{$src2, $src1|$src1, $src2}", []>;
+
 // SSE2 instructions with XS prefix
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
@@ -1112,21 +1143,21 @@ def Int_CVTPD2PIrr : PDI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
                          [(set VR64:$dst, (int_x86_sse_cvtpd2pi VR128:$src))]>;
 def Int_CVTPD2PIrm : PDI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
                          "cvtpd2pi\t{$src, $dst|$dst, $src}",
-                         [(set VR64:$dst, (int_x86_sse_cvtpd2pi 
+                         [(set VR64:$dst, (int_x86_sse_cvtpd2pi
                                            (memop addr:$src)))]>;
 def Int_CVTTPD2PIrr: PDI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
                          "cvttpd2pi\t{$src, $dst|$dst, $src}",
                          [(set VR64:$dst, (int_x86_sse_cvttpd2pi VR128:$src))]>;
 def Int_CVTTPD2PIrm: PDI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
                          "cvttpd2pi\t{$src, $dst|$dst, $src}",
-                         [(set VR64:$dst, (int_x86_sse_cvttpd2pi 
+                         [(set VR64:$dst, (int_x86_sse_cvttpd2pi
                                            (memop addr:$src)))]>;
 def Int_CVTPI2PDrr : PDI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
                          "cvtpi2pd\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse_cvtpi2pd VR64:$src))]>;
 def Int_CVTPI2PDrm : PDI<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                          "cvtpi2pd\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst, (int_x86_sse_cvtpi2pd 
+                         [(set VR128:$dst, (int_x86_sse_cvtpi2pd
                                             (load addr:$src)))]>;
 
 // Aliases for intrinsics
@@ -1141,11 +1172,11 @@ def Int_CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
 
 // Comparison instructions
 let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
-  def CMPSDrr : SDIi8<0xC2, MRMSrcReg, 
+  def CMPSDrr : SDIi8<0xC2, MRMSrcReg,
                     (outs FR64:$dst), (ins FR64:$src1, FR64:$src, SSECC:$cc),
                     "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
 let mayLoad = 1 in
-  def CMPSDrm : SDIi8<0xC2, MRMSrcMem, 
+  def CMPSDrm : SDIi8<0xC2, MRMSrcMem,
                     (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, SSECC:$cc),
                     "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
 }
@@ -1162,13 +1193,15 @@ def UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs), (ins FR64:$src1, f64mem:$src2),
 
 // Aliases to match intrinsics which expect XMM operand(s).
 let Constraints = "$src1 = $dst" in {
-  def Int_CMPSDrr : SDIi8<0xC2, MRMSrcReg, 
-                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+  def Int_CMPSDrr : SDIi8<0xC2, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src,
+						SSECC:$cc),
                         "cmp${cc}sd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
                                            VR128:$src, imm:$cc))]>;
-  def Int_CMPSDrm : SDIi8<0xC2, MRMSrcMem, 
-                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src, SSECC:$cc),
+  def Int_CMPSDrm : SDIi8<0xC2, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src,
+						SSECC:$cc),
                         "cmp${cc}sd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
                                            (load addr:$src), imm:$cc))]>;
@@ -1194,11 +1227,12 @@ def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
                        (implicit EFLAGS)]>;
 } // Defs = [EFLAGS]
 
-// Aliases of packed SSE2 instructions for scalar use. These all have names that
-// start with 'Fs'.
+// Aliases of packed SSE2 instructions for scalar use. These all have names
+// that start with 'Fs'.
 
 // Alias instructions that map fld0 to pxor for sse.
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
+    canFoldAsLoad = 1 in
 def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins),
                  "pxor\t$dst, $dst", [(set FR64:$dst, fpimm0)]>,
                Requires<[HasSSE2]>, TB, OpSize;
@@ -1286,7 +1320,7 @@ multiclass basic_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
                                  (ins FR64:$src1, f64mem:$src2),
                  !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
                  [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
-                 
+
   // Vector operation, reg+reg.
   def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
                                  (ins VR128:$src1, VR128:$src2),
@@ -1350,7 +1384,7 @@ multiclass sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
                                  (ins FR64:$src1, f64mem:$src2),
                  !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
                  [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
-                 
+
   // Vector operation, reg+reg.
   def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
                                  (ins VR128:$src1, VR128:$src2),
@@ -1402,7 +1436,7 @@ defm MAX : sse2_fp_binop_rm<0x5F, "max", X86fmax,
 defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin,
                             int_x86_sse2_min_sd, int_x86_sse2_min_pd>;
 
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 // SSE packed FP Instructions
 
 // Move Instructions
@@ -1442,13 +1476,13 @@ let Constraints = "$src1 = $dst" in {
     def MOVLPDrm : PDI<0x12, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
                        "movlpd\t{$src2, $dst|$dst, $src2}",
-                       [(set VR128:$dst, 
+                       [(set VR128:$dst,
                          (v2f64 (movlp VR128:$src1,
                                  (scalar_to_vector (loadf64 addr:$src2)))))]>;
     def MOVHPDrm : PDI<0x16, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
                        "movhpd\t{$src2, $dst|$dst, $src2}",
-                       [(set VR128:$dst, 
+                       [(set VR128:$dst,
                          (v2f64 (movhp VR128:$src1,
                                  (scalar_to_vector (loadf64 addr:$src2)))))]>;
   } // AddedComplexity
@@ -1564,7 +1598,7 @@ def Int_CVTSD2SSrr: SDI<0x5A, MRMSrcReg,
                    [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
                                       VR128:$src2))]>;
 def Int_CVTSD2SSrm: SDI<0x5A, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 
+                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
                    "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                    [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
                                       (load addr:$src2)))]>;
@@ -1612,7 +1646,7 @@ multiclass sse2_fp_unop_rm<bits<8> opc, string OpcodeStr,
   def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
                 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
                 [(set FR64:$dst, (OpNode (load addr:$src)))]>;
-                 
+
   // Vector operation, reg.
   def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
               !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
@@ -1712,12 +1746,12 @@ let Constraints = "$src1 = $dst" in {
 }
 
 let Constraints = "$src1 = $dst" in {
-  def CMPPDrri : PDIi8<0xC2, MRMSrcReg, 
+  def CMPPDrri : PDIi8<0xC2, MRMSrcReg,
                     (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
                     "cmp${cc}pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
                                                         VR128:$src, imm:$cc))]>;
-  def CMPPDrmi : PDIi8<0xC2, MRMSrcMem, 
+  def CMPPDrmi : PDIi8<0xC2, MRMSrcMem,
                   (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
                   "cmp${cc}pd\t{$src, $dst|$dst, $src}",
                   [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
@@ -1730,12 +1764,12 @@ def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
 
 // Shuffle and unpack instructions
 let Constraints = "$src1 = $dst" in {
-  def SHUFPDrri : PDIi8<0xC6, MRMSrcReg, 
+  def SHUFPDrri : PDIi8<0xC6, MRMSrcReg,
                  (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3),
                  "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                  [(set VR128:$dst,
                    (v2f64 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
-  def SHUFPDrmi : PDIi8<0xC6, MRMSrcMem, 
+  def SHUFPDrmi : PDIi8<0xC6, MRMSrcMem,
                         (outs VR128:$dst), (ins VR128:$src1,
                          f128mem:$src2, i8imm:$src3),
                         "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
@@ -1744,24 +1778,24 @@ let Constraints = "$src1 = $dst" in {
                                   VR128:$src1, (memopv2f64 addr:$src2))))]>;
 
   let AddedComplexity = 10 in {
-    def UNPCKHPDrr : PDI<0x15, MRMSrcReg, 
+    def UNPCKHPDrr : PDI<0x15, MRMSrcReg,
                          (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                          "unpckhpd\t{$src2, $dst|$dst, $src2}",
                          [(set VR128:$dst,
                            (v2f64 (unpckh VR128:$src1, VR128:$src2)))]>;
-    def UNPCKHPDrm : PDI<0x15, MRMSrcMem, 
+    def UNPCKHPDrm : PDI<0x15, MRMSrcMem,
                          (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
                          "unpckhpd\t{$src2, $dst|$dst, $src2}",
                          [(set VR128:$dst,
                            (v2f64 (unpckh VR128:$src1,
                                           (memopv2f64 addr:$src2))))]>;
 
-    def UNPCKLPDrr : PDI<0x14, MRMSrcReg, 
+    def UNPCKLPDrr : PDI<0x14, MRMSrcReg,
                          (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                          "unpcklpd\t{$src2, $dst|$dst, $src2}",
                          [(set VR128:$dst,
                            (v2f64 (unpckl VR128:$src1, VR128:$src2)))]>;
-    def UNPCKLPDrm : PDI<0x14, MRMSrcMem, 
+    def UNPCKLPDrm : PDI<0x14, MRMSrcMem,
                          (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
                          "unpcklpd\t{$src2, $dst|$dst, $src2}",
                          [(set VR128:$dst,
@@ -1770,7 +1804,7 @@ let Constraints = "$src1 = $dst" in {
 } // Constraints = "$src1 = $dst"
 
 
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 // SSE integer instructions
 
 // Move Instructions
@@ -1825,14 +1859,17 @@ multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
 multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
                              string OpcodeStr,
                              Intrinsic IntId, Intrinsic IntId2> {
-  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1,
+						       VR128:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
-  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1,
+						       i128mem:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                [(set VR128:$dst, (IntId VR128:$src1,
-                                        (bitconvert (memopv2i64 addr:$src2))))]>;
-  def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                                      (bitconvert (memopv2i64 addr:$src2))))]>;
+  def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst), (ins VR128:$src1,
+							i32i8imm:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                [(set VR128:$dst, (IntId2 VR128:$src1, (i32 imm:$src2)))]>;
 }
@@ -1840,15 +1877,17 @@ multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
 /// PDI_binop_rm - Simple SSE2 binary operator.
 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         ValueType OpVT, bit Commutable = 0> {
-  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1,
+						       VR128:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]> {
     let isCommutable = Commutable;
   }
-  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1,
+						       i128mem:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                [(set VR128:$dst, (OpVT (OpNode VR128:$src1,
-                                       (bitconvert (memopv2i64 addr:$src2)))))]>;
+                                     (bitconvert (memopv2i64 addr:$src2)))))]>;
 }
 
 /// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64.
@@ -1858,14 +1897,17 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 ///
 multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               bit Commutable = 0> {
-  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+			       (ins VR128:$src1, VR128:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]> {
     let isCommutable = Commutable;
   }
-  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+			       (ins VR128:$src1, i128mem:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (OpNode VR128:$src1,(memopv2i64 addr:$src2)))]>;
+               [(set VR128:$dst, (OpNode VR128:$src1,
+					 (memopv2i64 addr:$src2)))]>;
 }
 
 } // Constraints = "$src1 = $dst"
@@ -2029,8 +2071,8 @@ def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
                      (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
                      "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      [(set VR128:$dst, (v4i32 (pshufd:$src2
-                                               (bc_v4i32(memopv2i64 addr:$src1)),
-                                               (undef))))]>;
+                                             (bc_v4i32(memopv2i64 addr:$src1)),
+                                             (undef))))]>;
 
 // SSE2 with ImmT == Imm8 and XS prefix.
 def PSHUFHWri : Ii8<0x70, MRMSrcReg,
@@ -2043,8 +2085,8 @@ def PSHUFHWmi : Ii8<0x70, MRMSrcMem,
                     (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
                     "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst, (v8i16 (pshufhw:$src2
-                                             (bc_v8i16 (memopv2i64 addr:$src1)),
-                                             (undef))))]>,
+                                            (bc_v8i16 (memopv2i64 addr:$src1)),
+                                            (undef))))]>,
                 XS, Requires<[HasSSE2]>;
 
 // SSE2 with ImmT == Imm8 and XD prefix.
@@ -2064,90 +2106,90 @@ def PSHUFLWmi : Ii8<0x70, MRMSrcMem,
 
 
 let Constraints = "$src1 = $dst" in {
-  def PUNPCKLBWrr : PDI<0x60, MRMSrcReg, 
+  def PUNPCKLBWrr : PDI<0x60, MRMSrcReg,
                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                         "punpcklbw\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (v16i8 (unpckl VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKLBWrm : PDI<0x60, MRMSrcMem, 
+  def PUNPCKLBWrm : PDI<0x60, MRMSrcMem,
                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                         "punpcklbw\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (unpckl VR128:$src1,
                                   (bc_v16i8 (memopv2i64 addr:$src2))))]>;
-  def PUNPCKLWDrr : PDI<0x61, MRMSrcReg, 
+  def PUNPCKLWDrr : PDI<0x61, MRMSrcReg,
                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                         "punpcklwd\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (v8i16 (unpckl VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKLWDrm : PDI<0x61, MRMSrcMem, 
+  def PUNPCKLWDrm : PDI<0x61, MRMSrcMem,
                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                         "punpcklwd\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (unpckl VR128:$src1,
                                   (bc_v8i16 (memopv2i64 addr:$src2))))]>;
-  def PUNPCKLDQrr : PDI<0x62, MRMSrcReg, 
+  def PUNPCKLDQrr : PDI<0x62, MRMSrcReg,
                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                         "punpckldq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (v4i32 (unpckl VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKLDQrm : PDI<0x62, MRMSrcMem, 
+  def PUNPCKLDQrm : PDI<0x62, MRMSrcMem,
                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                         "punpckldq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (unpckl VR128:$src1,
                                   (bc_v4i32 (memopv2i64 addr:$src2))))]>;
-  def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg, 
+  def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
                          (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                          "punpcklqdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (v2i64 (unpckl VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem, 
+  def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
                          (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                          "punpcklqdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (v2i64 (unpckl VR128:$src1,
                                          (memopv2i64 addr:$src2))))]>;
-  
-  def PUNPCKHBWrr : PDI<0x68, MRMSrcReg, 
+
+  def PUNPCKHBWrr : PDI<0x68, MRMSrcReg,
                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                         "punpckhbw\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (v16i8 (unpckh VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKHBWrm : PDI<0x68, MRMSrcMem, 
+  def PUNPCKHBWrm : PDI<0x68, MRMSrcMem,
                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                         "punpckhbw\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst, 
-                          (unpckh VR128:$src1, 
+                        [(set VR128:$dst,
+                          (unpckh VR128:$src1,
                                   (bc_v16i8 (memopv2i64 addr:$src2))))]>;
-  def PUNPCKHWDrr : PDI<0x69, MRMSrcReg, 
+  def PUNPCKHWDrr : PDI<0x69, MRMSrcReg,
                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                         "punpckhwd\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (v8i16 (unpckh VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKHWDrm : PDI<0x69, MRMSrcMem, 
+  def PUNPCKHWDrm : PDI<0x69, MRMSrcMem,
                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                         "punpckhwd\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (unpckh VR128:$src1,
                                   (bc_v8i16 (memopv2i64 addr:$src2))))]>;
-  def PUNPCKHDQrr : PDI<0x6A, MRMSrcReg, 
+  def PUNPCKHDQrr : PDI<0x6A, MRMSrcReg,
                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                         "punpckhdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (v4i32 (unpckh VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKHDQrm : PDI<0x6A, MRMSrcMem, 
+  def PUNPCKHDQrm : PDI<0x6A, MRMSrcMem,
                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                         "punpckhdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (unpckh VR128:$src1,
                                   (bc_v4i32 (memopv2i64 addr:$src2))))]>;
-  def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg, 
+  def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
                          (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                          "punpckhqdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
                           (v2i64 (unpckh VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem, 
+  def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                         "punpckhqdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
@@ -2172,7 +2214,7 @@ let Constraints = "$src1 = $dst" in {
                        (outs VR128:$dst), (ins VR128:$src1,
                         i16mem:$src2, i32i8imm:$src3),
                        "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                       [(set VR128:$dst, 
+                       [(set VR128:$dst,
                          (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
                                     imm:$src3))]>;
 }
@@ -2202,7 +2244,7 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
 def MOVNTImr  :   I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                     "movnti\t{$src, $dst|$dst, $src}",
-                    [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>, 
+                    [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
                   TB, Requires<[HasSSE2]>;
 
 // Flush cache
@@ -2217,17 +2259,18 @@ def MFENCE : I<0xAE, MRM6r, (outs), (ins),
                "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
 
 //TODO: custom lower this so as to never even generate the noop
-def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss), 
+def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss),
            (i8 0)), (NOOP)>;
 def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
 def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
-def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss), 
+def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss),
            (i8 1)), (MFENCE)>;
 
 // Alias instructions that map zero vector to pxor / xorp* for sse.
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-ones value if folding it would be beneficial.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1 in
   def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins),
                          "pcmpeqd\t$dst, $dst",
                          [(set VR128:$dst, (v4i32 immAllOnesV))]>;
@@ -2240,7 +2283,7 @@ def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR64:$src),
                         (v2f64 (scalar_to_vector FR64:$src)))]>;
 def MOVSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                      "movsd\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst, 
+                     [(set VR128:$dst,
                        (v2f64 (scalar_to_vector (loadf64 addr:$src))))]>;
 
 def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
@@ -2399,9 +2442,9 @@ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
             (MOVZPQILo2PQIrm addr:$src)>;
 }
 
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 // SSE3 Instructions
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 
 // Move Instructions
 def MOVSHDUPrr : S3SI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -2525,9 +2568,9 @@ let AddedComplexity = 20 in
   def : Pat<(v4i32 (movsldup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
             (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
 
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 // SSSE3 Instructions
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 
 /// SS3I_unop_rm_int_8 - Simple SSSE3 unary operator whose type is v*i8.
 multiclass SS3I_unop_rm_int_8<bits<8> opc, string OpcodeStr,
@@ -2801,12 +2844,13 @@ def : Pat<(X86pshufb VR128:$src, VR128:$mask),
 def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
           (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>;
 
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 // Non-Instruction Patterns
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 
-// extload f32 -> f64.  This matches load+fextend because we have a hack in 
-// the isel (PreprocessForFPConvert) that can introduce loads after dag combine.
+// extload f32 -> f64.  This matches load+fextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag
+// combine.
 // Since these loads aren't folded into the fextend, we have to match it
 // explicitly here.
 let Predicates = [HasSSE2] in
@@ -2884,12 +2928,12 @@ def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
       Requires<[HasSSE2]>;
 // Special unary SHUFPDrri case.
 def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
-          (SHUFPDrri VR128:$src1, VR128:$src1, 
+          (SHUFPDrri VR128:$src1, VR128:$src1,
                      (SHUFFLE_get_shuf_imm VR128:$src3))>,
       Requires<[HasSSE2]>;
 // Special unary SHUFPDrri case.
 def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
-          (SHUFPDrri VR128:$src1, VR128:$src1, 
+          (SHUFPDrri VR128:$src1, VR128:$src1,
                      (SHUFFLE_get_shuf_imm VR128:$src3))>,
       Requires<[HasSSE2]>;
 // Unary v4f32 shuffle with PSHUF* in order to fold a load.
@@ -2899,16 +2943,16 @@ def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
 
 // Special binary v4i32 shuffle cases with SHUFPS.
 def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
-          (SHUFPSrri VR128:$src1, VR128:$src2, 
+          (SHUFPSrri VR128:$src1, VR128:$src2,
                      (SHUFFLE_get_shuf_imm VR128:$src3))>,
            Requires<[HasSSE2]>;
 def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)))),
-          (SHUFPSrmi VR128:$src1, addr:$src2, 
+          (SHUFPSrmi VR128:$src1, addr:$src2,
                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
            Requires<[HasSSE2]>;
 // Special binary v2i64 shuffle cases using SHUFPDrri.
 def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
-          (SHUFPDrri VR128:$src1, VR128:$src2, 
+          (SHUFPDrri VR128:$src1, VR128:$src2,
                      (SHUFFLE_get_shuf_imm VR128:$src3))>,
           Requires<[HasSSE2]>;
 
@@ -3030,7 +3074,7 @@ def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
 // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
 // fall back to this for SSE1)
 def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
-          (SHUFPSrri VR128:$src2, VR128:$src1, 
+          (SHUFPSrri VR128:$src2, VR128:$src1,
                      (SHUFFLE_get_shuf_imm VR128:$src3))>, Requires<[HasSSE1]>;
 
 // Set lowest element and zero upper elements.
@@ -3097,7 +3141,7 @@ def : Pat<(store (v8i16 VR128:$src), addr:$dst),
           (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
           (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
-          
+
 //===----------------------------------------------------------------------===//
 // SSE4.1 Instructions
 //===----------------------------------------------------------------------===//
@@ -3108,7 +3152,7 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd,
                             Intrinsic V2F64Int> {
   // Intrinsic operation, reg.
   // Vector intrinsic operation, reg
-  def PSr_Int : SS4AIi8<opcps, MRMSrcReg, 
+  def PSr_Int : SS4AIi8<opcps, MRMSrcReg,
                     (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -3149,41 +3193,41 @@ multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
                             Intrinsic F64Int> {
   // Intrinsic operation, reg.
   def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
-                    (outs VR128:$dst), 
+                    (outs VR128:$dst),
                                  (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
                     !strconcat(OpcodeStr,
                     "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR128:$dst, 
+                    [(set VR128:$dst,
                             (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
                     OpSize;
 
   // Intrinsic operation, mem.
-  def SSm_Int : SS4AIi8<opcss, MRMSrcMem, 
-                    (outs VR128:$dst), 
+  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
+                    (outs VR128:$dst),
                                 (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
-                    !strconcat(OpcodeStr, 
+                    !strconcat(OpcodeStr,
                     "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR128:$dst, 
+                    [(set VR128:$dst,
                          (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
                     OpSize;
 
   // Intrinsic operation, reg.
   def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
-                    (outs VR128:$dst), 
+                    (outs VR128:$dst),
                             (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
                     !strconcat(OpcodeStr,
                     "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR128:$dst, 
+                    [(set VR128:$dst,
                             (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
                     OpSize;
 
   // Intrinsic operation, mem.
   def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
-                    (outs VR128:$dst), 
+                    (outs VR128:$dst),
                             (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
                     !strconcat(OpcodeStr,
                     "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR128:$dst, 
+                    [(set VR128:$dst,
                         (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
                     OpSize;
 }
@@ -3302,9 +3346,9 @@ let Constraints = "$src1 = $dst" in {
                                  Intrinsic IntId128, bit Commutable = 0> {
     def rri : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
-                    !strconcat(OpcodeStr, 
+                    !strconcat(OpcodeStr,
                      "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR128:$dst, 
+                    [(set VR128:$dst,
                       (IntId128 VR128:$src1, VR128:$src2, imm:$src3))]>,
                     OpSize {
       let isCommutable = Commutable;
@@ -3339,7 +3383,7 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2),
-                    !strconcat(OpcodeStr, 
+                    !strconcat(OpcodeStr,
                      "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
                     [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
                     OpSize;
@@ -3471,13 +3515,13 @@ def : Pat<(int_x86_sse41_pmovzxbq
 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
   def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
                  (ins VR128:$src1, i32i8imm:$src2),
-                 !strconcat(OpcodeStr, 
+                 !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
                  OpSize;
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
-                 !strconcat(OpcodeStr, 
+                 !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  []>, OpSize;
 // FIXME:
@@ -3492,7 +3536,7 @@ defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
-                 !strconcat(OpcodeStr, 
+                 !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  []>, OpSize;
 // FIXME:
@@ -3507,13 +3551,13 @@ defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
   def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
                  (ins VR128:$src1, i32i8imm:$src2),
-                 !strconcat(OpcodeStr, 
+                 !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR32:$dst,
                   (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
-                 !strconcat(OpcodeStr, 
+                 !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
                           addr:$dst)]>, OpSize;
@@ -3527,14 +3571,14 @@ defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
   def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
                  (ins VR128:$src1, i32i8imm:$src2),
-                 !strconcat(OpcodeStr, 
+                 !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR32:$dst,
                     (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
            OpSize;
-  def mr : SS4AIi8<opc, MRMDestMem, (outs), 
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
-                 !strconcat(OpcodeStr, 
+                 !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
                           addr:$dst)]>, OpSize;
@@ -3553,15 +3597,15 @@ let Constraints = "$src1 = $dst" in {
   multiclass SS41I_insert8<bits<8> opc, string OpcodeStr> {
     def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
-                   !strconcat(OpcodeStr, 
+                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, 
+                   [(set VR128:$dst,
                      (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
     def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
                    !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, 
+                   [(set VR128:$dst,
                      (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
                                 imm:$src3))]>, OpSize;
   }
@@ -3573,16 +3617,16 @@ let Constraints = "$src1 = $dst" in {
   multiclass SS41I_insert32<bits<8> opc, string OpcodeStr> {
     def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
-                   !strconcat(OpcodeStr, 
+                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, 
+                   [(set VR128:$dst,
                      (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
                    OpSize;
     def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
                    !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, 
+                   [(set VR128:$dst,
                      (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
                                        imm:$src3)))]>, OpSize;
   }
@@ -3590,37 +3634,57 @@ let Constraints = "$src1 = $dst" in {
 
 defm PINSRD      : SS41I_insert32<0x22, "pinsrd">;
 
+// insertps has a few different modes, there's the first two here below which
+// are optimized inserts that won't zero arbitrary elements in the destination
+// vector. The next one matches the intrinsic and could zero arbitrary elements
+// in the target vector.
 let Constraints = "$src1 = $dst" in {
   multiclass SS41I_insertf32<bits<8> opc, string OpcodeStr> {
     def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src1, FR32:$src2, i32i8imm:$src3),
-                   !strconcat(OpcodeStr, 
+                   (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, 
-                     (X86insrtps VR128:$src1, FR32:$src2, imm:$src3))]>, OpSize;
+                   [(set VR128:$dst,
+                     (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
+		OpSize;
     def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3),
                    !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, 
-                     (X86insrtps VR128:$src1, (loadf32 addr:$src2),
+                   [(set VR128:$dst,
+                     (X86insrtps VR128:$src1,
+                                (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
                                  imm:$src3))]>, OpSize;
   }
 }
 
 defm INSERTPS    : SS41I_insertf32<0x21, "insertps">;
 
+def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
+          (INSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>;
+
+// ptest instruction we'll lower to this in X86ISelLowering primarily from
+// the intel intrinsic that corresponds to this.
 let Defs = [EFLAGS] in {
 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
-                    "ptest \t{$src2, $src1|$src1, $src2}", []>, OpSize;
+                    "ptest \t{$src2, $src1|$src1, $src2}",
+                    [(X86ptest VR128:$src1, VR128:$src2),
+                      (implicit EFLAGS)]>, OpSize;
 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
-                    "ptest \t{$src2, $src1|$src1, $src2}", []>, OpSize;
+                    "ptest \t{$src2, $src1|$src1, $src2}",
+                    [(X86ptest VR128:$src1, (load addr:$src2)),
+                        (implicit EFLAGS)]>, OpSize;
 }
 
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "movntdqa\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
 
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 Instructions
+//===----------------------------------------------------------------------===//
+
 /// SS42I_binop_rm_int - Simple SSE 4.2 binary operator
 let Constraints = "$src1 = $dst" in {
   multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
@@ -3647,3 +3711,171 @@ def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
           (PCMPGTQrr VR128:$src1, VR128:$src2)>;
 def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
           (PCMPGTQrm VR128:$src1, addr:$src2)>;
+
+// crc intrinsic instruction
+// This set of instructions are only rm, the only difference is the size
+// of r and m.
+let Constraints = "$src1 = $dst" in {
+  def CRC32m8  : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
+                      (ins GR32:$src1, i8mem:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_8 GR32:$src1,
+                         (load addr:$src2)))]>, OpSize;
+  def CRC32r8  : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
+                      (ins GR32:$src1, GR8:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_8 GR32:$src1, GR8:$src2))]>,
+                         OpSize;
+  def CRC32m16  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+                      (ins GR32:$src1, i16mem:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_16 GR32:$src1,
+                         (load addr:$src2)))]>,
+                         OpSize;
+  def CRC32r16  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+                      (ins GR32:$src1, GR16:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_16 GR32:$src1, GR16:$src2))]>,
+                         OpSize;
+  def CRC32m32  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+                      (ins GR32:$src1, i32mem:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_32 GR32:$src1,
+                         (load addr:$src2)))]>, OpSize;
+  def CRC32r32  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+                      (ins GR32:$src1, GR32:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_32 GR32:$src1, GR32:$src2))]>,
+                         OpSize;
+  def CRC64m64  : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
+                      (ins GR64:$src1, i64mem:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR64:$dst,
+                         (int_x86_sse42_crc32_64 GR64:$src1,
+                         (load addr:$src2)))]>,
+                         OpSize, REX_W;
+  def CRC64r64  : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
+                      (ins GR64:$src1, GR64:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR64:$dst,
+                         (int_x86_sse42_crc32_64 GR64:$src1, GR64:$src2))]>,
+                         OpSize, REX_W;
+}
+
+// String/text processing instructions.
+let Defs = [EFLAGS], usesCustomDAGSchedInserter = 1 in {
+def PCMPISTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
+			(ins VR128:$src1, VR128:$src2, i8imm:$src3),
+		    "#PCMPISTRM128rr PSEUDO!",
+		    [(set VR128:$dst,
+			(int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
+						    imm:$src3))]>, OpSize;
+def PCMPISTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
+			(ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+		    "#PCMPISTRM128rm PSEUDO!",
+		    [(set VR128:$dst,
+			(int_x86_sse42_pcmpistrm128 VR128:$src1,
+						    (load addr:$src2),
+						    imm:$src3))]>, OpSize;
+}
+
+let Defs = [XMM0, EFLAGS] in {
+def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
+			    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+		     "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}",
+		     []>, OpSize;
+def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
+			    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+		     "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}",
+		     []>, OpSize;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX],
+	usesCustomDAGSchedInserter = 1 in {
+def PCMPESTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
+			(ins VR128:$src1, VR128:$src3, i8imm:$src5),
+		    "#PCMPESTRM128rr PSEUDO!",
+		    [(set VR128:$dst,
+			(int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
+						    VR128:$src3,
+						    EDX, imm:$src5))]>, OpSize;
+def PCMPESTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
+			(ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+		    "#PCMPESTRM128rm PSEUDO!",
+		    [(set VR128:$dst,
+			(int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
+						    (load addr:$src3),
+						    EDX, imm:$src5))]>, OpSize;
+}
+
+let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in {
+def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
+			    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+		     "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}",
+		     []>, OpSize;
+def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
+			    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+		     "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}",
+		     []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS] in {
+  multiclass SS42AI_pcmpistri<Intrinsic IntId128> {
+    def rr : SS42AI<0x63, MRMSrcReg, (outs),
+		(ins VR128:$src1, VR128:$src2, i8imm:$src3),
+		"pcmpistri\t{$src3, $src2, $src1|$src1, $src2, $src3}",
+		[(set ECX,
+		   (IntId128 VR128:$src1, VR128:$src2, imm:$src3)),
+	         (implicit EFLAGS)]>,
+		OpSize;
+    def rm : SS42AI<0x63, MRMSrcMem, (outs),
+		(ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+		"pcmpistri\t{$src3, $src2, $src1|$src1, $src2, $src3}",
+		[(set ECX,
+		  (IntId128 VR128:$src1, (load addr:$src2), imm:$src3)),
+		 (implicit EFLAGS)]>,
+		OpSize;
+  }
+}
+
+defm PCMPISTRI  : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128>;
+defm PCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128>;
+defm PCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128>;
+defm PCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128>;
+defm PCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128>;
+defm PCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128>;
+
+let Defs = [ECX, EFLAGS] in {
+let Uses = [EAX, EDX] in {
+  multiclass SS42AI_pcmpestri<Intrinsic IntId128> {
+    def rr : SS42AI<0x61, MRMSrcReg, (outs),
+		(ins VR128:$src1, VR128:$src3, i8imm:$src5),
+		"pcmpestri\t{$src5, $src3, $src1|$src1, $src3, $src5}",
+		[(set ECX,
+		   (IntId128 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5)),
+	         (implicit EFLAGS)]>,
+		OpSize;
+    def rm : SS42AI<0x61, MRMSrcMem, (outs),
+		(ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+		"pcmpestri\t{$src5, $src3, $src1|$src1, $src3, $src5}",
+		[(set ECX,
+		  (IntId128 VR128:$src1, EAX, (load addr:$src3),
+		    EDX, imm:$src5)),
+		 (implicit EFLAGS)]>,
+		OpSize;
+  }
+}
+}
+
+defm PCMPESTRI  : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128>;
+defm PCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128>;
+defm PCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128>;
+defm PCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128>;
+defm PCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128>;
+defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>;
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index f923106..62ca47f 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -15,15 +15,16 @@
 #include "X86JITInfo.h"
 #include "X86Relocations.h"
 #include "X86Subtarget.h"
+#include "X86TargetMachine.h"
 #include "llvm/Function.h"
-#include "llvm/Config/alloca.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <cstdlib>
 #include <cstring>
 using namespace llvm;
 
 // Determine the platform we're running on
-#if defined (__x86_64__) || defined (_M_AMD64)
+#if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64)
 # define X86_64_JIT
 #elif defined(__i386__) || defined(i386) || defined(_M_IX86)
 # define X86_32_JIT
@@ -51,13 +52,6 @@ static TargetJITInfo::JITCompilerFn JITCompilerFunction;
 #define GETASMPREFIX(X) GETASMPREFIX2(X)
 #define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
 
-// Check if building with -fPIC
-#if defined(__PIC__) && __PIC__ && defined(__linux__)
-#define ASMCALLSUFFIX "@PLT"
-#else
-#define ASMCALLSUFFIX
-#endif
-
 // For ELF targets, use a .size and .type directive, to let tools
 // know the extent of functions defined in assembler.
 #if defined(__ELF__)
@@ -130,7 +124,7 @@ extern "C" {
     // JIT callee
     "movq    %rbp, %rdi\n"    // Pass prev frame and return address
     "movq    8(%rbp), %rsi\n"
-    "call    " ASMPREFIX "X86CompilationCallback2" ASMCALLSUFFIX "\n"
+    "call    " ASMPREFIX "X86CompilationCallback2\n"
     // Restore all XMM arg registers
     "movaps  112(%rsp), %xmm7\n"
     "movaps  96(%rsp), %xmm6\n"
@@ -206,7 +200,7 @@ extern "C" {
     "movl    4(%ebp), %eax\n" // Pass prev frame and return address
     "movl    %eax, 4(%esp)\n"
     "movl    %ebp, (%esp)\n"
-    "call    " ASMPREFIX "X86CompilationCallback2" ASMCALLSUFFIX "\n"
+    "call    " ASMPREFIX "X86CompilationCallback2\n"
     "movl    %ebp, %esp\n"    // Restore ESP
     CFI(".cfi_def_cfa_register %esp\n")
     "subl    $12, %esp\n"
@@ -262,7 +256,7 @@ extern "C" {
     "movl    4(%ebp), %eax\n" // Pass prev frame and return address
     "movl    %eax, 4(%esp)\n"
     "movl    %ebp, (%esp)\n"
-    "call    " ASMPREFIX "X86CompilationCallback2" ASMCALLSUFFIX "\n"
+    "call    " ASMPREFIX "X86CompilationCallback2\n"
     "addl    $16, %esp\n"
     "movaps  48(%esp), %xmm3\n"
     CFI(".cfi_restore %xmm3\n")
@@ -321,8 +315,7 @@ extern "C" {
 
 #else // Not an i386 host
   void X86CompilationCallback() {
-    assert(0 && "Cannot call X86CompilationCallback() on a non-x86 arch!\n");
-    abort();
+    llvm_unreachable("Cannot call X86CompilationCallback() on a non-x86 arch!");
   }
 #endif
 }
@@ -331,14 +324,21 @@ extern "C" {
 /// function stub when we did not know the real target of a call.  This function
 /// must locate the start of the stub or call site and pass it into the JIT
 /// compiler function.
-extern "C" void ATTRIBUTE_USED
+extern "C" {
+#if !(defined (X86_64_JIT) && defined(_MSC_VER))
+ // the following function is called only from this translation unit,
+ // unless we are under 64bit Windows with MSC, where there is 
+ // no support for inline assembly
+static
+#endif
+void ATTRIBUTE_USED
 X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
   intptr_t *RetAddrLoc = &StackPtr[1];
   assert(*RetAddrLoc == RetAddr &&
          "Could not find return address on the stack!");
 
   // It's a stub if there is an interrupt marker after the call.
-  bool isStub = ((unsigned char*)RetAddr)[0] == 0xCD;
+  bool isStub = ((unsigned char*)RetAddr)[0] == 0xCE;
 
   // The call instruction should have pushed the return value onto the stack...
 #if defined (X86_64_JIT)
@@ -348,10 +348,10 @@ X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
 #endif
 
 #if 0
-  DOUT << "In callback! Addr=" << (void*)RetAddr
-       << " ESP=" << (void*)StackPtr
-       << ": Resolving call to function: "
-       << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n";
+  DEBUG(errs() << "In callback! Addr=" << (void*)RetAddr
+               << " ESP=" << (void*)StackPtr
+               << ": Resolving call to function: "
+               << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n");
 #endif
 
   // Sanity check to make sure this really is a call instruction.
@@ -377,7 +377,7 @@ X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
     // If this is a stub, rewrite the call into an unconditional branch
     // instruction so that two return addresses are not pushed onto the stack
     // when the requested function finally gets called.  This also makes the
-    // 0xCD byte (interrupt) dead, so the marker doesn't effect anything.
+    // 0xCE byte (interrupt) dead, so the marker doesn't effect anything.
 #if defined (X86_64_JIT)
     // If the target address is within 32-bit range of the stub, use a
     // PC-relative branch instead of loading the actual address.  (This is
@@ -403,31 +403,26 @@ X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
   *RetAddrLoc -= 5;
 #endif
 }
+}
 
 TargetJITInfo::LazyResolverFn
 X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
   JITCompilerFunction = F;
 
 #if defined (X86_32_JIT) && !defined (_MSC_VER)
-  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
-  union {
-    unsigned u[3];
-    char     c[12];
-  } text;
-
-  if (!X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1)) {
-    // FIXME: support for AMD family of processors.
-    if (memcmp(text.c, "GenuineIntel", 12) == 0) {
-      X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
-      if ((EDX >> 25) & 0x1)
-        return X86CompilationCallback_SSE;
-    }
-  }
+  if (Subtarget->hasSSE1())
+    return X86CompilationCallback_SSE;
 #endif
 
   return X86CompilationCallback;
 }
 
+X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) {
+  Subtarget = &TM.getSubtarget<X86Subtarget>();
+  useGOT = 0;
+  TLSOffset = 0;
+}
+
 void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
                                              JITCodeEmitter &JCE) {
 #if defined (X86_64_JIT)
@@ -485,7 +480,10 @@ void *X86JITInfo::emitFunctionStub(const Function* F, void *Fn,
   JCE.emitWordLE((intptr_t)Fn-JCE.getCurrentPCValue()-4);
 #endif
 
-  JCE.emitByte(0xCD);   // Interrupt - Just a marker identifying the stub!
+  // This used to use 0xCD, but that value is used by JITMemoryManager to
+  // initialize the buffer with garbage, which means it may follow a
+  // noreturn function call, confusing X86CompilationCallback2.  PR 4929.
+  JCE.emitByte(0xCE);   // Interrupt - Just a marker identifying the stub!
   return JCE.finishGVStub(F);
 }
 
@@ -495,9 +493,11 @@ void X86JITInfo::emitFunctionStubAtAddr(const Function* F, void *Fn, void *Stub,
   // complains about casting a function pointer to a normal pointer.
   JCE.startGVStub(F, Stub, 5);
   JCE.emitByte(0xE9);
-#if defined (X86_64_JIT)
-  assert(((((intptr_t)Fn-JCE.getCurrentPCValue()-5) << 32) >> 32) == 
-          ((intptr_t)Fn-JCE.getCurrentPCValue()-5) 
+#if defined (X86_64_JIT) && !defined (NDEBUG)
+  // Yes, we need both of these casts, or some broken versions of GCC (4.2.4)
+  // get the signed-ness of the expression wrong.  Go figure.
+  intptr_t Displacement = (intptr_t)Fn - (intptr_t)JCE.getCurrentPCValue() - 5;
+  assert(((Displacement << 32) >> 32) == Displacement
          && "PIC displacement does not fit in displacement field!");
 #endif
   JCE.emitWordLE((intptr_t)Fn-JCE.getCurrentPCValue()-4);
@@ -538,6 +538,7 @@ void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
       break;
     }
     case X86::reloc_absolute_word:
+    case X86::reloc_absolute_word_sext:
       // Absolute relocation, just add the relocated value to the value already
       // in memory.
       *((unsigned*)RelocPos) += (unsigned)ResultPtr;
@@ -554,7 +555,7 @@ char* X86JITInfo::allocateThreadLocalMemory(size_t size) {
   TLSOffset -= size;
   return TLSOffset;
 #else
-  assert(0 && "Cannot allocate thread local storage on this arch!\n");
+  llvm_unreachable("Cannot allocate thread local storage on this arch!");
   return 0;
 #endif
 }
diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
index 6a4e214..c381433 100644
--- a/lib/Target/X86/X86JITInfo.h
+++ b/lib/Target/X86/X86JITInfo.h
@@ -20,16 +20,15 @@
 
 namespace llvm {
   class X86TargetMachine;
+  class X86Subtarget;
 
   class X86JITInfo : public TargetJITInfo {
     X86TargetMachine &TM;
+    const X86Subtarget *Subtarget;
     uintptr_t PICBase;
     char* TLSOffset;
   public:
-    explicit X86JITInfo(X86TargetMachine &tm) : TM(tm) {
-      useGOT = 0;
-      TLSOffset = 0;
-    }
+    explicit X86JITInfo(X86TargetMachine &tm);
 
     /// replaceMachineCodeForFunction - Make it so that calling the function
     /// whose machine code is at OLD turns into a call to NEW, perhaps by
diff --git a/lib/Target/X86/X86MCAsmInfo.cpp b/lib/Target/X86/X86MCAsmInfo.cpp
new file mode 100644
index 0000000..9d7e66d
--- /dev/null
+++ b/lib/Target/X86/X86MCAsmInfo.cpp
@@ -0,0 +1,123 @@
+//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+enum AsmWriterFlavorTy {
+  // Note: This numbering has to match the GCC assembler dialects for inline
+  // asm alternatives to work right.
+  ATT = 0, Intel = 1
+};
+
+static cl::opt<AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
+  cl::desc("Choose style of code to emit from X86 backend:"),
+  cl::values(clEnumValN(ATT,   "att",   "Emit AT&T-style assembly"),
+             clEnumValN(Intel, "intel", "Emit Intel-style assembly"),
+             clEnumValEnd));
+
+
+static const char *const x86_asm_table[] = {
+  "{si}", "S",
+  "{di}", "D",
+  "{ax}", "a",
+  "{cx}", "c",
+  "{memory}", "memory",
+  "{flags}", "",
+  "{dirflag}", "",
+  "{fpsr}", "",
+  "{cc}", "cc",
+  0,0};
+
+X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &Triple) {
+  AsmTransCBE = x86_asm_table;
+  AssemblerDialect = AsmWriterFlavor;
+    
+  bool is64Bit = Triple.getArch() == Triple::x86_64;
+
+  TextAlignFillValue = 0x90;
+
+  if (!is64Bit)
+    Data64bitsDirective = 0;       // we can't emit a 64-bit unit
+
+  // Leopard and above support aligned common symbols.
+  COMMDirectiveTakesAlignment = Triple.getDarwinMajorNumber() >= 9;
+
+  CommentString = "##";
+  PCSymbol = ".";
+
+  SupportsDebugInformation = true;
+  DwarfUsesInlineInfoSection = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::Dwarf;
+  AbsoluteEHSectionOffsets = false;
+}
+
+X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &Triple) {
+  AsmTransCBE = x86_asm_table;
+  AssemblerDialect = AsmWriterFlavor;
+
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  SetDirective = "\t.set\t";
+  PCSymbol = ".";
+
+  // Set up DWARF directives
+  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+
+  // Debug Information
+  AbsoluteDebugSectionOffsets = true;
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::Dwarf;
+  AbsoluteEHSectionOffsets = false;
+
+  // On Linux we must declare when we can use a non-executable stack.
+  if (Triple.getOS() == Triple::Linux)
+    NonexecutableStackDirective = "\t.section\t.note.GNU-stack,\"\",@progbits";
+}
+
+X86MCAsmInfoCOFF::X86MCAsmInfoCOFF(const Triple &Triple) {
+  AsmTransCBE = x86_asm_table;
+  AssemblerDialect = AsmWriterFlavor;
+}
+
+
+X86WinMCAsmInfo::X86WinMCAsmInfo(const Triple &Triple) {
+  AsmTransCBE = x86_asm_table;
+  AssemblerDialect = AsmWriterFlavor;
+
+  GlobalPrefix = "_";
+  CommentString = ";";
+
+  PrivateGlobalPrefix = "$";
+  AlignDirective = "\tALIGN\t";
+  ZeroDirective = "\tdb\t";
+  ZeroDirectiveSuffix = " dup(0)";
+  AsciiDirective = "\tdb\t";
+  AscizDirective = 0;
+  Data8bitsDirective = "\tdb\t";
+  Data16bitsDirective = "\tdw\t";
+  Data32bitsDirective = "\tdd\t";
+  Data64bitsDirective = "\tdq\t";
+  HasDotTypeDotSizeDirective = false;
+  HasSingleParameterDotFile = false;
+
+  AlignmentIsInBytes = true;
+}
diff --git a/lib/Target/X86/X86MCAsmInfo.h b/lib/Target/X86/X86MCAsmInfo.h
new file mode 100644
index 0000000..18e2bdb
--- /dev/null
+++ b/lib/Target/X86/X86MCAsmInfo.h
@@ -0,0 +1,42 @@
+//=====-- X86MCAsmInfo.h - X86 asm properties -----------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETASMINFO_H
+#define X86TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+  class Triple;
+
+  struct X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+    explicit X86MCAsmInfoDarwin(const Triple &Triple);
+  };
+
+  struct X86ELFMCAsmInfo : public MCAsmInfo {
+    explicit X86ELFMCAsmInfo(const Triple &Triple);
+  };
+
+  struct X86MCAsmInfoCOFF : public MCAsmInfoCOFF {
+    explicit X86MCAsmInfoCOFF(const Triple &Triple);
+  };
+
+  struct X86WinMCAsmInfo : public MCAsmInfo {
+    explicit X86WinMCAsmInfo(const Triple &Triple);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index a2f319f..f03723a 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -30,14 +30,16 @@
 #include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/TargetFrameInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
 X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
@@ -54,6 +56,7 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
   Is64Bit = Subtarget->is64Bit();
   IsWin64 = Subtarget->isTargetWin64();
   StackAlign = TM.getFrameInfo()->getStackAlignment();
+
   if (Is64Bit) {
     SlotSize = 8;
     StackPtr = X86::RSP;
@@ -65,12 +68,12 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
   }
 }
 
-// getDwarfRegNum - This function maps LLVM register identifiers to the
-// Dwarf specific numbering, used in debug info and exception tables.
-
+/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF
+/// specific numbering, used in debug info and exception tables.
 int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
   const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
   unsigned Flavour = DWARFFlavour::X86_64;
+
   if (!Subtarget->is64Bit()) {
     if (Subtarget->isTargetDarwin()) {
       if (isEH)
@@ -88,9 +91,8 @@ int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
   return X86GenRegisterInfo::getDwarfRegNumFull(RegNo, Flavour);
 }
 
-// getX86RegNum - This function maps LLVM register identifiers to their X86
-// specific numbering, which is used in various places encoding instructions.
-//
+/// getX86RegNum - This function maps LLVM register identifiers to their X86
+/// specific numbering, which is used in various places encoding instructions.
 unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) {
   switch(RegNo) {
   case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
@@ -146,17 +148,131 @@ unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) {
 
   default:
     assert(isVirtualRegister(RegNo) && "Unknown physical register!");
-    assert(0 && "Register allocator hasn't allocated reg correctly yet!");
+    llvm_unreachable("Register allocator hasn't allocated reg correctly yet!");
     return 0;
   }
 }
 
-const TargetRegisterClass *X86RegisterInfo::getPointerRegClass() const {
-  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
-  if (Subtarget->is64Bit())
-    return &X86::GR64RegClass;
-  else
+const TargetRegisterClass *
+X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                          const TargetRegisterClass *B,
+                                          unsigned SubIdx) const {
+  switch (SubIdx) {
+  default: return 0;
+  case 1:
+    // 8-bit
+    if (B == &X86::GR8RegClass) {
+      if (A->getSize() == 2 || A->getSize() == 4 || A->getSize() == 8)
+        return A;
+    } else if (B == &X86::GR8_ABCD_LRegClass || B == &X86::GR8_ABCD_HRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
+          A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass ||
+          A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_ABCDRegClass;
+      else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass ||
+               A == &X86::GR32_NOREXRegClass ||
+               A == &X86::GR32_NOSPRegClass)
+        return &X86::GR32_ABCDRegClass;
+      else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass ||
+               A == &X86::GR16_NOREXRegClass)
+        return &X86::GR16_ABCDRegClass;
+    } else if (B == &X86::GR8_NOREXRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_NOREXRegClass;
+      else if (A == &X86::GR64_ABCDRegClass)
+        return &X86::GR64_ABCDRegClass;
+      else if (A == &X86::GR32RegClass || A == &X86::GR32_NOREXRegClass ||
+               A == &X86::GR32_NOSPRegClass)
+        return &X86::GR32_NOREXRegClass;
+      else if (A == &X86::GR32_ABCDRegClass)
+        return &X86::GR32_ABCDRegClass;
+      else if (A == &X86::GR16RegClass || A == &X86::GR16_NOREXRegClass)
+        return &X86::GR16_NOREXRegClass;
+      else if (A == &X86::GR16_ABCDRegClass)
+        return &X86::GR16_ABCDRegClass;
+    }
+    break;
+  case 2:
+    // 8-bit hi
+    if (B == &X86::GR8_ABCD_HRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
+          A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass ||
+          A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_ABCDRegClass;
+      else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass ||
+               A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass)
+        return &X86::GR32_ABCDRegClass;
+      else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass ||
+               A == &X86::GR16_NOREXRegClass)
+        return &X86::GR16_ABCDRegClass;
+    }
+    break;
+  case 3:
+    // 16-bit
+    if (B == &X86::GR16RegClass) {
+      if (A->getSize() == 4 || A->getSize() == 8)
+        return A;
+    } else if (B == &X86::GR16_ABCDRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
+          A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass ||
+          A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_ABCDRegClass;
+      else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass ||
+               A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass)
+        return &X86::GR32_ABCDRegClass;
+    } else if (B == &X86::GR16_NOREXRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_NOREXRegClass;
+      else if (A == &X86::GR64_ABCDRegClass)
+        return &X86::GR64_ABCDRegClass;
+      else if (A == &X86::GR32RegClass || A == &X86::GR32_NOREXRegClass ||
+               A == &X86::GR32_NOSPRegClass)
+        return &X86::GR32_NOREXRegClass;
+      else if (A == &X86::GR32_ABCDRegClass)
+        return &X86::GR64_ABCDRegClass;
+    }
+    break;
+  case 4:
+    // 32-bit
+    if (B == &X86::GR32RegClass || B == &X86::GR32_NOSPRegClass) {
+      if (A->getSize() == 8)
+        return A;
+    } else if (B == &X86::GR32_ABCDRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
+          A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass ||
+          A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_ABCDRegClass;
+    } else if (B == &X86::GR32_NOREXRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_NOREXRegClass;
+      else if (A == &X86::GR64_ABCDRegClass)
+        return &X86::GR64_ABCDRegClass;
+    }
+    break;
+  }
+  return 0;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getPointerRegClass(unsigned Kind) const {
+  switch (Kind) {
+  default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
+  case 0: // Normal GPRs.
+    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+      return &X86::GR64RegClass;
     return &X86::GR32RegClass;
+  case 1: // Normal GRPs except the stack pointer (for encoding reasons).
+    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+      return &X86::GR64_NOSPRegClass;
+    return &X86::GR32_NOSPRegClass;
+  }
 }
 
 const TargetRegisterClass *
@@ -276,6 +392,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(X86::ESP);
   Reserved.set(X86::SP);
   Reserved.set(X86::SPL);
+
   // Set the frame-pointer register and its aliases as reserved if needed.
   if (hasFP(MF)) {
     Reserved.set(X86::RBP);
@@ -283,10 +400,10 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(X86::BP);
     Reserved.set(X86::BPL);
   }
-  // Mark the x87 stack registers as reserved, since they don't
-  // behave normally with respect to liveness. We don't fully
-  // model the effects of x87 stack pushes and pops after
-  // stackification.
+
+  // Mark the x87 stack registers as reserved, since they don't behave normally
+  // with respect to liveness. We don't fully model the effects of x87 stack
+  // pushes and pops after stackification.
   Reserved.set(X86::ST0);
   Reserved.set(X86::ST1);
   Reserved.set(X86::ST2);
@@ -304,10 +421,12 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
 static unsigned calculateMaxStackAlignment(const MachineFrameInfo *FFI) {
   unsigned MaxAlign = 0;
+
   for (int i = FFI->getObjectIndexBegin(),
          e = FFI->getObjectIndexEnd(); i != e; ++i) {
     if (FFI->isDeadObjectIndex(i))
       continue;
+
     unsigned Align = FFI->getObjectAlignment(i);
     MaxAlign = std::max(MaxAlign, Align);
   }
@@ -315,10 +434,9 @@ static unsigned calculateMaxStackAlignment(const MachineFrameInfo *FFI) {
   return MaxAlign;
 }
 
-// hasFP - Return true if the specified function should have a dedicated frame
-// pointer register.  This is true if the function has variable sized allocas or
-// if frame pointer elimination is disabled.
-//
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
 bool X86RegisterInfo::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
@@ -335,7 +453,7 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // FIXME: Currently we don't support stack realignment for functions with
-  // variable-sized allocas
+  //        variable-sized allocas
   return (RealignStack &&
           (MFI->getMaxAlignment() > StackAlign &&
            !MFI->hasVarSizedObjects()));
@@ -345,34 +463,45 @@ bool X86RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
   return !MF.getFrameInfo()->hasVarSizedObjects();
 }
 
+bool X86RegisterInfo::hasReservedSpillSlot(MachineFunction &MF, unsigned Reg,
+                                           int &FrameIdx) const {
+  if (Reg == FramePtr && hasFP(MF)) {
+    FrameIdx = MF.getFrameInfo()->getObjectIndexBegin();
+    return true;
+  }
+  return false;
+}
+
 int
 X86RegisterInfo::getFrameIndexOffset(MachineFunction &MF, int FI) const {
-  int Offset = MF.getFrameInfo()->getObjectOffset(FI) + SlotSize;
-  uint64_t StackSize = MF.getFrameInfo()->getStackSize();
+  const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  int Offset = MFI->getObjectOffset(FI) - TFI.getOffsetOfLocalArea();
+  uint64_t StackSize = MFI->getStackSize();
 
   if (needsStackRealignment(MF)) {
-    if (FI < 0)
-      // Skip the saved EBP
+    if (FI < 0) {
+      // Skip the saved EBP.
       Offset += SlotSize;
-    else {
-      unsigned Align = MF.getFrameInfo()->getObjectAlignment(FI);
+    } else {
+      unsigned Align = MFI->getObjectAlignment(FI);
       assert( (-(Offset + StackSize)) % Align == 0);
       Align = 0;
       return Offset + StackSize;
     }
-
     // FIXME: Support tail calls
   } else {
     if (!hasFP(MF))
       return Offset + StackSize;
 
-    // Skip the saved EBP
+    // Skip the saved EBP.
     Offset += SlotSize;
 
     // Skip the RETADDR move area
     X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
     int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
-    if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta;
+    if (TailCallReturnAddrDelta < 0)
+      Offset -= TailCallReturnAddrDelta;
   }
 
   return Offset;
@@ -392,24 +521,29 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
       // alignment boundary.
-      Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
+      Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
 
       MachineInstr *New = 0;
       if (Old->getOpcode() == getCallFrameSetupOpcode()) {
         New = BuildMI(MF, Old->getDebugLoc(),
                       TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri),
-                      StackPtr).addReg(StackPtr).addImm(Amount);
+                      StackPtr)
+          .addReg(StackPtr)
+          .addImm(Amount);
       } else {
         assert(Old->getOpcode() == getCallFrameDestroyOpcode());
-        // factor out the amount the callee already popped.
+
+        // Factor out the amount the callee already popped.
         uint64_t CalleeAmt = Old->getOperand(1).getImm();
         Amount -= CalleeAmt;
-        if (Amount) {
+  
+      if (Amount) {
           unsigned Opc = (Amount < 128) ?
             (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
             (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri);
           New = BuildMI(MF, Old->getDebugLoc(), TII.get(Opc), StackPtr)
-            .addReg(StackPtr).addImm(Amount);
+            .addReg(StackPtr)
+            .addImm(Amount);
         }
       }
 
@@ -417,7 +551,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
         // The EFLAGS implicit def is dead.
         New->getOperand(3).setIsDead();
 
-        // Replace the pseudo instruction with a new instruction...
+        // Replace the pseudo instruction with a new instruction.
         MBB.insert(I, New);
       }
     }
@@ -432,10 +566,12 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       MachineInstr *Old = I;
       MachineInstr *New =
         BuildMI(MF, Old->getDebugLoc(), TII.get(Opc), 
-                StackPtr).addReg(StackPtr).addImm(CalleeAmt);
+                StackPtr)
+          .addReg(StackPtr)
+          .addImm(CalleeAmt);
+
       // The EFLAGS implicit def is dead.
       New->getOperand(3).setIsDead();
-
       MBB.insert(I, New);
     }
   }
@@ -443,21 +579,24 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                          int SPAdj, RegScavenger *RS) const{
+unsigned
+X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                     int SPAdj, int *Value,
+                                     RegScavenger *RS) const{
   assert(SPAdj == 0 && "Unexpected");
 
   unsigned i = 0;
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
+
   while (!MI.getOperand(i).isFI()) {
     ++i;
     assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
   }
 
   int FrameIndex = MI.getOperand(i).getIndex();
-
   unsigned BasePtr;
+
   if (needsStackRealignment(MF))
     BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
   else
@@ -471,34 +610,33 @@ void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (MI.getOperand(i+3).isImm()) {
     // Offset is a 32-bit integer.
     int Offset = getFrameIndexOffset(MF, FrameIndex) +
-      (int)(MI.getOperand(i+3).getImm());
+      (int)(MI.getOperand(i + 3).getImm());
   
-     MI.getOperand(i+3).ChangeToImmediate(Offset);
+     MI.getOperand(i + 3).ChangeToImmediate(Offset);
   } else {
     // Offset is symbolic. This is extremely rare.
     uint64_t Offset = getFrameIndexOffset(MF, FrameIndex) +
                       (uint64_t)MI.getOperand(i+3).getOffset();
     MI.getOperand(i+3).setOffset(Offset);
   }
+  return 0;
 }
 
 void
 X86RegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                       RegScavenger *RS) const {
-  MachineFrameInfo *FFI = MF.getFrameInfo();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // Calculate and set max stack object alignment early, so we can decide
   // whether we will need stack realignment (and thus FP).
-  unsigned MaxAlign = std::max(FFI->getMaxAlignment(),
-                               calculateMaxStackAlignment(FFI));
+  unsigned MaxAlign = std::max(MFI->getMaxAlignment(),
+                               calculateMaxStackAlignment(MFI));
 
-  FFI->setMaxAlignment(MaxAlign);
-}
+  MFI->setMaxAlignment(MaxAlign);
 
-void
-X86RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF) const{
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
   if (TailCallReturnAddrDelta < 0) {
     // create RETURNADDR area
     //   arg
@@ -509,18 +647,21 @@ X86RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF) const{
     //     ...
     //   }
     //   [EBP]
-    MF.getFrameInfo()->
-      CreateFixedObject(-TailCallReturnAddrDelta,
-                        (-1*SlotSize)+TailCallReturnAddrDelta);
+    MFI->CreateFixedObject(-TailCallReturnAddrDelta,
+                           (-1U*SlotSize)+TailCallReturnAddrDelta);
   }
+
   if (hasFP(MF)) {
     assert((TailCallReturnAddrDelta <= 0) &&
            "The Delta should always be zero or negative");
+    const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+
     // Create a frame entry for the EBP register that must be saved.
-    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize,
-                                                        (int)SlotSize * -2+
-                                                       TailCallReturnAddrDelta);
-    assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
+    int FrameIdx = MFI->CreateFixedObject(SlotSize,
+                                          -(int)SlotSize +
+                                          TFI.getOffsetOfLocalArea() +
+                                          TailCallReturnAddrDelta);
+    assert(FrameIdx == MFI->getObjectIndexBegin() &&
            "Slot for EBP register must be last in order to be found!");
     FrameIdx = 0;
   }
@@ -549,14 +690,14 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
     uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
     MachineInstr *MI =
       BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
-         .addReg(StackPtr).addImm(ThisVal);
-    // The EFLAGS implicit def is dead.
-    MI->getOperand(3).setIsDead();
+        .addReg(StackPtr)
+        .addImm(ThisVal);
+    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
     Offset -= ThisVal;
   }
 }
 
-// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
+/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
 static
 void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
                       unsigned StackPtr, uint64_t *NumBytes = NULL) {
@@ -579,11 +720,12 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
   }
 }
 
-// mergeSPUpdatesUp - Merge two stack-manipulating instructions lower iterator.
+/// mergeSPUpdatesUp - Merge two stack-manipulating instructions lower iterator.
 static
 void mergeSPUpdatesDown(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator &MBBI,
                         unsigned StackPtr, uint64_t *NumBytes = NULL) {
+  // FIXME: THIS ISN'T RUN!!!
   return;
 
   if (MBBI == MBB.end()) return;
@@ -610,23 +752,22 @@ void mergeSPUpdatesDown(MachineBasicBlock &MBB,
 }
 
 /// mergeSPUpdates - Checks the instruction before/after the passed
-/// instruction. If it is an ADD/SUB instruction it is deleted
-/// argument and the stack adjustment is returned as a positive value for ADD
-/// and a negative for SUB.
+/// instruction. If it is an ADD/SUB instruction it is deleted argument and the
+/// stack adjustment is returned as a positive value for ADD and a negative for
+/// SUB.
 static int mergeSPUpdates(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator &MBBI,
                            unsigned StackPtr,
                            bool doMergeWithPrevious) {
-
   if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
       (!doMergeWithPrevious && MBBI == MBB.end()))
     return 0;
 
-  int Offset = 0;
-
   MachineBasicBlock::iterator PI = doMergeWithPrevious ? prior(MBBI) : MBBI;
   MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : next(MBBI);
   unsigned Opc = PI->getOpcode();
+  int Offset = 0;
+
   if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
        Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
       PI->getOperand(0).getReg() == StackPtr){
@@ -644,122 +785,116 @@ static int mergeSPUpdates(MachineBasicBlock &MBB,
   return Offset;
 }
 
-void X86RegisterInfo::emitFrameMoves(MachineFunction &MF,
-                                     unsigned FrameLabelId,
-                                     unsigned ReadyLabelId) const {
+void X86RegisterInfo::emitCalleeSavedFrameMoves(MachineFunction &MF,
+                                                unsigned LabelId,
+                                                unsigned FramePtr) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
-  if (!MMI)
-    return;
+  if (!MMI) return;
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.empty()) return;
 
-  uint64_t StackSize = MFI->getStackSize();
   std::vector<MachineMove> &Moves = MMI->getFrameMoves();
   const TargetData *TD = MF.getTarget().getTargetData();
+  bool HasFP = hasFP(MF);
 
-  // Calculate amount of bytes used for return address storing
+  // Calculate amount of bytes used for return address storing.
   int stackGrowth =
     (MF.getTarget().getFrameInfo()->getStackGrowthDirection() ==
      TargetFrameInfo::StackGrowsUp ?
      TD->getPointerSize() : -TD->getPointerSize());
 
-  MachineLocation FPDst(hasFP(MF) ? FramePtr : StackPtr);
-  MachineLocation FPSrc(MachineLocation::VirtualFP);
-  Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
-
-  if (StackSize) {
-    // Show update of SP.
-    if (hasFP(MF)) {
-      // Adjust SP
-      MachineLocation SPDst(MachineLocation::VirtualFP);
-      MachineLocation SPSrc(MachineLocation::VirtualFP, 2*stackGrowth);
-      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
-    } else {
-      MachineLocation SPDst(MachineLocation::VirtualFP);
-      MachineLocation SPSrc(MachineLocation::VirtualFP,
-                            -StackSize+stackGrowth);
-      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
-    }
-  } else {
-    // FIXME: Verify & implement for FP
-    MachineLocation SPDst(StackPtr);
-    MachineLocation SPSrc(StackPtr, stackGrowth);
-    Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
-  }
-
-  // Add callee saved registers to move list.
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-
   // FIXME: This is dirty hack. The code itself is pretty mess right now.
   // It should be rewritten from scratch and generalized sometimes.
 
-  // Determine maximum offset (minumum due to stack growth)
+  // Determine maximum offset (minumum due to stack growth).
   int64_t MaxOffset = 0;
-  for (unsigned I = 0, E = CSI.size(); I!=E; ++I)
+  for (std::vector<CalleeSavedInfo>::const_iterator
+         I = CSI.begin(), E = CSI.end(); I != E; ++I)
     MaxOffset = std::min(MaxOffset,
-                         MFI->getObjectOffset(CSI[I].getFrameIdx()));
-
-  // Calculate offsets
-  int64_t saveAreaOffset = (hasFP(MF) ? 3 : 2)*stackGrowth;
-  for (unsigned I = 0, E = CSI.size(); I!=E; ++I) {
-    int64_t Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
-    unsigned Reg = CSI[I].getReg();
-    Offset = (MaxOffset-Offset+saveAreaOffset);
+                         MFI->getObjectOffset(I->getFrameIdx()));
+
+  // Calculate offsets.
+  int64_t saveAreaOffset = (HasFP ? 3 : 2) * stackGrowth;
+  for (std::vector<CalleeSavedInfo>::const_iterator
+         I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+    int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+    unsigned Reg = I->getReg();
+    Offset = MaxOffset - Offset + saveAreaOffset;
+
+    // Don't output a new machine move if we're re-saving the frame
+    // pointer. This happens when the PrologEpilogInserter has inserted an extra
+    // "PUSH" of the frame pointer -- the "emitPrologue" method automatically
+    // generates one when frame pointers are used. If we generate a "machine
+    // move" for this extra "PUSH", the linker will lose track of the fact that
+    // the frame pointer should have the value of the first "PUSH" when it's
+    // trying to unwind.
+    // 
+    // FIXME: This looks inelegant. It's possibly correct, but it's covering up
+    //        another bug. I.e., one where we generate a prolog like this:
+    //
+    //          pushl  %ebp
+    //          movl   %esp, %ebp
+    //          pushl  %ebp
+    //          pushl  %esi
+    //           ...
+    //
+    //        The immediate re-push of EBP is unnecessary. At the least, it's an
+    //        optimization bug. EBP can be used as a scratch register in certain
+    //        cases, but probably not when we have a frame pointer.
+    if (HasFP && FramePtr == Reg)
+      continue;
+
     MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
     MachineLocation CSSrc(Reg);
-    Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
-  }
-
-  if (hasFP(MF)) {
-    // Save FP
-    MachineLocation FPDst(MachineLocation::VirtualFP, 2*stackGrowth);
-    MachineLocation FPSrc(FramePtr);
-    Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+    Moves.push_back(MachineMove(LabelId, CSDst, CSSrc));
   }
 }
 
-
+/// emitPrologue - Push callee-saved registers onto the stack, which
+/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
+/// space for local variables. Also emit labels used by the exception handler to
+/// generate the exception handling frames.
 void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
-  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
+  MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const Function* Fn = MF.getFunction();
-  const X86Subtarget* Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>();
+  const Function *Fn = MF.getFunction();
+  const X86Subtarget *Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>();
   MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-  MachineBasicBlock::iterator MBBI = MBB.begin();
   bool needsFrameMoves = (MMI && MMI->hasDebugInfo()) ||
-                          !Fn->doesNotThrow() ||
-                          UnwindTablesMandatory;
+                          !Fn->doesNotThrow() || UnwindTablesMandatory;
+  uint64_t MaxAlign  = MFI->getMaxAlignment(); // Desired stack alignment.
+  uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
+  bool HasFP = hasFP(MF);
   DebugLoc DL;
 
-  // Prepare for frame info.
-  unsigned FrameLabelId = 0;
-
-  // Get the number of bytes to allocate from the FrameInfo.
-  uint64_t StackSize = MFI->getStackSize();
-
-  // Get desired stack alignment
-  uint64_t MaxAlign  = MFI->getMaxAlignment();
-
   // Add RETADDR move area to callee saved frame size.
   int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
   if (TailCallReturnAddrDelta < 0)
     X86FI->setCalleeSavedFrameSize(
-          X86FI->getCalleeSavedFrameSize() +(-TailCallReturnAddrDelta));
+      X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
 
   // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
   // function, and use up to 128 bytes of stack space, don't have a frame
   // pointer, calls, or dynamic alloca then we do not need to adjust the
   // stack pointer (we fit in the Red Zone).
-  bool DisableRedZone = Fn->hasFnAttr(Attribute::NoRedZone);
-  if (Is64Bit && !DisableRedZone &&
+  if (Is64Bit && !Fn->hasFnAttr(Attribute::NoRedZone) &&
       !needsStackRealignment(MF) &&
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
       !MFI->hasCalls() &&                          // No calls.
       !Subtarget->isTargetWin64()) {               // Win64 has no Red Zone
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
-    if (hasFP(MF)) MinSize += SlotSize;
-    StackSize = std::max(MinSize,
-                         StackSize > 128 ? StackSize - 128 : 0);
+    if (HasFP) MinSize += SlotSize;
+    StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
+    MFI->setStackSize(StackSize);
+  } else if (Subtarget->isTargetWin64()) {
+    // We need to always allocate 32 bytes as register spill area.
+    // FIXME: We might reuse these 32 bytes for leaf functions.
+    StackSize += 32;
     MFI->setStackSize(StackSize);
   }
 
@@ -769,33 +904,73 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
   if (TailCallReturnAddrDelta < 0) {
     MachineInstr *MI =
       BuildMI(MBB, MBBI, DL, TII.get(Is64Bit? X86::SUB64ri32 : X86::SUB32ri),
-              StackPtr).addReg(StackPtr).addImm(-TailCallReturnAddrDelta);
-    // The EFLAGS implicit def is dead.
-    MI->getOperand(3).setIsDead();
+              StackPtr)
+        .addReg(StackPtr)
+        .addImm(-TailCallReturnAddrDelta);
+    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
   }
 
+  // Mapping for machine moves:
+  //
+  //   DST: VirtualFP AND
+  //        SRC: VirtualFP              => DW_CFA_def_cfa_offset
+  //        ELSE                        => DW_CFA_def_cfa
+  //
+  //   SRC: VirtualFP AND
+  //        DST: Register               => DW_CFA_def_cfa_register
+  //
+  //   ELSE
+  //        OFFSET < 0                  => DW_CFA_offset_extended_sf
+  //        REG < 64                    => DW_CFA_offset + Reg
+  //        ELSE                        => DW_CFA_offset_extended
+
+  std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+  const TargetData *TD = MF.getTarget().getTargetData();
   uint64_t NumBytes = 0;
-  if (hasFP(MF)) {
-    // Calculate required stack adjustment
+  int stackGrowth =
+    (MF.getTarget().getFrameInfo()->getStackGrowthDirection() ==
+     TargetFrameInfo::StackGrowsUp ?
+       TD->getPointerSize() : -TD->getPointerSize());
+
+  if (HasFP) {
+    // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
     if (needsStackRealignment(MF))
-      FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;
+      FrameSize = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
 
     NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
 
-    // Get the offset of the stack slot for the EBP register... which is
+    // Get the offset of the stack slot for the EBP register, which is
     // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
     // Update the frame offset adjustment.
     MFI->setOffsetAdjustment(-NumBytes);
 
-    // Save EBP into the appropriate stack slot...
+    // Save EBP/RBP into the appropriate stack slot.
     BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
       .addReg(FramePtr, RegState::Kill);
 
     if (needsFrameMoves) {
-      // Mark effective beginning of when frame pointer becomes valid.
-      FrameLabelId = MMI->NextLabelID();
+      // Mark the place where EBP/RBP was saved.
+      unsigned FrameLabelId = MMI->NextLabelID();
       BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(FrameLabelId);
+
+      // Define the current CFA rule to use the provided offset.
+      if (StackSize) {
+        MachineLocation SPDst(MachineLocation::VirtualFP);
+        MachineLocation SPSrc(MachineLocation::VirtualFP, 2 * stackGrowth);
+        Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+      } else {
+        // FIXME: Verify & implement for FP
+        MachineLocation SPDst(StackPtr);
+        MachineLocation SPSrc(StackPtr, stackGrowth);
+        Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+      }
+
+      // Change the rule for the FramePtr to be an "offset" rule.
+      MachineLocation FPDst(MachineLocation::VirtualFP,
+                            2 * stackGrowth);
+      MachineLocation FPSrc(FramePtr);
+      Moves.push_back(MachineMove(FrameLabelId, FPDst, FPSrc));
     }
 
     // Update EBP with the new base value...
@@ -803,6 +978,17 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
             TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
         .addReg(StackPtr);
 
+    if (needsFrameMoves) {
+      // Mark effective beginning of when frame pointer becomes valid.
+      unsigned FrameLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(FrameLabelId);
+
+      // Define the current CFA to use the EBP/RBP register.
+      MachineLocation FPDst(FramePtr);
+      MachineLocation FPSrc(MachineLocation::VirtualFP);
+      Moves.push_back(MachineMove(FrameLabelId, FPDst, FPSrc));
+    }
+
     // Mark the FramePtr as live-in in every block except the entry.
     for (MachineFunction::iterator I = next(MF.begin()), E = MF.end();
          I != E; ++I)
@@ -814,6 +1000,7 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
         BuildMI(MBB, MBBI, DL,
                 TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri),
                 StackPtr).addReg(StackPtr).addImm(-MaxAlign);
+
       // The EFLAGS implicit def is dead.
       MI->getOperand(3).setIsDead();
     }
@@ -822,11 +1009,30 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
   }
 
   // Skip the callee-saved push instructions.
+  bool PushedRegs = false;
+  int StackOffset = 2 * stackGrowth;
+
   while (MBBI != MBB.end() &&
          (MBBI->getOpcode() == X86::PUSH32r ||
-          MBBI->getOpcode() == X86::PUSH64r))
+          MBBI->getOpcode() == X86::PUSH64r)) {
+    PushedRegs = true;
     ++MBBI;
 
+    if (!HasFP && needsFrameMoves) {
+      // Mark callee-saved push instruction.
+      unsigned LabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(LabelId);
+
+      // Define the current CFA rule to use the provided offset.
+      unsigned Ptr = StackSize ?
+        MachineLocation::VirtualFP : StackPtr;
+      MachineLocation SPDst(Ptr);
+      MachineLocation SPSrc(Ptr, StackOffset);
+      Moves.push_back(MachineMove(LabelId, SPDst, SPSrc));
+      StackOffset += stackGrowth;
+    }
+  }
+
   if (MBBI != MBB.end())
     DL = MBBI->getDebugLoc();
 
@@ -883,12 +1089,29 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
       emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII);
   }
 
-  if (needsFrameMoves) {
-    unsigned ReadyLabelId = 0;
-    // Mark effective beginning of when frame pointer is ready.
-    ReadyLabelId = MMI->NextLabelID();
-    BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(ReadyLabelId);
-    emitFrameMoves(MF, FrameLabelId, ReadyLabelId);
+  if ((NumBytes || PushedRegs) && needsFrameMoves) {
+    // Mark end of stack pointer adjustment.
+    unsigned LabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(LabelId);
+
+    if (!HasFP && NumBytes) {
+      // Define the current CFA rule to use the provided offset.
+      if (StackSize) {
+        MachineLocation SPDst(MachineLocation::VirtualFP);
+        MachineLocation SPSrc(MachineLocation::VirtualFP,
+                              -StackSize + stackGrowth);
+        Moves.push_back(MachineMove(LabelId, SPDst, SPSrc));
+      } else {
+        // FIXME: Verify & implement for FP
+        MachineLocation SPDst(StackPtr);
+        MachineLocation SPSrc(StackPtr, stackGrowth);
+        Moves.push_back(MachineMove(LabelId, SPDst, SPSrc));
+      }
+    }
+
+    // Emit DWARF info specifying the offsets of the callee-saved registers.
+    if (PushedRegs)
+      emitCalleeSavedFrameMoves(MF, LabelId, HasFP ? FramePtr : StackPtr);
   }
 }
 
@@ -901,6 +1124,8 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
   DebugLoc DL = MBBI->getDebugLoc();
 
   switch (RetOpcode) {
+  default:
+    llvm_unreachable("Can only insert epilog into returning blocks");
   case X86::RET:
   case X86::RETI:
   case X86::TCRETURNdi:
@@ -911,26 +1136,25 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
   case X86::EH_RETURN64:
   case X86::TAILJMPd:
   case X86::TAILJMPr:
-  case X86::TAILJMPm: break;  // These are ok
-  default:
-    assert(0 && "Can only insert epilog into returning blocks");
+  case X86::TAILJMPm:
+    break;  // These are ok
   }
 
-  // Get the number of bytes to allocate from the FrameInfo
+  // Get the number of bytes to allocate from the FrameInfo.
   uint64_t StackSize = MFI->getStackSize();
   uint64_t MaxAlign  = MFI->getMaxAlignment();
   unsigned CSSize = X86FI->getCalleeSavedFrameSize();
   uint64_t NumBytes = 0;
 
   if (hasFP(MF)) {
-    // Calculate required stack adjustment
+    // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
     if (needsStackRealignment(MF))
       FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;
 
     NumBytes = FrameSize - CSSize;
 
-    // pop EBP.
+    // Pop EBP.
     BuildMI(MBB, MBBI, DL,
             TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr);
   } else {
@@ -942,9 +1166,11 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
   while (MBBI != MBB.begin()) {
     MachineBasicBlock::iterator PI = prior(MBBI);
     unsigned Opc = PI->getOpcode();
+
     if (Opc != X86::POP32r && Opc != X86::POP64r &&
         !PI->getDesc().isTerminator())
       break;
+
     --MBBI;
   }
 
@@ -957,10 +1183,10 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
 
   // If dynamic alloca is used, then reset esp to point to the last callee-saved
   // slot before popping them off! Same applies for the case, when stack was
-  // realigned
+  // realigned.
   if (needsStackRealignment(MF)) {
     // We cannot use LEA here, because stack pointer was realigned. We need to
-    // deallocate local frame back
+    // deallocate local frame back.
     if (CSSize) {
       emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
       MBBI = prior(LastCSPop);
@@ -972,17 +1198,18 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
   } else if (MFI->hasVarSizedObjects()) {
     if (CSSize) {
       unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
-      MachineInstr *MI = addLeaRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr),
-                                         FramePtr, false, -CSSize);
+      MachineInstr *MI =
+        addLeaRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr),
+                        FramePtr, false, -CSSize);
       MBB.insert(MBBI, MI);
-    } else
-      BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
-              StackPtr).addReg(FramePtr);
-
-  } else {
-    // adjust stack pointer back: ESP += numbytes
-    if (NumBytes)
-      emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
+    } else {
+      BuildMI(MBB, MBBI, DL,
+              TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), StackPtr)
+        .addReg(FramePtr);
+    }
+  } else if (NumBytes) {
+    // Adjust stack pointer back: ESP += numbytes.
+    emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
   }
 
   // We're returning from function via eh_return.
@@ -993,9 +1220,9 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL,
             TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
             StackPtr).addReg(DestAddr.getReg());
-  // Tail call return: adjust the stack pointer and jump to callee
   } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
              RetOpcode== X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64) {
+    // Tail call return: adjust the stack pointer and jump to callee.
     MBBI = prior(MBB.end());
     MachineOperand &JumpTarget = MBBI->getOperand(0);
     MachineOperand &StackAdjust = MBBI->getOperand(1);
@@ -1006,6 +1233,7 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
     int MaxTCDelta = X86FI->getTCReturnAddrDelta();
     int Offset = 0;
     assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+
     // Incoporate the retaddr area.
     Offset = StackAdj-MaxTCDelta;
     assert(Offset >= 0 && "Offset should never be negative");
@@ -1032,6 +1260,7 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
     // Add the return addr area delta back since we are not tail calling.
     int delta = -1*X86FI->getTCReturnAddrDelta();
     MBBI = prior(MBB.end());
+
     // Check for possible merge with preceeding ADD instruction.
     delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
     emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII);
@@ -1039,18 +1268,16 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
 }
 
 unsigned X86RegisterInfo::getRARegister() const {
-  if (Is64Bit)
-    return X86::RIP;  // Should have dwarf #16
-  else
-    return X86::EIP;  // Should have dwarf #8
+  return Is64Bit ? X86::RIP     // Should have dwarf #16.
+                 : X86::EIP;    // Should have dwarf #8.
 }
 
 unsigned X86RegisterInfo::getFrameRegister(MachineFunction &MF) const {
   return hasFP(MF) ? FramePtr : StackPtr;
 }
 
-void X86RegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves)
-                                                                         const {
+void
+X86RegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves) const {
   // Calculate amount of bytes used for return address storing
   int stackGrowth = (Is64Bit ? -8 : -4);
 
@@ -1066,18 +1293,18 @@ void X86RegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves)
 }
 
 unsigned X86RegisterInfo::getEHExceptionRegister() const {
-  assert(0 && "What is the exception register");
+  llvm_unreachable("What is the exception register");
   return 0;
 }
 
 unsigned X86RegisterInfo::getEHHandlerRegister() const {
-  assert(0 && "What is the exception handler register");
+  llvm_unreachable("What is the exception handler register");
   return 0;
 }
 
 namespace llvm {
-unsigned getX86SubSuperRegister(unsigned Reg, MVT VT, bool High) {
-  switch (VT.getSimpleVT()) {
+unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
+  switch (VT.getSimpleVT().SimpleTy) {
   default: return Reg;
   case MVT::i8:
     if (High) {
@@ -1264,14 +1491,21 @@ namespace {
            RegNum < RI.getLastVirtReg(); ++RegNum)
         MaxAlign = std::max(MaxAlign, RI.getRegClass(RegNum)->getAlignment());
 
-      FFI->setMaxAlignment(MaxAlign);
+      if (FFI->getMaxAlignment() == MaxAlign)
+        return false;
 
-      return false;
+      FFI->setMaxAlignment(MaxAlign);
+      return true;
     }
 
     virtual const char *getPassName() const {
       return "X86 Maximal Stack Alignment Calculator";
     }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
   };
 
   char MSAC::ID = 0;
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 33b9f5e..f635707 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -93,9 +93,16 @@ public:
   /// Code Generation virtual methods...
   /// 
 
+  /// getMatchingSuperRegClass - Return a subclass of the specified register
+  /// class A so that each register in it has a sub-register of the
+  /// specified sub-register index which is in the specified register class B.
+  virtual const TargetRegisterClass *
+  getMatchingSuperRegClass(const TargetRegisterClass *A,
+                           const TargetRegisterClass *B, unsigned Idx) const;
+
   /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
   /// values.
-  const TargetRegisterClass *getPointerRegClass() const;
+  const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
 
   /// getCrossCopyRegClass - Returns a legal register class to copy a register
   /// in the specified class to or from. Returns NULL if it is possible to copy
@@ -125,23 +132,25 @@ public:
 
   bool hasReservedCallFrame(MachineFunction &MF) const;
 
+  bool hasReservedSpillSlot(MachineFunction &MF, unsigned Reg,
+                            int &FrameIdx) const;
+
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MI) const;
 
-  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
 
-  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS = NULL) const;
 
+  void emitCalleeSavedFrameMoves(MachineFunction &MF, unsigned LabelId,
+                                 unsigned FramePtr) const;
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
-  void emitFrameMoves(MachineFunction &MF,
-                      unsigned FrameLabelId, unsigned ReadyLabelId) const;
-
   // Debug information queries.
   unsigned getRARegister() const;
   unsigned getFrameRegister(MachineFunction &MF) const;
@@ -155,8 +164,8 @@ public:
 
 // getX86SubSuperRegister - X86 utility function. It returns the sub or super
 // register of a specific X86 register.
-// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
-unsigned getX86SubSuperRegister(unsigned, MVT, bool High=false);
+// e.g. getX86SubSuperRegister(X86::EAX, EVT::i16) return X86:AX
+unsigned getX86SubSuperRegister(unsigned, EVT, bool High=false);
 
 } // End llvm namespace
 
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 2e6f017..7bf074d 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -270,42 +270,27 @@ def : SubRegSet<1, [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
 // require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
 // cannot be encoded.
 def GR8 : RegisterClass<"X86", [i8],  8,
-                        [AL, CL, DL, BL, AH, CH, DH, BH, SIL, DIL, BPL, SPL,
+                        [AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
                          R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B]> {
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
   let MethodBodies = [{
-    // Does the function dedicate RBP / EBP to being a frame ptr?
-    // If so, don't allocate SPL or BPL.
-    static const unsigned X86_GR8_AO_64_fp[] = {
-      X86::AL,   X86::CL,   X86::DL,   X86::SIL, X86::DIL,
-      X86::R8B,  X86::R9B,  X86::R10B, X86::R11B,
-      X86::BL,   X86::R14B, X86::R15B, X86::R12B, X86::R13B
-    };
-    // If not, just don't allocate SPL.
     static const unsigned X86_GR8_AO_64[] = {
       X86::AL,   X86::CL,   X86::DL,   X86::SIL, X86::DIL,
       X86::R8B,  X86::R9B,  X86::R10B, X86::R11B,
       X86::BL,   X86::R14B, X86::R15B, X86::R12B, X86::R13B, X86::BPL
     };
-    // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
-    static const unsigned X86_GR8_AO_32[] = {
-      X86::AL, X86::CL, X86::DL, X86::AH, X86::CH, X86::DH, X86::BL, X86::BH
-    };
 
     GR8Class::iterator
     GR8Class::allocation_order_begin(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
-      const TargetRegisterInfo *RI = TM.getRegisterInfo();
       const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (!Subtarget.is64Bit())
-        return X86_GR8_AO_32;
-      else if (RI->hasFP(MF))
-        return X86_GR8_AO_64_fp;
-      else
+      if (Subtarget.is64Bit())
         return X86_GR8_AO_64;
+      else
+        return begin();
     }
 
     GR8Class::iterator
@@ -313,17 +298,20 @@ def GR8 : RegisterClass<"X86", [i8],  8,
       const TargetMachine &TM = MF.getTarget();
       const TargetRegisterInfo *RI = TM.getRegisterInfo();
       const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      // Does the function dedicate RBP / EBP to being a frame ptr?
       if (!Subtarget.is64Bit())
-        return X86_GR8_AO_32 + (sizeof(X86_GR8_AO_32) / sizeof(unsigned));
+        // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
+        return begin() + 8;
       else if (RI->hasFP(MF))
-        return X86_GR8_AO_64_fp + (sizeof(X86_GR8_AO_64_fp) / sizeof(unsigned));
+        // If so, don't allocate SPL or BPL.
+        return array_endof(X86_GR8_AO_64) - 1;
       else
-        return X86_GR8_AO_64 + (sizeof(X86_GR8_AO_64) / sizeof(unsigned));
+        // If not, just don't allocate SPL.
+        return array_endof(X86_GR8_AO_64);
     }
   }];
 }
 
-
 def GR16 : RegisterClass<"X86", [i16], 16,
                          [AX, CX, DX, SI, DI, BX, BP, SP,
                           R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W]> {
@@ -333,42 +321,20 @@ def GR16 : RegisterClass<"X86", [i16], 16,
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
   let MethodBodies = [{
-    // Does the function dedicate RBP / EBP to being a frame ptr?
-    // If so, don't allocate SP or BP.
-    static const unsigned X86_GR16_AO_64_fp[] = {
-      X86::AX,  X86::CX,   X86::DX,   X86::SI,   X86::DI,
-      X86::R8W, X86::R9W,  X86::R10W, X86::R11W,
-      X86::BX, X86::R14W, X86::R15W,  X86::R12W, X86::R13W
-    };
-    static const unsigned X86_GR16_AO_32_fp[] = {
-      X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX
-    };
-    // If not, just don't allocate SP.
     static const unsigned X86_GR16_AO_64[] = {
       X86::AX,  X86::CX,   X86::DX,   X86::SI,   X86::DI,
       X86::R8W, X86::R9W,  X86::R10W, X86::R11W,
       X86::BX, X86::R14W, X86::R15W,  X86::R12W, X86::R13W, X86::BP
     };
-    static const unsigned X86_GR16_AO_32[] = {
-      X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX, X86::BP
-    };
 
     GR16Class::iterator
     GR16Class::allocation_order_begin(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
-      const TargetRegisterInfo *RI = TM.getRegisterInfo();
       const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (Subtarget.is64Bit()) {
-        if (RI->hasFP(MF))
-          return X86_GR16_AO_64_fp;
-        else
-          return X86_GR16_AO_64;
-      } else {
-        if (RI->hasFP(MF))
-          return X86_GR16_AO_32_fp;
-        else
-          return X86_GR16_AO_32;
-      }
+      if (Subtarget.is64Bit())
+        return X86_GR16_AO_64;
+      else
+        return begin();
     }
 
     GR16Class::iterator
@@ -377,21 +343,26 @@ def GR16 : RegisterClass<"X86", [i16], 16,
       const TargetRegisterInfo *RI = TM.getRegisterInfo();
       const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
       if (Subtarget.is64Bit()) {
+        // Does the function dedicate RBP to being a frame ptr?
         if (RI->hasFP(MF))
-          return X86_GR16_AO_64_fp+(sizeof(X86_GR16_AO_64_fp)/sizeof(unsigned));
+          // If so, don't allocate SP or BP.
+          return array_endof(X86_GR16_AO_64) - 1;
         else
-          return X86_GR16_AO_64 + (sizeof(X86_GR16_AO_64) / sizeof(unsigned));
+          // If not, just don't allocate SP.
+          return array_endof(X86_GR16_AO_64);
       } else {
+        // Does the function dedicate EBP to being a frame ptr?
         if (RI->hasFP(MF))
-          return X86_GR16_AO_32_fp+(sizeof(X86_GR16_AO_32_fp)/sizeof(unsigned));
+          // If so, don't allocate SP or BP.
+          return begin() + 6;
         else
-          return X86_GR16_AO_32 + (sizeof(X86_GR16_AO_32) / sizeof(unsigned));
+          // If not, just don't allocate SP.
+          return begin() + 7;
       }
     }
   }];
 }
 
-
 def GR32 : RegisterClass<"X86", [i32], 32, 
                          [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
                           R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
@@ -401,42 +372,20 @@ def GR32 : RegisterClass<"X86", [i32], 32,
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
   let MethodBodies = [{
-    // Does the function dedicate RBP / EBP to being a frame ptr?
-    // If so, don't allocate ESP or EBP.
-    static const unsigned X86_GR32_AO_64_fp[] = {
-      X86::EAX, X86::ECX,  X86::EDX,  X86::ESI,  X86::EDI,
-      X86::R8D, X86::R9D,  X86::R10D, X86::R11D,
-      X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D
-    };
-    static const unsigned X86_GR32_AO_32_fp[] = {
-      X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX
-    };
-    // If not, just don't allocate ESP.
     static const unsigned X86_GR32_AO_64[] = {
       X86::EAX, X86::ECX,  X86::EDX,  X86::ESI,  X86::EDI,
       X86::R8D, X86::R9D,  X86::R10D, X86::R11D,
       X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP
     };
-    static const unsigned X86_GR32_AO_32[] = {
-      X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP
-    };
 
     GR32Class::iterator
     GR32Class::allocation_order_begin(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
-      const TargetRegisterInfo *RI = TM.getRegisterInfo();
       const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (Subtarget.is64Bit()) {
-        if (RI->hasFP(MF))
-          return X86_GR32_AO_64_fp;
-        else
-          return X86_GR32_AO_64;
-      } else {
-        if (RI->hasFP(MF))
-          return X86_GR32_AO_32_fp;
-        else
-          return X86_GR32_AO_32;
-      }
+      if (Subtarget.is64Bit())
+        return X86_GR32_AO_64;
+      else
+        return begin();
     }
 
     GR32Class::iterator
@@ -445,21 +394,29 @@ def GR32 : RegisterClass<"X86", [i32], 32,
       const TargetRegisterInfo *RI = TM.getRegisterInfo();
       const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
       if (Subtarget.is64Bit()) {
+        // Does the function dedicate RBP to being a frame ptr?
         if (RI->hasFP(MF))
-          return X86_GR32_AO_64_fp+(sizeof(X86_GR32_AO_64_fp)/sizeof(unsigned));
+          // If so, don't allocate ESP or EBP.
+          return array_endof(X86_GR32_AO_64) - 1;
         else
-          return X86_GR32_AO_64 + (sizeof(X86_GR32_AO_64) / sizeof(unsigned));
+          // If not, just don't allocate ESP.
+          return array_endof(X86_GR32_AO_64);
       } else {
+        // Does the function dedicate EBP to being a frame ptr?
         if (RI->hasFP(MF))
-          return X86_GR32_AO_32_fp+(sizeof(X86_GR32_AO_32_fp)/sizeof(unsigned));
+          // If so, don't allocate ESP or EBP.
+          return begin() + 6;
         else
-          return X86_GR32_AO_32 + (sizeof(X86_GR32_AO_32) / sizeof(unsigned));
+          // If not, just don't allocate ESP.
+          return begin() + 7;
       }
     }
   }];
 }
 
-
+// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
+// RIP isn't really a register and it can't be used anywhere except in an
+// address, but it doesn't cause trouble.
 def GR64 : RegisterClass<"X86", [i64], 64, 
                          [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
                           RBX, R14, R15, R12, R13, RBP, RSP, RIP]> {
@@ -483,6 +440,11 @@ def GR64 : RegisterClass<"X86", [i64], 64,
   }];
 }
 
+// Segment registers for use by MOV instructions (and others) that have a
+//   segment register as one operand.  Always contain a 16-bit segment
+//   descriptor.
+def SEGMENT_REG : RegisterClass<"X86", [i16], 16, [CS, DS, SS, ES, FS, GS]> {
+}
 
 // GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
 // GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
@@ -509,38 +471,25 @@ def GR64_ABCD : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RBX]> {
 // On x86-64, GR64_NOREX, GR32_NOREX and GR16_NOREX are the classes
 // of registers which do not by themselves require a REX prefix.
 def GR8_NOREX : RegisterClass<"X86", [i8], 8,
-                              [AL, CL, DL, BL, AH, CH, DH, BH,
+                              [AL, CL, DL, AH, CH, DH, BL, BH,
                                SIL, DIL, BPL, SPL]> {
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
   let MethodBodies = [{
-    // Does the function dedicate RBP / EBP to being a frame ptr?
-    // If so, don't allocate SPL or BPL.
-    static const unsigned X86_GR8_NOREX_AO_64_fp[] = {
-      X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL, X86::BL
-    };
-    // If not, just don't allocate SPL.
     static const unsigned X86_GR8_NOREX_AO_64[] = {
       X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL, X86::BL, X86::BPL
     };
-    // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
-    static const unsigned X86_GR8_NOREX_AO_32[] = {
-      X86::AL, X86::CL, X86::DL, X86::AH, X86::CH, X86::DH, X86::BL, X86::BH
-    };
 
     GR8_NOREXClass::iterator
     GR8_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
-      const TargetRegisterInfo *RI = TM.getRegisterInfo();
       const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (!Subtarget.is64Bit())
-        return X86_GR8_NOREX_AO_32;
-      else if (RI->hasFP(MF))
-        return X86_GR8_NOREX_AO_64_fp;
-      else
+      if (Subtarget.is64Bit())
         return X86_GR8_NOREX_AO_64;
+      else
+        return begin();
     }
 
     GR8_NOREXClass::iterator
@@ -548,15 +497,16 @@ def GR8_NOREX : RegisterClass<"X86", [i8], 8,
       const TargetMachine &TM = MF.getTarget();
       const TargetRegisterInfo *RI = TM.getRegisterInfo();
       const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      // Does the function dedicate RBP / EBP to being a frame ptr?
       if (!Subtarget.is64Bit())
-        return X86_GR8_NOREX_AO_32 +
-               (sizeof(X86_GR8_NOREX_AO_32) / sizeof(unsigned));
+        // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
+        return begin() + 8;
       else if (RI->hasFP(MF))
-        return X86_GR8_NOREX_AO_64_fp +
-               (sizeof(X86_GR8_NOREX_AO_64_fp) / sizeof(unsigned));
+        // If so, don't allocate SPL or BPL.
+        return array_endof(X86_GR8_NOREX_AO_64) - 1;
       else
-        return X86_GR8_NOREX_AO_64 +
-               (sizeof(X86_GR8_NOREX_AO_64) / sizeof(unsigned));
+        // If not, just don't allocate SPL.
+        return array_endof(X86_GR8_NOREX_AO_64);
     }
   }];
 }
@@ -564,38 +514,20 @@ def GR16_NOREX : RegisterClass<"X86", [i16], 16,
                                [AX, CX, DX, SI, DI, BX, BP, SP]> {
   let SubRegClassList = [GR8_NOREX, GR8_NOREX];
   let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
   let MethodBodies = [{
-    // Does the function dedicate RBP / EBP to being a frame ptr?
-    // If so, don't allocate SP or BP.
-    static const unsigned X86_GR16_AO_fp[] = {
-      X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX
-    };
-    // If not, just don't allocate SP.
-    static const unsigned X86_GR16_AO[] = {
-      X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX, X86::BP
-    };
-
-    GR16_NOREXClass::iterator
-    GR16_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetRegisterInfo *RI = TM.getRegisterInfo();
-      if (RI->hasFP(MF))
-        return X86_GR16_AO_fp;
-      else
-        return X86_GR16_AO;
-    }
-
     GR16_NOREXClass::iterator
     GR16_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
       const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      // Does the function dedicate RBP / EBP to being a frame ptr?
       if (RI->hasFP(MF))
-        return X86_GR16_AO_fp+(sizeof(X86_GR16_AO_fp)/sizeof(unsigned));
+        // If so, don't allocate SP or BP.
+        return end() - 2;
       else
-        return X86_GR16_AO + (sizeof(X86_GR16_AO) / sizeof(unsigned));
+        // If not, just don't allocate SP.
+        return end() - 1;
     }
   }];
 }
@@ -604,89 +536,149 @@ def GR32_NOREX : RegisterClass<"X86", [i32], 32,
                                [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> {
   let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX];
   let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
   let MethodBodies = [{
-    // Does the function dedicate RBP / EBP to being a frame ptr?
-    // If so, don't allocate ESP or EBP.
-    static const unsigned X86_GR32_NOREX_AO_fp[] = {
-      X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX
-    };
-    // If not, just don't allocate ESP.
-    static const unsigned X86_GR32_NOREX_AO[] = {
-      X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP
-    };
-
     GR32_NOREXClass::iterator
-    GR32_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+    GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
       const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      // Does the function dedicate RBP / EBP to being a frame ptr?
       if (RI->hasFP(MF))
-        return X86_GR32_NOREX_AO_fp;
+        // If so, don't allocate ESP or EBP.
+        return end() - 2;
       else
-        return X86_GR32_NOREX_AO;
+        // If not, just don't allocate ESP.
+        return end() - 1;
     }
-
-    GR32_NOREXClass::iterator
-    GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+  }];
+}
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+                               [RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX, GR32_NOREX];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR64_NOREXClass::iterator
+    GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
       const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      // Does the function dedicate RBP to being a frame ptr?
       if (RI->hasFP(MF))
-        return X86_GR32_NOREX_AO_fp +
-               (sizeof(X86_GR32_NOREX_AO_fp) / sizeof(unsigned));
+        // If so, don't allocate RIP, RSP or RBP.
+        return end() - 3;
       else
-        return X86_GR32_NOREX_AO +
-               (sizeof(X86_GR32_NOREX_AO) / sizeof(unsigned));
+        // If not, just don't allocate RIP or RSP.
+        return end() - 2;
     }
   }];
 }
 
-// GR64_NOREX - GR64 registers which do not require a REX prefix.
-def GR64_NOREX : RegisterClass<"X86", [i64], 64,
-                               [RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP]> {
-  let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX, GR32_NOREX];
+// GR32_NOSP - GR32 registers except ESP.
+def GR32_NOSP : RegisterClass<"X86", [i32], 32,
+                              [EAX, ECX, EDX, ESI, EDI, EBX, EBP,
+                               R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
+  let SubRegClassList = [GR8, GR8, GR16];
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
   let MethodBodies = [{
-    // Does the function dedicate RBP / EBP to being a frame ptr?
-    // If so, don't allocate RSP or RBP.
-    static const unsigned X86_GR64_NOREX_AO_fp[] = {
-      X86::RAX, X86::RCX, X86::RDX, X86::RSI, X86::RDI, X86::RBX
-    };
-    // If not, just don't allocate RSP.
-    static const unsigned X86_GR64_NOREX_AO[] = {
-      X86::RAX, X86::RCX, X86::RDX, X86::RSI, X86::RDI, X86::RBX, X86::RBP
+    static const unsigned X86_GR32_NOSP_AO_64[] = {
+      X86::EAX, X86::ECX,  X86::EDX,  X86::ESI,  X86::EDI,
+      X86::R8D, X86::R9D,  X86::R10D, X86::R11D,
+      X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP
     };
 
-    GR64_NOREXClass::iterator
-    GR64_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+    GR32_NOSPClass::iterator
+    GR32_NOSPClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit())
+        return X86_GR32_NOSP_AO_64;
+      else
+        return begin();
+    }
+
+    GR32_NOSPClass::iterator
+    GR32_NOSPClass::allocation_order_end(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
       const TargetRegisterInfo *RI = TM.getRegisterInfo();
-      if (RI->hasFP(MF))
-        return X86_GR64_NOREX_AO_fp;
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        // Does the function dedicate RBP to being a frame ptr?
+        if (RI->hasFP(MF))
+          // If so, don't allocate EBP.
+          return array_endof(X86_GR32_NOSP_AO_64) - 1;
+        else
+          // If not, any reg in this class is ok.
+          return array_endof(X86_GR32_NOSP_AO_64);
+      } else {
+        // Does the function dedicate EBP to being a frame ptr?
+        if (RI->hasFP(MF))
+          // If so, don't allocate EBP.
+          return begin() + 6;
+        else
+          // If not, any reg in this class is ok.
+          return begin() + 7;
+      }
+    }
+  }];
+}
+
+// GR64_NOSP - GR64 registers except RSP (and RIP).
+def GR64_NOSP : RegisterClass<"X86", [i64], 64,
+                              [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+                               RBX, R14, R15, R12, R13, RBP]> {
+  let SubRegClassList = [GR8, GR8, GR16, GR32_NOSP];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR64_NOSPClass::iterator
+    GR64_NOSPClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return begin();  // None of these are allocatable in 32-bit.
+      if (RI->hasFP(MF)) // Does the function dedicate RBP to being a frame ptr?
+        return end()-1;  // If so, don't allocate RBP
       else
-        return X86_GR64_NOREX_AO;
+        return end();  // If not, any reg in this class is ok.
     }
+  }];
+}
 
-    GR64_NOREXClass::iterator
-    GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+// GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
+def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
+                                    [RAX, RCX, RDX, RSI, RDI, RBX, RBP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX, GR32_NOREX];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR64_NOREX_NOSPClass::iterator
+    GR64_NOREX_NOSPClass::allocation_order_end(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
       const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      // Does the function dedicate RBP to being a frame ptr?
       if (RI->hasFP(MF))
-        return X86_GR64_NOREX_AO_fp +
-               (sizeof(X86_GR64_NOREX_AO_fp) / sizeof(unsigned));
+        // If so, don't allocate RBP.
+        return end() - 1;
       else
-        return X86_GR64_NOREX_AO +
-               (sizeof(X86_GR64_NOREX_AO) / sizeof(unsigned));
+        // If not, any reg in this class is ok.
+        return end();
     }
   }];
 }
 
 // A class to support the 'A' assembler constraint: EAX then EDX.
-def GRAD : RegisterClass<"X86", [i32], 32, [EAX, EDX]>;
+def GR32_AD : RegisterClass<"X86", [i32], 32, [EAX, EDX]> {
+  let SubRegClassList = [GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD];
+}
 
 // Scalar SSE2 floating point registers.
 def FR32 : RegisterClass<"X86", [f32], 32,
diff --git a/lib/Target/X86/X86Relocations.h b/lib/Target/X86/X86Relocations.h
index b225f48..990962d 100644
--- a/lib/Target/X86/X86Relocations.h
+++ b/lib/Target/X86/X86Relocations.h
@@ -20,21 +20,31 @@ namespace llvm {
   namespace X86 {
     /// RelocationType - An enum for the x86 relocation codes. Note that
     /// the terminology here doesn't follow x86 convention - word means
-    /// 32-bit and dword means 64-bit.
+    /// 32-bit and dword means 64-bit. The relocations will be treated
+    /// by JIT or ObjectCode emitters, this is transparent to the x86 code 
+    /// emitter but JIT and ObjectCode will treat them differently
     enum RelocationType {
-      // reloc_pcrel_word - PC relative relocation, add the relocated value to
-      // the value already in memory, after we adjust it for where the PC is.
+      /// reloc_pcrel_word - PC relative relocation, add the relocated value to
+      /// the value already in memory, after we adjust it for where the PC is.
       reloc_pcrel_word = 0,
 
-      // reloc_picrel_word - PIC base relative relocation, add the relocated
-      // value to the value already in memory, after we adjust it for where the
-      // PIC base is.
+      /// reloc_picrel_word - PIC base relative relocation, add the relocated
+      /// value to the value already in memory, after we adjust it for where the
+      /// PIC base is.
       reloc_picrel_word = 1,
-      
-      // reloc_absolute_word, reloc_absolute_dword - Absolute relocation, just
-      // add the relocated value to the value already in memory.
+
+      /// reloc_absolute_word - absolute relocation, just add the relocated
+      /// value to the value already in memory.
       reloc_absolute_word = 2,
-      reloc_absolute_dword = 3
+
+      /// reloc_absolute_word_sext - absolute relocation, just add the relocated
+      /// value to the value already in memory. In object files, it represents a
+      /// value which must be sign-extended when resolving the relocation.
+      reloc_absolute_word_sext = 3,
+
+      /// reloc_absolute_dword - absolute relocation, just add the relocated
+      /// value to the value already in memory.
+      reloc_absolute_dword = 4
     };
   }
 }
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 8506fa6..fb76aeb 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -13,80 +13,111 @@
 
 #define DEBUG_TYPE "subtarget"
 #include "X86Subtarget.h"
+#include "X86InstrInfo.h"
 #include "X86GenSubtarget.inc"
-#include "llvm/Module.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/GlobalValue.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
 #if defined(_MSC_VER)
-    #include <intrin.h>
+#include <intrin.h>
 #endif
 
-static cl::opt<X86Subtarget::AsmWriterFlavorTy>
-AsmWriterFlavor("x86-asm-syntax", cl::init(X86Subtarget::Unset),
-  cl::desc("Choose style of code to emit from X86 backend:"),
-  cl::values(
-    clEnumValN(X86Subtarget::ATT,   "att",   "Emit AT&T-style assembly"),
-    clEnumValN(X86Subtarget::Intel, "intel", "Emit Intel-style assembly"),
-    clEnumValEnd));
-
-
-/// True if accessing the GV requires an extra load. For Windows, dllimported
-/// symbols are indirect, loading the value at address GV rather then the
-/// value of GV itself. This means that the GlobalAddress must be in the base
-/// or index register of the address, not the GV offset field.
-bool X86Subtarget::GVRequiresExtraLoad(const GlobalValue* GV,
-                                       const TargetMachine& TM,
-                                       bool isDirectCall) const
-{
-  // FIXME: PIC
-  if (TM.getRelocationModel() != Reloc::Static &&
-      TM.getCodeModel() != CodeModel::Large) {
+/// ClassifyGlobalReference - Classify a global variable reference for the
+/// current subtarget according to how we should reference it in a non-pcrel
+/// context.
+unsigned char X86Subtarget::
+ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
+  // DLLImport only exists on windows, it is implemented as a load from a
+  // DLLIMPORT stub.
+  if (GV->hasDLLImportLinkage())
+    return X86II::MO_DLLIMPORT;
+
+  // GV with ghost linkage (in JIT lazy compilation mode) do not require an
+  // extra load from stub.
+  bool isDecl = GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode();
+
+  // X86-64 in PIC mode.
+  if (isPICStyleRIPRel()) {
+    // Large model never uses stubs.
+    if (TM.getCodeModel() == CodeModel::Large)
+      return X86II::MO_NO_FLAG;
+      
     if (isTargetDarwin()) {
-      if (isDirectCall)
-        return false;
-      bool isDecl = GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode();
-      if (GV->hasHiddenVisibility() &&
-          (Is64Bit || (!isDecl && !GV->hasCommonLinkage())))
-        // If symbol visibility is hidden, the extra load is not needed if
-        // target is x86-64 or the symbol is definitely defined in the current
-        // translation unit.
-        return false;
-      return !isDirectCall && (isDecl || GV->isWeakForLinker());
-    } else if (isTargetELF()) {
+      // If symbol visibility is hidden, the extra load is not needed if
+      // target is x86-64 or the symbol is definitely defined in the current
+      // translation unit.
+      if (GV->hasDefaultVisibility() &&
+          (isDecl || GV->isWeakForLinker()))
+        return X86II::MO_GOTPCREL;
+    } else {
+      assert(isTargetELF() && "Unknown rip-relative target");
+
       // Extra load is needed for all externally visible.
-      if (isDirectCall)
-        return false;
-      if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
-        return false;
-      return true;
-    } else if (isTargetCygMing() || isTargetWindows()) {
-      return (GV->hasDLLImportLinkage());
+      if (!GV->hasLocalLinkage() && GV->hasDefaultVisibility())
+        return X86II::MO_GOTPCREL;
     }
+
+    return X86II::MO_NO_FLAG;
   }
-  return false;
-}
+  
+  if (isPICStyleGOT()) {   // 32-bit ELF targets.
+    // Extra load is needed for all externally visible.
+    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+      return X86II::MO_GOTOFF;
+    return X86II::MO_GOT;
+  }
+  
+  if (isPICStyleStubPIC()) {  // Darwin/32 in PIC mode.
+    // Determine whether we have a stub reference and/or whether the reference
+    // is relative to the PIC base or not.
+    
+    // If this is a strong reference to a definition, it is definitely not
+    // through a stub.
+    if (!isDecl && !GV->isWeakForLinker())
+      return X86II::MO_PIC_BASE_OFFSET;
 
-/// True if accessing the GV requires a register.  This is a superset of the
-/// cases where GVRequiresExtraLoad is true.  Some variations of PIC require
-/// a register, but not an extra load.
-bool X86Subtarget::GVRequiresRegister(const GlobalValue *GV,
-                                      const TargetMachine& TM,
-                                      bool isDirectCall) const
-{
-  if (GVRequiresExtraLoad(GV, TM, isDirectCall))
-    return true;
-  // Code below here need only consider cases where GVRequiresExtraLoad
-  // returns false.
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return !isDirectCall && 
-      (GV->hasLocalLinkage() || GV->hasExternalLinkage());
-  return false;
+    // Unless we have a symbol with hidden visibility, we have to go through a
+    // normal $non_lazy_ptr stub because this symbol might be resolved late.
+    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+    
+    // If symbol visibility is hidden, we have a stub for common symbol
+    // references and external declarations.
+    if (isDecl || GV->hasCommonLinkage()) {
+      // Hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE;
+    }
+    
+    // Otherwise, no stub.
+    return X86II::MO_PIC_BASE_OFFSET;
+  }
+  
+  if (isPICStyleStubNoDynamic()) {  // Darwin/32 in -mdynamic-no-pic mode.
+    // Determine whether we have a stub reference.
+    
+    // If this is a strong reference to a definition, it is definitely not
+    // through a stub.
+    if (!isDecl && !GV->isWeakForLinker())
+      return X86II::MO_NO_FLAG;
+    
+    // Unless we have a symbol with hidden visibility, we have to go through a
+    // normal $non_lazy_ptr stub because this symbol might be resolved late.
+    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_NONLAZY;
+
+    // Otherwise, no stub.
+    return X86II::MO_NO_FLAG;
+  }
+  
+  // Direct static reference to global.
+  return X86II::MO_NO_FLAG;
 }
 
+
 /// getBZeroEntry - This function returns the name of a function which has an
 /// interface like the non-standard bzero function, if such a function exists on
 /// the current subtarget and it is considered prefereable over memset with zero
@@ -120,9 +151,9 @@ unsigned X86Subtarget::getSpecialAddressLatency() const {
 
 /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
 /// specified arguments.  If we can't run cpuid on the host, return true.
-bool X86::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
-                          unsigned *rECX, unsigned *rEDX) {
-#if defined(__x86_64__) || defined(_M_AMD64)
+static bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX,
+                            unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
   #if defined(__GNUC__)
     // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
     asm ("movq\t%%rbx, %%rsi\n\t"
@@ -192,18 +223,19 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
     char     c[12];
   } text;
   
-  if (X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1))
+  if (GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1))
     return;
 
-  X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+  GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
   
-  if ((EDX >> 23) & 0x1) X86SSELevel = MMX;
-  if ((EDX >> 25) & 0x1) X86SSELevel = SSE1;
-  if ((EDX >> 26) & 0x1) X86SSELevel = SSE2;
-  if (ECX & 0x1)         X86SSELevel = SSE3;
-  if ((ECX >> 9)  & 0x1) X86SSELevel = SSSE3;
-  if ((ECX >> 19) & 0x1) X86SSELevel = SSE41;
-  if ((ECX >> 20) & 0x1) X86SSELevel = SSE42;
+  if ((EDX >> 15) & 1) HasCMov = true;
+  if ((EDX >> 23) & 1) X86SSELevel = MMX;
+  if ((EDX >> 25) & 1) X86SSELevel = SSE1;
+  if ((EDX >> 26) & 1) X86SSELevel = SSE2;
+  if (ECX & 0x1)       X86SSELevel = SSE3;
+  if ((ECX >> 9)  & 1) X86SSELevel = SSSE3;
+  if ((ECX >> 19) & 1) X86SSELevel = SSE41;
+  if ((ECX >> 20) & 1) X86SSELevel = SSE42;
 
   bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
   bool IsAMD   = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
@@ -218,7 +250,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
     DetectFamilyModel(EAX, Family, Model);
     IsBTMemSlow = IsAMD || (Family == 6 && Model >= 13);
 
-    X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+    GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
     HasX86_64 = (EDX >> 29) & 0x1;
     HasSSE4A = IsAMD && ((ECX >> 6) & 0x1);
     HasFMA4 = IsAMD && ((ECX >> 16) & 0x1);
@@ -227,13 +259,13 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
 
 static const char *GetCurrentX86CPU() {
   unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
-  if (X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX))
+  if (GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX))
     return "generic";
   unsigned Family = 0;
   unsigned Model  = 0;
   DetectFamilyModel(EAX, Family, Model);
 
-  X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+  GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
   bool Em64T = (EDX >> 29) & 0x1;
   bool HasSSE3 = (ECX & 0x1);
 
@@ -242,7 +274,7 @@ static const char *GetCurrentX86CPU() {
     char     c[12];
   } text;
 
-  X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
+  GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
   if (memcmp(text.c, "GenuineIntel", 12) == 0) {
     switch (Family) {
       case 3:
@@ -319,9 +351,7 @@ static const char *GetCurrentX86CPU() {
         }
       case 15:
         if (HasSSE3) {
-          switch (Model) {
-          default: return "k8-sse3";
-          }
+          return "k8-sse3";
         } else {
           switch (Model) {
           case 1:  return "opteron";
@@ -330,9 +360,7 @@ static const char *GetCurrentX86CPU() {
           }
         }
       case 16:
-        switch (Model) {
-        default: return "amdfam10";
-        }
+        return "amdfam10";
     default:
       return "generic";
     }
@@ -341,11 +369,12 @@ static const char *GetCurrentX86CPU() {
   }
 }
 
-X86Subtarget::X86Subtarget(const Module &M, const std::string &FS, bool is64Bit)
-  : AsmFlavor(AsmWriterFlavor)
-  , PICStyle(PICStyles::None)
+X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS, 
+                           bool is64Bit)
+  : PICStyle(PICStyles::None)
   , X86SSELevel(NoMMXSSE)
   , X863DNowLevel(NoThreeDNow)
+  , HasCMov(false)
   , HasX86_64(false)
   , HasSSE4A(false)
   , HasAVX(false)
@@ -384,15 +413,14 @@ X86Subtarget::X86Subtarget(const Module &M, const std::string &FS, bool is64Bit)
   if (Is64Bit)
     HasX86_64 = true;
 
-  DOUT << "Subtarget features: SSELevel " << X86SSELevel
-       << ", 3DNowLevel " << X863DNowLevel
-       << ", 64bit " << HasX86_64 << "\n";
+  DEBUG(errs() << "Subtarget features: SSELevel " << X86SSELevel
+               << ", 3DNowLevel " << X863DNowLevel
+               << ", 64bit " << HasX86_64 << "\n");
   assert((!Is64Bit || HasX86_64) &&
          "64-bit code requested on a subtarget that doesn't support it!");
 
   // Set the boolean corresponding to the current target triple, or the default
   // if one cannot be determined, to true.
-  const std::string& TT = M.getTargetTriple();
   if (TT.length() > 5) {
     size_t Pos;
     if ((Pos = TT.find("-darwin")) != std::string::npos) {
@@ -415,38 +443,10 @@ X86Subtarget::X86Subtarget(const Module &M, const std::string &FS, bool is64Bit)
       TargetType = isWindows;
     } else if (TT.find("windows") != std::string::npos) {
       TargetType = isWindows;
-    }
-    else if (TT.find("-cl") != std::string::npos) {
+    } else if (TT.find("-cl") != std::string::npos) {
       TargetType = isDarwin;
       DarwinVers = 9;
     }
-  } else if (TT.empty()) {
-#if defined(__CYGWIN__)
-    TargetType = isCygwin;
-#elif defined(__MINGW32__) || defined(__MINGW64__)
-    TargetType = isMingw;
-#elif defined(__APPLE__)
-    TargetType = isDarwin;
-#if __APPLE_CC__ > 5400
-    DarwinVers = 9;  // GCC 5400+ is Leopard.
-#else
-    DarwinVers = 8;  // Minimum supported darwin is Tiger.
-#endif
-    
-#elif defined(_WIN32) || defined(_WIN64)
-    TargetType = isWindows;
-#elif defined(__linux__)
-    // Linux doesn't imply ELF, but we don't currently support anything else.
-    TargetType = isELF;
-    IsLinux = true;
-#endif
-  }
-
-  // If the asm syntax hasn't been overridden on the command line, use whatever
-  // the target wants.
-  if (AsmFlavor == X86Subtarget::Unset) {
-    AsmFlavor = (TargetType == isWindows)
-      ? X86Subtarget::Intel : X86Subtarget::ATT;
   }
 
   // Stack alignment is 16 bytes on Darwin (both 32 and 64 bit) and for all 64
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 0d1434f..a2e368d 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -18,23 +18,22 @@
 #include <string>
 
 namespace llvm {
-class Module;
 class GlobalValue;
 class TargetMachine;
   
+/// PICStyles - The X86 backend supports a number of different styles of PIC.
+/// 
 namespace PICStyles {
 enum Style {
-  Stub, GOT, RIPRel, WinPIC, None
+  StubPIC,          // Used on i386-darwin in -fPIC mode.
+  StubDynamicNoPIC, // Used on i386-darwin in -mdynamic-no-pic mode.
+  GOT,              // Used on many 32-bit unices in -fPIC mode.
+  RIPRel,           // Used on X86-64 when not in -static mode.
+  None              // Set when in -static mode (not PIC or DynamicNoPIC mode).
 };
 }
 
 class X86Subtarget : public TargetSubtarget {
-public:
-  enum AsmWriterFlavorTy {
-    // Note: This numbering has to match the GCC assembler dialects for inline
-    // asm alternatives to work right.
-    ATT = 0, Intel = 1, Unset
-  };
 protected:
   enum X86SSEEnum {
     NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42
@@ -44,10 +43,6 @@ protected:
     NoThreeDNow, ThreeDNow, ThreeDNowA
   };
 
-  /// AsmFlavor - Which x86 asm dialect to use.
-  ///
-  AsmWriterFlavorTy AsmFlavor;
-
   /// PICStyle - Which PIC style to use
   ///
   PICStyles::Style PICStyle;
@@ -60,6 +55,10 @@ protected:
   ///
   X863DNowEnum X863DNowLevel;
 
+  /// HasCMov - True if this processor has conditional move instructions
+  /// (generally pentium pro+).
+  bool HasCMov;
+  
   /// HasX86_64 - True if the processor supports X86-64 instructions.
   ///
   bool HasX86_64;
@@ -95,7 +94,7 @@ protected:
   unsigned MaxInlineSizeThreshold;
 
 private:
-  /// Is64Bit - True if the processor supports 64-bit instructions and module
+  /// Is64Bit - True if the processor supports 64-bit instructions and
   /// pointer size is 64 bit.
   bool Is64Bit;
 
@@ -105,9 +104,9 @@ public:
   } TargetType;
 
   /// This constructor initializes the data members to match that
-  /// of the specified module.
+  /// of the specified triple.
   ///
-  X86Subtarget(const Module &M, const std::string &FS, bool is64Bit);
+  X86Subtarget(const std::string &TT, const std::string &FS, bool is64Bit);
 
   /// getStackAlignment - Returns the minimum alignment known to hold of the
   /// stack frame on entry to the function and which must be maintained by every
@@ -145,66 +144,67 @@ public:
   bool hasAVX() const { return HasAVX; }
   bool hasFMA3() const { return HasFMA3; }
   bool hasFMA4() const { return HasFMA4; }
-
   bool isBTMemSlow() const { return IsBTMemSlow; }
 
-  unsigned getAsmFlavor() const {
-    return AsmFlavor != Unset ? unsigned(AsmFlavor) : 0;
-  }
-
-  bool isFlavorAtt() const { return AsmFlavor == ATT; }
-  bool isFlavorIntel() const { return AsmFlavor == Intel; }
-
   bool isTargetDarwin() const { return TargetType == isDarwin; }
-  bool isTargetELF() const {
-    return TargetType == isELF;
-  }
+  bool isTargetELF() const { return TargetType == isELF; }
+  
   bool isTargetWindows() const { return TargetType == isWindows; }
   bool isTargetMingw() const { return TargetType == isMingw; }
-  bool isTargetCygMing() const { return (TargetType == isMingw ||
-                                         TargetType == isCygwin); }
   bool isTargetCygwin() const { return TargetType == isCygwin; }
+  bool isTargetCygMing() const {
+    return TargetType == isMingw || TargetType == isCygwin;
+  }
+  
+  /// isTargetCOFF - Return true if this is any COFF/Windows target variant.
+  bool isTargetCOFF() const {
+    return TargetType == isMingw || TargetType == isCygwin ||
+           TargetType == isWindows;
+  }
+  
   bool isTargetWin64() const {
-    return (Is64Bit && (TargetType == isMingw || TargetType == isWindows));
+    return Is64Bit && (TargetType == isMingw || TargetType == isWindows);
   }
 
   std::string getDataLayout() const {
     const char *p;
     if (is64Bit())
       p = "e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128";
-    else {
-      if (isTargetDarwin())
-        p = "e-p:32:32-f64:32:64-i64:32:64-f80:128:128";
-      else
-        p = "e-p:32:32-f64:32:64-i64:32:64-f80:32:32";
-    }
+    else if (isTargetDarwin())
+      p = "e-p:32:32-f64:32:64-i64:32:64-f80:128:128";
+    else
+      p = "e-p:32:32-f64:32:64-i64:32:64-f80:32:32";
     return std::string(p);
   }
 
   bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
   bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
-  bool isPICStyleStub() const { return PICStyle == PICStyles::Stub; }
   bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
-  bool isPICStyleWinPIC() const { return PICStyle == PICStyles::WinPIC; }
+
+  bool isPICStyleStubPIC() const {
+    return PICStyle == PICStyles::StubPIC;
+  }
+
+  bool isPICStyleStubNoDynamic() const {
+    return PICStyle == PICStyles::StubDynamicNoPIC;
+  }
+  bool isPICStyleStubAny() const {
+    return PICStyle == PICStyles::StubDynamicNoPIC ||
+           PICStyle == PICStyles::StubPIC; }
   
-  /// getDarwinVers - Return the darwin version number, 8 = tiger, 9 = leopard.
+  /// getDarwinVers - Return the darwin version number, 8 = Tiger, 9 = Leopard,
+  /// 10 = Snow Leopard, etc.
   unsigned getDarwinVers() const { return DarwinVers; }
   
   /// isLinux - Return true if the target is "Linux".
   bool isLinux() const { return IsLinux; }
 
-  /// True if accessing the GV requires an extra load. For Windows, dllimported
-  /// symbols are indirect, loading the value at address GV rather then the
-  /// value of GV itself. This means that the GlobalAddress must be in the base
-  /// or index register of the address, not the GV offset field.
-  bool GVRequiresExtraLoad(const GlobalValue* GV, const TargetMachine& TM,
-                           bool isDirectCall) const;
-
-  /// True if accessing the GV requires a register.  This is a superset of the
-  /// cases where GVRequiresExtraLoad is true.  Some variations of PIC require
-  /// a register, but not an extra load.
-  bool GVRequiresRegister(const GlobalValue* GV, const TargetMachine& TM,
-                           bool isDirectCall) const;
+  
+  /// ClassifyGlobalReference - Classify a global variable reference for the
+  /// current subtarget according to how we should reference it in a non-pcrel
+  /// context.
+  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM)const;
 
   /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
   /// to immediate address.
@@ -224,13 +224,6 @@ public:
   unsigned getSpecialAddressLatency() const;
 };
 
-namespace X86 {
-  /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in
-  /// the specified arguments.  If we can't run cpuid on the host, return true.
-  bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
-                       unsigned *rECX, unsigned *rEDX);
-}
-
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index b000914..a61de1c 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -11,172 +11,134 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86TargetAsmInfo.h"
+#include "X86MCAsmInfo.h"
 #include "X86TargetMachine.h"
 #include "X86.h"
-#include "llvm/Module.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
-/// X86TargetMachineModule - Note that this is used on hosts that cannot link
-/// in a library unless there are references into the library.  In particular,
-/// it seems that it is not possible to get things to work on Win32 without
-/// this.  Though it is unused, do not remove it.
-extern "C" int X86TargetMachineModule;
-int X86TargetMachineModule = 0;
-
-// Register the target.
-static RegisterTarget<X86_32TargetMachine>
-X("x86",    "32-bit X86: Pentium-Pro and above");
-static RegisterTarget<X86_64TargetMachine>
-Y("x86-64", "64-bit X86: EM64T and AMD64");
-
-// Force static initialization.
-extern "C" void LLVMInitializeX86Target() { }
-
-// No assembler printer by default
-X86TargetMachine::AsmPrinterCtorFn X86TargetMachine::AsmPrinterCtor = 0;
-
-const TargetAsmInfo *X86TargetMachine::createTargetAsmInfo() const {
-  if (Subtarget.isFlavorIntel())
-    return new X86WinTargetAsmInfo(*this);
-  else
-    switch (Subtarget.TargetType) {
-     case X86Subtarget::isDarwin:
-      return new X86DarwinTargetAsmInfo(*this);
-     case X86Subtarget::isELF:
-      return new X86ELFTargetAsmInfo(*this);
-     case X86Subtarget::isMingw:
-     case X86Subtarget::isCygwin:
-      return new X86COFFTargetAsmInfo(*this);
-     case X86Subtarget::isWindows:
-      return new X86WinTargetAsmInfo(*this);
-     default:
-      return new X86GenericTargetAsmInfo(*this);
-    }
-}
-
-unsigned X86_32TargetMachine::getJITMatchQuality() {
-#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
-  return 10;
-#endif
-  return 0;
-}
-
-unsigned X86_64TargetMachine::getJITMatchQuality() {
-#if defined(__x86_64__) || defined(_M_AMD64)
-  return 10;
-#endif
-  return 0;
+static const MCAsmInfo *createMCAsmInfo(const Target &T,
+                                                const StringRef &TT) {
+  Triple TheTriple(TT);
+  switch (TheTriple.getOS()) {
+  case Triple::Darwin:
+    return new X86MCAsmInfoDarwin(TheTriple);
+  case Triple::MinGW32:
+  case Triple::MinGW64:
+  case Triple::Cygwin:
+    return new X86MCAsmInfoCOFF(TheTriple);
+  case Triple::Win32:
+    return new X86WinMCAsmInfo(TheTriple);
+  default:
+    return new X86ELFMCAsmInfo(TheTriple);
+  }
 }
 
-unsigned X86_32TargetMachine::getModuleMatchQuality(const Module &M) {
-  // We strongly match "i[3-9]86-*".
-  std::string TT = M.getTargetTriple();
-  if (TT.size() >= 5 && TT[0] == 'i' && TT[2] == '8' && TT[3] == '6' &&
-      TT[4] == '-' && TT[1] - '3' < 6)
-    return 20;
-  // If the target triple is something non-X86, we don't match.
-  if (!TT.empty()) return 0;
+extern "C" void LLVMInitializeX86Target() { 
+  // Register the target.
+  RegisterTargetMachine<X86_32TargetMachine> X(TheX86_32Target);
+  RegisterTargetMachine<X86_64TargetMachine> Y(TheX86_64Target);
 
-  if (M.getEndianness()  == Module::LittleEndian &&
-      M.getPointerSize() == Module::Pointer32)
-    return 10;                                   // Weak match
-  else if (M.getEndianness() != Module::AnyEndianness ||
-           M.getPointerSize() != Module::AnyPointerSize)
-    return 0;                                    // Match for some other target
+  // Register the target asm info.
+  RegisterAsmInfoFn A(TheX86_32Target, createMCAsmInfo);
+  RegisterAsmInfoFn B(TheX86_64Target, createMCAsmInfo);
 
-  return getJITMatchQuality()/2;
+  // Register the code emitter.
+  TargetRegistry::RegisterCodeEmitter(TheX86_32Target, createX86MCCodeEmitter);
+  TargetRegistry::RegisterCodeEmitter(TheX86_64Target, createX86MCCodeEmitter);
 }
 
-unsigned X86_64TargetMachine::getModuleMatchQuality(const Module &M) {
-  // We strongly match "x86_64-*".
-  std::string TT = M.getTargetTriple();
-  if (TT.size() >= 7 && TT[0] == 'x' && TT[1] == '8' && TT[2] == '6' &&
-      TT[3] == '_' && TT[4] == '6' && TT[5] == '4' && TT[6] == '-')
-    return 20;
-
-  // We strongly match "amd64-*".
-  if (TT.size() >= 6 && TT[0] == 'a' && TT[1] == 'm' && TT[2] == 'd' &&
-      TT[3] == '6' && TT[4] == '4' && TT[5] == '-')
-    return 20;
-  
-  // If the target triple is something non-X86-64, we don't match.
-  if (!TT.empty()) return 0;
-
-  if (M.getEndianness()  == Module::LittleEndian &&
-      M.getPointerSize() == Module::Pointer64)
-    return 10;                                   // Weak match
-  else if (M.getEndianness() != Module::AnyEndianness ||
-           M.getPointerSize() != Module::AnyPointerSize)
-    return 0;                                    // Match for some other target
 
-  return getJITMatchQuality()/2;
-}
-
-X86_32TargetMachine::X86_32TargetMachine(const Module &M, const std::string &FS)
-  : X86TargetMachine(M, FS, false) {
+X86_32TargetMachine::X86_32TargetMachine(const Target &T, const std::string &TT,
+                                         const std::string &FS)
+  : X86TargetMachine(T, TT, FS, false) {
 }
 
 
-X86_64TargetMachine::X86_64TargetMachine(const Module &M, const std::string &FS)
-  : X86TargetMachine(M, FS, true) {
+X86_64TargetMachine::X86_64TargetMachine(const Target &T, const std::string &TT,
+                                         const std::string &FS)
+  : X86TargetMachine(T, TT, FS, true) {
 }
 
-/// X86TargetMachine ctor - Create an ILP32 architecture model
+/// X86TargetMachine ctor - Create an X86 target.
 ///
-X86TargetMachine::X86TargetMachine(const Module &M, const std::string &FS,
-                                   bool is64Bit)
-  : Subtarget(M, FS, is64Bit),
+X86TargetMachine::X86TargetMachine(const Target &T, const std::string &TT, 
+                                   const std::string &FS, bool is64Bit)
+  : LLVMTargetMachine(T, TT), 
+    Subtarget(TT, FS, is64Bit),
     DataLayout(Subtarget.getDataLayout()),
     FrameInfo(TargetFrameInfo::StackGrowsDown,
-              Subtarget.getStackAlignment(), Subtarget.is64Bit() ? -8 : -4),
+              Subtarget.getStackAlignment(),
+              (Subtarget.isTargetWin64() ? -40 :
+               (Subtarget.is64Bit() ? -8 : -4))),
     InstrInfo(*this), JITInfo(*this), TLInfo(*this), ELFWriterInfo(*this) {
   DefRelocModel = getRelocationModel();
-  // FIXME: Correctly select PIC model for Win64 stuff
+      
+  // If no relocation model was picked, default as appropriate for the target.
   if (getRelocationModel() == Reloc::Default) {
-    if (Subtarget.isTargetDarwin() ||
-        (Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64()))
-      setRelocationModel(Reloc::DynamicNoPIC);
-    else
+    if (!Subtarget.isTargetDarwin())
       setRelocationModel(Reloc::Static);
+    else if (Subtarget.is64Bit())
+      setRelocationModel(Reloc::PIC_);
+    else
+      setRelocationModel(Reloc::DynamicNoPIC);
   }
 
-  // ELF doesn't have a distinct dynamic-no-PIC model. Dynamic-no-PIC
-  // is defined as a model for code which may be used in static or
-  // dynamic executables but not necessarily a shared library. On ELF
-  // implement this by using the Static model.
-  if (Subtarget.isTargetELF() &&
-      getRelocationModel() == Reloc::DynamicNoPIC)
-    setRelocationModel(Reloc::Static);
-
-  if (Subtarget.is64Bit()) {
-    // No DynamicNoPIC support under X86-64.
-    if (getRelocationModel() == Reloc::DynamicNoPIC)
+  assert(getRelocationModel() != Reloc::Default &&
+         "Relocation mode not picked");
+
+  // If no code model is picked, default to small.
+  if (getCodeModel() == CodeModel::Default)
+    setCodeModel(CodeModel::Small);
+      
+  // ELF and X86-64 don't have a distinct DynamicNoPIC model.  DynamicNoPIC
+  // is defined as a model for code which may be used in static or dynamic
+  // executables but not necessarily a shared library. On X86-32 we just
+  // compile in -static mode, in x86-64 we use PIC.
+  if (getRelocationModel() == Reloc::DynamicNoPIC) {
+    if (is64Bit)
       setRelocationModel(Reloc::PIC_);
-    // Default X86-64 code model is small.
-    if (getCodeModel() == CodeModel::Default)
-      setCodeModel(CodeModel::Small);
+    else if (!Subtarget.isTargetDarwin())
+      setRelocationModel(Reloc::Static);
   }
 
-  if (Subtarget.isTargetCygMing())
-    Subtarget.setPICStyle(PICStyles::WinPIC);
-  else if (Subtarget.isTargetDarwin()) {
+  // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+  // the Mach-O file format doesn't support it.
+  if (getRelocationModel() == Reloc::Static &&
+      Subtarget.isTargetDarwin() &&
+      is64Bit)
+    setRelocationModel(Reloc::PIC_);
+      
+  // Determine the PICStyle based on the target selected.
+  if (getRelocationModel() == Reloc::Static) {
+    // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
+    Subtarget.setPICStyle(PICStyles::None);
+  } else if (Subtarget.isTargetCygMing()) {
+    Subtarget.setPICStyle(PICStyles::None);
+  } else if (Subtarget.isTargetDarwin()) {
     if (Subtarget.is64Bit())
       Subtarget.setPICStyle(PICStyles::RIPRel);
-    else
-      Subtarget.setPICStyle(PICStyles::Stub);
+    else if (getRelocationModel() == Reloc::PIC_)
+      Subtarget.setPICStyle(PICStyles::StubPIC);
+    else {
+      assert(getRelocationModel() == Reloc::DynamicNoPIC);
+      Subtarget.setPICStyle(PICStyles::StubDynamicNoPIC);
+    }
   } else if (Subtarget.isTargetELF()) {
     if (Subtarget.is64Bit())
       Subtarget.setPICStyle(PICStyles::RIPRel);
     else
       Subtarget.setPICStyle(PICStyles::GOT);
   }
+      
+  // Finally, if we have "none" as our PIC style, force to static mode.
+  if (Subtarget.getPICStyle() == PICStyles::None)
+    setRelocationModel(Reloc::Static);
 }
 
 //===----------------------------------------------------------------------===//
@@ -212,33 +174,16 @@ bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM,
   return true;  // -print-machineinstr should print after this.
 }
 
-bool X86TargetMachine::addAssemblyEmitter(PassManagerBase &PM,
-                                          CodeGenOpt::Level OptLevel,
-                                          bool Verbose,
-                                          raw_ostream &Out) {
-  // FIXME: Move this somewhere else!
-  // On Darwin, override 64-bit static relocation to pic_ since the
-  // assembler doesn't support it.
-  if (DefRelocModel == Reloc::Static &&
-      Subtarget.isTargetDarwin() && Subtarget.is64Bit() &&
-      getCodeModel() == CodeModel::Small)
-    setRelocationModel(Reloc::PIC_);
-
-  assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-  if (AsmPrinterCtor)
-    PM.add(AsmPrinterCtor(Out, *this, Verbose));
-  return false;
-}
-
 bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel,
-                                      bool DumpAsm, 
                                       MachineCodeEmitter &MCE) {
   // FIXME: Move this to TargetJITInfo!
   // On Darwin, do not override 64-bit setting made in X86TargetMachine().
   if (DefRelocModel == Reloc::Default && 
-        (!Subtarget.isTargetDarwin() || !Subtarget.is64Bit()))
+      (!Subtarget.isTargetDarwin() || !Subtarget.is64Bit())) {
     setRelocationModel(Reloc::Static);
+    Subtarget.setPICStyle(PICStyles::None);
+  }
   
   // 64-bit JIT places everything in the same buffer except external functions.
   // On Darwin, use small code model but hack the call instruction for 
@@ -251,24 +196,20 @@ bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
   }
 
   PM.add(createX86CodeEmitterPass(*this, MCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
 
   return false;
 }
 
 bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel,
-                                      bool DumpAsm,
                                       JITCodeEmitter &JCE) {
   // FIXME: Move this to TargetJITInfo!
   // On Darwin, do not override 64-bit setting made in X86TargetMachine().
   if (DefRelocModel == Reloc::Default && 
-        (!Subtarget.isTargetDarwin() || !Subtarget.is64Bit()))
+      (!Subtarget.isTargetDarwin() || !Subtarget.is64Bit())) {
     setRelocationModel(Reloc::Static);
+    Subtarget.setPICStyle(PICStyles::None);
+  }
   
   // 64-bit JIT places everything in the same buffer except external functions.
   // On Darwin, use small code model but hack the call instruction for 
@@ -281,40 +222,34 @@ bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
   }
 
   PM.add(createX86JITCodeEmitterPass(*this, JCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
 
   return false;
 }
 
+bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel,
+                                      ObjectCodeEmitter &OCE) {
+  PM.add(createX86ObjectCodeEmitterPass(*this, OCE));
+  return false;
+}
+
 bool X86TargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
                                             CodeGenOpt::Level OptLevel,
-                                            bool DumpAsm,
                                             MachineCodeEmitter &MCE) {
   PM.add(createX86CodeEmitterPass(*this, MCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
-
   return false;
 }
 
 bool X86TargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
                                             CodeGenOpt::Level OptLevel,
-                                            bool DumpAsm,
                                             JITCodeEmitter &JCE) {
   PM.add(createX86JITCodeEmitterPass(*this, JCE));
-  if (DumpAsm) {
-    assert(AsmPrinterCtor && "AsmPrinter was not linked in");
-    if (AsmPrinterCtor)
-      PM.add(AsmPrinterCtor(errs(), *this, true));
-  }
-
   return false;
 }
 
+bool X86TargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel,
+                                            ObjectCodeEmitter &OCE) {
+  PM.add(createX86ObjectCodeEmitterPass(*this, OCE));
+  return false;
+}
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 90a5cc2..b538408 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -26,7 +26,7 @@
 
 namespace llvm {
   
-class raw_ostream;
+class formatted_raw_ostream;
 
 class X86TargetMachine : public LLVMTargetMachine {
   X86Subtarget      Subtarget;
@@ -38,18 +38,9 @@ class X86TargetMachine : public LLVMTargetMachine {
   X86ELFWriterInfo  ELFWriterInfo;
   Reloc::Model      DefRelocModel; // Reloc model before it's overridden.
 
-protected:
-  virtual const TargetAsmInfo *createTargetAsmInfo() const;
-
-  // To avoid having target depend on the asmprinter stuff libraries, asmprinter
-  // set this functions to ctor pointer at startup time if they are linked in.
-  typedef FunctionPass *(*AsmPrinterCtorFn)(raw_ostream &o,
-                                            X86TargetMachine &tm,
-                                            bool verbose);
-  static AsmPrinterCtorFn AsmPrinterCtor;
-
 public:
-  X86TargetMachine(const Module &M, const std::string &FS, bool is64Bit);
+  X86TargetMachine(const Target &T, const std::string &TT, 
+                   const std::string &FS, bool is64Bit);
 
   virtual const X86InstrInfo     *getInstrInfo() const { return &InstrInfo; }
   virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
@@ -66,50 +57,41 @@ public:
     return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
   }
 
-  static unsigned getModuleMatchQuality(const Module &M);
-  static unsigned getJITMatchQuality();
-
-  static void registerAsmPrinter(AsmPrinterCtorFn F) {
-    AsmPrinterCtor = F;
-  }
-
   // Set up the pass pipeline.
   virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addAssemblyEmitter(PassManagerBase &PM,
-                                  CodeGenOpt::Level OptLevel, 
-                                  bool Verbose, raw_ostream &Out);
   virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
-                              bool DumpAsm, MachineCodeEmitter &MCE);
+                              MachineCodeEmitter &MCE);
   virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
-                              bool DumpAsm, JITCodeEmitter &JCE);
+                              JITCodeEmitter &JCE);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              ObjectCodeEmitter &OCE);
+  virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+                                    CodeGenOpt::Level OptLevel,
+                                    MachineCodeEmitter &MCE);
   virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
                                     CodeGenOpt::Level OptLevel,
-                                    bool DumpAsm, MachineCodeEmitter &MCE);
+                                    JITCodeEmitter &JCE);
   virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
                                     CodeGenOpt::Level OptLevel,
-                                    bool DumpAsm, JITCodeEmitter &JCE);
+                                    ObjectCodeEmitter &OCE);
 };
 
 /// X86_32TargetMachine - X86 32-bit target machine.
 ///
 class X86_32TargetMachine : public X86TargetMachine {
 public:
-  X86_32TargetMachine(const Module &M, const std::string &FS);
-  
-  static unsigned getJITMatchQuality();
-  static unsigned getModuleMatchQuality(const Module &M);
+  X86_32TargetMachine(const Target &T, const std::string &M,
+                      const std::string &FS);
 };
 
 /// X86_64TargetMachine - X86 64-bit target machine.
 ///
 class X86_64TargetMachine : public X86TargetMachine {
 public:
-  X86_64TargetMachine(const Module &M, const std::string &FS);
-  
-  static unsigned getJITMatchQuality();
-  static unsigned getModuleMatchQuality(const Module &M);
+  X86_64TargetMachine(const Target &T, const std::string &TT,
+                      const std::string &FS);
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
new file mode 100644
index 0000000..d39b3c4
--- /dev/null
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -0,0 +1,65 @@
+//===-- llvm/Target/X86/X86TargetObjectFile.cpp - X86 Object Info ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+using namespace llvm;
+
+const MCExpr *X8632_MachoTargetObjectFile::
+getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                                 MachineModuleInfo *MMI,
+                                 bool &IsIndirect, bool &IsPCRel) const {
+  // The mach-o version of this method defaults to returning a stub reference.
+  IsIndirect = true;
+  IsPCRel    = false;
+  
+  
+  MachineModuleInfoMachO &MachOMMI =
+  MMI->getObjFileInfo<MachineModuleInfoMachO>();
+  
+  SmallString<128> Name;
+  Mang->getNameWithPrefix(Name, GV, true);
+  Name += "$non_lazy_ptr";
+  
+  // Add information about the stub reference to MachOMMI so that the stub gets
+  // emitted by the asmprinter.
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name.str());
+  const MCSymbol *&StubSym = MachOMMI.getGVStubEntry(Sym);
+  if (StubSym == 0) {
+    Name.clear();
+    Mang->getNameWithPrefix(Name, GV, false);
+    StubSym = getContext().GetOrCreateSymbol(Name.str());
+  }
+  
+  return MCSymbolRefExpr::Create(Sym, getContext());
+}
+
+const MCExpr *X8664_MachoTargetObjectFile::
+getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                                 MachineModuleInfo *MMI,
+                                 bool &IsIndirect, bool &IsPCRel) const {
+  
+  // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
+  // is an indirect pc-relative reference.
+  IsIndirect = true;
+  IsPCRel    = true;
+  
+  SmallString<128> Name;
+  Mang->getNameWithPrefix(Name, GV, false);
+  Name += "@GOTPCREL";
+  const MCExpr *Res =
+    MCSymbolRefExpr::Create(Name.str(), getContext());
+  const MCExpr *Four = MCConstantExpr::Create(4, getContext());
+  return MCBinaryExpr::CreateAdd(Res, Four, getContext());
+}
+
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
new file mode 100644
index 0000000..377a93b
--- /dev/null
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -0,0 +1,40 @@
+//===-- llvm/Target/X86/X86TargetObjectFile.h - X86 Object Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_X86_TARGETOBJECTFILE_H
+#define LLVM_TARGET_X86_TARGETOBJECTFILE_H
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+  
+  /// X8632_MachoTargetObjectFile - This TLOF implementation is used for
+  /// Darwin/x86-32.
+  class X8632_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+  public:
+    
+    virtual const MCExpr *
+    getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                                     MachineModuleInfo *MMI,
+                                     bool &IsIndirect, bool &IsPCRel) const;
+  };
+  
+  /// X8664_MachoTargetObjectFile - This TLOF implementation is used for
+  /// Darwin/x86-64.
+  class X8664_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+  public:
+
+    virtual const MCExpr *
+    getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                                     MachineModuleInfo *MMI,
+                                     bool &IsIndirect, bool &IsPCRel) const;
+  };
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/XCore/AsmPrinter/CMakeLists.txt b/lib/Target/XCore/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..7c7c2f4
--- /dev/null
+++ b/lib/Target/XCore/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMXCoreAsmPrinter
+  XCoreAsmPrinter.cpp
+  )
+add_dependencies(LLVMXCoreAsmPrinter XCoreCodeGenTable_gen)
diff --git a/lib/Target/XCore/AsmPrinter/Makefile b/lib/Target/XCore/AsmPrinter/Makefile
new file mode 100644
index 0000000..82dc1df
--- /dev/null
+++ b/lib/Target/XCore/AsmPrinter/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/XCore/AsmPrinter/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMXCoreAsmPrinter
+
+# Hack: we need to include 'main' XCore target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp b/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp
new file mode 100644
index 0000000..e58edda
--- /dev/null
+++ b/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp
@@ -0,0 +1,374 @@
+//===-- XCoreAsmPrinter.cpp - XCore LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the XAS-format XCore assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreSubtarget.h"
+#include "XCoreMCAsmInfo.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cctype>
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+static cl::opt<unsigned> MaxThreads("xcore-max-threads", cl::Optional,
+  cl::desc("Maximum number of threads (for emulation thread-local storage)"),
+  cl::Hidden,
+  cl::value_desc("number"),
+  cl::init(8));
+
+namespace {
+  class VISIBILITY_HIDDEN XCoreAsmPrinter : public AsmPrinter {
+    const XCoreSubtarget &Subtarget;
+  public:
+    explicit XCoreAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                             const MCAsmInfo *T, bool V)
+      : AsmPrinter(O, TM, T, V),
+      Subtarget(TM.getSubtarget<XCoreSubtarget>()) {}
+
+    virtual const char *getPassName() const {
+      return "XCore Assembly Printer";
+    }
+
+    void printMemOperand(const MachineInstr *MI, int opNum);
+    void printOperand(const MachineInstr *MI, int opNum);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                        unsigned AsmVariant, const char *ExtraCode);
+
+    void emitGlobalDirective(const std::string &name);
+    void emitExternDirective(const std::string &name);
+    
+    void emitArrayBound(const std::string &name, const GlobalVariable *GV);
+    virtual void PrintGlobalVariable(const GlobalVariable *GV);
+
+    void emitFunctionStart(MachineFunction &MF);
+    void emitFunctionEnd(MachineFunction &MF);
+
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printMachineInstruction(const MachineInstr *MI);
+    bool runOnMachineFunction(MachineFunction &F);
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AsmPrinter::getAnalysisUsage(AU);
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      AU.addRequired<DwarfWriter>();
+    }
+  };
+} // end of anonymous namespace
+
+#include "XCoreGenAsmWriter.inc"
+
+void XCoreAsmPrinter::
+emitGlobalDirective(const std::string &name)
+{
+  O << MAI->getGlobalDirective() << name;
+  O << "\n";
+}
+
+void XCoreAsmPrinter::
+emitExternDirective(const std::string &name)
+{
+  O << "\t.extern\t" << name;
+  O << '\n';
+}
+
+void XCoreAsmPrinter::
+emitArrayBound(const std::string &name, const GlobalVariable *GV)
+{
+  assert(((GV->hasExternalLinkage() ||
+    GV->hasWeakLinkage()) ||
+    GV->hasLinkOnceLinkage()) && "Unexpected linkage");
+  if (const ArrayType *ATy = dyn_cast<ArrayType>(
+    cast<PointerType>(GV->getType())->getElementType()))
+  {
+    O << MAI->getGlobalDirective() << name << ".globound" << "\n";
+    O << MAI->getSetDirective() << name << ".globound" << ","
+      << ATy->getNumElements() << "\n";
+    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage()) {
+      // TODO Use COMDAT groups for LinkOnceLinkage
+      O << MAI->getWeakDefDirective() << name << ".globound" << "\n";
+    }
+  }
+}
+
+void XCoreAsmPrinter::PrintGlobalVariable(const GlobalVariable *GV) {
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (!GV->hasInitializer() ||
+      EmitSpecialLLVMGlobal(GV))
+    return;
+
+  const TargetData *TD = TM.getTargetData();
+  
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GV, Mang,TM));
+  
+  std::string name = Mang->getMangledName(GV);
+  Constant *C = GV->getInitializer();
+  unsigned Align = (unsigned)TD->getPreferredTypeAlignmentShift(C->getType());
+  
+  // Mark the start of the global
+  O << "\t.cc_top " << name << ".data," << name << "\n";
+
+  switch (GV->getLinkage()) {
+  case GlobalValue::AppendingLinkage:
+    llvm_report_error("AppendingLinkage is not supported by this target!");
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+  case GlobalValue::ExternalLinkage:
+    emitArrayBound(name, GV);
+    emitGlobalDirective(name);
+    // TODO Use COMDAT groups for LinkOnceLinkage
+    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage()) {
+      O << MAI->getWeakDefDirective() << name << "\n";
+    }
+    // FALL THROUGH
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+  case GlobalValue::LinkerPrivateLinkage:
+    break;
+  case GlobalValue::GhostLinkage:
+    llvm_unreachable("Should not have any unmaterialized functions!");
+  case GlobalValue::DLLImportLinkage:
+    llvm_unreachable("DLLImport linkage is not supported by this target!");
+  case GlobalValue::DLLExportLinkage:
+    llvm_unreachable("DLLExport linkage is not supported by this target!");
+  default:
+    llvm_unreachable("Unknown linkage type!");
+  }
+
+  EmitAlignment(Align, GV, 2);
+  
+  unsigned Size = TD->getTypeAllocSize(C->getType());
+  if (GV->isThreadLocal()) {
+    Size *= MaxThreads;
+  }
+  if (MAI->hasDotTypeDotSizeDirective()) {
+    O << "\t.type " << name << ",@object\n";
+    O << "\t.size " << name << "," << Size << "\n";
+  }
+  O << name << ":\n";
+  
+  EmitGlobalConstant(C);
+  if (GV->isThreadLocal()) {
+    for (unsigned i = 1; i < MaxThreads; ++i) {
+      EmitGlobalConstant(C);
+    }
+  }
+  if (Size < 4) {
+    // The ABI requires that unsigned scalar types smaller than 32 bits
+    // are are padded to 32 bits.
+    EmitZeros(4 - Size);
+  }
+  
+  // Mark the end of the global
+  O << "\t.cc_bottom " << name << ".data\n";
+}
+
+/// Emit the directives on the start of functions
+void XCoreAsmPrinter::emitFunctionStart(MachineFunction &MF) {
+  // Print out the label for the function.
+  const Function *F = MF.getFunction();
+
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
+  
+  // Mark the start of the function
+  O << "\t.cc_top " << CurrentFnName << ".function," << CurrentFnName << "\n";
+
+  switch (F->getLinkage()) {
+  default: llvm_unreachable("Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+  case Function::PrivateLinkage:
+  case Function::LinkerPrivateLinkage:
+    break;
+  case Function::ExternalLinkage:
+    emitGlobalDirective(CurrentFnName);
+    break;
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+    // TODO Use COMDAT groups for LinkOnceLinkage
+    O << MAI->getGlobalDirective() << CurrentFnName << "\n";
+    O << MAI->getWeakDefDirective() << CurrentFnName << "\n";
+    break;
+  }
+  // (1 << 1) byte aligned
+  EmitAlignment(MF.getAlignment(), F, 1);
+  if (MAI->hasDotTypeDotSizeDirective()) {
+    O << "\t.type " << CurrentFnName << ",@function\n";
+  }
+  O << CurrentFnName << ":\n";
+}
+
+/// Emit the directives on the end of functions
+void XCoreAsmPrinter::
+emitFunctionEnd(MachineFunction &MF) 
+{
+  // Mark the end of the function
+  O << "\t.cc_bottom " << CurrentFnName << ".function\n";
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool XCoreAsmPrinter::runOnMachineFunction(MachineFunction &MF)
+{
+  this->MF = &MF;
+
+  SetupMachineFunction(MF);
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out jump tables referenced by the function
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  // Emit the function start directives
+  emitFunctionStart(MF);
+  
+  // Emit pre-function debug information.
+  DW->BeginFunction(&MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      EmitBasicBlockStart(I);
+    }
+
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      printMachineInstruction(II);
+    }
+
+    // Each Basic Block is separated by a newline
+    O << '\n';
+  }
+
+  // Emit function end directives
+  emitFunctionEnd(MF);
+  
+  // Emit post-function debug information.
+  DW->EndFunction(&MF);
+
+  // We didn't modify anything.
+  return false;
+}
+
+void XCoreAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum)
+{
+  printOperand(MI, opNum);
+  
+  if (MI->getOperand(opNum+1).isImm()
+    && MI->getOperand(opNum+1).getImm() == 0)
+    return;
+  
+  O << "+";
+  printOperand(MI, opNum+1);
+}
+
+void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << getRegisterName(MO.getReg());
+    break;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    GetMBBSymbol(MO.getMBB()->getNumber())->print(O, MAI);
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    O << Mang->getMangledName(MO.getGlobal());
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+  default:
+    llvm_unreachable("not implemented");
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant, 
+                                      const char *ExtraCode) {
+  printOperand(MI, OpNo);
+  return false;
+}
+
+void XCoreAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  processDebugLoc(MI, true);
+
+  // Check for mov mnemonic
+  unsigned src, dst, srcSR, dstSR;
+  if (TM.getInstrInfo()->isMoveInstr(*MI, src, dst, srcSR, dstSR)) {
+    O << "\tmov " << getRegisterName(dst) << ", ";
+    O << getRegisterName(src) << '\n';
+    return;
+  }
+  printInstruction(MI);
+  if (VerboseAsm && !MI->getDebugLoc().isUnknown())
+    EmitComments(*MI);
+  O << '\n';
+
+  processDebugLoc(MI, false);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreAsmPrinter() { 
+  RegisterAsmPrinter<XCoreAsmPrinter> X(TheXCoreTarget);
+}
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index a7aba14..0965323 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -11,13 +11,14 @@ tablegen(XCoreGenCallingConv.inc -gen-callingconv)
 tablegen(XCoreGenSubtarget.inc -gen-subtarget)
 
 add_llvm_target(XCore
-  XCoreAsmPrinter.cpp
+  MCSectionXCore.cpp
   XCoreFrameInfo.cpp
   XCoreInstrInfo.cpp
   XCoreISelDAGToDAG.cpp
   XCoreISelLowering.cpp
+  XCoreMCAsmInfo.cpp
   XCoreRegisterInfo.cpp
   XCoreSubtarget.cpp
-  XCoreTargetAsmInfo.cpp
   XCoreTargetMachine.cpp
+  XCoreTargetObjectFile.cpp
   )
diff --git a/lib/Target/XCore/MCSectionXCore.cpp b/lib/Target/XCore/MCSectionXCore.cpp
new file mode 100644
index 0000000..5acceaf
--- /dev/null
+++ b/lib/Target/XCore/MCSectionXCore.cpp
@@ -0,0 +1,35 @@
+//===- MCSectionXCore.cpp - XCore-specific section representation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MCSectionXCore class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCSectionXCore.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+MCSectionXCore *
+MCSectionXCore::Create(const StringRef &Section, unsigned Type,
+                       unsigned Flags, SectionKind K,
+                       bool isExplicit, MCContext &Ctx) {
+  return new (Ctx) MCSectionXCore(Section, Type, Flags, K, isExplicit);
+}
+
+
+/// PrintTargetSpecificSectionFlags - This handles the XCore-specific cp/dp
+/// section flags.
+void MCSectionXCore::PrintTargetSpecificSectionFlags(const MCAsmInfo &MAI,
+                                                     raw_ostream &OS) const {
+  if (getFlags() & MCSectionXCore::SHF_CP_SECTION)
+    OS << 'c';
+  if (getFlags() & MCSectionXCore::SHF_DP_SECTION)
+    OS << 'd';
+}
diff --git a/lib/Target/XCore/MCSectionXCore.h b/lib/Target/XCore/MCSectionXCore.h
new file mode 100644
index 0000000..02f8f95
--- /dev/null
+++ b/lib/Target/XCore/MCSectionXCore.h
@@ -0,0 +1,54 @@
+//===- MCSectionXCore.h - XCore-specific section representation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MCSectionXCore class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MCSECTION_XCORE_H
+#define LLVM_MCSECTION_XCORE_H
+
+#include "llvm/MC/MCSectionELF.h"
+
+namespace llvm {
+  
+class MCSectionXCore : public MCSectionELF {
+  MCSectionXCore(const StringRef &Section, unsigned Type, unsigned Flags,
+                 SectionKind K, bool isExplicit)
+    : MCSectionELF(Section, Type, Flags, K, isExplicit) {}
+  
+public:
+  
+  enum {
+    /// SHF_CP_SECTION - All sections with the "c" flag are grouped together
+    /// by the linker to form the constant pool and the cp register is set to
+    /// the start of the constant pool by the boot code.
+    SHF_CP_SECTION = FIRST_TARGET_DEP_FLAG,
+    
+    /// SHF_DP_SECTION - All sections with the "d" flag are grouped together
+    /// by the linker to form the data section and the dp register is set to
+    /// the start of the section by the boot code.
+    SHF_DP_SECTION = FIRST_TARGET_DEP_FLAG << 1
+  };
+  
+  static MCSectionXCore *Create(const StringRef &Section, unsigned Type,
+                                unsigned Flags, SectionKind K,
+                                bool isExplicit, MCContext &Ctx);
+  
+  
+  /// PrintTargetSpecificSectionFlags - This handles the XCore-specific cp/dp
+  /// section flags.
+  virtual void PrintTargetSpecificSectionFlags(const MCAsmInfo &MAI,
+                                               raw_ostream &OS) const;
+
+};
+  
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/XCore/Makefile b/lib/Target/XCore/Makefile
index 568df70..bd3b52a 100644
--- a/lib/Target/XCore/Makefile
+++ b/lib/Target/XCore/Makefile
@@ -7,7 +7,7 @@
 #
 ##===----------------------------------------------------------------------===##
 LEVEL = ../../..
-LIBRARYNAME = LLVMXCore
+LIBRARYNAME = LLVMXCoreCodeGen
 TARGET = XCore
 
 # Make sure that tblgen is run, first thing.
@@ -17,5 +17,7 @@ BUILT_SOURCES = XCoreGenRegisterInfo.h.inc XCoreGenRegisterNames.inc \
                 XCoreGenDAGISel.inc XCoreGenCallingConv.inc \
 		XCoreGenSubtarget.inc
 
+DIRS = AsmPrinter TargetInfo
+
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Target/XCore/TargetInfo/CMakeLists.txt b/lib/Target/XCore/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..0a568de
--- /dev/null
+++ b/lib/Target/XCore/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMXCoreInfo
+  XCoreTargetInfo.cpp
+  )
+
+add_dependencies(LLVMXCoreInfo XCoreTable_gen)
diff --git a/lib/Target/XCore/TargetInfo/Makefile b/lib/Target/XCore/TargetInfo/Makefile
new file mode 100644
index 0000000..07473d2
--- /dev/null
+++ b/lib/Target/XCore/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/XCore/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMXCoreInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
new file mode 100644
index 0000000..7aa8965
--- /dev/null
+++ b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
@@ -0,0 +1,19 @@
+//===-- XCoreTargetInfo.cpp - XCore Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheXCoreTarget;
+
+extern "C" void LLVMInitializeXCoreTargetInfo() { 
+  RegisterTarget<Triple::xcore> X(TheXCoreTarget, "xcore", "XCore");
+}
diff --git a/lib/Target/XCore/XCore.h b/lib/Target/XCore/XCore.h
index d95aab3..8937fbe 100644
--- a/lib/Target/XCore/XCore.h
+++ b/lib/Target/XCore/XCore.h
@@ -21,12 +21,12 @@ namespace llvm {
   class FunctionPass;
   class TargetMachine;
   class XCoreTargetMachine;
-  class raw_ostream;
+  class formatted_raw_ostream;
 
   FunctionPass *createXCoreISelDag(XCoreTargetMachine &TM);
-  FunctionPass *createXCoreCodePrinterPass(raw_ostream &OS,
-                                           XCoreTargetMachine &TM,
-                                           bool Verbose);
+
+  extern Target TheXCoreTarget;
+
 } // end namespace llvm;
 
 // Defines symbolic names for XCore registers.  This defines a mapping from
diff --git a/lib/Target/XCore/XCore.td b/lib/Target/XCore/XCore.td
index 7a2dcdb..b07445d 100644
--- a/lib/Target/XCore/XCore.td
+++ b/lib/Target/XCore/XCore.td
@@ -30,27 +30,14 @@ def XCoreInstrInfo : InstrInfo {
 }
 
 //===----------------------------------------------------------------------===//
-// XCore Subtarget features.
-//===----------------------------------------------------------------------===//
-
-def FeatureXS1A
-  : SubtargetFeature<"xs1a", "IsXS1A", "true",
-                     "Enable XS1A instructions">;
-
-def FeatureXS1B
-  : SubtargetFeature<"xs1b", "IsXS1B", "true",
-                     "Enable XS1B instructions">;
-
-//===----------------------------------------------------------------------===//
 // XCore processors supported.
 //===----------------------------------------------------------------------===//
 
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
-def : Proc<"generic",      [FeatureXS1A]>;
-def : Proc<"xs1a-generic", [FeatureXS1A]>;
-def : Proc<"xs1b-generic", [FeatureXS1B]>;
+def : Proc<"generic",      []>;
+def : Proc<"xs1b-generic", []>;
 
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index eed34a4..860b72f 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Intrinsics.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -28,6 +29,8 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include <queue>
 #include <set>
 using namespace llvm;
@@ -159,69 +162,62 @@ InstructionSelect() {
 SDNode *XCoreDAGToDAGISel::Select(SDValue Op) {
   SDNode *N = Op.getNode();
   DebugLoc dl = N->getDebugLoc();
-  MVT NVT = N->getValueType(0);
+  EVT NVT = N->getValueType(0);
   if (NVT == MVT::i32) {
     switch (N->getOpcode()) {
       default: break;
       case ISD::Constant: {
         if (Predicate_immMskBitp(N)) {
           SDValue MskSize = Transform_msksize_xform(N);
-          return CurDAG->getTargetNode(XCore::MKMSK_rus, dl, MVT::i32, MskSize);
+          return CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
+                                        MVT::i32, MskSize);
         }
         else if (! Predicate_immU16(N)) {
           unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
           SDValue CPIdx =
-            CurDAG->getTargetConstantPool(ConstantInt::get(Type::Int32Ty, Val),
+            CurDAG->getTargetConstantPool(ConstantInt::get(
+                                  Type::getInt32Ty(*CurDAG->getContext()), Val),
                                           TLI.getPointerTy());
-          return CurDAG->getTargetNode(XCore::LDWCP_lru6, dl, MVT::i32, 
-                                       MVT::Other, CPIdx, 
-                                       CurDAG->getEntryNode());
+          return CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32, 
+                                        MVT::Other, CPIdx, 
+                                        CurDAG->getEntryNode());
         }
         break;
       }
       case ISD::SMUL_LOHI: {
         // FIXME fold addition into the macc instruction
-        if (!Subtarget.isXS1A()) {
-          SDValue Zero(CurDAG->getTargetNode(XCore::LDC_ru6, dl, MVT::i32,
-                                  CurDAG->getTargetConstant(0, MVT::i32)), 0);
-          SDValue Ops[] = { Zero, Zero, Op.getOperand(0), Op.getOperand(1) };
-          SDNode *ResNode = CurDAG->getTargetNode(XCore::MACCS_l4r, dl,
-                                                  MVT::i32, MVT::i32, Ops, 4);
-          ReplaceUses(SDValue(N, 0), SDValue(ResNode, 1));
-          ReplaceUses(SDValue(N, 1), SDValue(ResNode, 0));
-          return NULL;
-        }
-        break;
+        SDValue Zero(CurDAG->getMachineNode(XCore::LDC_ru6, dl, MVT::i32,
+                                CurDAG->getTargetConstant(0, MVT::i32)), 0);
+        SDValue Ops[] = { Zero, Zero, Op.getOperand(0), Op.getOperand(1) };
+        SDNode *ResNode = CurDAG->getMachineNode(XCore::MACCS_l4r, dl,
+                                                 MVT::i32, MVT::i32, Ops, 4);
+        ReplaceUses(SDValue(N, 0), SDValue(ResNode, 1));
+        ReplaceUses(SDValue(N, 1), SDValue(ResNode, 0));
+        return NULL;
       }
       case ISD::UMUL_LOHI: {
         // FIXME fold addition into the macc / lmul instruction
-        SDValue Zero(CurDAG->getTargetNode(XCore::LDC_ru6, dl, MVT::i32,
+        SDValue Zero(CurDAG->getMachineNode(XCore::LDC_ru6, dl, MVT::i32,
                                   CurDAG->getTargetConstant(0, MVT::i32)), 0);
         SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
                             Zero, Zero };
-        SDNode *ResNode = CurDAG->getTargetNode(XCore::LMUL_l6r, dl, MVT::i32,
-                                                MVT::i32, Ops, 4);
+        SDNode *ResNode = CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32,
+                                                 MVT::i32, Ops, 4);
         ReplaceUses(SDValue(N, 0), SDValue(ResNode, 1));
         ReplaceUses(SDValue(N, 1), SDValue(ResNode, 0));
         return NULL;
       }
       case XCoreISD::LADD: {
-        if (!Subtarget.isXS1A()) {
-          SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
-                              Op.getOperand(2) };
-          return CurDAG->getTargetNode(XCore::LADD_l5r, dl, MVT::i32, MVT::i32,
-                                       Ops, 3);
-        }
-        break;
+        SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+                            Op.getOperand(2) };
+        return CurDAG->getMachineNode(XCore::LADD_l5r, dl, MVT::i32, MVT::i32,
+                                      Ops, 3);
       }
       case XCoreISD::LSUB: {
-        if (!Subtarget.isXS1A()) {
-          SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
-                              Op.getOperand(2) };
-          return CurDAG->getTargetNode(XCore::LSUB_l5r, dl, MVT::i32, MVT::i32,
-                                       Ops, 3);
-        }
-        break;
+        SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1),
+                            Op.getOperand(2) };
+        return CurDAG->getMachineNode(XCore::LSUB_l5r, dl, MVT::i32, MVT::i32,
+                                      Ops, 3);
       }
       // Other cases are autogenerated.
     }
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index cc11d32..5ef56c9 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -16,6 +16,7 @@
 #include "XCoreISelLowering.h"
 #include "XCoreMachineFunctionInfo.h"
 #include "XCore.h"
+#include "XCoreTargetObjectFile.h"
 #include "XCoreTargetMachine.h"
 #include "XCoreSubtarget.h"
 #include "llvm/DerivedTypes.h"
@@ -32,6 +33,8 @@
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/VectorExtras.h"
 #include <queue>
 #include <set>
@@ -48,12 +51,14 @@ getTargetNodeName(unsigned Opcode) const
     case XCoreISD::CPRelativeWrapper : return "XCoreISD::CPRelativeWrapper";
     case XCoreISD::STWSP             : return "XCoreISD::STWSP";
     case XCoreISD::RETSP             : return "XCoreISD::RETSP";
+    case XCoreISD::LADD              : return "XCoreISD::LADD";
+    case XCoreISD::LSUB              : return "XCoreISD::LSUB";
     default                           : return NULL;
   }
 }
 
 XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
-  : TargetLowering(XTM),
+  : TargetLowering(XTM, new XCoreTargetObjectFile()),
     TM(XTM),
     Subtarget(*XTM.getSubtargetImpl()) {
 
@@ -67,8 +72,6 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setIntDivIsCheap(false);
 
   setShiftAmountType(MVT::i32);
-  // shl X, 32 == 0
-  setShiftAmountFlavor(Extend);
   setStackPointerRegisterToSaveRestore(XCore::SP);
 
   setSchedulingPreference(SchedulingForRegPressure);
@@ -88,13 +91,8 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
   
   // 64bit
-  if (!Subtarget.isXS1A()) {
-    setOperationAction(ISD::ADD, MVT::i64, Custom);
-    setOperationAction(ISD::SUB, MVT::i64, Custom);
-  }
-  if (Subtarget.isXS1A()) {
-    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-  }
+  setOperationAction(ISD::ADD, MVT::i64, Custom);
+  setOperationAction(ISD::SUB, MVT::i64, Custom);
   setOperationAction(ISD::MULHS, MVT::i32, Expand);
   setOperationAction(ISD::MULHU, MVT::i32, Expand);
   setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
@@ -112,9 +110,6 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::JumpTable, MVT::i32, Custom);
 
-  // RET must be custom lowered, to meet ABI requirements
-  setOperationAction(ISD::RET,           MVT::Other, Custom);
-
   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
   
   // Thread Local Storage
@@ -130,7 +125,11 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
 
   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
-  
+
+  // Custom expand misaligned loads / stores.
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+
   // Varargs
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
@@ -145,19 +144,24 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   // Debug
   setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
   setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+
+  maxStoresPerMemset = 4;
+  maxStoresPerMemmove = maxStoresPerMemcpy = 2;
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::STORE);
 }
 
 SDValue XCoreTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) {
   switch (Op.getOpcode()) 
   {
-  case ISD::CALL:             return LowerCALL(Op, DAG);
-  case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG);
-  case ISD::RET:              return LowerRET(Op, DAG);
   case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::ConstantPool:     return LowerConstantPool(Op, DAG);
   case ISD::JumpTable:        return LowerJumpTable(Op, DAG);
+  case ISD::LOAD:             return LowerLOAD(Op, DAG);
+  case ISD::STORE:            return LowerSTORE(Op, DAG);
   case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
   case ISD::VAARG:            return LowerVAARG(Op, DAG);
   case ISD::VASTART:          return LowerVASTART(Op, DAG);
@@ -166,7 +170,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) {
   case ISD::SUB:              return ExpandADDSUB(Op.getNode(), DAG);
   case ISD::FRAMEADDR:        return LowerFRAMEADDR(Op, DAG);
   default:
-    assert(0 && "unimplemented operand");
+    llvm_unreachable("unimplemented operand");
     return SDValue();
   }
 }
@@ -178,7 +182,7 @@ void XCoreTargetLowering::ReplaceNodeResults(SDNode *N,
                                              SelectionDAG &DAG) {
   switch (N->getOpcode()) {
   default:
-    assert(0 && "Don't know how to custom expand this!");
+    llvm_unreachable("Don't know how to custom expand this!");
     return;
   case ISD::ADD:
   case ISD::SUB:
@@ -214,17 +218,16 @@ getGlobalAddressWrapper(SDValue GA, GlobalValue *GV, SelectionDAG &DAG)
   DebugLoc dl = GA.getDebugLoc();
   if (isa<Function>(GV)) {
     return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA);
-  } else if (!Subtarget.isXS1A()) {
-    const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
-    if (!GVar) {
-      // If GV is an alias then use the aliasee to determine constness
-      if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-        GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal());
-    }
-    bool isConst = GVar && GVar->isConstant();
-    if (isConst) {
-      return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
-    }
+  }
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar) {
+    // If GV is an alias then use the aliasee to determine constness
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal());
+  }
+  bool isConst = GVar && GVar->isConstant();
+  if (isConst) {
+    return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
   }
   return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
 }
@@ -265,14 +268,16 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG)
       GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal());
   }
   if (! GVar) {
-    assert(0 && "Thread local object not a GlobalVariable?");
+    llvm_unreachable("Thread local object not a GlobalVariable?");
     return SDValue();
   }
   const Type *Ty = cast<PointerType>(GV->getType())->getElementType();
   if (!Ty->isSized() || isZeroLengthArray(Ty)) {
-    cerr << "Size of thread local object " << GVar->getName()
-         << " is unknown\n";
-    abort();
+#ifndef NDEBUG
+    errs() << "Size of thread local object " << GVar->getName()
+           << " is unknown\n";
+#endif
+    llvm_unreachable(0);
   }
   SDValue base = getGlobalAddressWrapper(GA, GV, DAG);
   const TargetData *TD = TM.getTargetData();
@@ -288,21 +293,16 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG)
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   // FIXME there isn't really debug info here
   DebugLoc dl = CP->getDebugLoc();
-  if (Subtarget.isXS1A()) {
-    assert(0 && "Lowering of constant pool unimplemented");
-    return SDValue();
+  EVT PtrVT = Op.getValueType();
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry()) {
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                    CP->getAlignment());
   } else {
-    MVT PtrVT = Op.getValueType();
-    SDValue Res;
-    if (CP->isMachineConstantPoolEntry()) {
-      Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
-                                      CP->getAlignment());
-    } else {
-      Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
-                                      CP->getAlignment());
-    }
-    return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, Res);
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                    CP->getAlignment());
   }
+  return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, Res);
 }
 
 SDValue XCoreTargetLowering::
@@ -310,19 +310,211 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG)
 {
   // FIXME there isn't really debug info here
   DebugLoc dl = Op.getDebugLoc();
-  MVT PtrVT = Op.getValueType();
+  EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
   return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, JTI);
 }
 
+static bool
+IsWordAlignedBasePlusConstantOffset(SDValue Addr, SDValue &AlignedBase,
+                                    int64_t &Offset)
+{
+  if (Addr.getOpcode() != ISD::ADD) {
+    return false;
+  }
+  ConstantSDNode *CN = 0;
+  if (!(CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+    return false;
+  }
+  int64_t off = CN->getSExtValue();
+  const SDValue &Base = Addr.getOperand(0);
+  const SDValue *Root = &Base;
+  if (Base.getOpcode() == ISD::ADD &&
+      Base.getOperand(1).getOpcode() == ISD::SHL) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Base.getOperand(1)
+                                                      .getOperand(1));
+    if (CN && (CN->getSExtValue() >= 2)) {
+      Root = &Base.getOperand(0);
+    }
+  }
+  if (isa<FrameIndexSDNode>(*Root)) {
+    // All frame indicies are word aligned
+    AlignedBase = Base;
+    Offset = off;
+    return true;
+  }
+  if (Root->getOpcode() == XCoreISD::DPRelativeWrapper ||
+      Root->getOpcode() == XCoreISD::CPRelativeWrapper) {
+    // All dp / cp relative addresses are word aligned
+    AlignedBase = Base;
+    Offset = off;
+    return true;
+  }
+  return false;
+}
+
+SDValue XCoreTargetLowering::
+LowerLOAD(SDValue Op, SelectionDAG &DAG)
+{
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+         "Unexpected extension type");
+  assert(LD->getMemoryVT() == MVT::i32 && "Unexpected load EVT");
+  if (allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+    return SDValue();
+  }
+  unsigned ABIAlignment = getTargetData()->
+    getABITypeAlignment(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  // Leave aligned load alone.
+  if (LD->getAlignment() >= ABIAlignment) {
+    return SDValue();
+  }
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  DebugLoc dl = Op.getDebugLoc();
+  
+  SDValue Base;
+  int64_t Offset;
+  if (!LD->isVolatile() &&
+      IsWordAlignedBasePlusConstantOffset(BasePtr, Base, Offset)) {
+    if (Offset % 4 == 0) {
+      // We've managed to infer better alignment information than the load
+      // already has. Use an aligned load.
+      return DAG.getLoad(getPointerTy(), dl, Chain, BasePtr, NULL, 4);
+    }
+    // Lower to
+    // ldw low, base[offset >> 2]
+    // ldw high, base[(offset >> 2) + 1]
+    // shr low_shifted, low, (offset & 0x3) * 8
+    // shl high_shifted, high, 32 - (offset & 0x3) * 8
+    // or result, low_shifted, high_shifted
+    SDValue LowOffset = DAG.getConstant(Offset & ~0x3, MVT::i32);
+    SDValue HighOffset = DAG.getConstant((Offset & ~0x3) + 4, MVT::i32);
+    SDValue LowShift = DAG.getConstant((Offset & 0x3) * 8, MVT::i32);
+    SDValue HighShift = DAG.getConstant(32 - (Offset & 0x3) * 8, MVT::i32);
+    
+    SDValue LowAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, Base, LowOffset);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, Base, HighOffset);
+    
+    SDValue Low = DAG.getLoad(getPointerTy(), dl, Chain,
+                               LowAddr, NULL, 4);
+    SDValue High = DAG.getLoad(getPointerTy(), dl, Chain,
+                               HighAddr, NULL, 4);
+    SDValue LowShifted = DAG.getNode(ISD::SRL, dl, MVT::i32, Low, LowShift);
+    SDValue HighShifted = DAG.getNode(ISD::SHL, dl, MVT::i32, High, HighShift);
+    SDValue Result = DAG.getNode(ISD::OR, dl, MVT::i32, LowShifted, HighShifted);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Low.getValue(1),
+                             High.getValue(1));
+    SDValue Ops[] = { Result, Chain };
+    return DAG.getMergeValues(Ops, 2, dl);
+  }
+  
+  if (LD->getAlignment() == 2) {
+    int SVOffset = LD->getSrcValueOffset();
+    SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, Chain,
+                                 BasePtr, LD->getSrcValue(), SVOffset, MVT::i16,
+                                 LD->isVolatile(), 2);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
+                                   DAG.getConstant(2, MVT::i32));
+    SDValue High = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::i32, Chain,
+                                  HighAddr, LD->getSrcValue(), SVOffset + 2,
+                                  MVT::i16, LD->isVolatile(), 2);
+    SDValue HighShifted = DAG.getNode(ISD::SHL, dl, MVT::i32, High,
+                                      DAG.getConstant(16, MVT::i32));
+    SDValue Result = DAG.getNode(ISD::OR, dl, MVT::i32, Low, HighShifted);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Low.getValue(1),
+                             High.getValue(1));
+    SDValue Ops[] = { Result, Chain };
+    return DAG.getMergeValues(Ops, 2, dl);
+  }
+  
+  // Lower to a call to __misaligned_load(BasePtr).
+  const Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  
+  Entry.Ty = IntPtrTy;
+  Entry.Node = BasePtr;
+  Args.push_back(Entry);
+  
+  std::pair<SDValue, SDValue> CallResult =
+        LowerCallTo(Chain, IntPtrTy, false, false,
+                    false, false, 0, CallingConv::C, false,
+                    /*isReturnValueUsed=*/true,
+                    DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
+                    Args, DAG, dl);
+
+  SDValue Ops[] =
+    { CallResult.first, CallResult.second };
+
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue XCoreTargetLowering::
+LowerSTORE(SDValue Op, SelectionDAG &DAG)
+{
+  StoreSDNode *ST = cast<StoreSDNode>(Op);
+  assert(!ST->isTruncatingStore() && "Unexpected store type");
+  assert(ST->getMemoryVT() == MVT::i32 && "Unexpected store EVT");
+  if (allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+    return SDValue();
+  }
+  unsigned ABIAlignment = getTargetData()->
+    getABITypeAlignment(ST->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  // Leave aligned store alone.
+  if (ST->getAlignment() >= ABIAlignment) {
+    return SDValue();
+  }
+  SDValue Chain = ST->getChain();
+  SDValue BasePtr = ST->getBasePtr();
+  SDValue Value = ST->getValue();
+  DebugLoc dl = Op.getDebugLoc();
+  
+  if (ST->getAlignment() == 2) {
+    int SVOffset = ST->getSrcValueOffset();
+    SDValue Low = Value;
+    SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value,
+                                      DAG.getConstant(16, MVT::i32));
+    SDValue StoreLow = DAG.getTruncStore(Chain, dl, Low, BasePtr,
+                                         ST->getSrcValue(), SVOffset, MVT::i16,
+                                         ST->isVolatile(), 2);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
+                                   DAG.getConstant(2, MVT::i32));
+    SDValue StoreHigh = DAG.getTruncStore(Chain, dl, High, HighAddr,
+                                          ST->getSrcValue(), SVOffset + 2,
+                                          MVT::i16, ST->isVolatile(), 2);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StoreLow, StoreHigh);
+  }
+  
+  // Lower to a call to __misaligned_store(BasePtr, Value).
+  const Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  
+  Entry.Ty = IntPtrTy;
+  Entry.Node = BasePtr;
+  Args.push_back(Entry);
+  
+  Entry.Node = Value;
+  Args.push_back(Entry);
+  
+  std::pair<SDValue, SDValue> CallResult =
+        LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), false, false,
+                    false, false, 0, CallingConv::C, false,
+                    /*isReturnValueUsed=*/true,
+                    DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
+                    Args, DAG, dl);
+
+  return CallResult.second;
+}
+
 SDValue XCoreTargetLowering::
 ExpandADDSUB(SDNode *N, SelectionDAG &DAG)
 {
   assert(N->getValueType(0) == MVT::i64 &&
          (N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
         "Unknown operand to lower!");
-  assert(!Subtarget.isXS1A() && "Cannot custom lower ADD/SUB on xs1a");
   DebugLoc dl = N->getDebugLoc();
   
   // Extract components
@@ -353,12 +545,12 @@ ExpandADDSUB(SDNode *N, SelectionDAG &DAG)
 SDValue XCoreTargetLowering::
 LowerVAARG(SDValue Op, SelectionDAG &DAG)
 {
-  assert(0 && "unimplemented");
+  llvm_unreachable("unimplemented");
   // FIX Arguments passed by reference need a extra dereference.
   SDNode *Node = Op.getNode();
   DebugLoc dl = Node->getDebugLoc();
   const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
-  MVT VT = Node->getValueType(0);
+  EVT VT = Node->getValueType(0);
   SDValue VAList = DAG.getLoad(getPointerTy(), dl, Node->getOperand(0),
                                Node->getOperand(1), V, 0);
   // Increment the pointer, VAList, to the next vararg
@@ -398,35 +590,33 @@ SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
 
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
-//
-//  The lower operations present on calling convention works on this order:
-//      LowerCALL (virt regs --> phys regs, virt regs --> stack) 
-//      LowerFORMAL_ARGUMENTS (phys --> virt regs, stack --> virt regs)
-//      LowerRET (virt regs --> phys regs)
-//      LowerCALL (phys regs --> virt regs)
-//
 //===----------------------------------------------------------------------===//
 
 #include "XCoreGenCallingConv.inc"
 
 //===----------------------------------------------------------------------===//
-//                  CALL Calling Convention Implementation
+//                  Call Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
-/// XCore custom CALL implementation
-SDValue XCoreTargetLowering::
-LowerCALL(SDValue Op, SelectionDAG &DAG)
-{
-  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
-  unsigned CallingConv = TheCall->getCallingConv();
+/// XCore call implementation
+SDValue
+XCoreTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               bool isTailCall,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) {
+
   // For now, only CallingConv::C implemented
-  switch (CallingConv) 
+  switch (CallConv)
   {
     default:
-      assert(0 && "Unsupported calling convention");
+      llvm_unreachable("Unsupported calling convention");
     case CallingConv::Fast:
     case CallingConv::C:
-      return LowerCCCCallTo(Op, DAG, CallingConv);
+      return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+                            Outs, Ins, dl, DAG, InVals);
   }
 }
 
@@ -434,24 +624,25 @@ LowerCALL(SDValue Op, SelectionDAG &DAG)
 /// regs to (physical regs)/(stack frame), CALLSEQ_START and
 /// CALLSEQ_END are emitted.
 /// TODO: isTailCall, sret.
-SDValue XCoreTargetLowering::
-LowerCCCCallTo(SDValue Op, SelectionDAG &DAG, unsigned CC) 
-{
-  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
-  SDValue Chain  = TheCall->getChain();
-  SDValue Callee = TheCall->getCallee();
-  bool isVarArg  = TheCall->isVarArg();
-  DebugLoc dl = Op.getDebugLoc();
+SDValue
+XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) {
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
 
   // The ABI dictates there should be one stack slot available to the callee
   // on function entry (for saving lr).
   CCInfo.AllocateStack(4, 4);
 
-  CCInfo.AnalyzeCallOperands(TheCall, CC_XCore);
+  CCInfo.AnalyzeCallOperands(Outs, CC_XCore);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -465,13 +656,11 @@ LowerCCCCallTo(SDValue Op, SelectionDAG &DAG, unsigned CC)
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-
-    // Arguments start after the 5 first operands of ISD::CALL
-    SDValue Arg = TheCall->getArg(i);
+    SDValue Arg = Outs[i].Val;
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
-      default: assert(0 && "Unknown loc info!");
+      default: llvm_unreachable("Unknown loc info!");
       case CCValAssign::Full: break;
       case CCValAssign::SExt:
         Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
@@ -554,59 +743,58 @@ LowerCCCCallTo(SDValue Op, SelectionDAG &DAG, unsigned CC)
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
-  return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
-                 Op.getResNo());
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
 }
 
-/// LowerCallResult - Lower the result values of an ISD::CALL into the
-/// appropriate copies out of appropriate physical registers.  This assumes that
-/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
-/// being lowered. Returns a SDNode with the same number of values as the 
-/// ISD::CALL.
-SDNode *XCoreTargetLowering::
-LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, 
-        unsigned CallingConv, SelectionDAG &DAG) {
-  bool isVarArg = TheCall->isVarArg();
-  DebugLoc dl = TheCall->getDebugLoc();
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue
+XCoreTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                     CallingConv::ID CallConv, bool isVarArg,
+                                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                                     DebugLoc dl, SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &InVals) {
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
 
-  CCInfo.AnalyzeCallResult(TheCall, RetCC_XCore);
-  SmallVector<SDValue, 8> ResultVals;
+  CCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
                                  RVLocs[i].getValVT(), InFlag).getValue(1);
     InFlag = Chain.getValue(2);
-    ResultVals.push_back(Chain.getValue(0));
+    InVals.push_back(Chain.getValue(0));
   }
 
-  ResultVals.push_back(Chain);
-
-  // Merge everything together with a MERGE_VALUES node.
-  return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
-                     &ResultVals[0], ResultVals.size()).getNode();
+  return Chain;
 }
 
 //===----------------------------------------------------------------------===//
-//             FORMAL_ARGUMENTS Calling Convention Implementation
+//             Formal Arguments Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
-/// XCore custom FORMAL_ARGUMENTS implementation
-SDValue XCoreTargetLowering::
-LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) 
-{
-  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-  switch(CC) 
+/// XCore formal arguments implementation
+SDValue
+XCoreTargetLowering::LowerFormalArguments(SDValue Chain,
+                                          CallingConv::ID CallConv,
+                                          bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                          DebugLoc dl,
+                                          SelectionDAG &DAG,
+                                          SmallVectorImpl<SDValue> &InVals) {
+  switch (CallConv)
   {
     default:
-      assert(0 && "Unsupported calling convention");
+      llvm_unreachable("Unsupported calling convention");
     case CallingConv::C:
     case CallingConv::Fast:
-      return LowerCCCArguments(Op, DAG);
+      return LowerCCCArguments(Chain, CallConv, isVarArg,
+                               Ins, dl, DAG, InVals);
   }
 }
 
@@ -614,27 +802,28 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG)
 /// virtual registers and generate load operations for
 /// arguments places on the stack.
 /// TODO: sret
-SDValue XCoreTargetLowering::
-LowerCCCArguments(SDValue Op, SelectionDAG &DAG)
-{
+SDValue
+XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
+                                       CallingConv::ID CallConv,
+                                       bool isVarArg,
+                                       const SmallVectorImpl<ISD::InputArg>
+                                         &Ins,
+                                       DebugLoc dl,
+                                       SelectionDAG &DAG,
+                                       SmallVectorImpl<SDValue> &InVals) {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
-  SDValue Root = Op.getOperand(0);
-  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
-  unsigned CC = MF.getFunction()->getCallingConv();
-  DebugLoc dl = Op.getDebugLoc();
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
 
-  CCInfo.AnalyzeFormalArguments(Op.getNode(), CC_XCore);
+  CCInfo.AnalyzeFormalArguments(Ins, CC_XCore);
 
   unsigned StackSlotSize = XCoreFrameInfo::stackSlotSize();
 
-  SmallVector<SDValue, 16> ArgValues;
-  
   unsigned LRSaveSize = StackSlotSize;
   
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -643,18 +832,21 @@ LowerCCCArguments(SDValue Op, SelectionDAG &DAG)
     
     if (VA.isRegLoc()) {
       // Arguments passed in registers
-      MVT RegVT = VA.getLocVT();
-      switch (RegVT.getSimpleVT()) {
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
       default:
-        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
-             << RegVT.getSimpleVT()
-             << "\n";
-        abort();
+        {
+#ifndef NDEBUG
+          errs() << "LowerFormalArguments Unhandled argument type: "
+                 << RegVT.getSimpleVT().SimpleTy << "\n";
+#endif
+          llvm_unreachable(0);
+        }
       case MVT::i32:
         unsigned VReg = RegInfo.createVirtualRegister(
                           XCore::GRRegsRegisterClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        ArgValues.push_back(DAG.getCopyFromReg(Root, dl, VReg, RegVT));
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
       }
     } else {
       // sanity check
@@ -662,9 +854,9 @@ LowerCCCArguments(SDValue Op, SelectionDAG &DAG)
       // Load the argument to a virtual register
       unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
       if (ObjSize > StackSlotSize) {
-        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
-             << VA.getLocVT().getSimpleVT()
-             << "\n";
+        errs() << "LowerFormalArguments Unhandled argument type: "
+               << (unsigned)VA.getLocVT().getSimpleVT().SimpleTy
+               << "\n";
       }
       // Create the frame index object for this incoming parameter...
       int FI = MFI->CreateFixedObject(ObjSize,
@@ -673,7 +865,7 @@ LowerCCCArguments(SDValue Op, SelectionDAG &DAG)
       // Create the SelectionDAG nodes corresponding to a load
       //from this parameter
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
-      ArgValues.push_back(DAG.getLoad(VA.getLocVT(), dl, Root, FIN, NULL, 0));
+      InVals.push_back(DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, NULL, 0));
     }
   }
   
@@ -702,14 +894,14 @@ LowerCCCArguments(SDValue Op, SelectionDAG &DAG)
         unsigned VReg = RegInfo.createVirtualRegister(
                           XCore::GRRegsRegisterClass);
         RegInfo.addLiveIn(ArgRegs[i], VReg);
-        SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i32);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
         // Move argument from virt reg -> stack
         SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
         MemOps.push_back(Store);
       }
       if (!MemOps.empty())
-        Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                           &MemOps[0], MemOps.size());
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                            &MemOps[0], MemOps.size());
     } else {
       // This will point to the next argument passed via stack.
       XFI->setVarArgsFrameIndex(
@@ -717,34 +909,29 @@ LowerCCCArguments(SDValue Op, SelectionDAG &DAG)
     }
   }
   
-  ArgValues.push_back(Root);
-
-  // Return the new list of results.
-  std::vector<MVT> RetVT(Op.getNode()->value_begin(),
-                                    Op.getNode()->value_end());
-  return DAG.getNode(ISD::MERGE_VALUES, dl, RetVT, 
-                     &ArgValues[0], ArgValues.size());
+  return Chain;
 }
 
 //===----------------------------------------------------------------------===//
 //               Return Value Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
-SDValue XCoreTargetLowering::
-LowerRET(SDValue Op, SelectionDAG &DAG)
-{
+SDValue
+XCoreTargetLowering::LowerReturn(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 DebugLoc dl, SelectionDAG &DAG) {
+
   // CCValAssign - represent the assignment of
   // the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
-  unsigned CC   = DAG.getMachineFunction().getFunction()->getCallingConv();
-  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
-  DebugLoc dl = Op.getDebugLoc();
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
 
-  // Analize return values of ISD::RET
-  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_XCore);
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
 
   // If this is the first return lowered for this function, add 
   // the regs to the liveout set for the function.
@@ -754,8 +941,6 @@ LowerRET(SDValue Op, SelectionDAG &DAG)
         DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
   }
 
-  // The chain is always operand #0
-  SDValue Chain = Op.getOperand(0);
   SDValue Flag;
 
   // Copy the result values into the output registers.
@@ -763,10 +948,8 @@ LowerRET(SDValue Op, SelectionDAG &DAG)
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
-    // ISD::RET => ret chain, (regnum1,val1), ...
-    // So i*2+1 index only the regnums
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 
-                             Op.getOperand(i*2+1), Flag);
+                             Outs[i].Val, Flag);
 
     // guarantee that all emitted copies are
     // stuck together, avoiding something bad
@@ -788,7 +971,8 @@ LowerRET(SDValue Op, SelectionDAG &DAG)
 
 MachineBasicBlock *
 XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *BB) const {
+                                                 MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
   const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   assert((MI->getOpcode() == XCore::SELECT_CC) &&
@@ -816,9 +1000,18 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
   F->insert(It, copy0MBB);
   F->insert(It, sinkMBB);
-  // Update machine-CFG edges by transferring all successors of the current
+  // Update machine-CFG edges by first adding all successors of the current
   // block to the new block which will contain the Phi node for the select.
-  sinkMBB->transferSuccessors(BB);
+  // Also inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
+         E = BB->succ_end(); I != E; ++I) {
+    EM->insert(std::make_pair(*I, sinkMBB));
+    sinkMBB->addSuccessor(*I);
+  }
+  // Next, remove all successors of the current block, and add the true
+  // and fallthrough blocks as its successors.
+  while (!BB->succ_empty())
+    BB->removeSuccessor(BB->succ_begin());
   // Next, add the true and fallthrough blocks as its successors.
   BB->addSuccessor(copy0MBB);
   BB->addSuccessor(sinkMBB);
@@ -844,6 +1037,56 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 }
 
 //===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::STORE: {
+    // Replace unaligned store of unaligned load with memmove.
+    StoreSDNode *ST  = cast<StoreSDNode>(N);
+    if (!DCI.isBeforeLegalize() ||
+        allowsUnalignedMemoryAccesses(ST->getMemoryVT()) ||
+        ST->isVolatile() || ST->isIndexed()) {
+      break;
+    }
+    SDValue Chain = ST->getChain();
+
+    unsigned StoreBits = ST->getMemoryVT().getStoreSizeInBits();
+    if (StoreBits % 8) {
+      break;
+    }
+    unsigned ABIAlignment = getTargetData()->getABITypeAlignment(
+        ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext()));
+    unsigned Alignment = ST->getAlignment();
+    if (Alignment >= ABIAlignment) {
+      break;
+    }
+
+    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(ST->getValue())) {
+      if (LD->hasNUsesOfValue(1, 0) && ST->getMemoryVT() == LD->getMemoryVT() &&
+        LD->getAlignment() == Alignment &&
+        !LD->isVolatile() && !LD->isIndexed() &&
+        Chain.reachesChainWithoutSideEffects(SDValue(LD, 1))) {
+        return DAG.getMemmove(Chain, dl, ST->getBasePtr(),
+                              LD->getBasePtr(),
+                              DAG.getConstant(StoreBits/8, MVT::i32),
+                              Alignment, ST->getSrcValue(),
+                              ST->getSrcValueOffset(), LD->getSrcValue(),
+                              LD->getSrcValueOffset());
+      }
+    }
+    break;
+  }
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
 //  Addressing mode description hooks
 //===----------------------------------------------------------------------===//
 
@@ -867,44 +1110,35 @@ static inline bool isImmUs4(int64_t val)
 bool
 XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM, 
                                               const Type *Ty) const {
-  MVT VT = getValueType(Ty, true);
-  // Get expected value type after legalization
-  switch (VT.getSimpleVT()) {
-  // Legal load / stores
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i32:
-    break;
-  // Expand i1 -> i8
-  case MVT::i1:
-    VT = MVT::i8;
-    break;
-  // Everything else is lowered to words
-  default:
-    VT = MVT::i32;
-    break;
-  }
+  // Be conservative with void
+  // FIXME: Can we be more aggressive?
+  if (Ty->getTypeID() == Type::VoidTyID)
+    return false;
+
+  const TargetData *TD = TM.getTargetData();
+  unsigned Size = TD->getTypeAllocSize(Ty);
   if (AM.BaseGV) {
-    return VT == MVT::i32 && !AM.HasBaseReg && AM.Scale == 0 &&
+    return Size >= 4 && !AM.HasBaseReg && AM.Scale == 0 &&
                  AM.BaseOffs%4 == 0;
   }
   
-  switch (VT.getSimpleVT()) {
-  default:
-    return false;
-  case MVT::i8:
+  switch (Size) {
+  case 1:
     // reg + imm
     if (AM.Scale == 0) {
       return isImmUs(AM.BaseOffs);
     }
+    // reg + reg
     return AM.Scale == 1 && AM.BaseOffs == 0;
-  case MVT::i16:
+  case 2:
+  case 3:
     // reg + imm
     if (AM.Scale == 0) {
       return isImmUs2(AM.BaseOffs);
     }
+    // reg + reg<<1
     return AM.Scale == 2 && AM.BaseOffs == 0;
-  case MVT::i32:
+  default:
     // reg + imm
     if (AM.Scale == 0) {
       return isImmUs4(AM.BaseOffs);
@@ -922,7 +1156,7 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
 
 std::vector<unsigned> XCoreTargetLowering::
 getRegClassForInlineAsmConstraint(const std::string &Constraint,
-                                  MVT VT) const 
+                                  EVT VT) const 
 {
   if (Constraint.size() != 1)
     return std::vector<unsigned>();
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 753ea81..ef8555e 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -79,7 +79,8 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
   
     virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                  MachineBasicBlock *MBB) const;
+                                                         MachineBasicBlock *MBB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
 
     virtual bool isLegalAddressingMode(const AddrMode &AM,
                                        const Type *Ty) const;
@@ -92,18 +93,31 @@ namespace llvm {
     const XCoreSubtarget &Subtarget;
   
     // Lower Operand helpers
-    SDValue LowerCCCArguments(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerCCCCallTo(SDValue Op, SelectionDAG &DAG, unsigned CC);
-    SDNode *LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode*TheCall,
-                            unsigned CallingConv, SelectionDAG &DAG);
+    SDValue LowerCCCArguments(SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              DebugLoc dl, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals);
+    SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           bool isTailCall,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG);
     SDValue getGlobalAddressWrapper(SDValue GA, GlobalValue *GV,
                                     SelectionDAG &DAG);
 
     // Lower Operand specifics
-    SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG);
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
@@ -116,10 +130,35 @@ namespace llvm {
     // Inline asm support
     std::vector<unsigned>
     getRegClassForInlineAsmConstraint(const std::string &Constraint,
-              MVT VT) const;
+              EVT VT) const;
   
     // Expand specifics
     SDValue ExpandADDSUB(SDNode *Op, SelectionDAG &DAG);
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv,
+                           bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
   };
 }
 
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index 504d202..e616fe6 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "XCoreGenInstrInfo.inc"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 namespace XCore {
@@ -36,7 +37,7 @@ namespace XCore {
 
 using namespace llvm;
 
-XCoreInstrInfo::XCoreInstrInfo(void)
+XCoreInstrInfo::XCoreInstrInfo()
   : TargetInstrInfoImpl(XCoreInsts, array_lengthof(XCoreInsts)),
     RI(*this) {
 }
@@ -115,30 +116,6 @@ XCoreInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   return 0;
 }
 
-/// isInvariantLoad - Return true if the specified instruction (which is marked
-/// mayLoad) is loading from a location whose value is invariant across the
-/// function.  For example, loading a value from the constant pool or from
-/// from the argument area of a function if it does not change.  This should
-/// only return true of *all* loads the instruction does are invariant (if it
-/// does multiple loads).
-bool
-XCoreInstrInfo::isInvariantLoad(const MachineInstr *MI) const {
-  // Loads from constants pools and loads from invariant argument slots are
-  // invariant
-  int Opcode = MI->getOpcode();
-  if (Opcode == XCore::LDWCP_ru6 || Opcode == XCore::LDWCP_lru6) {
-    return MI->getOperand(1).isCPI();
-  }
-  int FrameIndex;
-  if (isLoadFromStackSlot(MI, FrameIndex)) {
-    const MachineFrameInfo &MFI =
-      *MI->getParent()->getParent()->getFrameInfo();
-    return MFI.isFixedObjectIndex(FrameIndex) &&
-           MFI.isImmutableObjectIndex(FrameIndex);
-  }
-  return false;
-}
-
 //===----------------------------------------------------------------------===//
 // Branch Analysis
 //===----------------------------------------------------------------------===//
@@ -186,7 +163,7 @@ static XCore::CondCode GetCondFromBranchOpc(unsigned BrOpc)
 static inline unsigned GetCondBranchFromCond(XCore::CondCode CC) 
 {
   switch (CC) {
-  default: assert(0 && "Illegal condition code!");
+  default: llvm_unreachable("Illegal condition code!");
   case XCore::COND_TRUE   : return XCore::BRFT_lru6;
   case XCore::COND_FALSE  : return XCore::BRFF_lru6;
   }
@@ -197,7 +174,7 @@ static inline unsigned GetCondBranchFromCond(XCore::CondCode CC)
 static inline XCore::CondCode GetOppositeBranchCondition(XCore::CondCode CC)
 {
   switch (CC) {
-  default: assert(0 && "Illegal condition code!");
+  default: llvm_unreachable("Illegal condition code!");
   case XCore::COND_TRUE   : return XCore::COND_FALSE;
   case XCore::COND_FALSE  : return XCore::COND_TRUE;
   }
@@ -402,14 +379,6 @@ void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     .addImm(0);
 }
 
-void XCoreInstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
-                            bool isKill, SmallVectorImpl<MachineOperand> &Addr,
-                            const TargetRegisterClass *RC,
-                            SmallVectorImpl<MachineInstr*> &NewMIs) const
-{
-  assert(0 && "unimplemented\n");
-}
-
 void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator I,
                                           unsigned DestReg, int FrameIndex,
@@ -422,14 +391,6 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     .addImm(0);
 }
 
-void XCoreInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                              SmallVectorImpl<MachineOperand> &Addr,
-                              const TargetRegisterClass *RC,
-                              SmallVectorImpl<MachineInstr*> &NewMIs) const
-{
-  assert(0 && "unimplemented\n");
-}
-
 bool XCoreInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                                MachineBasicBlock::iterator MI,
                                   const std::vector<CalleeSavedInfo> &CSI) const
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index 0870886..24230ac 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -22,7 +22,7 @@ namespace llvm {
 class XCoreInstrInfo : public TargetInstrInfoImpl {
   const XCoreRegisterInfo RI;
 public:
-  XCoreInstrInfo(void);
+  XCoreInstrInfo();
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
@@ -52,8 +52,6 @@ public:
   virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
                                       int &FrameIndex) const;
   
-  virtual bool isInvariantLoad(const MachineInstr *MI) const;
-  
   virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                              MachineBasicBlock *&FBB,
                              SmallVectorImpl<MachineOperand> &Cond,
@@ -76,21 +74,11 @@ public:
                                    unsigned SrcReg, bool isKill, int FrameIndex,
                                    const TargetRegisterClass *RC) const;
 
-  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                              SmallVectorImpl<MachineOperand> &Addr,
-                              const TargetRegisterClass *RC,
-                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
   virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MI,
                                     unsigned DestReg, int FrameIndex,
                                     const TargetRegisterClass *RC) const;
 
-  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                               SmallVectorImpl<MachineOperand> &Addr,
-                               const TargetRegisterClass *RC,
-                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
   virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MI,
                                 const std::vector<CalleeSavedInfo> &CSI) const;
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 65cd4fe..4b9ea7a 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -23,18 +23,6 @@
 include "XCoreInstrFormats.td"
 
 //===----------------------------------------------------------------------===//
-// Feature predicates.
-//===----------------------------------------------------------------------===//
-
-// HasXS1A - This predicate is true when the target processor supports XS1A
-// instructions.
-def HasXS1A   : Predicate<"Subtarget.isXS1A()">;
-
-// HasXS1B - This predicate is true when the target processor supports XS1B
-// instructions.
-def HasXS1B : Predicate<"Subtarget.isXS1B()">;
-
-//===----------------------------------------------------------------------===//
 // XCore specific DAG Nodes.
 //
 
@@ -95,6 +83,12 @@ def neg_xform : SDNodeXForm<imm, [{
   return getI32Imm(-value);
 }]>;
 
+def bpwsub_xform : SDNodeXForm<imm, [{
+  // Transformation function: 32-imm
+  uint32_t value = N->getZExtValue();
+  return getI32Imm(32-value);
+}]>;
+
 def div4neg_xform : SDNodeXForm<imm, [{
   // Transformation function: -imm/4
   uint32_t value = N->getZExtValue();
@@ -136,9 +130,6 @@ def immU20 : PatLeaf<(imm), [{
   return (uint32_t)N->getZExtValue() < (1 << 20);
 }]>;
 
-// FIXME check subtarget. Currently we check if the immediate
-// is in the common subset of legal immediate values for both
-// XS1A and XS1B.
 def immMskBitp : PatLeaf<(imm), [{
   uint32_t value = (uint32_t)N->getZExtValue();
   if (!isMask_32(value)) {
@@ -151,9 +142,6 @@ def immMskBitp : PatLeaf<(imm), [{
           || msksize == 32;
 }]>;
 
-// FIXME check subtarget. Currently we check if the immediate
-// is in the common subset of legal immediate values for both
-// XS1A and XS1B.
 def immBitp : PatLeaf<(imm), [{
   uint32_t value = (uint32_t)N->getZExtValue();
   return (value >= 1 && value <= 8)
@@ -162,6 +150,14 @@ def immBitp : PatLeaf<(imm), [{
           || value == 32;
 }]>;
 
+def immBpwSubBitp : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (value >= 24 && value <= 31)
+          || value == 16
+          || value == 8
+          || value == 0;
+}]>;
+
 def lda16f : PatFrag<(ops node:$addr, node:$offset),
                      (add node:$addr, (shl node:$offset, 1))>;
 def lda16b : PatFrag<(ops node:$addr, node:$offset),
@@ -469,7 +465,7 @@ def ST8_l3r : _FL3R<(outs), (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
 }
 
 // Four operand long
-let Predicates = [HasXS1B], Constraints = "$src1 = $dst1,$src2 = $dst2" in {
+let Constraints = "$src1 = $dst1,$src2 = $dst2" in {
 def MACCU_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2),
                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3,
                       GRRegs:$src4),
@@ -485,7 +481,6 @@ def MACCS_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2),
 
 // Five operand long
 
-let Predicates = [HasXS1B] in {
 def LADD_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2),
                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
                     "ladd $dst1, $dst2, $src1, $src2, $src3",
@@ -500,7 +495,6 @@ def LDIV_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2),
                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
                     "ldiv $dst1, $dst2, $src1, $src2, $src3",
                     []>;
-}
 
 // Six operand long
 
@@ -510,13 +504,6 @@ def LMUL_l6r : _L6R<(outs GRRegs:$dst1, GRRegs:$dst2),
                     "lmul $dst1, $dst2, $src1, $src2, $src3, $src4",
                     []>;
 
-let Predicates = [HasXS1A] in
-def MACC_l6r : _L6R<(outs GRRegs:$dst1, GRRegs:$dst2),
-                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3,
-                      GRRegs:$src4),
-                    "macc $dst1, $dst2, $src1, $src2, $src3, $src4",
-                    []>;
-
 // Register - U6
 
 //let Uses = [DP] in ...
@@ -664,13 +651,12 @@ def BRFU_lu6 : _FLU6<
 }
 
 //let Uses = [CP] in ...
-let Predicates = [HasXS1B], Defs = [R11], neverHasSideEffects = 1,
-  isReMaterializable = 1 in
+let Defs = [R11], neverHasSideEffects = 1, isReMaterializable = 1 in
 def LDAWCP_u6: _FRU6<(outs), (ins MEMii:$a),
                     "ldaw r11, cp[$a]",
                     []>;
 
-let Predicates = [HasXS1B], Defs = [R11], isReMaterializable = 1 in
+let Defs = [R11], isReMaterializable = 1 in
 def LDAWCP_lu6: _FLRU6<
                     (outs), (ins MEMii:$a),
                     "ldaw r11, cp[$a]",
@@ -821,7 +807,7 @@ def : Pat<(zextloadi8 (add GRRegs:$addr, GRRegs:$offset)),
           (LD8U_3r GRRegs:$addr, GRRegs:$offset)>;
 def : Pat<(zextloadi8 GRRegs:$addr), (LD8U_3r GRRegs:$addr, (LDC_ru6 0))>;
 
-def : Pat<(zextloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
+def : Pat<(sextloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
           (LD16S_3r GRRegs:$addr, GRRegs:$offset)>;
 def : Pat<(sextloadi16 GRRegs:$addr), (LD16S_3r GRRegs:$addr, (LDC_ru6 0))>;
 
@@ -989,3 +975,21 @@ def : Pat<(mul GRRegs:$src, -3),
 def : Pat<(sra GRRegs:$src, 31),
           (ASHR_l2rus GRRegs:$src, 32)>;
 
+def : Pat<(brcond (setlt GRRegs:$lhs, 0), bb:$dst),
+          (BRFT_lru6 (ASHR_l2rus GRRegs:$lhs, 32), bb:$dst)>;
+
+// setge X, 0 is canonicalized to setgt X, -1
+def : Pat<(brcond (setgt GRRegs:$lhs, -1), bb:$dst),
+          (BRFF_lru6 (ASHR_l2rus GRRegs:$lhs, 32), bb:$dst)>;
+
+def : Pat<(select (setlt GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (ASHR_l2rus GRRegs:$lhs, 32), GRRegs:$T, GRRegs:$F)>;
+
+def : Pat<(select (setgt GRRegs:$lhs, -1), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (ASHR_l2rus GRRegs:$lhs, 32), GRRegs:$F, GRRegs:$T)>;
+
+def : Pat<(setgt GRRegs:$lhs, -1),
+          (EQ_2rus (ASHR_l2rus GRRegs:$lhs, 32), 0)>;
+
+def : Pat<(sra (shl GRRegs:$src, immBpwSubBitp:$imm), immBpwSubBitp:$imm),
+          (SEXT_rus GRRegs:$src, (bpwsub_xform immBpwSubBitp:$imm))>;
diff --git a/lib/Target/XCore/XCoreMCAsmInfo.cpp b/lib/Target/XCore/XCoreMCAsmInfo.cpp
new file mode 100644
index 0000000..dffdda9
--- /dev/null
+++ b/lib/Target/XCore/XCoreMCAsmInfo.cpp
@@ -0,0 +1,31 @@
+//===-- XCoreMCAsmInfo.cpp - XCore asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMCAsmInfo.h"
+using namespace llvm;
+
+XCoreMCAsmInfo::XCoreMCAsmInfo(const Target &T, const StringRef &TT) {
+  SupportsDebugInformation = true;
+  Data16bitsDirective = "\t.short\t";
+  Data32bitsDirective = "\t.long\t";
+  Data64bitsDirective = 0;
+  ZeroDirective = "\t.space\t";
+  CommentString = "#";
+    
+  PrivateGlobalPrefix = ".L";
+  AscizDirective = ".asciiz";
+  WeakDefDirective = "\t.weak\t";
+  WeakRefDirective = "\t.weak\t";
+  SetDirective = "\t.set\t";
+
+  // Debug
+  HasLEB128 = true;
+  AbsoluteDebugSectionOffsets = true;
+}
+
diff --git a/lib/Target/XCore/XCoreMCAsmInfo.h b/lib/Target/XCore/XCoreMCAsmInfo.h
new file mode 100644
index 0000000..01f8e48
--- /dev/null
+++ b/lib/Target/XCore/XCoreMCAsmInfo.h
@@ -0,0 +1,29 @@
+//=====-- XCoreMCAsmInfo.h - XCore asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the XCoreMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORETARGETASMINFO_H
+#define XCORETARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+  class XCoreMCAsmInfo : public MCAsmInfo {
+  public:
+    explicit XCoreMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 82cd92d..136a035 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -30,6 +30,8 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -142,9 +144,11 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       
       if (!isU6 && !isImmU16(Amount)) {
         // FIX could emit multiple instructions in this case.
-        cerr << "eliminateCallFramePseudoInstr size too big: "
-             << Amount << "\n";
-        abort();
+#ifndef NDEBUG
+        errs() << "eliminateCallFramePseudoInstr size too big: "
+               << Amount << "\n";
+#endif
+        llvm_unreachable(0);
       }
 
       MachineInstr *New;
@@ -167,8 +171,10 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-void XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                            int SPAdj, RegScavenger *RS) const {
+unsigned
+XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, int *Value,
+                                       RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
   MachineInstr &MI = *II;
   DebugLoc dl = MI.getDebugLoc();
@@ -187,12 +193,13 @@ void XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int StackSize = MF.getFrameInfo()->getStackSize();
 
   #ifndef NDEBUG
-  DOUT << "\nFunction         : " << MF.getFunction()->getName() << "\n";
-  DOUT << "<--------->\n";
-  MI.print(DOUT);
-  DOUT << "FrameIndex         : " << FrameIndex << "\n";
-  DOUT << "FrameOffset        : " << Offset << "\n";
-  DOUT << "StackSize          : " << StackSize << "\n";
+  DEBUG(errs() << "\nFunction         : " 
+        << MF.getFunction()->getName() << "\n");
+  DEBUG(errs() << "<--------->\n");
+  DEBUG(MI.print(errs()));
+  DEBUG(errs() << "FrameIndex         : " << FrameIndex << "\n");
+  DEBUG(errs() << "FrameOffset        : " << Offset << "\n");
+  DEBUG(errs() << "StackSize          : " << StackSize << "\n");
   #endif
 
   Offset += StackSize;
@@ -203,10 +210,7 @@ void XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   
   assert(Offset%4 == 0 && "Misaligned stack offset");
 
-  #ifndef NDEBUG
-  DOUT << "Offset             : " << Offset << "\n";
-  DOUT << "<--------->\n";
-  #endif
+  DEBUG(errs() << "Offset             : " << Offset << "\n" << "<--------->\n");
   
   Offset/=4;
   
@@ -224,63 +228,65 @@ void XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     bool isUs = isImmUs(Offset);
     unsigned FramePtr = XCore::R10;
     
-    MachineInstr *New = 0;
     if (!isUs) {
       if (!RS) {
-        cerr << "eliminateFrameIndex Frame size too big: " << Offset << "\n";
-        abort();
+        std::string msg;
+        raw_string_ostream Msg(msg);
+        Msg << "eliminateFrameIndex Frame size too big: " << Offset;
+        llvm_report_error(Msg.str());
       }
       unsigned ScratchReg = RS->scavengeRegister(XCore::GRRegsRegisterClass, II,
                                                  SPAdj);
       loadConstant(MBB, II, ScratchReg, Offset, dl);
       switch (MI.getOpcode()) {
       case XCore::LDWFI:
-        New = BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
+        BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
               .addReg(FramePtr)
               .addReg(ScratchReg, RegState::Kill);
         break;
       case XCore::STWFI:
-        New = BuildMI(MBB, II, dl, TII.get(XCore::STW_3r))
+        BuildMI(MBB, II, dl, TII.get(XCore::STW_3r))
               .addReg(Reg, getKillRegState(isKill))
               .addReg(FramePtr)
               .addReg(ScratchReg, RegState::Kill);
         break;
       case XCore::LDAWFI:
-        New = BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
+        BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
               .addReg(FramePtr)
               .addReg(ScratchReg, RegState::Kill);
         break;
       default:
-        assert(0 && "Unexpected Opcode\n");
+        llvm_unreachable("Unexpected Opcode");
       }
     } else {
       switch (MI.getOpcode()) {
       case XCore::LDWFI:
-        New = BuildMI(MBB, II, dl, TII.get(XCore::LDW_2rus), Reg)
+        BuildMI(MBB, II, dl, TII.get(XCore::LDW_2rus), Reg)
               .addReg(FramePtr)
               .addImm(Offset);
         break;
       case XCore::STWFI:
-        New = BuildMI(MBB, II, dl, TII.get(XCore::STW_2rus))
+        BuildMI(MBB, II, dl, TII.get(XCore::STW_2rus))
               .addReg(Reg, getKillRegState(isKill))
               .addReg(FramePtr)
               .addImm(Offset);
         break;
       case XCore::LDAWFI:
-        New = BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l2rus), Reg)
+        BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l2rus), Reg)
               .addReg(FramePtr)
               .addImm(Offset);
         break;
       default:
-        assert(0 && "Unexpected Opcode\n");
+        llvm_unreachable("Unexpected Opcode");
       }
     }
   } else {
     bool isU6 = isImmU6(Offset);
     if (!isU6 && !isImmU16(Offset)) {
-      // FIXME could make this work for LDWSP, LDAWSP.
-      cerr << "eliminateFrameIndex Frame size too big: " << Offset << "\n";
-      abort();
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "eliminateFrameIndex Frame size too big: " << Offset;
+      llvm_report_error(Msg.str());
     }
 
     switch (MI.getOpcode()) {
@@ -302,11 +308,12 @@ void XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
             .addImm(Offset);
       break;
     default:
-      assert(0 && "Unexpected Opcode\n");
+      llvm_unreachable("Unexpected Opcode");
     }
   }
   // Erase old instruction.
   MBB.erase(II);
+  return 0;
 }
 
 void
@@ -354,8 +361,10 @@ loadConstant(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   // TODO use mkmsk if possible.
   if (!isImmU16(Value)) {
     // TODO use constant pool.
-    cerr << "loadConstant value too big " << Value << "\n";
-    abort();
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "loadConstant value too big " << Value;
+    llvm_report_error(Msg.str());
   }
   int Opcode = isImmU6(Value) ? XCore::LDC_ru6 : XCore::LDC_lru6;
   BuildMI(MBB, I, dl, TII.get(Opcode), DstReg).addImm(Value);
@@ -368,8 +377,10 @@ storeToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   Offset/=4;
   bool isU6 = isImmU6(Offset);
   if (!isU6 && !isImmU16(Offset)) {
-    cerr << "storeToStack offset too big " << Offset << "\n";
-    abort();
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "storeToStack offset too big " << Offset;
+    llvm_report_error(Msg.str());
   }
   int Opcode = isU6 ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
   BuildMI(MBB, I, dl, TII.get(Opcode))
@@ -384,8 +395,10 @@ loadFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   Offset/=4;
   bool isU6 = isImmU6(Offset);
   if (!isU6 && !isImmU16(Offset)) {
-    cerr << "loadFromStack offset too big " << Offset << "\n";
-    abort();
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "loadFromStack offset too big " << Offset;
+    llvm_report_error(Msg.str());
   }
   int Opcode = isU6 ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
   BuildMI(MBB, I, dl, TII.get(Opcode), DstReg)
@@ -414,8 +427,10 @@ void XCoreRegisterInfo::emitPrologue(MachineFunction &MF) const {
 
   if (!isU6 && !isImmU16(FrameSize)) {
     // FIXME could emit multiple instructions.
-    cerr << "emitPrologue Frame size too big: " << FrameSize << "\n";
-    abort();
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "emitPrologue Frame size too big: " << FrameSize;
+    llvm_report_error(Msg.str());
   }
   bool emitFrameMoves = needsFrameMoves(MF);
 
@@ -538,8 +553,10 @@ void XCoreRegisterInfo::emitEpilogue(MachineFunction &MF,
 
   if (!isU6 && !isImmU16(FrameSize)) {
     // FIXME could emit multiple instructions.
-    cerr << "emitEpilogue Frame size too big: " << FrameSize << "\n";
-    abort();
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "emitEpilogue Frame size too big: " << FrameSize;
+    llvm_report_error(Msg.str());
   }
 
   if (FrameSize) {
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 00b7caa..a7df510 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -57,8 +57,9 @@ public:
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-  void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
                            
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                 RegScavenger *RS = NULL) const;
diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp
index dc53da4..78a6fa5 100644
--- a/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/lib/Target/XCore/XCoreSubtarget.cpp
@@ -13,16 +13,8 @@
 
 #include "XCoreSubtarget.h"
 #include "XCore.h"
-#include "XCoreGenSubtarget.inc"
 using namespace llvm;
 
-XCoreSubtarget::XCoreSubtarget(const TargetMachine &TM, const Module &M, 
-                             const std::string &FS)
-  : IsXS1A(false),
-    IsXS1B(false)
+XCoreSubtarget::XCoreSubtarget(const std::string &TT, const std::string &FS)
 {
-  std::string CPU = "xs1a-generic";
-
-  // Parse features string.
-  ParseSubtargetFeatures(FS, CPU);
 }
diff --git a/lib/Target/XCore/XCoreSubtarget.h b/lib/Target/XCore/XCoreSubtarget.h
index ff6475b..f8be3ec 100644
--- a/lib/Target/XCore/XCoreSubtarget.h
+++ b/lib/Target/XCore/XCoreSubtarget.h
@@ -20,21 +20,14 @@
 #include <string>
 
 namespace llvm {
-class Module;
 
 class XCoreSubtarget : public TargetSubtarget {
-  bool IsXS1A;
-  bool IsXS1B;
 
 public:
   /// This constructor initializes the data members to match that
-  /// of the specified module.
+  /// of the specified triple.
   ///
-  XCoreSubtarget(const TargetMachine &TM, const Module &M, 
-                const std::string &FS);
-
-  bool isXS1A() const { return IsXS1A; }
-  bool isXS1B() const { return IsXS1B; }
+  XCoreSubtarget(const std::string &TT, const std::string &FS);
   
   /// ParseSubtargetFeatures - Parses features string setting specified 
   /// subtarget options.  Definition of function is auto generated by tblgen.
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index b72225f..75f2055 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -10,38 +10,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "XCoreTargetAsmInfo.h"
+#include "XCoreMCAsmInfo.h"
 #include "XCoreTargetMachine.h"
 #include "XCore.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
-/// XCoreTargetMachineModule - Note that this is used on hosts that
-/// cannot link in a library unless there are references into the
-/// library.  In particular, it seems that it is not possible to get
-/// things to work on Win32 without this.  Though it is unused, do not
-/// remove it.
-extern "C" int XCoreTargetMachineModule;
-int XCoreTargetMachineModule = 0;
-
-namespace {
-  // Register the target.
-  RegisterTarget<XCoreTargetMachine> X("xcore", "XCore");
-}
-
-// Force static initialization.
-extern "C" void LLVMInitializeXCoreTarget() { }
-
-const TargetAsmInfo *XCoreTargetMachine::createTargetAsmInfo() const {
-  return new XCoreTargetAsmInfo(*this);
-}
-
 /// XCoreTargetMachine ctor - Create an ILP32 architecture model
 ///
-XCoreTargetMachine::XCoreTargetMachine(const Module &M, const std::string &FS)
-  : Subtarget(*this, M, FS),
+XCoreTargetMachine::XCoreTargetMachine(const Target &T, const std::string &TT,
+                                       const std::string &FS)
+  : LLVMTargetMachine(T, TT),
+    Subtarget(TT, FS),
     DataLayout("e-p:32:32:32-a0:0:32-f32:32:32-f64:32:32-i1:8:32-i8:8:32-"
                "i16:16:32-i32:32:32-i64:32:32"),
     InstrInfo(),
@@ -49,26 +31,14 @@ XCoreTargetMachine::XCoreTargetMachine(const Module &M, const std::string &FS)
     TLInfo(*this) {
 }
 
-unsigned XCoreTargetMachine::getModuleMatchQuality(const Module &M) {
-  std::string TT = M.getTargetTriple();
-  if (TT.size() >= 6 && std::string(TT.begin(), TT.begin()+6) == "xcore-")
-    return 20;
-  
-  // Otherwise we don't match.
-  return 0;
-}
-
 bool XCoreTargetMachine::addInstSelector(PassManagerBase &PM,
                                          CodeGenOpt::Level OptLevel) {
   PM.add(createXCoreISelDag(*this));
   return false;
 }
 
-bool XCoreTargetMachine::addAssemblyEmitter(PassManagerBase &PM,
-                                            CodeGenOpt::Level OptLevel,
-                                            bool Verbose,
-                                            raw_ostream &Out) {
-  // Output assembly language.
-  PM.add(createXCoreCodePrinterPass(Out, *this, Verbose));
-  return false;
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreTarget() {
+  RegisterTargetMachine<XCoreTargetMachine> X(TheXCoreTarget);
+  RegisterAsmInfo<XCoreMCAsmInfo> Y(TheXCoreTarget);
 }
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index 2385aed..b0b1464 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -23,20 +23,15 @@
 
 namespace llvm {
 
-class Module;
-
 class XCoreTargetMachine : public LLVMTargetMachine {
   XCoreSubtarget Subtarget;
   const TargetData DataLayout;       // Calculates type size & alignment
   XCoreInstrInfo InstrInfo;
   XCoreFrameInfo FrameInfo;
   XCoreTargetLowering TLInfo;
-
-protected:
-  virtual const TargetAsmInfo *createTargetAsmInfo() const;
-
 public:
-  XCoreTargetMachine(const Module &M, const std::string &FS);
+  XCoreTargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS);
 
   virtual const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; }
   virtual const XCoreFrameInfo *getFrameInfo() const { return &FrameInfo; }
@@ -49,13 +44,9 @@ public:
     return &InstrInfo.getRegisterInfo();
   }
   virtual const TargetData       *getTargetData() const { return &DataLayout; }
-  static unsigned getModuleMatchQuality(const Module &M);
 
   // Pass Pipeline Configuration
   virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addAssemblyEmitter(PassManagerBase &PM,
-                                  CodeGenOpt::Level OptLevel, 
-                                  bool Verbose, raw_ostream &Out);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.cpp b/lib/Target/XCore/XCoreTargetObjectFile.cpp
new file mode 100644
index 0000000..7de3b55
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -0,0 +1,67 @@
+//===-- XCoreTargetObjectFile.cpp - XCore object files --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetObjectFile.h"
+#include "XCoreSubtarget.h"
+#include "MCSectionXCore.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+
+void XCoreTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+  DataSection =
+    MCSectionXCore::Create(".dp.data", MCSectionELF::SHT_PROGBITS, 
+                           MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE |
+                           MCSectionXCore::SHF_DP_SECTION,
+                           SectionKind::getDataRel(), false, getContext());
+  BSSSection =
+    MCSectionXCore::Create(".dp.bss", MCSectionELF::SHT_NOBITS,
+                           MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE |
+                           MCSectionXCore::SHF_DP_SECTION,
+                           SectionKind::getBSS(), false, getContext());
+  
+  MergeableConst4Section = 
+    MCSectionXCore::Create(".cp.rodata.cst4", MCSectionELF::SHT_PROGBITS,
+                           MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_MERGE |
+                           MCSectionXCore::SHF_CP_SECTION,
+                           SectionKind::getMergeableConst4(), false,
+                           getContext());
+  MergeableConst8Section = 
+    MCSectionXCore::Create(".cp.rodata.cst8", MCSectionELF::SHT_PROGBITS,
+                           MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_MERGE |
+                           MCSectionXCore::SHF_CP_SECTION,
+                           SectionKind::getMergeableConst8(), false,
+                           getContext());
+  MergeableConst16Section = 
+    MCSectionXCore::Create(".cp.rodata.cst16", MCSectionELF::SHT_PROGBITS,
+                           MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_MERGE |
+                           MCSectionXCore::SHF_CP_SECTION,
+                           SectionKind::getMergeableConst16(), false,
+                           getContext());
+  
+  // TLS globals are lowered in the backend to arrays indexed by the current
+  // thread id. After lowering they require no special handling by the linker
+  // and can be placed in the standard data / bss sections.
+  TLSDataSection = DataSection;
+  TLSBSSSection = BSSSection;
+
+  ReadOnlySection = 
+    MCSectionXCore::Create(".cp.rodata", MCSectionELF::SHT_PROGBITS,
+                           MCSectionELF::SHF_ALLOC |
+                           MCSectionXCore::SHF_CP_SECTION,
+                           SectionKind::getReadOnlyWithRel(), false,
+                           getContext());
+
+  // Dynamic linking is not supported. Data with relocations is placed in the
+  // same section as data without relocations.
+  DataRelSection = DataRelLocalSection = DataSection;
+  DataRelROSection = DataRelROLocalSection = ReadOnlySection;
+}
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.h b/lib/Target/XCore/XCoreTargetObjectFile.h
new file mode 100644
index 0000000..7efb990
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetObjectFile.h
@@ -0,0 +1,26 @@
+//===-- llvm/Target/XCoreTargetObjectFile.h - XCore Object Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_XCORE_TARGETOBJECTFILE_H
+#define LLVM_TARGET_XCORE_TARGETOBJECTFILE_H
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+  class XCoreTargetObjectFile : public TargetLoweringObjectFileELF {
+  public:
+    
+    void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    // TODO: Classify globals as xcore wishes.
+  };
+} // end namespace llvm
+
+#endif