131 files changed, 8956 insertions, 2469 deletions
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 6fe7c2c..8e537d8 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -643,6 +643,13 @@ ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
   DebugLoc DL = DebugLoc::getUnknownLoc();
   if (I != MBB.end()) DL = I->getDebugLoc();
 
+  // tGPR is used sometimes in ARM instructions that need to avoid using
+  // certain registers.  Just treat it as GPR here.
+  if (DestRC == ARM::tGPRRegisterClass)
+    DestRC = ARM::GPRRegisterClass;
+  if (SrcRC == ARM::tGPRRegisterClass)
+    SrcRC = ARM::GPRRegisterClass;
+
   if (DestRC != SrcRC) {
     if (DestRC->getSize() != SrcRC->getSize())
       return false;
@@ -697,6 +704,11 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                             MFI.getObjectSize(FI),
                             Align);
 
+  // tGPR is used sometimes in ARM instructions that need to avoid using
+  // certain registers.  Just treat it as GPR here.
+  if (RC == ARM::tGPRRegisterClass)
+    RC = ARM::GPRRegisterClass;
+
   if (RC == ARM::GPRRegisterClass) {
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR))
                    .addReg(SrcReg, getKillRegState(isKill))
@@ -745,6 +757,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                             MFI.getObjectSize(FI),
                             Align);
 
+  // tGPR is used sometimes in ARM instructions that need to avoid using
+  // certain registers.  Just treat it as GPR here.
+  if (RC == ARM::tGPRRegisterClass)
+    RC = ARM::GPRRegisterClass;
+
   if (RC == ARM::GPRRegisterClass) {
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg)
                    .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO));
@@ -1020,9 +1037,8 @@ ARMBaseInstrInfo::duplicate(MachineInstr *Orig, MachineFunction &MF) const {
   return MI;
 }
 
-bool ARMBaseInstrInfo::isIdentical(const MachineInstr *MI0,
-                                  const MachineInstr *MI1,
-                                  const MachineRegisterInfo *MRI) const {
+bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
+                                        const MachineInstr *MI1) const {
   int Opcode = MI0->getOpcode();
   if (Opcode == ARM::t2LDRpci ||
       Opcode == ARM::t2LDRpci_pic ||
@@ -1051,7 +1067,7 @@ bool ARMBaseInstrInfo::isIdentical(const MachineInstr *MI0,
     return ACPV0->hasSameValue(ACPV1);
   }
 
-  return TargetInstrInfoImpl::isIdentical(MI0, MI1, MRI);
+  return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
 }
 
 /// getInstrPredicate - If instruction is predicated, returns its predicate
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 0d9d4a7..e095acc 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -289,8 +289,8 @@ public:
 
   MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const;
 
-  virtual bool isIdentical(const MachineInstr *MI, const MachineInstr *Other,
-                           const MachineRegisterInfo *MRI) const;
+  virtual bool produceSameValue(const MachineInstr *MI0,
+                                const MachineInstr *MI1) const;
 };
 
 static inline
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 91e3550..8044966 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -513,7 +513,7 @@ cannotEliminateFrame(const MachineFunction &MF) const {
 }
 
 /// estimateStackSize - Estimate and return the size of the frame.
-static unsigned estimateStackSize(MachineFunction &MF, MachineFrameInfo *MFI) {
+static unsigned estimateStackSize(MachineFunction &MF) {
   const MachineFrameInfo *FFI = MF.getFrameInfo();
   int Offset = 0;
   for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
@@ -671,8 +671,16 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     }
   }
 
+  // If any of the stack slot references may be out of range of an immediate
+  // offset, make sure a register (or a spill slot) is available for the
+  // register scavenger. Note that if we're indexing off the frame pointer, the
+  // effective stack size is 4 bytes larger since the FP points to the stack
+  // slot of the previous FP.
+  bool BigStack = RS &&
+    estimateStackSize(MF) + (hasFP(MF) ? 4 : 0) >= estimateRSStackSizeLimit(MF);
+
   bool ExtraCSSpill = false;
-  if (!CanEliminateFrame || cannotEliminateFrame(MF)) {
+  if (BigStack || !CanEliminateFrame || cannotEliminateFrame(MF)) {
     AFI->setHasStackFrame(true);
 
     // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
@@ -727,51 +735,43 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     // callee-saved register or reserve a special spill slot to facilitate
     // register scavenging. Thumb1 needs a spill slot for stack pointer
     // adjustments also, even when the frame itself is small.
-    if (RS && !ExtraCSSpill) {
-      MachineFrameInfo  *MFI = MF.getFrameInfo();
-      // If any of the stack slot references may be out of range of an
-      // immediate offset, make sure a register (or a spill slot) is
-      // available for the register scavenger. Note that if we're indexing
-      // off the frame pointer, the effective stack size is 4 bytes larger
-      // since the FP points to the stack slot of the previous FP.
-      if (estimateStackSize(MF, MFI) + (hasFP(MF) ? 4 : 0)
-          >= estimateRSStackSizeLimit(MF)) {
-        // If any non-reserved CS register isn't spilled, just spill one or two
-        // extra. That should take care of it!
-        unsigned NumExtras = TargetAlign / 4;
-        SmallVector<unsigned, 2> Extras;
-        while (NumExtras && !UnspilledCS1GPRs.empty()) {
-          unsigned Reg = UnspilledCS1GPRs.back();
-          UnspilledCS1GPRs.pop_back();
+    if (BigStack && !ExtraCSSpill) {
+      // If any non-reserved CS register isn't spilled, just spill one or two
+      // extra. That should take care of it!
+      unsigned NumExtras = TargetAlign / 4;
+      SmallVector<unsigned, 2> Extras;
+      while (NumExtras && !UnspilledCS1GPRs.empty()) {
+        unsigned Reg = UnspilledCS1GPRs.back();
+        UnspilledCS1GPRs.pop_back();
+        if (!isReservedReg(MF, Reg)) {
+          Extras.push_back(Reg);
+          NumExtras--;
+        }
+      }
+      // For non-Thumb1 functions, also check for hi-reg CS registers
+      if (!AFI->isThumb1OnlyFunction()) {
+        while (NumExtras && !UnspilledCS2GPRs.empty()) {
+          unsigned Reg = UnspilledCS2GPRs.back();
+          UnspilledCS2GPRs.pop_back();
           if (!isReservedReg(MF, Reg)) {
             Extras.push_back(Reg);
             NumExtras--;
           }
         }
-        // For non-Thumb1 functions, also check for hi-reg CS registers
-        if (!AFI->isThumb1OnlyFunction()) {
-          while (NumExtras && !UnspilledCS2GPRs.empty()) {
-            unsigned Reg = UnspilledCS2GPRs.back();
-            UnspilledCS2GPRs.pop_back();
-            if (!isReservedReg(MF, Reg)) {
-              Extras.push_back(Reg);
-              NumExtras--;
-            }
-          }
-        }
-        if (Extras.size() && NumExtras == 0) {
-          for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
-            MF.getRegInfo().setPhysRegUsed(Extras[i]);
-            AFI->setCSRegisterIsSpilled(Extras[i]);
-          }
-        } else if (!AFI->isThumb1OnlyFunction()) {
-          // note: Thumb1 functions spill to R12, not the stack.
-          // Reserve a slot closest to SP or frame pointer.
-          const TargetRegisterClass *RC = ARM::GPRRegisterClass;
-          RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
-                                                             RC->getAlignment(),
-                                                             false));
+      }
+      if (Extras.size() && NumExtras == 0) {
+        for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
+          MF.getRegInfo().setPhysRegUsed(Extras[i]);
+          AFI->setCSRegisterIsSpilled(Extras[i]);
         }
+      } else if (!AFI->isThumb1OnlyFunction()) {
+        // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
+        // closest to SP or frame pointer.
+        const TargetRegisterClass *RC = ARM::GPRRegisterClass;
+        MachineFrameInfo *MFI = MF.getFrameInfo();
+        RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                           RC->getAlignment(),
+                                                           false));
       }
     }
   }
@@ -1085,6 +1085,15 @@ hasReservedCallFrame(MachineFunction &MF) const {
   return !MF.getFrameInfo()->hasVarSizedObjects();
 }
 
+// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+// call frame pseudos can be simplified. Unlike most targets, having a FP
+// is not sufficient here since we still may reference some objects via SP
+// even when FP is available in Thumb2 mode.
+bool ARMBaseRegisterInfo::
+canSimplifyCallFramePseudos(MachineFunction &MF) const {
+  return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects();
+}
+
 static void
 emitSPUpdate(bool isARM,
              MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
@@ -1119,13 +1128,14 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 
       ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
       assert(!AFI->isThumb1OnlyFunction() &&
-             "This eliminateCallFramePseudoInstr does not suppor Thumb1!");
+             "This eliminateCallFramePseudoInstr does not support Thumb1!");
       bool isARM = !AFI->isThumbFunction();
 
       // Replace the pseudo instruction with a new instruction...
       unsigned Opc = Old->getOpcode();
-      ARMCC::CondCodes Pred = (ARMCC::CondCodes)Old->getOperand(1).getImm();
-      // FIXME: Thumb2 version of ADJCALLSTACKUP and ADJCALLSTACKDOWN?
+      int PIdx = Old->findFirstPredOperandIdx();
+      ARMCC::CondCodes Pred = (PIdx == -1)
+        ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(PIdx).getImm();
       if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
         // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
         unsigned PredReg = Old->getOperand(2).getReg();
@@ -1149,7 +1159,6 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   assert(!AFI->isThumb1OnlyFunction() &&
          "This eliminateFrameIndex does not support Thumb1!");
@@ -1160,12 +1169,12 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 
   int FrameIndex = MI.getOperand(i).getIndex();
-  int Offset = MFI->getObjectOffset(FrameIndex) + MFI->getStackSize() + SPAdj;
   unsigned FrameReg;
 
-  Offset = getFrameIndexReference(MF, FrameIndex, FrameReg);
+  int Offset = getFrameIndexReference(MF, FrameIndex, FrameReg);
   if (FrameReg != ARM::SP)
     SPAdj = 0;
+  Offset += SPAdj;
 
   // Modify MI as necessary to handle as much of 'Offset' as possible
   bool Done = false;
@@ -1256,7 +1265,7 @@ emitPrologue(MachineFunction &MF) const {
   MachineFrameInfo  *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   assert(!AFI->isThumb1OnlyFunction() &&
-         "This emitPrologue does not suppor Thumb1!");
+         "This emitPrologue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
   unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
   unsigned NumBytes = MFI->getStackSize();
@@ -1417,7 +1426,7 @@ emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   assert(!AFI->isThumb1OnlyFunction() &&
-         "This emitEpilogue does not suppor Thumb1!");
+         "This emitEpilogue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
 
   unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 33ba21d..64f6ff1 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -138,6 +138,7 @@ public:
   virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
 
   virtual bool hasReservedCallFrame(MachineFunction &MF) const;
+  virtual bool canSimplifyCallFramePseudos(MachineFunction &MF) const;
 
   virtual void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                              MachineBasicBlock &MBB,
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index df4ae70..013e00a 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -58,8 +58,6 @@ public:
     return "ARM Instruction Selection";
   }
 
-  virtual void InstructionSelect();
-
   /// getI32Imm - Return a target constant of type i32 with the specified
   /// value.
   inline SDValue getI32Imm(unsigned Imm) {
@@ -203,11 +201,6 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
 }
 
 
-void ARMDAGToDAGISel::InstructionSelect() {
-  SelectRoot(*CurDAG);
-  CurDAG->RemoveDeadNodes();
-}
-
 bool ARMDAGToDAGISel::SelectShifterOperandReg(SDNode *Op,
                                               SDValue N,
                                               SDValue &BaseReg,
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index adf1644..6a2c6bb 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -294,6 +294,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setTargetDAGCombine(ISD::SIGN_EXTEND);
     setTargetDAGCombine(ISD::ZERO_EXTEND);
     setTargetDAGCombine(ISD::ANY_EXTEND);
+    setTargetDAGCombine(ISD::SELECT_CC);
   }
 
   computeRegisterProperties();
@@ -544,6 +545,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VZIP:          return "ARMISD::VZIP";
   case ARMISD::VUZP:          return "ARMISD::VUZP";
   case ARMISD::VTRN:          return "ARMISD::VTRN";
+  case ARMISD::FMAX:          return "ARMISD::FMAX";
+  case ARMISD::FMIN:          return "ARMISD::FMIN";
   }
 }
 
@@ -921,7 +924,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // These operations are automatically eliminated by the prolog/epilog pass
   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
 
-  SDValue StackPtr = DAG.getRegister(ARM::SP, MVT::i32);
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
 
   RegsToPassVector RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
@@ -970,8 +973,6 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                            VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
         } else {
           assert(VA.isMemLoc());
-          if (StackPtr.getNode() == 0)
-            StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
 
           MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
                                                  dl, DAG, VA, Flags));
@@ -984,8 +985,6 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else {
       assert(VA.isMemLoc());
-      if (StackPtr.getNode() == 0)
-        StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
 
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
                                              dl, DAG, VA, Flags));
@@ -1283,8 +1282,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
     LowerCallTo(Chain, (const Type *) Type::getInt32Ty(*DAG.getContext()),
                 false, false, false, false,
                 0, CallingConv::C, false, /*isReturnValueUsed=*/true,
-                DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl,
-                DAG.GetOrdering(Chain.getNode()));
+                DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
   return CallResult.first;
 }
 
@@ -3856,23 +3854,106 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC
+/// to match f32 max/min patterns to use NEON vmax/vmin instructions.
+static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
+                                       const ARMSubtarget *ST) {
+  // If the target supports NEON, try to use vmax/vmin instructions for f32
+  // selects like "x < y ? x : y".  Unless the FiniteOnlyFPMath option is set,
+  // be careful about NaNs:  NEON's vmax/vmin return NaN if either operand is
+  // a NaN; only do the transformation when it matches that behavior.
+
+  // For now only do this when using NEON for FP operations; if using VFP, it
+  // is not obvious that the benefit outweighs the cost of switching to the
+  // NEON pipeline.
+  if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() ||
+      N->getValueType(0) != MVT::f32)
+    return SDValue();
+
+  SDValue CondLHS = N->getOperand(0);
+  SDValue CondRHS = N->getOperand(1);
+  SDValue LHS = N->getOperand(2);
+  SDValue RHS = N->getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+  unsigned Opcode = 0;
+  bool IsReversed;
+  if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) {
+    IsReversed = false; // x CC y ? x : y
+  } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) {
+    IsReversed = true ; // x CC y ? y : x
+  } else {
+    return SDValue();
+  }
+
+  bool IsUnordered;
+  switch (CC) {
+  default: break;
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETULT:
+  case ISD::SETULE:
+    // If LHS is NaN, an ordered comparison will be false and the result will
+    // be the RHS, but vmin(NaN, RHS) = NaN.  Avoid this by checking that LHS
+    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
+    IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE);
+    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
+      break;
+    // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin
+    // will return -0, so vmin can only be used for unsafe math or if one of
+    // the operands is known to be nonzero.
+    if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) &&
+        !UnsafeFPMath &&
+        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+      break;
+    Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN;
+    break;
+
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETGT:
+  case ISD::SETGE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    // If LHS is NaN, an ordered comparison will be false and the result will
+    // be the RHS, but vmax(NaN, RHS) = NaN.  Avoid this by checking that LHS
+    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
+    IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE);
+    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
+      break;
+    // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax
+    // will return +0, so vmax can only be used for unsafe math or if one of
+    // the operands is known to be nonzero.
+    if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) &&
+        !UnsafeFPMath &&
+        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+      break;
+    Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX;
+    break;
+  }
+
+  if (!Opcode)
+    return SDValue();
+  return DAG.getNode(Opcode, N->getDebugLoc(), N->getValueType(0), LHS, RHS);
+}
+
 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default: break;
-  case ISD::ADD:      return PerformADDCombine(N, DCI);
-  case ISD::SUB:      return PerformSUBCombine(N, DCI);
+  case ISD::ADD:        return PerformADDCombine(N, DCI);
+  case ISD::SUB:        return PerformSUBCombine(N, DCI);
   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
-  case ISD::INTRINSIC_WO_CHAIN:
-    return PerformIntrinsicCombine(N, DCI.DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
   case ISD::SHL:
   case ISD::SRA:
-  case ISD::SRL:
-    return PerformShiftCombine(N, DCI.DAG, Subtarget);
+  case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
-  case ISD::ANY_EXTEND:
-    return PerformExtendCombine(N, DCI.DAG, Subtarget);
+  case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
+  case ISD::SELECT_CC:  return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
   }
   return SDValue();
 }
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 3c5df45..f8f8adc 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -131,7 +131,11 @@ namespace llvm {
       VREV16,       // reverse elements within 16-bit halfwords
       VZIP,         // zip (interleave)
       VUZP,         // unzip (deinterleave)
-      VTRN          // transpose
+      VTRN,         // transpose
+
+      // Floating-point max and min:
+      FMAX,
+      FMIN
     };
   }
 
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index db60458..76595fa 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -708,6 +708,20 @@ class AI3ldsbpr<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{24}    = 1; // P bit
   let Inst{27-25} = 0b000;
 }
+class AI3lddpr<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 0; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
+}
+
 
 // Pre-indexed stores
 class AI3sthpr<dag oops, dag iops, Format f, InstrItinClass itin,
@@ -723,6 +737,19 @@ class AI3sthpr<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{24}    = 1; // P bit
   let Inst{27-25} = 0b000;
 }
+class AI3stdpr<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
+}
 
 // Post-indexed loads
 class AI3ldhpo<dag oops, dag iops, Format f, InstrItinClass itin,
@@ -734,7 +761,7 @@ class AI3ldhpo<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{6}     = 0; // S bit
   let Inst{7}     = 1;
   let Inst{20}    = 1; // L bit
-  let Inst{21}    = 1; // W bit
+  let Inst{21}    = 0; // W bit
   let Inst{24}    = 0; // P bit
   let Inst{27-25} = 0b000;
 }
@@ -747,7 +774,7 @@ class AI3ldshpo<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{6}     = 1; // S bit
   let Inst{7}     = 1;
   let Inst{20}    = 1; // L bit
-  let Inst{21}    = 1; // W bit
+  let Inst{21}    = 0; // W bit
   let Inst{24}    = 0; // P bit
   let Inst{27-25} = 0b000;
 }
@@ -760,7 +787,20 @@ class AI3ldsbpo<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{6}     = 1; // S bit
   let Inst{7}     = 1;
   let Inst{20}    = 1; // L bit
-  let Inst{21}    = 1; // W bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-25} = 0b000;
+}
+class AI3lddpo<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr, pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 0; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
   let Inst{24}    = 0; // P bit
   let Inst{27-25} = 0b000;
 }
@@ -775,7 +815,20 @@ class AI3sthpo<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{6}     = 0; // S bit
   let Inst{7}     = 1;
   let Inst{20}    = 0; // L bit
-  let Inst{21}    = 1; // W bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-25} = 0b000;
+}
+class AI3stdpo<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr, pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
   let Inst{24}    = 0; // P bit
   let Inst{27-25} = 0b000;
 }
@@ -1150,6 +1203,19 @@ class T2Iidxldst<bit signed, bits<2> opcod, bit load, bit pre,
   let Inst{8} = 1; // The W bit.
 }
 
+// Helper class for disassembly only
+// A6.3.16 & A6.3.17
+// T2Imac - Thumb2 multiply [accumulate, and absolute difference] instructions.
+class T2I_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops, dag iops,
+             InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-24} = 0b011;
+  let Inst{23} = long;
+  let Inst{22-20} = op22_20;
+  let Inst{7-4} = op7_4;
+}
+
 // Tv5Pat - Same as Pat<>, but requires V5T Thumb mode.
 class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsThumb1Only, HasV5T];
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 1c6f78a..87c6f6e 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -130,8 +130,6 @@ def IsThumb2  : Predicate<"Subtarget->isThumb2()">;
 def IsARM     : Predicate<"!Subtarget->isThumb()">;
 def IsDarwin    : Predicate<"Subtarget->isTargetDarwin()">;
 def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">;
-def CarryDefIsUnused : Predicate<"!N->hasAnyUseOfValue(1)">;
-def CarryDefIsUsed   : Predicate<"N->hasAnyUseOfValue(1)">;
 
 // FIXME: Eventually this will be just "hasV6T2Ops".
 def UseMovt   : Predicate<"Subtarget->useMovt()">;
@@ -176,7 +174,7 @@ def imm16_31 : PatLeaf<(i32 imm), [{
   return (int32_t)N->getZExtValue() >= 16 && (int32_t)N->getZExtValue() < 32;
 }]>;
 
-def so_imm_neg : 
+def so_imm_neg :
   PatLeaf<(imm), [{
     return ARM_AM::getSOImmVal(-(int)N->getZExtValue()) != -1;
   }], so_imm_neg_XFORM>;
@@ -194,7 +192,7 @@ def sext_16_node : PatLeaf<(i32 GPR:$a), [{
 /// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
 /// e.g., 0xf000ffff
 def bf_inv_mask_imm : Operand<i32>,
-                      PatLeaf<(imm), [{ 
+                      PatLeaf<(imm), [{
   uint32_t v = (uint32_t)N->getZExtValue();
   if (v == 0xffffffff)
     return 0;
@@ -227,7 +225,7 @@ def lo16AllZero : PatLeaf<(i32 imm), [{
   return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
 }], hi16>;
 
-/// imm0_65535 predicate - True if the 32-bit immediate is in the range 
+/// imm0_65535 predicate - True if the 32-bit immediate is in the range
 /// [0.65535].
 def imm0_65535 : PatLeaf<(i32 imm), [{
   return (uint32_t)N->getZExtValue() < 65536;
@@ -236,6 +234,21 @@ def imm0_65535 : PatLeaf<(i32 imm), [{
 class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
 class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
 
+/// adde and sube predicates - True based on whether the carry flag output
+/// will be needed or not.
+def adde_dead_carry :
+  PatFrag<(ops node:$LHS, node:$RHS), (adde node:$LHS, node:$RHS),
+  [{return !N->hasAnyUseOfValue(1);}]>;
+def sube_dead_carry :
+  PatFrag<(ops node:$LHS, node:$RHS), (sube node:$LHS, node:$RHS),
+  [{return !N->hasAnyUseOfValue(1);}]>;
+def adde_live_carry :
+  PatFrag<(ops node:$LHS, node:$RHS), (adde node:$LHS, node:$RHS),
+  [{return N->hasAnyUseOfValue(1);}]>;
+def sube_live_carry :
+  PatFrag<(ops node:$LHS, node:$RHS), (sube node:$LHS, node:$RHS),
+  [{return N->hasAnyUseOfValue(1);}]>;
+
 //===----------------------------------------------------------------------===//
 // Operand Definitions.
 //
@@ -501,6 +514,22 @@ multiclass AI_unary_rrot<bits<8> opcod, string opc, PatFrag opnode> {
   }
 }
 
+multiclass AI_unary_rrot_np<bits<8> opcod, string opc> {
+  def r     : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src),
+                 IIC_iUNAr, opc, "\t$dst, $src",
+                 [/* For disassembly only; pattern left blank */]>,
+              Requires<[IsARM, HasV6]> {
+    let Inst{11-10} = 0b00;
+    let Inst{19-16} = 0b1111;
+  }
+  def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src, i32imm:$rot),
+                 IIC_iUNAsi, opc, "\t$dst, $src, ror $rot",
+                 [/* For disassembly only; pattern left blank */]>,
+              Requires<[IsARM, HasV6]> {
+    let Inst{19-16} = 0b1111;
+  }
+}
+
 /// AI_bin_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
 multiclass AI_bin_rrot<bits<8> opcod, string opc, PatFrag opnode> {
@@ -510,13 +539,29 @@ multiclass AI_bin_rrot<bits<8> opcod, string opc, PatFrag opnode> {
                Requires<[IsARM, HasV6]> {
     let Inst{11-10} = 0b00;
   }
-  def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot),
+  def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS,
+                                              i32imm:$rot),
                   IIC_iALUsi, opc, "\t$dst, $LHS, $RHS, ror $rot",
                   [(set GPR:$dst, (opnode GPR:$LHS,
                                           (rotr GPR:$RHS, rot_imm:$rot)))]>,
                   Requires<[IsARM, HasV6]>;
 }
 
+// For disassembly only.
+multiclass AI_bin_rrot_np<bits<8> opcod, string opc> {
+  def rr     : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS),
+                  IIC_iALUr, opc, "\t$dst, $LHS, $RHS",
+                  [/* For disassembly only; pattern left blank */]>,
+               Requires<[IsARM, HasV6]> {
+    let Inst{11-10} = 0b00;
+  }
+  def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS,
+                                              i32imm:$rot),
+                  IIC_iALUsi, opc, "\t$dst, $LHS, $RHS, ror $rot",
+                  [/* For disassembly only; pattern left blank */]>,
+                  Requires<[IsARM, HasV6]>;
+}
+
 /// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
 let Uses = [CPSR] in {
 multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
@@ -524,13 +569,13 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
   def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
                 DPFrm, IIC_iALUi, opc, "\t$dst, $a, $b",
                [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>,
-               Requires<[IsARM, CarryDefIsUnused]> {
+               Requires<[IsARM]> {
     let Inst{25} = 1;
   }
   def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
                 DPFrm, IIC_iALUr, opc, "\t$dst, $a, $b",
                [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>,
-               Requires<[IsARM, CarryDefIsUnused]> {
+               Requires<[IsARM]> {
     let isCommutable = Commutable;
     let Inst{11-4} = 0b00000000;
     let Inst{25} = 0;
@@ -538,7 +583,7 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
   def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
                 DPSoRegFrm, IIC_iALUsr, opc, "\t$dst, $a, $b",
                [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>,
-               Requires<[IsARM, CarryDefIsUnused]> {
+               Requires<[IsARM]> {
     let Inst{25} = 0;
   }
 }
@@ -549,16 +594,14 @@ multiclass AI1_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
   def Sri : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
                 DPFrm, IIC_iALUi, !strconcat(opc, "\t$dst, $a, $b"),
                [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>,
-               Requires<[IsARM, CarryDefIsUsed]> {
-    let Defs = [CPSR];
+               Requires<[IsARM]> {
     let Inst{20} = 1;
     let Inst{25} = 1;
   }
   def Srr : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
                 DPFrm, IIC_iALUr, !strconcat(opc, "\t$dst, $a, $b"),
                [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>,
-               Requires<[IsARM, CarryDefIsUsed]> {
-    let Defs = [CPSR];
+               Requires<[IsARM]> {
     let Inst{11-4} = 0b00000000;
     let Inst{20} = 1;
     let Inst{25} = 0;
@@ -566,8 +609,7 @@ multiclass AI1_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
   def Srs : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
                 DPSoRegFrm, IIC_iALUsr, !strconcat(opc, "\t$dst, $a, $b"),
                [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>,
-               Requires<[IsARM, CarryDefIsUsed]> {
-    let Defs = [CPSR];
+               Requires<[IsARM]> {
     let Inst{20} = 1;
     let Inst{25} = 0;
   }
@@ -593,13 +635,16 @@ PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
                     i32imm:$size), NoItinerary,
            "${instid:label} ${cpidx:cpentry}", []>;
 
-let Defs = [SP], Uses = [SP] in {
+// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
+// from removing one half of the matched pairs. That breaks PEI, which assumes
+// these will always be in pairs, and asserts if it finds otherwise. Better way?
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
 def ADJCALLSTACKUP :
 PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
            "@ ADJCALLSTACKUP $amt1",
            [(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
 
-def ADJCALLSTACKDOWN : 
+def ADJCALLSTACKDOWN :
 PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
            "@ ADJCALLSTACKDOWN $amt",
            [(ARMcallseq_start timm:$amt)]>;
@@ -633,6 +678,14 @@ def WFI : AI<(outs), (ins), MiscFrm, NoItinerary, "wfi", "",
   let Inst{7-0} = 0b00000011;
 }
 
+def SEL : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, NoItinerary, "sel",
+             "\t$dst, $a, $b",
+             [/* For disassembly only; pattern left blank */]>,
+          Requires<[IsARM, HasV6]> {
+  let Inst{27-20} = 0b01101000;
+  let Inst{7-4} = 0b1011;
+}
+
 def SEV : AI<(outs), (ins), MiscFrm, NoItinerary, "sev", "",
              [/* For disassembly only; pattern left blank */]>,
           Requires<[IsARM, HasV6T2]> {
@@ -664,6 +717,34 @@ def CPS : AXI<(outs),(ins i32imm:$opt), MiscFrm, NoItinerary, "cps${opt:cps}",
   let Inst{5} = 0;
 }
 
+// Preload signals the memory system of possible future data/instruction access.
+// These are for disassembly only.
+multiclass APreLoad<bit data, bit read, string opc> {
+
+  def i : AXI<(outs), (ins GPR:$base, i32imm:$imm), MiscFrm, NoItinerary,
+               !strconcat(opc, "\t[$base, $imm]"), []> {
+    let Inst{31-26} = 0b111101;
+    let Inst{25} = 0; // 0 for immediate form
+    let Inst{24} = data;
+    let Inst{22} = read;
+    let Inst{21-20} = 0b01;
+  }
+
+  def r : AXI<(outs), (ins addrmode2:$addr), MiscFrm, NoItinerary,
+               !strconcat(opc, "\t$addr"), []> {
+    let Inst{31-26} = 0b111101;
+    let Inst{25} = 1; // 1 for register form
+    let Inst{24} = data;
+    let Inst{22} = read;
+    let Inst{21-20} = 0b01;
+    let Inst{4} = 0;
+  }
+}
+
+defm PLD  : APreLoad<1, 1, "pld">;
+defm PLDW : APreLoad<1, 0, "pldw">;
+defm PLI  : APreLoad<0, 1, "pli">;
+
 def SETENDBE : AXI<(outs),(ins), MiscFrm, NoItinerary, "setend\tbe",
                    [/* For disassembly only; pattern left blank */]>,
                Requires<[IsARM]> {
@@ -761,7 +842,7 @@ def LEApcrelJT : AXI1<0x0, (outs GPR:$dst),
                          "(${label}_${id}-(",
                                   "${:private}PCRELL${:uid}+8))\n"),
                        !strconcat("${:private}PCRELL${:uid}:\n\t",
-                                  "add$p\t$dst, pc, #${:private}PCRELV${:uid}")),
+                                 "add$p\t$dst, pc, #${:private}PCRELV${:uid}")),
                    []> {
     let Inst{25} = 1;
 }
@@ -771,7 +852,7 @@ def LEApcrelJT : AXI1<0x0, (outs GPR:$dst),
 //
 
 let isReturn = 1, isTerminator = 1, isBarrier = 1 in
-  def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br, 
+  def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br,
                   "bx", "\tlr", [(ARMretflag)]> {
   let Inst{3-0}   = 0b1110;
   let Inst{7-4}   = 0b0001;
@@ -828,9 +909,10 @@ let isCall = 1,
   }
 
   // ARMv4T
-  def BX : ABXIx2<(outs), (ins GPR:$func, variable_ops),
+  // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
+  def BX : ABXIx2<(outs), (ins tGPR:$func, variable_ops),
                   IIC_Br, "mov\tlr, pc\n\tbx\t$func",
-                  [(ARMcall_nolink GPR:$func)]>,
+                  [(ARMcall_nolink tGPR:$func)]>,
            Requires<[IsARM, IsNotDarwin]> {
     let Inst{7-4}   = 0b0001;
     let Inst{19-8}  = 0b111111111111;
@@ -865,9 +947,10 @@ let isCall = 1,
   }
 
   // ARMv4T
-  def BXr9 : ABXIx2<(outs), (ins GPR:$func, variable_ops),
+  // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
+  def BXr9 : ABXIx2<(outs), (ins tGPR:$func, variable_ops),
                   IIC_Br, "mov\tlr, pc\n\tbx\t$func",
-                  [(ARMcall_nolink GPR:$func)]>, Requires<[IsARM, IsDarwin]> {
+                  [(ARMcall_nolink tGPR:$func)]>, Requires<[IsARM, IsDarwin]> {
     let Inst{7-4}   = 0b0001;
     let Inst{19-8}  = 0b111111111111;
     let Inst{27-20} = 0b00010010;
@@ -917,7 +1000,7 @@ let isBranch = 1, isTerminator = 1 in {
   } // isBarrier = 1
 
   // FIXME: should be able to write a pattern for ARMBrcond, but can't use
-  // a two-value operand where a dag node expects two operands. :( 
+  // a two-value operand where a dag node expects two operands. :(
   def Bcc : ABI<0b1010, (outs), (ins brtarget:$target),
                IIC_Br, "b", "\t$target",
                [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>;
@@ -931,25 +1014,61 @@ def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
   let Inst{7-4} = 0b0010;
 }
 
-// Supervisor call (software interrupt) -- for disassembly only
+// Secure Monitor Call is a system instruction -- for disassembly only
+def SMC : ABI<0b0001, (outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt",
+              [/* For disassembly only; pattern left blank */]> {
+  let Inst{23-20} = 0b0110;
+  let Inst{7-4} = 0b0111;
+}
+
+// Supervisor Call (Software Interrupt) -- for disassembly only
 let isCall = 1 in {
 def SVC : ABI<0b1111, (outs), (ins i32imm:$svc), IIC_Br, "svc", "\t$svc",
               [/* For disassembly only; pattern left blank */]>;
 }
 
+// Store Return State is a system instruction -- for disassembly only
+def SRSW : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, i32imm:$mode),
+                NoItinerary, "srs${addr:submode}\tsp!, $mode",
+                [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-28} = 0b1111;
+  let Inst{22-20} = 0b110; // W = 1
+}
+
+def SRS  : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, i32imm:$mode),
+                NoItinerary, "srs${addr:submode}\tsp, $mode",
+                [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-28} = 0b1111;
+  let Inst{22-20} = 0b100; // W = 0
+}
+
+// Return From Exception is a system instruction -- for disassembly only
+def RFEW : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, GPR:$base),
+                NoItinerary, "rfe${addr:submode}\t$base!",
+                [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-28} = 0b1111;
+  let Inst{22-20} = 0b011; // W = 1
+}
+
+def RFE  : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, GPR:$base),
+                NoItinerary, "rfe${addr:submode}\t$base",
+                [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-28} = 0b1111;
+  let Inst{22-20} = 0b001; // W = 0
+}
+
 //===----------------------------------------------------------------------===//
 //  Load / store Instructions.
 //
 
 // Load
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in 
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def LDR  : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr,
                "ldr", "\t$dst, $addr",
                [(set GPR:$dst, (load addrmode2:$addr))]>;
 
 // Special LDR for loads from non-pc-relative constpools.
-let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
-    mayHaveSideEffects = 1  in
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1 in
 def LDRcp : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr,
                  "ldr", "\t$dst, $addr", []>;
 
@@ -958,7 +1077,7 @@ def LDRH  : AI3ldh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
                   IIC_iLoadr, "ldrh", "\t$dst, $addr",
                   [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>;
 
-def LDRB  : AI2ldb<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, 
+def LDRB  : AI2ldb<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm,
                   IIC_iLoadr, "ldrb", "\t$dst, $addr",
                   [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>;
 
@@ -1017,9 +1136,22 @@ def LDRSB_PRE : AI3ldsbpr<(outs GPR:$dst, GPR:$base_wb),
 def LDRSB_POST: AI3ldsbpo<(outs GPR:$dst, GPR:$base_wb),
                       (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
                    "ldrsb", "\t$dst, [$base], $offset", "$base = $base_wb", []>;
+
+// For disassembly only
+def LDRD_PRE : AI3lddpr<(outs GPR:$dst1, GPR:$dst2, GPR:$base_wb),
+                        (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadr,
+                 "ldrd", "\t$dst1, $dst2, $addr!", "$addr.base = $base_wb", []>,
+                Requires<[IsARM, HasV5TE]>;
+
+// For disassembly only
+def LDRD_POST : AI3lddpo<(outs GPR:$dst1, GPR:$dst2, GPR:$base_wb),
+                       (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadr,
+            "ldrd", "\t$dst1, $dst2, [$base], $offset", "$base = $base_wb", []>,
+                Requires<[IsARM, HasV5TE]>;
+
 }
 
-// LDRT and LDRBT are for disassembly only.
+// LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT are for disassembly only.
 
 def LDRT : AI2ldwpo<(outs GPR:$dst, GPR:$base_wb),
                    (ins GPR:$base, am2offset:$offset), LdFrm, IIC_iLoadru,
@@ -1028,8 +1160,26 @@ def LDRT : AI2ldwpo<(outs GPR:$dst, GPR:$base_wb),
 }
 
 def LDRBT : AI2ldbpo<(outs GPR:$dst, GPR:$base_wb),
-                   (ins GPR:$base,am2offset:$offset), LdFrm, IIC_iLoadru,
-                   "ldrb", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
+                  (ins GPR:$base,am2offset:$offset), LdFrm, IIC_iLoadru,
+                  "ldrbt", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
+  let Inst{21} = 1; // overwrite
+}
+
+def LDRSBT : AI3ldsbpo<(outs GPR:$dst, GPR:$base_wb),
+                 (ins GPR:$base,am2offset:$offset), LdMiscFrm, IIC_iLoadru,
+                 "ldrsbt", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
+  let Inst{21} = 1; // overwrite
+}
+
+def LDRHT : AI3ldhpo<(outs GPR:$dst, GPR:$base_wb),
+                  (ins GPR:$base, am3offset:$offset), LdMiscFrm, IIC_iLoadru,
+                  "ldrht", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
+  let Inst{21} = 1; // overwrite
+}
+
+def LDRSHT : AI3ldshpo<(outs GPR:$dst, GPR:$base_wb),
+                 (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
+                 "ldrsht", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
   let Inst{21} = 1; // overwrite
 }
 
@@ -1039,8 +1189,8 @@ def STR  : AI2stw<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer,
                [(store GPR:$src, addrmode2:$addr)]>;
 
 // Stores with truncate
-def STRH : AI3sth<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm, IIC_iStorer,
-               "strh", "\t$src, $addr",
+def STRH : AI3sth<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm,
+               IIC_iStorer, "strh", "\t$src, $addr",
                [(truncstorei16 GPR:$src, addrmode3:$addr)]>;
 
 def STRB : AI2stb<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer,
@@ -1055,51 +1205,65 @@ def STRD : AI3std<(outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr),
 
 // Indexed stores
 def STR_PRE  : AI2stwpr<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base, am2offset:$offset), 
+                     (ins GPR:$src, GPR:$base, am2offset:$offset),
                      StFrm, IIC_iStoreru,
                     "str", "\t$src, [$base, $offset]!", "$base = $base_wb",
                     [(set GPR:$base_wb,
                       (pre_store GPR:$src, GPR:$base, am2offset:$offset))]>;
 
 def STR_POST : AI2stwpo<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am2offset:$offset), 
+                     (ins GPR:$src, GPR:$base,am2offset:$offset),
                      StFrm, IIC_iStoreru,
                     "str", "\t$src, [$base], $offset", "$base = $base_wb",
                     [(set GPR:$base_wb,
                       (post_store GPR:$src, GPR:$base, am2offset:$offset))]>;
 
 def STRH_PRE : AI3sthpr<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am3offset:$offset), 
+                     (ins GPR:$src, GPR:$base,am3offset:$offset),
                      StMiscFrm, IIC_iStoreru,
                      "strh", "\t$src, [$base, $offset]!", "$base = $base_wb",
                     [(set GPR:$base_wb,
                       (pre_truncsti16 GPR:$src, GPR:$base,am3offset:$offset))]>;
 
 def STRH_POST: AI3sthpo<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am3offset:$offset), 
+                     (ins GPR:$src, GPR:$base,am3offset:$offset),
                      StMiscFrm, IIC_iStoreru,
                      "strh", "\t$src, [$base], $offset", "$base = $base_wb",
                     [(set GPR:$base_wb, (post_truncsti16 GPR:$src,
                                          GPR:$base, am3offset:$offset))]>;
 
 def STRB_PRE : AI2stbpr<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am2offset:$offset), 
+                     (ins GPR:$src, GPR:$base,am2offset:$offset),
                      StFrm, IIC_iStoreru,
                      "strb", "\t$src, [$base, $offset]!", "$base = $base_wb",
                     [(set GPR:$base_wb, (pre_truncsti8 GPR:$src,
                                          GPR:$base, am2offset:$offset))]>;
 
 def STRB_POST: AI2stbpo<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am2offset:$offset), 
+                     (ins GPR:$src, GPR:$base,am2offset:$offset),
                      StFrm, IIC_iStoreru,
                      "strb", "\t$src, [$base], $offset", "$base = $base_wb",
                     [(set GPR:$base_wb, (post_truncsti8 GPR:$src,
                                          GPR:$base, am2offset:$offset))]>;
 
-// STRT and STRBT are for disassembly only.
+// For disassembly only
+def STRD_PRE : AI3stdpr<(outs GPR:$base_wb),
+                     (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset),
+                     StMiscFrm, IIC_iStoreru,
+                     "strd", "\t$src1, $src2, [$base, $offset]!",
+                     "$base = $base_wb", []>;
+
+// For disassembly only
+def STRD_POST: AI3stdpo<(outs GPR:$base_wb),
+                     (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset),
+                     StMiscFrm, IIC_iStoreru,
+                     "strd", "\t$src1, $src2, [$base], $offset",
+                     "$base = $base_wb", []>;
+
+// STRT, STRBT, and STRHT are for disassembly only.
 
 def STRT : AI2stwpo<(outs GPR:$base_wb),
-                    (ins GPR:$src, GPR:$base,am2offset:$offset), 
+                    (ins GPR:$src, GPR:$base,am2offset:$offset),
                     StFrm, IIC_iStoreru,
                     "strt", "\t$src, [$base], $offset", "$base = $base_wb",
                     [/* For disassembly only; pattern left blank */]> {
@@ -1107,13 +1271,21 @@ def STRT : AI2stwpo<(outs GPR:$base_wb),
 }
 
 def STRBT : AI2stbpo<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am2offset:$offset), 
+                     (ins GPR:$src, GPR:$base,am2offset:$offset),
                      StFrm, IIC_iStoreru,
                      "strbt", "\t$src, [$base], $offset", "$base = $base_wb",
                      [/* For disassembly only; pattern left blank */]> {
   let Inst{21} = 1; // overwrite
 }
 
+def STRHT: AI3sthpo<(outs GPR:$base_wb),
+                    (ins GPR:$src, GPR:$base,am3offset:$offset),
+                    StMiscFrm, IIC_iStoreru,
+                    "strht", "\t$src, [$base], $offset", "$base = $base_wb",
+                    [/* For disassembly only; pattern left blank */]> {
+  let Inst{21} = 1; // overwrite
+}
+
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
@@ -1141,7 +1313,7 @@ def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr,
   let Inst{25} = 0;
 }
 
-def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src), 
+def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src),
                 DPSoRegFrm, IIC_iMOVsr,
                 "mov", "\t$dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP {
   let Inst{25} = 0;
@@ -1154,7 +1326,7 @@ def MOVi : AsI1<0b1101, (outs GPR:$dst), (ins so_imm:$src), DPFrm, IIC_iMOVi,
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVi16 : AI1<0b1000, (outs GPR:$dst), (ins i32imm:$src), 
+def MOVi16 : AI1<0b1000, (outs GPR:$dst), (ins i32imm:$src),
                  DPFrm, IIC_iMOVi,
                  "movw", "\t$dst, $src",
                  [(set GPR:$dst, imm0_65535:$src)]>,
@@ -1168,7 +1340,7 @@ def MOVTi16 : AI1<0b1010, (outs GPR:$dst), (ins GPR:$src, i32imm:$imm),
                   DPFrm, IIC_iMOVi,
                   "movt", "\t$dst, $imm",
                   [(set GPR:$dst,
-                        (or (and GPR:$src, 0xffff), 
+                        (or (and GPR:$src, 0xffff),
                             lo16AllZero:$imm))]>, UnaryDP,
                   Requires<[IsARM, HasV6T2]> {
   let Inst{20} = 0;
@@ -1187,7 +1359,7 @@ def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, IIC_iMOVsi,
 // due to flag operands.
 
 let Defs = [CPSR] in {
-def MOVsrl_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, 
+def MOVsrl_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
                       IIC_iMOVsi, "movs", "\t$dst, $src, lsr #1",
                       [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP;
 def MOVsra_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
@@ -1211,7 +1383,11 @@ defm SXTAB : AI_bin_rrot<0b01101010,
 defm SXTAH : AI_bin_rrot<0b01101011,
                "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
 
-// TODO: SXT(A){B|H}16
+// For disassembly only
+defm SXTB16  : AI_unary_rrot_np<0b01101000, "sxtb16">;
+
+// For disassembly only
+defm SXTAB16 : AI_bin_rrot_np<0b01101000, "sxtab16">;
 
 // Zero extenders
 
@@ -1235,9 +1411,9 @@ defm UXTAH : AI_bin_rrot<0b01101111, "uxtah",
 }
 
 // This isn't safe in general, the add is two 16-bit units, not a 32-bit add.
-//defm UXTAB16 : xxx<"uxtab16", 0xff00ff>;
+// For disassembly only
+defm UXTAB16 : AI_bin_rrot_np<0b01101100, "uxtab16">;
 
-// TODO: UXT(A){B|H}16
 
 def SBFX  : I<(outs GPR:$dst),
               (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
@@ -1273,13 +1449,13 @@ defm SUBS : AI1_bin_s_irs<0b0010, "subs",
                           BinOpFrag<(subc node:$LHS, node:$RHS)>>;
 
 defm ADC : AI1_adde_sube_irs<0b0101, "adc",
-                             BinOpFrag<(adde node:$LHS, node:$RHS)>, 1>;
+                          BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>, 1>;
 defm SBC : AI1_adde_sube_irs<0b0110, "sbc",
-                             BinOpFrag<(sube node:$LHS, node:$RHS)>>;
+                          BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>;
 defm ADCS : AI1_adde_sube_s_irs<0b0101, "adcs",
-                             BinOpFrag<(adde node:$LHS, node:$RHS)>, 1>;
+                          BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>;
 defm SBCS : AI1_adde_sube_s_irs<0b0110, "sbcs",
-                             BinOpFrag<(sube node:$LHS, node:$RHS)>>;
+                          BinOpFrag<(sube_live_carry node:$LHS, node:$RHS) >>;
 
 // These don't define reg/reg forms, because they are handled above.
 def RSBri : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
@@ -1313,14 +1489,14 @@ def RSBSrs : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
 let Uses = [CPSR] in {
 def RSCri : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
                  DPFrm, IIC_iALUi, "rsc", "\t$dst, $a, $b",
-                 [(set GPR:$dst, (sube so_imm:$b, GPR:$a))]>,
-                 Requires<[IsARM, CarryDefIsUnused]> {
+                 [(set GPR:$dst, (sube_dead_carry so_imm:$b, GPR:$a))]>,
+                 Requires<[IsARM]> {
     let Inst{25} = 1;
 }
 def RSCrs : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
                  DPSoRegFrm, IIC_iALUsr, "rsc", "\t$dst, $a, $b",
-                 [(set GPR:$dst, (sube so_reg:$b, GPR:$a))]>,
-                 Requires<[IsARM, CarryDefIsUnused]> {
+                 [(set GPR:$dst, (sube_dead_carry so_reg:$b, GPR:$a))]>,
+                 Requires<[IsARM]> {
     let Inst{25} = 0;
 }
 }
@@ -1329,15 +1505,15 @@ def RSCrs : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
 let Defs = [CPSR], Uses = [CPSR] in {
 def RSCSri : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
                   DPFrm, IIC_iALUi, "rscs\t$dst, $a, $b",
-                  [(set GPR:$dst, (sube so_imm:$b, GPR:$a))]>,
-                  Requires<[IsARM, CarryDefIsUnused]> {
+                  [(set GPR:$dst, (sube_dead_carry so_imm:$b, GPR:$a))]>,
+                  Requires<[IsARM]> {
     let Inst{20} = 1;
     let Inst{25} = 1;
 }
 def RSCSrs : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
                   DPSoRegFrm, IIC_iALUsr, "rscs\t$dst, $a, $b",
-                  [(set GPR:$dst, (sube so_reg:$b, GPR:$a))]>,
-                  Requires<[IsARM, CarryDefIsUnused]> {
+                  [(set GPR:$dst, (sube_dead_carry so_reg:$b, GPR:$a))]>,
+                  Requires<[IsARM]> {
     let Inst{20} = 1;
     let Inst{25} = 0;
 }
@@ -1358,10 +1534,9 @@ def : ARMPat<(add    GPR:$src, so_imm_neg:$imm),
 // (mul X, 2^n+1) -> (add (X << n), X)
 // (mul X, 2^n-1) -> (rsb X, (X << n))
 
-// Saturating adds/subtracts -- for disassembly only
-
+// ARM Arithmetic Instruction -- for disassembly only
 // GPR:$dst = GPR:$a op GPR:$b
-class AQI<bits<8> op27_20, bits<4> op7_4, string opc>
+class AAI<bits<8> op27_20, bits<4> op7_4, string opc>
   : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, IIC_iALUr,
        opc, "\t$dst, $a, $b",
        [/* For disassembly only; pattern left blank */]> {
@@ -1369,22 +1544,116 @@ class AQI<bits<8> op27_20, bits<4> op7_4, string opc>
   let Inst{7-4} = op7_4;
 }
 
-def QADD    : AQI<0b00010000, 0b0101, "qadd">;
-def QADD16  : AQI<0b01100010, 0b0001, "qadd16">;
-def QADD8   : AQI<0b01100010, 0b1001, "qadd8">;
-def QASX    : AQI<0b01100010, 0b0011, "qasx">;
-def QDADD   : AQI<0b00010100, 0b0101, "qdadd">;
-def QDSUB   : AQI<0b00010110, 0b0101, "qdsub">;
-def QSAX    : AQI<0b01100010, 0b0101, "qsax">;
-def QSUB    : AQI<0b00010010, 0b0101, "qsub">;
-def QSUB16  : AQI<0b01100010, 0b0111, "qsub16">;
-def QSUB8   : AQI<0b01100010, 0b1111, "qsub8">;
-def UQADD16 : AQI<0b01100110, 0b0001, "uqadd16">;
-def UQADD8  : AQI<0b01100110, 0b1001, "uqadd8">;
-def UQASX   : AQI<0b01100110, 0b0011, "uqasx">;
-def UQSAX   : AQI<0b01100110, 0b0101, "uqsax">;
-def UQSUB16 : AQI<0b01100110, 0b0111, "uqsub16">;
-def UQSUB8  : AQI<0b01100110, 0b1111, "uqsub8">;
+// Saturating add/subtract -- for disassembly only
+
+def QADD    : AAI<0b00010000, 0b0101, "qadd">;
+def QADD16  : AAI<0b01100010, 0b0001, "qadd16">;
+def QADD8   : AAI<0b01100010, 0b1001, "qadd8">;
+def QASX    : AAI<0b01100010, 0b0011, "qasx">;
+def QDADD   : AAI<0b00010100, 0b0101, "qdadd">;
+def QDSUB   : AAI<0b00010110, 0b0101, "qdsub">;
+def QSAX    : AAI<0b01100010, 0b0101, "qsax">;
+def QSUB    : AAI<0b00010010, 0b0101, "qsub">;
+def QSUB16  : AAI<0b01100010, 0b0111, "qsub16">;
+def QSUB8   : AAI<0b01100010, 0b1111, "qsub8">;
+def UQADD16 : AAI<0b01100110, 0b0001, "uqadd16">;
+def UQADD8  : AAI<0b01100110, 0b1001, "uqadd8">;
+def UQASX   : AAI<0b01100110, 0b0011, "uqasx">;
+def UQSAX   : AAI<0b01100110, 0b0101, "uqsax">;
+def UQSUB16 : AAI<0b01100110, 0b0111, "uqsub16">;
+def UQSUB8  : AAI<0b01100110, 0b1111, "uqsub8">;
+
+// Signed/Unsigned add/subtract -- for disassembly only
+
+def SASX   : AAI<0b01100001, 0b0011, "sasx">;
+def SADD16 : AAI<0b01100001, 0b0001, "sadd16">;
+def SADD8  : AAI<0b01100001, 0b1001, "sadd8">;
+def SSAX   : AAI<0b01100001, 0b0101, "ssax">;
+def SSUB16 : AAI<0b01100001, 0b0111, "ssub16">;
+def SSUB8  : AAI<0b01100001, 0b1111, "ssub8">;
+def UASX   : AAI<0b01100101, 0b0011, "uasx">;
+def UADD16 : AAI<0b01100101, 0b0001, "uadd16">;
+def UADD8  : AAI<0b01100101, 0b1001, "uadd8">;
+def USAX   : AAI<0b01100101, 0b0101, "usax">;
+def USUB16 : AAI<0b01100101, 0b0111, "usub16">;
+def USUB8  : AAI<0b01100101, 0b1111, "usub8">;
+
+// Signed/Unsigned halving add/subtract -- for disassembly only
+
+def SHASX   : AAI<0b01100011, 0b0011, "shasx">;
+def SHADD16 : AAI<0b01100011, 0b0001, "shadd16">;
+def SHADD8  : AAI<0b01100011, 0b1001, "shadd8">;
+def SHSAX   : AAI<0b01100011, 0b0101, "shsax">;
+def SHSUB16 : AAI<0b01100011, 0b0111, "shsub16">;
+def SHSUB8  : AAI<0b01100011, 0b1111, "shsub8">;
+def UHASX   : AAI<0b01100111, 0b0011, "uhasx">;
+def UHADD16 : AAI<0b01100111, 0b0001, "uhadd16">;
+def UHADD8  : AAI<0b01100111, 0b1001, "uhadd8">;
+def UHSAX   : AAI<0b01100111, 0b0101, "uhsax">;
+def UHSUB16 : AAI<0b01100111, 0b0111, "uhsub16">;
+def UHSUB8  : AAI<0b01100111, 0b1111, "uhsub8">;
+
+// Unsigned Sum of Absolute Differences [and Accumulate] -- for disassembly only
+
+def USAD8  : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b),
+                MulFrm /* for convenience */, NoItinerary, "usad8",
+                "\t$dst, $a, $b", []>,
+             Requires<[IsARM, HasV6]> {
+  let Inst{27-20} = 0b01111000;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = 0b0001;
+}
+def USADA8 : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+                MulFrm /* for convenience */, NoItinerary, "usada8",
+                "\t$dst, $a, $b, $acc", []>,
+             Requires<[IsARM, HasV6]> {
+  let Inst{27-20} = 0b01111000;
+  let Inst{7-4} = 0b0001;
+}
+
+// Signed/Unsigned saturate -- for disassembly only
+
+def SSATlsl : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, i32imm:$shamt),
+                 DPFrm, NoItinerary, "ssat", "\t$dst, $bit_pos, $a, lsl $shamt",
+                 [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-21} = 0b0110101;
+  let Inst{6-4} = 0b001;
+}
+
+def SSATasr : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, i32imm:$shamt),
+                 DPFrm, NoItinerary, "ssat", "\t$dst, $bit_pos, $a, asr $shamt",
+                 [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-21} = 0b0110101;
+  let Inst{6-4} = 0b101;
+}
+
+def SSAT16 : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), DPFrm,
+                NoItinerary, "ssat16", "\t$dst, $bit_pos, $a",
+                [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-20} = 0b01101010;
+  let Inst{7-4} = 0b0011;
+}
+
+def USATlsl : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, i32imm:$shamt),
+                 DPFrm, NoItinerary, "usat", "\t$dst, $bit_pos, $a, lsl $shamt",
+                 [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-21} = 0b0110111;
+  let Inst{6-4} = 0b001;
+}
+
+def USATasr : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, i32imm:$shamt),
+                 DPFrm, NoItinerary, "usat", "\t$dst, $bit_pos, $a, asr $shamt",
+                 [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-21} = 0b0110111;
+  let Inst{6-4} = 0b101;
+}
+
+def USAT16 : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), DPFrm,
+                NoItinerary, "usat16", "\t$dst, $bit_pos, $a",
+                [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-20} = 0b01101110;
+  let Inst{7-4} = 0b0011;
+}
 
 //===----------------------------------------------------------------------===//
 //  Bitwise Instructions.
@@ -1408,6 +1677,17 @@ def BFC    : I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
   let Inst{6-0}   = 0b0011111;
 }
 
+// A8.6.18  BFI - Bitfield insert (Encoding A1)
+// Added for disassembler with the pattern field purposely left blank.
+def BFI    : I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
+               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "bfi", "\t$dst, $src, $imm", "",
+               [/* For disassembly only; pattern left blank */]>,
+               Requires<[IsARM, HasV6T2]> {
+  let Inst{27-21} = 0b0111110;
+  let Inst{6-4}   = 0b001; // Rn: Inst{3-0} != 15
+}
+
 def  MVNr  : AsI1<0b1111, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr,
                   "mvn", "\t$dst, $src",
                   [(set GPR:$dst, (not GPR:$src))]>, UnaryDP {
@@ -1420,7 +1700,7 @@ def  MVNs  : AsI1<0b1111, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm,
   let Inst{25} = 0;
 }
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def  MVNi  : AsI1<0b1111, (outs GPR:$dst), (ins so_imm:$imm), DPFrm, 
+def  MVNi  : AsI1<0b1111, (outs GPR:$dst), (ins so_imm:$imm), DPFrm,
                   IIC_iMOVi, "mvn", "\t$dst, $imm",
                   [(set GPR:$dst, so_imm_not:$imm)]>,UnaryDP {
     let Inst{25} = 1;
@@ -1483,6 +1763,14 @@ def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
   let Inst{15-12} = 0b1111;
 }
 
+def SMMULR : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+               IIC_iMUL32, "smmulr", "\t$dst, $a, $b",
+               [/* For disassembly only; pattern left blank */]>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b0011; // R = 1
+  let Inst{15-12} = 0b1111;
+}
+
 def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
                IIC_iMAC32, "smmla", "\t$dst, $a, $b, $c",
                [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>,
@@ -1490,6 +1778,12 @@ def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
   let Inst{7-4}   = 0b0001;
 }
 
+def SMMLAR : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+               IIC_iMAC32, "smmlar", "\t$dst, $a, $b, $c",
+               [/* For disassembly only; pattern left blank */]>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b0011; // R = 1
+}
 
 def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
                IIC_iMAC32, "smmls", "\t$dst, $a, $b, $c",
@@ -1498,6 +1792,13 @@ def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
   let Inst{7-4}   = 0b1101;
 }
 
+def SMMLSR : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+               IIC_iMAC32, "smmlsr", "\t$dst, $a, $b, $c",
+               [/* For disassembly only; pattern left blank */]>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b1111; // R = 1
+}
+
 multiclass AI_smul<string opc, PatFrag opnode> {
   def BB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
               IIC_iMUL32, !strconcat(opc, "bb"), "\t$dst, $a, $b",
@@ -1569,7 +1870,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
   def BT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
               IIC_iMAC16, !strconcat(opc, "bt"), "\t$dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
-                                                     (sra GPR:$b, (i32 16)))))]>,
+                                                    (sra GPR:$b, (i32 16)))))]>,
            Requires<[IsARM, HasV5TE]> {
              let Inst{5} = 0;
              let Inst{6} = 1;
@@ -1648,7 +1949,54 @@ def SMLALTT : AMulxyI<0b0001010,(outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b),
   let Inst{6} = 1;
 }
 
-// TODO: Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
+// Helper class for AI_smld -- for disassembly only
+class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops,
+                InstrItinClass itin, string opc, string asm>
+  : AI<oops, iops, MulFrm, itin, opc, asm, []>, Requires<[IsARM, HasV6]> {
+  let Inst{4}     = 1;
+  let Inst{5}     = swap;
+  let Inst{6}     = sub;
+  let Inst{7}     = 0;
+  let Inst{21-20} = 0b00;
+  let Inst{22}    = long;
+  let Inst{27-23} = 0b01110;
+}
+
+multiclass AI_smld<bit sub, string opc> {
+
+  def D : AMulDualI<0, sub, 0, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+                  NoItinerary, !strconcat(opc, "d"), "\t$dst, $a, $b, $acc">;
+
+  def DX : AMulDualI<0, sub, 1, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+                  NoItinerary, !strconcat(opc, "dx"), "\t$dst, $a, $b, $acc">;
+
+  def LD : AMulDualI<1, sub, 0, (outs GPR:$ldst,GPR:$hdst), (ins GPR:$a,GPR:$b),
+                  NoItinerary, !strconcat(opc, "ld"), "\t$ldst, $hdst, $a, $b">;
+
+  def LDX : AMulDualI<1, sub, 1, (outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b),
+                  NoItinerary, !strconcat(opc, "ldx"),"\t$ldst, $hdst, $a, $b">;
+
+}
+
+defm SMLA : AI_smld<0, "smla">;
+defm SMLS : AI_smld<1, "smls">;
+
+multiclass AI_sdml<bit sub, string opc> {
+
+  def D : AMulDualI<0, sub, 0, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+                    NoItinerary, !strconcat(opc, "d"), "\t$dst, $a, $b"> {
+    let Inst{15-12} = 0b1111;
+  }
+
+  def DX : AMulDualI<0, sub, 1, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+                    NoItinerary, !strconcat(opc, "dx"), "\t$dst, $a, $b"> {
+    let Inst{15-12} = 0b1111;
+  }
+
+}
+
+defm SMUA : AI_sdml<0, "smua">;
+defm SMUS : AI_sdml<1, "smus">;
 
 //===----------------------------------------------------------------------===//
 //  Misc. Arithmetic Instructions.
@@ -1706,7 +2054,7 @@ def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
 
 def PKHBT : AMiscA1I<0b01101000, (outs GPR:$dst),
                                  (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
-               IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2, LSL $shamt",
+               IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2, lsl $shamt",
                [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF),
                                    (and (shl GPR:$src2, (i32 imm:$shamt)),
                                         0xFFFF0000)))]>,
@@ -1723,7 +2071,7 @@ def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)),
 
 def PKHTB : AMiscA1I<0b01101000, (outs GPR:$dst),
                                  (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
-               IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2, ASR $shamt",
+               IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2, asr $shamt",
                [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000),
                                    (and (sra GPR:$src2, imm16_31:$shamt),
                                         0xFFFF)))]>, Requires<[IsARM, HasV6]> {
@@ -1769,7 +2117,7 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm),
 
 // Conditional moves
 // FIXME: should be able to write a pattern for ARMcmov, but can't use
-// a two-value operand where a dag node expects two operands. :( 
+// a two-value operand where a dag node expects two operands. :(
 def MOVCCr : AI1<0b1101, (outs GPR:$dst), (ins GPR:$false, GPR:$true), DPFrm,
                 IIC_iCMOVr, "mov", "\t$dst, $true",
       [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>,
@@ -1807,6 +2155,7 @@ def Int_MemBarrierV7 : AInoP<(outs), (ins),
                         Requires<[IsARM, HasV7]> {
   let Inst{31-4} = 0xf57ff05;
   // FIXME: add support for options other than a full system DMB
+  // See DMB disassembly-only variants below.
   let Inst{3-0} = 0b1111;
 }
 
@@ -1817,6 +2166,7 @@ def Int_SyncBarrierV7 : AInoP<(outs), (ins),
                         Requires<[IsARM, HasV7]> {
   let Inst{31-4} = 0xf57ff04;
   // FIXME: add support for options other than a full system DSB
+  // See DSB disassembly-only variants below.
   let Inst{3-0} = 0b1111;
 }
 
@@ -1839,6 +2189,64 @@ def Int_SyncBarrierV6 : AInoP<(outs), (ins GPR:$zero),
 }
 }
 
+// Helper class for multiclass MemB -- for disassembly only
+class AMBI<string opc, string asm>
+  : AInoP<(outs), (ins), MiscFrm, NoItinerary, opc, asm,
+          [/* For disassembly only; pattern left blank */]>,
+    Requires<[IsARM, HasV7]> {
+  let Inst{31-20} = 0xf57;
+}
+
+multiclass MemB<bits<4> op7_4, string opc> {
+
+  def st : AMBI<opc, "\tst"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b1110;
+  }
+
+  def ish : AMBI<opc, "\tish"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b1011;
+  }
+
+  def ishst : AMBI<opc, "\tishst"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b1010;
+  }
+
+  def nsh : AMBI<opc, "\tnsh"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b0111;
+  }
+
+  def nshst : AMBI<opc, "\tnshst"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b0110;
+  }
+
+  def osh : AMBI<opc, "\tosh"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b0011;
+  }
+
+  def oshst : AMBI<opc, "\toshst"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b0010;
+  }
+}
+
+// These DMB variants are for disassembly only.
+defm DMB : MemB<0b0101, "dmb">;
+
+// These DSB variants are for disassembly only.
+defm DSB : MemB<0b0100, "dsb">;
+
+// ISB has only full system option -- for disassembly only
+def ISBsy : AMBI<"isb", ""> {
+  let Inst{7-4} = 0b0110;
+  let Inst{3-0} = 0b1111;
+}
+
 let usesCustomInserter = 1 in {
   let Uses = [CPSR] in {
     def ATOMIC_LOAD_ADD_I8 : PseudoInst<
@@ -1978,6 +2386,14 @@ def STREXD : AIstrex<0b01, (outs GPR:$success),
                     []>;
 }
 
+// Clear-Exclusive is for disassembly only.
+def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
+                [/* For disassembly only; pattern left blank */]>,
+            Requires<[IsARM, HasV7]>  {
+  let Inst{31-20} = 0xf57;
+  let Inst{7-4} = 0b0001;
+}
+
 // SWP/SWPB are deprecated in V6/V7 and for disassembly only.
 let mayLoad = 1 in {
 def SWP : AI<(outs GPR:$dst), (ins GPR:$src, GPR:$ptr), LdStExFrm, NoItinerary,
@@ -2049,7 +2465,7 @@ let Defs =
 
 // Two piece so_imms.
 let isReMaterializable = 1 in
-def MOVi2pieces : AI1x2<(outs GPR:$dst), (ins so_imm2part:$src), 
+def MOVi2pieces : AI1x2<(outs GPR:$dst), (ins so_imm2part:$src),
                          Pseudo, IIC_iMOVi,
                          "mov", "\t$dst, $src",
                          [(set GPR:$dst, so_imm2part:$src)]>,
@@ -2074,7 +2490,7 @@ def : ARMPat<(add GPR:$LHS, so_neg_imm2part:$RHS),
 // FIXME: Remove this when we can do generalized remat.
 let isReMaterializable = 1 in
 def MOVi32imm : AI1x2<(outs GPR:$dst), (ins i32imm:$src), Pseudo, IIC_iMOVi,
-                    "movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}",
+                   "movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}",
                      [(set GPR:$dst, (i32 imm:$src))]>,
                Requires<[IsARM, HasV6T2]>;
 
@@ -2201,6 +2617,102 @@ def CDP2 : ABXI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1,
   let Inst{4} = 0;
 }
 
+class ACI<dag oops, dag iops, string opc, string asm>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, NoItinerary,
+      opc, asm, "", [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-25} = 0b110;
+}
+
+multiclass LdStCop<bits<4> op31_28, bit load, string opc> {
+
+  def _OFFSET : ACI<(outs),
+      (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr),
+      opc, "\tp$cop, cr$CRd, $addr"> {
+    let Inst{31-28} = op31_28;
+    let Inst{24} = 1; // P = 1
+    let Inst{21} = 0; // W = 0
+    let Inst{22} = 0; // D = 0
+    let Inst{20} = load;
+  }
+
+  def _PRE : ACI<(outs),
+      (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr),
+      opc, "\tp$cop, cr$CRd, $addr!"> {
+    let Inst{31-28} = op31_28;
+    let Inst{24} = 1; // P = 1
+    let Inst{21} = 1; // W = 1
+    let Inst{22} = 0; // D = 0
+    let Inst{20} = load;
+  }
+
+  def _POST : ACI<(outs),
+      (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, am2offset:$offset),
+      opc, "\tp$cop, cr$CRd, [$base], $offset"> {
+    let Inst{31-28} = op31_28;
+    let Inst{24} = 0; // P = 0
+    let Inst{21} = 1; // W = 1
+    let Inst{22} = 0; // D = 0
+    let Inst{20} = load;
+  }
+
+  def _OPTION : ACI<(outs),
+      (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, i32imm:$option),
+      opc, "\tp$cop, cr$CRd, [$base], $option"> {
+    let Inst{31-28} = op31_28;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = 1; // U = 1
+    let Inst{21} = 0; // W = 0
+    let Inst{22} = 0; // D = 0
+    let Inst{20} = load;
+  }
+
+  def L_OFFSET : ACI<(outs),
+      (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr),
+      opc, "l\tp$cop, cr$CRd, $addr"> {
+    let Inst{31-28} = op31_28;
+    let Inst{24} = 1; // P = 1
+    let Inst{21} = 0; // W = 0
+    let Inst{22} = 1; // D = 1
+    let Inst{20} = load;
+  }
+
+  def L_PRE : ACI<(outs),
+      (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr),
+      opc, "l\tp$cop, cr$CRd, $addr!"> {
+    let Inst{31-28} = op31_28;
+    let Inst{24} = 1; // P = 1
+    let Inst{21} = 1; // W = 1
+    let Inst{22} = 1; // D = 1
+    let Inst{20} = load;
+  }
+
+  def L_POST : ACI<(outs),
+      (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, am2offset:$offset),
+      opc, "l\tp$cop, cr$CRd, [$base], $offset"> {
+    let Inst{31-28} = op31_28;
+    let Inst{24} = 0; // P = 0
+    let Inst{21} = 1; // W = 1
+    let Inst{22} = 1; // D = 1
+    let Inst{20} = load;
+  }
+
+  def L_OPTION : ACI<(outs),
+      (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, nohash_imm:$option),
+      opc, "l\tp$cop, cr$CRd, [$base], $option"> {
+    let Inst{31-28} = op31_28;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = 1; // U = 1
+    let Inst{21} = 0; // W = 0
+    let Inst{22} = 1; // D = 1
+    let Inst{20} = load;
+  }
+}
+
+defm LDC  : LdStCop<{?,?,?,?}, 1, "ldc">;
+defm LDC2 : LdStCop<0b1111,    1, "ldc2">;
+defm STC  : LdStCop<{?,?,?,?}, 0, "stc">;
+defm STC2 : LdStCop<0b1111,    0, "stc2">;
+
 def MCR : ABI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1,
               GPR:$Rt, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2),
               NoItinerary, "mcr", "\tp$cop, $opc1, $Rt, cr$CRn, cr$CRm, $opc2",
@@ -2282,14 +2794,28 @@ def MRSsys : ABI<0b0001,(outs GPR:$dst),(ins), NoItinerary,"mrs","\t$dst, spsr",
 }
 
 // FIXME: mask is ignored for the time being.
-def MSR : ABI<0b0001,(outs),(ins GPR:$src), NoItinerary, "mrs", "\tcpsr, $src",
+def MSR : ABI<0b0001,(outs),(ins GPR:$src), NoItinerary, "msr", "\tcpsr, $src",
               [/* For disassembly only; pattern left blank */]> {
   let Inst{23-20} = 0b0010;
   let Inst{7-4} = 0b0000;
 }
 
 // FIXME: mask is ignored for the time being.
-def MSRsys : ABI<0b0001,(outs),(ins GPR:$src),NoItinerary,"mrs","\tspsr, $src",
+def MSRi : ABI<0b0011,(outs),(ins so_imm:$a), NoItinerary, "msr", "\tcpsr, $a",
+              [/* For disassembly only; pattern left blank */]> {
+  let Inst{23-20} = 0b0010;
+  let Inst{7-4} = 0b0000;
+}
+
+// FIXME: mask is ignored for the time being.
+def MSRsys : ABI<0b0001,(outs),(ins GPR:$src),NoItinerary,"msr","\tspsr, $src",
+              [/* For disassembly only; pattern left blank */]> {
+  let Inst{23-20} = 0b0110;
+  let Inst{7-4} = 0b0000;
+}
+
+// FIXME: mask is ignored for the time being.
+def MSRsysi : ABI<0b0011,(outs),(ins so_imm:$a),NoItinerary,"msr","\tspsr, $a",
               [/* For disassembly only; pattern left blank */]> {
   let Inst{23-20} = 0b0110;
   let Inst{7-4} = 0b0000;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index e2be7ba..3aa0810 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -83,11 +83,17 @@ def NEONvrev32    : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
 def NEONvrev16    : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
 
 def SDTARMVSHUF2  : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
-                                         SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
+                                         SDTCisSameAs<0, 2>,
+                                         SDTCisSameAs<0, 3>]>;
 def NEONzip       : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
 def NEONuzp       : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
 def NEONtrn       : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
 
+def SDTARMFMAX    : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>]>;
+def NEONfmax      : SDNode<"ARMISD::FMAX", SDTARMFMAX>;
+def NEONfmin      : SDNode<"ARMISD::FMIN", SDTARMFMAX>;
+
 //===----------------------------------------------------------------------===//
 // NEON operand definitions
 //===----------------------------------------------------------------------===//
@@ -123,9 +129,7 @@ def h64imm : Operand<i64> {
 let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
 def VLDMD : NI<(outs),
                (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops),
-               IIC_fpLoadm,
-               "vldm", "${addr:submode} ${addr:base}, $dst1",
-               []> {
+               IIC_fpLoadm, "vldm", "${addr:submode} ${addr:base}, $dst1", []> {
   let Inst{27-25} = 0b110;
   let Inst{20}    = 1;
   let Inst{11-9}  = 0b101;
@@ -133,9 +137,7 @@ def VLDMD : NI<(outs),
 
 def VLDMS : NI<(outs),
                (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops),
-               IIC_fpLoadm,
-               "vldm", "${addr:submode} ${addr:base}, $dst1",
-               []> {
+               IIC_fpLoadm, "vldm", "${addr:submode} ${addr:base}, $dst1", []> {
   let Inst{27-25} = 0b110;
   let Inst{20}    = 1;
   let Inst{11-9}  = 0b101;
@@ -144,10 +146,9 @@ def VLDMS : NI<(outs),
 */
 
 // Use vldmia to load a Q register as a D register pair.
-def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr),
-               IIC_fpLoadm,
-               "vldmia", "$addr, ${dst:dregpair}",
-               [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]> {
+def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr), IIC_fpLoadm,
+                "vldmia", "$addr, ${dst:dregpair}",
+                [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]> {
   let Inst{27-25} = 0b110;
   let Inst{24}    = 0; // P bit
   let Inst{23}    = 1; // U bit
@@ -156,10 +157,9 @@ def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr),
 }
 
 // Use vstmia to store a Q register as a D register pair.
-def VSTRQ : NI4<(outs), (ins QPR:$src, addrmode4:$addr),
-               IIC_fpStorem,
-               "vstmia", "$addr, ${src:dregpair}",
-               [(store (v2f64 QPR:$src), addrmode4:$addr)]> {
+def VSTRQ : NI4<(outs), (ins QPR:$src, addrmode4:$addr), IIC_fpStorem,
+                "vstmia", "$addr, ${src:dregpair}",
+                [(store (v2f64 QPR:$src), addrmode4:$addr)]> {
   let Inst{27-25} = 0b110;
   let Inst{24}    = 0; // P bit
   let Inst{23}    = 1; // U bit
@@ -191,6 +191,29 @@ def  VLD1q32  : VLD1Q<0b1000, "vld1", "32", v4i32, int_arm_neon_vld1>;
 def  VLD1qf   : VLD1Q<0b1000, "vld1", "32", v4f32, int_arm_neon_vld1>;
 def  VLD1q64  : VLD1Q<0b1100, "vld1", "64", v2i64, int_arm_neon_vld1>;
 
+// These (dreg triple/quadruple) are for disassembly only.
+class VLD1D3<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0, 0b10, 0b0110, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$addr), IIC_VLD1, OpcodeStr, Dt,
+          "\\{$dst1, $dst2, $dst3\\}, $addr", "",
+          [/* For disassembly only; pattern left blank */]>;
+class VLD1D4<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$addr), IIC_VLD1, OpcodeStr, Dt,
+          "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "",
+          [/* For disassembly only; pattern left blank */]>;
+
+def  VLD1d8T  : VLD1D3<0b0000, "vld1", "8">;
+def  VLD1d16T : VLD1D3<0b0100, "vld1", "16">;
+def  VLD1d32T : VLD1D3<0b1000, "vld1", "32">;
+//def  VLD1d64T : VLD1D3<0b1100, "vld1", "64">;
+
+def  VLD1d8Q  : VLD1D4<0b0000, "vld1", "8">;
+def  VLD1d16Q : VLD1D4<0b0100, "vld1", "16">;
+def  VLD1d32Q : VLD1D4<0b1000, "vld1", "32">;
+//def  VLD1d64Q : VLD1D4<0b1100, "vld1", "64">;
+
+
 let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
 
 //   VLD2     : Vector Load (multiple 2-element structures)
@@ -216,6 +239,16 @@ def  VLD2q8   : VLD2Q<0b0000, "vld2", "8">;
 def  VLD2q16  : VLD2Q<0b0100, "vld2", "16">;
 def  VLD2q32  : VLD2Q<0b1000, "vld2", "32">;
 
+// These (double-spaced dreg pair) are for disassembly only.
+class VLD2Ddbl<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b10,0b1001,op7_4, (outs DPR:$dst1, DPR:$dst2),
+          (ins addrmode6:$addr), IIC_VLD2,
+          OpcodeStr, Dt, "\\{$dst1, $dst2\\}, $addr", "", []>;
+
+def  VLD2d8D  : VLD2Ddbl<0b0000, "vld2", "8">;
+def  VLD2d16D : VLD2Ddbl<0b0100, "vld2", "16">;
+def  VLD2d32D : VLD2Ddbl<0b1000, "vld2", "32">;
+
 //   VLD3     : Vector Load (multiple 3-element structures)
 class VLD3D<bits<4> op7_4, string OpcodeStr, string Dt>
   : NLdSt<0,0b10,0b0100,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
@@ -285,105 +318,64 @@ def  VLD4q32b : VLD4WB<0b1000, "vld4", "32">;
 class VLD2LN<bits<4> op11_8, string OpcodeStr, string Dt>
   : NLdSt<1,0b10,op11_8,{?,?,?,?}, (outs DPR:$dst1, DPR:$dst2),
             (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
-            IIC_VLD2,
-            OpcodeStr, Dt, "\\{$dst1[$lane], $dst2[$lane]\\}, $addr",
+            IIC_VLD2, OpcodeStr, Dt, "\\{$dst1[$lane], $dst2[$lane]\\}, $addr",
             "$src1 = $dst1, $src2 = $dst2", []>;
 
 // vld2 to single-spaced registers.
 def VLD2LNd8  : VLD2LN<0b0001, "vld2", "8">;
-def VLD2LNd16 : VLD2LN<0b0101, "vld2", "16"> {
-  let Inst{5} = 0;
-}
-def VLD2LNd32 : VLD2LN<0b1001, "vld2", "32"> {
-  let Inst{6} = 0;
-}
+def VLD2LNd16 : VLD2LN<0b0101, "vld2", "16"> { let Inst{5} = 0; }
+def VLD2LNd32 : VLD2LN<0b1001, "vld2", "32"> { let Inst{6} = 0; }
 
 // vld2 to double-spaced even registers.
-def VLD2LNq16a: VLD2LN<0b0101, "vld2", "16"> {
-  let Inst{5} = 1;
-}
-def VLD2LNq32a: VLD2LN<0b1001, "vld2", "32"> {
-  let Inst{6} = 1;
-}
+def VLD2LNq16a: VLD2LN<0b0101, "vld2", "16"> { let Inst{5} = 1; }
+def VLD2LNq32a: VLD2LN<0b1001, "vld2", "32"> { let Inst{6} = 1; }
 
 // vld2 to double-spaced odd registers.
-def VLD2LNq16b: VLD2LN<0b0101, "vld2", "16"> {
-  let Inst{5} = 1;
-}
-def VLD2LNq32b: VLD2LN<0b1001, "vld2", "32"> {
-  let Inst{6} = 1;
-}
+def VLD2LNq16b: VLD2LN<0b0101, "vld2", "16"> { let Inst{5} = 1; }
+def VLD2LNq32b: VLD2LN<0b1001, "vld2", "32"> { let Inst{6} = 1; }
 
 //   VLD3LN   : Vector Load (single 3-element structure to one lane)
 class VLD3LN<bits<4> op11_8, string OpcodeStr, string Dt>
   : NLdSt<1,0b10,op11_8,{?,?,?,?}, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
             (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
-            nohash_imm:$lane), IIC_VLD3,
-            OpcodeStr, Dt,
+            nohash_imm:$lane), IIC_VLD3, OpcodeStr, Dt,
             "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr",
             "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3", []>;
 
 // vld3 to single-spaced registers.
-def VLD3LNd8  : VLD3LN<0b0010, "vld3", "8"> {
-  let Inst{4} = 0;
-}
-def VLD3LNd16 : VLD3LN<0b0110, "vld3", "16"> {
-  let Inst{5-4} = 0b00;
-}
-def VLD3LNd32 : VLD3LN<0b1010, "vld3", "32"> {
-  let Inst{6-4} = 0b000;
-}
+def VLD3LNd8  : VLD3LN<0b0010, "vld3", "8"> { let Inst{4} = 0; }
+def VLD3LNd16 : VLD3LN<0b0110, "vld3", "16"> { let Inst{5-4} = 0b00; }
+def VLD3LNd32 : VLD3LN<0b1010, "vld3", "32"> { let Inst{6-4} = 0b000; }
 
 // vld3 to double-spaced even registers.
-def VLD3LNq16a: VLD3LN<0b0110, "vld3", "16"> {
-  let Inst{5-4} = 0b10;
-}
-def VLD3LNq32a: VLD3LN<0b1010, "vld3", "32"> {
-  let Inst{6-4} = 0b100;
-}
+def VLD3LNq16a: VLD3LN<0b0110, "vld3", "16"> { let Inst{5-4} = 0b10; }
+def VLD3LNq32a: VLD3LN<0b1010, "vld3", "32"> { let Inst{6-4} = 0b100; }
 
 // vld3 to double-spaced odd registers.
-def VLD3LNq16b: VLD3LN<0b0110, "vld3", "16"> {
-  let Inst{5-4} = 0b10;
-}
-def VLD3LNq32b: VLD3LN<0b1010, "vld3", "32"> {
-  let Inst{6-4} = 0b100;
-}
+def VLD3LNq16b: VLD3LN<0b0110, "vld3", "16"> { let Inst{5-4} = 0b10; }
+def VLD3LNq32b: VLD3LN<0b1010, "vld3", "32"> { let Inst{6-4} = 0b100; }
 
 //   VLD4LN   : Vector Load (single 4-element structure to one lane)
 class VLD4LN<bits<4> op11_8, string OpcodeStr, string Dt>
   : NLdSt<1,0b10,op11_8,{?,?,?,?},
             (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
             (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
-            nohash_imm:$lane), IIC_VLD4,
-            OpcodeStr, Dt,
+            nohash_imm:$lane), IIC_VLD4, OpcodeStr, Dt,
           "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $addr",
             "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>;
 
 // vld4 to single-spaced registers.
 def VLD4LNd8  : VLD4LN<0b0011, "vld4", "8">;
-def VLD4LNd16 : VLD4LN<0b0111, "vld4", "16"> {
-  let Inst{5} = 0;
-}
-def VLD4LNd32 : VLD4LN<0b1011, "vld4", "32"> {
-  let Inst{6} = 0;
-}
+def VLD4LNd16 : VLD4LN<0b0111, "vld4", "16"> { let Inst{5} = 0; }
+def VLD4LNd32 : VLD4LN<0b1011, "vld4", "32"> { let Inst{6} = 0; }
 
 // vld4 to double-spaced even registers.
-def VLD4LNq16a: VLD4LN<0b0111, "vld4", "16"> {
-  let Inst{5} = 1;
-}
-def VLD4LNq32a: VLD4LN<0b1011, "vld4", "32"> {
-  let Inst{6} = 1;
-}
+def VLD4LNq16a: VLD4LN<0b0111, "vld4", "16"> { let Inst{5} = 1; }
+def VLD4LNq32a: VLD4LN<0b1011, "vld4", "32"> { let Inst{6} = 1; }
 
 // vld4 to double-spaced odd registers.
-def VLD4LNq16b: VLD4LN<0b0111, "vld4", "16"> {
-  let Inst{5} = 1;
-}
-def VLD4LNq32b: VLD4LN<0b1011, "vld4", "32"> {
-  let Inst{6} = 1;
-}
+def VLD4LNq16b: VLD4LN<0b0111, "vld4", "16"> { let Inst{5} = 1; }
+def VLD4LNq32b: VLD4LN<0b1011, "vld4", "32"> { let Inst{6} = 1; }
 
 //   VLD1DUP  : Vector Load (single element to all lanes)
 //   VLD2DUP  : Vector Load (single 2-element structure to all lanes)
@@ -418,6 +410,31 @@ def  VST1qf   : VST1Q<0b1000, "vst1", "32", v4f32, int_arm_neon_vst1>;
 def  VST1q64  : VST1Q<0b1100, "vst1", "64", v2i64, int_arm_neon_vst1>;
 } // hasExtraSrcRegAllocReq
 
+// These (dreg triple/quadruple) are for disassembly only.
+class VST1D3<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST,
+          OpcodeStr, Dt,
+          "\\{$src1, $src2, $src3\\}, $addr", "",
+          [/* For disassembly only; pattern left blank */]>;
+class VST1D4<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST, OpcodeStr, Dt,
+          "\\{$src1, $src2, $src3, $src4\\}, $addr", "",
+          [/* For disassembly only; pattern left blank */]>;
+
+def  VST1d8T  : VST1D3<0b0000, "vst1", "8">;
+def  VST1d16T : VST1D3<0b0100, "vst1", "16">;
+def  VST1d32T : VST1D3<0b1000, "vst1", "32">;
+//def  VST1d64T : VST1D3<0b1100, "vst1", "64">;
+
+def  VST1d8Q  : VST1D4<0b0000, "vst1", "8">;
+def  VST1d16Q : VST1D4<0b0100, "vst1", "16">;
+def  VST1d32Q : VST1D4<0b1000, "vst1", "32">;
+//def  VST1d64Q : VST1D4<0b1100, "vst1", "64">;
+
+
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in {
 
 //   VST2     : Vector Store (multiple 2-element structures)
@@ -428,8 +445,7 @@ class VST2D<bits<4> op7_4, string OpcodeStr, string Dt>
 class VST2Q<bits<4> op7_4, string OpcodeStr, string Dt>
   : NLdSt<0,0b00,0b0011,op7_4, (outs),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
-          IIC_VST,
-          OpcodeStr, Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr",
+          IIC_VST, OpcodeStr, Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr",
           "", []>;
 
 def  VST2d8   : VST2D<0b0000, "vst2", "8">;
@@ -443,6 +459,16 @@ def  VST2q8   : VST2Q<0b0000, "vst2", "8">;
 def  VST2q16  : VST2Q<0b0100, "vst2", "16">;
 def  VST2q32  : VST2Q<0b1000, "vst2", "32">;
 
+// These (double-spaced dreg pair) are for disassembly only.
+class VST2Ddbl<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0, 0b00, 0b1001, op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST,
+          OpcodeStr, Dt, "\\{$src1, $src2\\}, $addr", "", []>;
+
+def  VST2d8D  : VST2Ddbl<0b0000, "vst2", "8">;
+def  VST2d16D : VST2Ddbl<0b0100, "vst2", "16">;
+def  VST2d32D : VST2Ddbl<0b1000, "vst2", "32">;
+
 //   VST3     : Vector Store (multiple 3-element structures)
 class VST3D<bits<4> op7_4, string OpcodeStr, string Dt>
   : NLdSt<0,0b00,0b0100,op7_4, (outs),
@@ -476,14 +502,12 @@ def  VST3q32b : VST3WB<0b1000, "vst3", "32">;
 class VST4D<bits<4> op7_4, string OpcodeStr, string Dt>
   : NLdSt<0,0b00,0b0000,op7_4, (outs),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
-          IIC_VST,
-          OpcodeStr, Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr",
+          IIC_VST, OpcodeStr, Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr",
           "", []>;
 class VST4WB<bits<4> op7_4, string OpcodeStr, string Dt>
   : NLdSt<0,0b00,0b0001,op7_4, (outs GPR:$wb),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
-          IIC_VST,
-          OpcodeStr, Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr",
+          IIC_VST, OpcodeStr, Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr",
           "$addr.addr = $wb", []>;
 
 def  VST4d8   : VST4D<0b0000, "vst4", "8">;
@@ -511,104 +535,63 @@ def  VST4q32b : VST4WB<0b1000, "vst4", "32">;
 //   VST2LN   : Vector Store (single 2-element structure from one lane)
 class VST2LN<bits<4> op11_8, string OpcodeStr, string Dt>
   : NLdSt<1,0b00,op11_8,{?,?,?,?}, (outs),
-            (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
-            IIC_VST,
-            OpcodeStr, Dt, "\\{$src1[$lane], $src2[$lane]\\}, $addr",
-            "", []>;
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
+          IIC_VST, OpcodeStr, Dt, "\\{$src1[$lane], $src2[$lane]\\}, $addr",
+          "", []>;
 
 // vst2 to single-spaced registers.
 def VST2LNd8  : VST2LN<0b0001, "vst2", "8">;
-def VST2LNd16 : VST2LN<0b0101, "vst2", "16"> {
-  let Inst{5} = 0;
-}
-def VST2LNd32 : VST2LN<0b1001, "vst2", "32"> {
-  let Inst{6} = 0;
-}
+def VST2LNd16 : VST2LN<0b0101, "vst2", "16"> { let Inst{5} = 0; }
+def VST2LNd32 : VST2LN<0b1001, "vst2", "32"> { let Inst{6} = 0; }
 
 // vst2 to double-spaced even registers.
-def VST2LNq16a: VST2LN<0b0101, "vst2", "16"> {
-  let Inst{5} = 1;
-}
-def VST2LNq32a: VST2LN<0b1001, "vst2", "32"> {
-  let Inst{6} = 1;
-}
+def VST2LNq16a: VST2LN<0b0101, "vst2", "16"> { let Inst{5} = 1; }
+def VST2LNq32a: VST2LN<0b1001, "vst2", "32"> { let Inst{6} = 1; }
 
 // vst2 to double-spaced odd registers.
-def VST2LNq16b: VST2LN<0b0101, "vst2", "16"> {
-  let Inst{5} = 1;
-}
-def VST2LNq32b: VST2LN<0b1001, "vst2", "32"> {
-  let Inst{6} = 1;
-}
+def VST2LNq16b: VST2LN<0b0101, "vst2", "16"> { let Inst{5} = 1; }
+def VST2LNq32b: VST2LN<0b1001, "vst2", "32"> { let Inst{6} = 1; }
 
 //   VST3LN   : Vector Store (single 3-element structure from one lane)
 class VST3LN<bits<4> op11_8, string OpcodeStr, string Dt>
   : NLdSt<1,0b00,op11_8,{?,?,?,?}, (outs),
-            (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
-            nohash_imm:$lane), IIC_VST,
-            OpcodeStr, Dt,
-            "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr", "", []>;
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
+           nohash_imm:$lane), IIC_VST, OpcodeStr, Dt,
+          "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr", "", []>;
 
 // vst3 to single-spaced registers.
-def VST3LNd8  : VST3LN<0b0010, "vst3", "8"> {
-  let Inst{4} = 0;
-}
-def VST3LNd16 : VST3LN<0b0110, "vst3", "16"> {
-  let Inst{5-4} = 0b00;
-}
-def VST3LNd32 : VST3LN<0b1010, "vst3", "32"> {
-  let Inst{6-4} = 0b000;
-}
+def VST3LNd8  : VST3LN<0b0010, "vst3", "8"> { let Inst{4} = 0; }
+def VST3LNd16 : VST3LN<0b0110, "vst3", "16"> { let Inst{5-4} = 0b00; }
+def VST3LNd32 : VST3LN<0b1010, "vst3", "32"> { let Inst{6-4} = 0b000; }
 
 // vst3 to double-spaced even registers.
-def VST3LNq16a: VST3LN<0b0110, "vst3", "16"> {
-  let Inst{5-4} = 0b10;
-}
-def VST3LNq32a: VST3LN<0b1010, "vst3", "32"> {
-  let Inst{6-4} = 0b100;
-}
+def VST3LNq16a: VST3LN<0b0110, "vst3", "16"> { let Inst{5-4} = 0b10; }
+def VST3LNq32a: VST3LN<0b1010, "vst3", "32"> { let Inst{6-4} = 0b100; }
 
 // vst3 to double-spaced odd registers.
-def VST3LNq16b: VST3LN<0b0110, "vst3", "16"> {
-  let Inst{5-4} = 0b10;
-}
-def VST3LNq32b: VST3LN<0b1010, "vst3", "32"> {
-  let Inst{6-4} = 0b100;
-}
+def VST3LNq16b: VST3LN<0b0110, "vst3", "16"> { let Inst{5-4} = 0b10; }
+def VST3LNq32b: VST3LN<0b1010, "vst3", "32"> { let Inst{6-4} = 0b100; }
 
 //   VST4LN   : Vector Store (single 4-element structure from one lane)
 class VST4LN<bits<4> op11_8, string OpcodeStr, string Dt>
   : NLdSt<1,0b00,op11_8,{?,?,?,?}, (outs),
-            (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
-            nohash_imm:$lane), IIC_VST,
-            OpcodeStr, Dt,
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
+           nohash_imm:$lane), IIC_VST, OpcodeStr, Dt,
           "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr",
-            "", []>;
+          "", []>;
 
 // vst4 to single-spaced registers.
 def VST4LNd8  : VST4LN<0b0011, "vst4", "8">;
-def VST4LNd16 : VST4LN<0b0111, "vst4", "16"> {
-  let Inst{5} = 0;
-}
-def VST4LNd32 : VST4LN<0b1011, "vst4", "32"> {
-  let Inst{6} = 0;
-}
+def VST4LNd16 : VST4LN<0b0111, "vst4", "16"> { let Inst{5} = 0; }
+def VST4LNd32 : VST4LN<0b1011, "vst4", "32"> { let Inst{6} = 0; }
 
 // vst4 to double-spaced even registers.
-def VST4LNq16a: VST4LN<0b0111, "vst4", "16"> {
-  let Inst{5} = 1;
-}
-def VST4LNq32a: VST4LN<0b1011, "vst4", "32"> {
-  let Inst{6} = 1;
-}
+def VST4LNq16a: VST4LN<0b0111, "vst4", "16"> { let Inst{5} = 1; }
+def VST4LNq32a: VST4LN<0b1011, "vst4", "32"> { let Inst{6} = 1; }
 
 // vst4 to double-spaced odd registers.
-def VST4LNq16b: VST4LN<0b0111, "vst4", "16"> {
-  let Inst{5} = 1;
-}
-def VST4LNq32b: VST4LN<0b1011, "vst4", "32"> {
-  let Inst{6} = 1;
-}
+def VST4LNq16b: VST4LN<0b0111, "vst4", "16"> { let Inst{5} = 1; }
+def VST4LNq32b: VST4LN<0b1011, "vst4", "32"> { let Inst{6} = 1; }
 
 } // mayStore = 1, hasExtraSrcRegAllocReq = 1
 
@@ -656,34 +639,26 @@ def SubReg_i32_lane : SDNodeXForm<imm, [{
 // Instruction Classes
 //===----------------------------------------------------------------------===//
 
-// Basic 2-register operations, both double- and quad-register.
+// Basic 2-register operations: single-, double- and quad-register.
+class N2VS<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+           string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src),
+        IIC_VUNAD, OpcodeStr, Dt, "$dst, $src", "", []>;
 class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
-           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,string Dt,
-           ValueType ResTy, ValueType OpTy, SDNode OpNode>
+           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+           string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
         (ins DPR:$src), IIC_VUNAD, OpcodeStr, Dt, "$dst, $src", "",
         [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>;
 class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
-           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,string Dt,
-           ValueType ResTy, ValueType OpTy, SDNode OpNode>
+           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+           string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
         (ins QPR:$src), IIC_VUNAQ, OpcodeStr, Dt, "$dst, $src", "",
         [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>;
 
-// Basic 2-register operations, scalar single-precision.
-class N2VDs<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
-            bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,string Dt,
-            ValueType ResTy, ValueType OpTy, SDNode OpNode>
-  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
-        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src),
-        IIC_VUNAD, OpcodeStr, Dt, "$dst, $src", "", []>;
-
-class N2VDsPat<SDNode OpNode, ValueType ResTy, ValueType OpTy, NeonI Inst>
-  : NEONFPPat<(ResTy (OpNode SPR:$a)),
-       (EXTRACT_SUBREG
-           (Inst (INSERT_SUBREG (OpTy (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0)),
-        arm_ssubreg_0)>;
-
 // Basic 2-register intrinsics, both double- and quad-register.
 class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op4, 
@@ -700,21 +675,6 @@ class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
         (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
         [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
 
-// Basic 2-register intrinsics, scalar single-precision
-class N2VDInts<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
-              bits<2> op17_16, bits<5> op11_7, bit op4, 
-              InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
-  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
-        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), itin,
-        OpcodeStr, Dt, "$dst, $src", "", []>;
-
-class N2VDIntsPat<SDNode OpNode, NeonI Inst>
-  : NEONFPPat<(f32 (OpNode SPR:$a)),
-       (EXTRACT_SUBREG
-           (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0)),
-        arm_ssubreg_0)>;
-
 // Narrow 2-register intrinsics.
 class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
@@ -742,15 +702,22 @@ class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr, string Dt>
 class N2VQShuffle<bits<2> op19_18, bits<5> op11_7,
                   InstrItinClass itin, string OpcodeStr, string Dt>
   : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$dst1, QPR:$dst2),
-        (ins QPR:$src1, QPR:$src2), itin, 
-        OpcodeStr, Dt, "$dst1, $dst2",
+        (ins QPR:$src1, QPR:$src2), itin, OpcodeStr, Dt, "$dst1, $dst2",
         "$src1 = $dst1, $src2 = $dst2", []>;
 
-// Basic 3-register operations, both double- and quad-register.
+// Basic 3-register operations: single-, double- and quad-register.
+class N3VS<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+           SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
+        OpcodeStr, Dt, "$dst, $src1, $src2", "", []> {
+  let isCommutable = Commutable;
+}
+
 class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            InstrItinClass itin, string OpcodeStr, string Dt,
-           ValueType ResTy, ValueType OpTy,
-           SDNode OpNode, bit Commutable>
+           ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
@@ -763,9 +730,9 @@ class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            ValueType ResTy, ValueType OpTy,
            SDNode OpNode, bit Commutable>
   : N3VX<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
-        OpcodeStr, "$dst, $src1, $src2", "",
-        [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
+         (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
+         OpcodeStr, "$dst, $src1, $src2", "",
+         [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]>{
   let isCommutable = Commutable;
 }
 class N3VDSL<bits<2> op21_20, bits<4> op11_8, 
@@ -776,27 +743,23 @@ class N3VDSL<bits<2> op21_20, bits<4> op11_8,
         itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (Ty DPR:$dst),
               (Ty (ShOp (Ty DPR:$src1),
-                        (Ty (NEONvduplane (Ty DPR_VFP2:$src2),
-                                          imm:$lane)))))]> {
+                        (Ty (NEONvduplane (Ty DPR_VFP2:$src2), imm:$lane)))))]>{
   let isCommutable = 0;
 }
 class N3VDSL16<bits<2> op21_20, bits<4> op11_8, 
                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        IIC_VMULi16D,
-        OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        IIC_VMULi16D, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (Ty DPR:$dst),
               (Ty (ShOp (Ty DPR:$src1),
-                        (Ty (NEONvduplane (Ty DPR_8:$src2),
-                                          imm:$lane)))))]> {
+                        (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> {
   let isCommutable = 0;
 }
 
 class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            InstrItinClass itin, string OpcodeStr, string Dt,
-           ValueType ResTy, ValueType OpTy,
-           SDNode OpNode, bit Commutable>
+           ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
         (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
@@ -805,12 +768,11 @@ class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 }
 class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            InstrItinClass itin, string OpcodeStr,
-           ValueType ResTy, ValueType OpTy,
-           SDNode OpNode, bit Commutable>
+           ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
   : N3VX<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
-        OpcodeStr, "$dst, $src1, $src2", "",
-        [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
+         (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
+         OpcodeStr, "$dst, $src1, $src2", "",
+         [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]>{
   let isCommutable = Commutable;
 }
 class N3VQSL<bits<2> op21_20, bits<4> op11_8, 
@@ -825,13 +787,11 @@ class N3VQSL<bits<2> op21_20, bits<4> op11_8,
                                                 imm:$lane)))))]> {
   let isCommutable = 0;
 }
-class N3VQSL16<bits<2> op21_20, bits<4> op11_8, 
-               string OpcodeStr, string Dt,
+class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
                ValueType ResTy, ValueType OpTy, SDNode ShOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        IIC_VMULi16Q,
-        OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        IIC_VMULi16Q, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (ResTy QPR:$dst),
               (ResTy (ShOp (ResTy QPR:$src1),
                            (ResTy (NEONvduplane (OpTy DPR_8:$src2),
@@ -839,27 +799,10 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8,
   let isCommutable = 0;
 }
 
-// Basic 3-register operations, scalar single-precision
-class N3VDs<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-           string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
-           SDNode OpNode, bit Commutable>
-  : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
-        OpcodeStr, Dt, "$dst, $src1, $src2", "", []> {
-  let isCommutable = Commutable;
-}
-class N3VDsPat<SDNode OpNode, NeonI Inst>
-  : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)),
-       (EXTRACT_SUBREG
-           (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0),
-                 (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$b, arm_ssubreg_0)),
-        arm_ssubreg_0)>;
-
 // Basic 3-register intrinsics, both double- and quad-register.
 class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy,
-              Intrinsic IntOp, bit Commutable>
+              ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
@@ -891,8 +834,7 @@ class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 
 class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy,
-              Intrinsic IntOp, bit Commutable>
+              ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
         (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
@@ -924,7 +866,15 @@ class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
   let isCommutable = 0;
 }
 
-// Multiply-Add/Sub operations, both double- and quad-register.
+// Multiply-Add/Sub operations: single-, double- and quad-register.
+class N3VSMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, SDNode MulOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR_VFP2:$dst),
+        (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", []>;
+
 class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
                 ValueType Ty, SDNode MulOp, SDNode OpNode>
@@ -976,8 +926,8 @@ class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
         [(set (ResTy QPR:$dst),
               (ResTy (ShOp (ResTy QPR:$src1),
                            (ResTy (MulOp QPR:$src2,
-                                         (ResTy (NEONvduplane (OpTy DPR_VFP2:$src3),
-                                                              imm:$lane)))))))]>;
+                                   (ResTy (NEONvduplane (OpTy DPR_VFP2:$src3),
+                                                        imm:$lane)))))))]>;
 class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                     string OpcodeStr, string Dt,
                     ValueType ResTy, ValueType OpTy,
@@ -989,25 +939,8 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
         [(set (ResTy QPR:$dst),
               (ResTy (ShOp (ResTy QPR:$src1),
                            (ResTy (MulOp QPR:$src2,
-                                         (ResTy (NEONvduplane (OpTy DPR_8:$src3),
-                                                              imm:$lane)))))))]>;
-
-// Multiply-Add/Sub operations, scalar single-precision
-class N3VDMulOps<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-                 InstrItinClass itin, string OpcodeStr, string Dt,
-                 ValueType Ty, SDNode MulOp, SDNode OpNode>
-  : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR_VFP2:$dst),
-        (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", []>;
-
-class N3VDMulOpsPat<SDNode MulNode, SDNode OpNode, NeonI Inst>
-  : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))),
-      (EXTRACT_SUBREG
-          (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$acc, arm_ssubreg_0),
-                (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a,   arm_ssubreg_0),
-                (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$b,   arm_ssubreg_0)),
-       arm_ssubreg_0)>;
+                                   (ResTy (NEONvduplane (OpTy DPR_8:$src3),
+                                                        imm:$lane)))))))]>;
 
 // Neon 3-argument intrinsics, both double- and quad-register.
 // The destination register is also used as the first source operand register.
@@ -1050,9 +983,9 @@ class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                             (OpTy DPR:$src2),
                             (OpTy (NEONvduplane (OpTy DPR_VFP2:$src3),
                                                 imm:$lane)))))]>;
-class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
-                   string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
-                   Intrinsic IntOp>
+class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+                   InstrItinClass itin, string OpcodeStr, string Dt,
+                   ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst),
         (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
@@ -1063,7 +996,6 @@ class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass iti
                             (OpTy (NEONvduplane (OpTy DPR_8:$src3),
                                                 imm:$lane)))))]>;
 
-
 // Narrowing 3-register intrinsics.
 class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ,
@@ -1095,9 +1027,9 @@ class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
               (ResTy (IntOp (OpTy DPR:$src1),
                             (OpTy (NEONvduplane (OpTy DPR_VFP2:$src2),
                                                 imm:$lane)))))]>;
-class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
-                  string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, 
-                  Intrinsic IntOp>
+class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+                  InstrItinClass itin, string OpcodeStr, string Dt,
+                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), 
         itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
@@ -1249,6 +1181,45 @@ class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
 //   S = single int (32 bit) elements
 //   D = double int (64 bit) elements
 
+// Neon 2-register vector operations -- for disassembly only.
+
+// First with only element sizes of 8, 16 and 32 bits:
+multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                       bits<5> op11_7, bit op4, string opc, string Dt,
+                       string asm> {
+  // 64-bit vector types.
+  def v8i8  : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4,
+                  (outs DPR:$dst), (ins DPR:$src), NoItinerary,
+                  opc, !strconcat(Dt, "8"), asm, "", []>;
+  def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
+                  (outs DPR:$dst), (ins DPR:$src), NoItinerary,
+                  opc, !strconcat(Dt, "16"), asm, "", []>;
+  def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
+                  (outs DPR:$dst), (ins DPR:$src), NoItinerary,
+                  opc, !strconcat(Dt, "32"), asm, "", []>;
+  def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
+                  (outs DPR:$dst), (ins DPR:$src), NoItinerary,
+                  opc, "f32", asm, "", []> {
+    let Inst{10} = 1; // overwrite F = 1
+  }
+
+  // 128-bit vector types.
+  def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4,
+                  (outs QPR:$dst), (ins QPR:$src), NoItinerary,
+                  opc, !strconcat(Dt, "8"), asm, "", []>;
+  def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
+                  (outs QPR:$dst), (ins QPR:$src), NoItinerary,
+                  opc, !strconcat(Dt, "16"), asm, "", []>;
+  def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
+                  (outs QPR:$dst), (ins QPR:$src), NoItinerary,
+                  opc, !strconcat(Dt, "32"), asm, "", []>;
+  def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
+                  (outs QPR:$dst), (ins QPR:$src), NoItinerary,
+                  opc, "f32", asm, "", []> {
+    let Inst{10} = 1; // overwrite F = 1
+  }
+}
+
 // Neon 3-register vector operations.
 
 // First with only element sizes of 8, 16 and 32 bits:
@@ -1262,22 +1233,22 @@ multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                    OpcodeStr, !strconcat(Dt, "8"),
                    v8i8, v8i8, OpNode, Commutable>;
   def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16,
-                 OpcodeStr, !strconcat(Dt, "16"),
-                 v4i16, v4i16, OpNode, Commutable>;
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v4i16, v4i16, OpNode, Commutable>;
   def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, itinD32,
-                 OpcodeStr, !strconcat(Dt, "32"),
-                 v2i32, v2i32, OpNode, Commutable>;
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v2i32, v2i32, OpNode, Commutable>;
 
   // 128-bit vector types.
   def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, itinQ16,
-                  OpcodeStr, !strconcat(Dt, "8"),
-                  v16i8, v16i8, OpNode, Commutable>;
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v16i8, v16i8, OpNode, Commutable>;
   def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, itinQ16,
-                 OpcodeStr, !strconcat(Dt, "16"),
-                 v8i16, v8i16, OpNode, Commutable>;
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v8i16, v8i16, OpNode, Commutable>;
   def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, itinQ32,
-                 OpcodeStr, !strconcat(Dt, "32"),
-                 v4i32, v4i32, OpNode, Commutable>;
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v4i32, v4i32, OpNode, Commutable>;
 }
 
 multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, string Dt, SDNode ShOp> {
@@ -1372,7 +1343,7 @@ multiclass N3VIntSL_HS<bits<4> op11_8,
   def v2i32 : N3VDIntSL<0b10, op11_8, itinD32,
                         OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp>;
   def v8i16 : N3VQIntSL16<0b01, op11_8, itinQ16,
-                        OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, IntOp>;
+                          OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, IntOp>;
   def v4i32 : N3VQIntSL<0b10, op11_8, itinQ32,
                         OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, IntOp>;
 }
@@ -1386,8 +1357,8 @@ multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
   : N3VInt_HS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32,
               OpcodeStr, Dt, IntOp, Commutable> {
   def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, itinD16,
-                     OpcodeStr, !strconcat(Dt, "8"),
-                     v8i8, v8i8, IntOp, Commutable>;
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i8, v8i8, IntOp, Commutable>;
   def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, itinQ16,
                       OpcodeStr, !strconcat(Dt, "8"),
                       v16i8, v16i8, IntOp, Commutable>;
@@ -1402,11 +1373,11 @@ multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
   : N3VInt_QHS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32,
                OpcodeStr, Dt, IntOp, Commutable> {
   def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, itinD32,
-                   OpcodeStr, !strconcat(Dt, "64"),
-                   v1i64, v1i64, IntOp, Commutable>;
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v1i64, v1i64, IntOp, Commutable>;
   def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, itinQ32,
-                   OpcodeStr, !strconcat(Dt, "64"),
-                   v2i64, v2i64, IntOp, Commutable>;
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v2i64, v2i64, IntOp, Commutable>;
 }
 
 
@@ -1511,9 +1482,11 @@ multiclass N3VMulOpSL_HS<bits<4> op11_8,
   def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32,
                           OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, ShOp>;
   def v8i16 : N3VQMulOpSL16<0b01, op11_8, itinQ16,
-                      OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, mul, ShOp>;
+                            OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16,
+                            mul, ShOp>;
   def v4i32 : N3VQMulOpSL<0b10, op11_8, itinQ32,
-                      OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, mul, ShOp>;
+                          OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32,
+                          mul, ShOp>;
 }
 
 // Neon 3-argument intrinsics,
@@ -1522,19 +1495,19 @@ multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        string OpcodeStr, string Dt, Intrinsic IntOp> {
   // 64-bit vector types.
   def v8i8  : N3VDInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16D,
-                        OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
+                       OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
   def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D,
-                        OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>;
+                       OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>;
   def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32D,
-                        OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>;
+                       OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>;
 
   // 128-bit vector types.
   def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16Q,
-                        OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>;
+                       OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>;
   def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16Q,
-                        OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>;
+                       OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>;
   def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32Q,
-                        OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>;
+                       OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>;
 }
 
 
@@ -1576,17 +1549,17 @@ multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
   def v8i8  : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
                       itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
   def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
-                   itinD, OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>;
+                      itinD, OpcodeStr, !strconcat(Dt, "16"),v4i16,v4i16,IntOp>;
   def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
-                   itinD, OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>;
+                      itinD, OpcodeStr, !strconcat(Dt, "32"),v2i32,v2i32,IntOp>;
 
   // 128-bit vector types.
   def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
-                    itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>;
+                      itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8,v16i8,IntOp>;
   def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
-                   itinQ, OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>;
+                      itinQ, OpcodeStr, !strconcat(Dt, "16"),v8i16,v8i16,IntOp>;
   def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
-                   itinQ, OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>;
+                      itinQ, OpcodeStr, !strconcat(Dt, "32"),v4i32,v4i32,IntOp>;
 }
 
 
@@ -1846,29 +1819,31 @@ def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16D, "vmul", "p8",
 def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16Q, "vmul", "p8",
                         v16i8, v16i8, int_arm_neon_vmulp, 1>;
 def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VBIND, "vmul", "f32",
-                        v2f32, v2f32, fmul, 1>;
+                     v2f32, v2f32, fmul, 1>;
 def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VBINQ, "vmul", "f32",
-                        v4f32, v4f32, fmul, 1>;
-defm VMULsl  : N3VSL_HS<0b1000, "vmul", "i", mul>;
-def VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>;
-def VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32, v2f32, fmul>;
+                     v4f32, v4f32, fmul, 1>;
+defm VMULsl   : N3VSL_HS<0b1000, "vmul", "i", mul>;
+def  VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>;
+def  VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32,
+                       v2f32, fmul>;
+
 def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
                       (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
           (v8i16 (VMULslv8i16 (v8i16 QPR:$src1),
                               (v4i16 (EXTRACT_SUBREG QPR:$src2,
-                                                     (DSubReg_i16_reg imm:$lane))),
+                                      (DSubReg_i16_reg imm:$lane))),
                               (SubReg_i16_lane imm:$lane)))>;
 def : Pat<(v4i32 (mul (v4i32 QPR:$src1),
                       (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
           (v4i32 (VMULslv4i32 (v4i32 QPR:$src1),
                               (v2i32 (EXTRACT_SUBREG QPR:$src2,
-                                                     (DSubReg_i32_reg imm:$lane))),
+                                      (DSubReg_i32_reg imm:$lane))),
                               (SubReg_i32_lane imm:$lane)))>;
 def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
                        (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),
           (v4f32 (VMULslfq (v4f32 QPR:$src1),
                            (v2f32 (EXTRACT_SUBREG QPR:$src2,
-                                                  (DSubReg_i32_reg imm:$lane))),
+                                   (DSubReg_i32_reg imm:$lane))),
                            (SubReg_i32_lane imm:$lane)))>;
 
 //   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
@@ -1883,14 +1858,14 @@ def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1),
                                                             imm:$lane)))),
           (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1),
                                  (v4i16 (EXTRACT_SUBREG QPR:$src2,
-                                                  (DSubReg_i16_reg imm:$lane))),
+                                         (DSubReg_i16_reg imm:$lane))),
                                  (SubReg_i16_lane imm:$lane)))>;
 def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
                                        (v4i32 (NEONvduplane (v4i32 QPR:$src2),
                                                             imm:$lane)))),
           (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1),
                                  (v2i32 (EXTRACT_SUBREG QPR:$src2,
-                                                  (DSubReg_i32_reg imm:$lane))),
+                                         (DSubReg_i32_reg imm:$lane))),
                                  (SubReg_i32_lane imm:$lane)))>;
 
 //   VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
@@ -1905,14 +1880,14 @@ def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1),
                                                              imm:$lane)))),
           (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1),
                                   (v4i16 (EXTRACT_SUBREG QPR:$src2,
-                                                         (DSubReg_i16_reg imm:$lane))),
+                                          (DSubReg_i16_reg imm:$lane))),
                                   (SubReg_i16_lane imm:$lane)))>;
 def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
                                         (v4i32 (NEONvduplane (v4i32 QPR:$src2),
                                                              imm:$lane)))),
           (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1),
                                   (v2i32 (EXTRACT_SUBREG QPR:$src2,
-                                                  (DSubReg_i32_reg imm:$lane))),
+                                          (DSubReg_i32_reg imm:$lane))),
                                   (SubReg_i32_lane imm:$lane)))>;
 
 //   VMULL    : Vector Multiply Long (integer and polynomial) (Q = D * D)
@@ -1950,30 +1925,28 @@ def  VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32",
                             v4f32, v2f32, fmul, fadd>;
 
 def : Pat<(v8i16 (add (v8i16 QPR:$src1),
-                      (mul (v8i16 QPR:$src2),
-                           (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
-          (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1),
-                              (v8i16 QPR:$src2),
+                  (mul (v8i16 QPR:$src2),
+                       (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+          (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
                               (v4i16 (EXTRACT_SUBREG QPR:$src3,
-                                                     (DSubReg_i16_reg imm:$lane))),
+                                      (DSubReg_i16_reg imm:$lane))),
                               (SubReg_i16_lane imm:$lane)))>;
 
 def : Pat<(v4i32 (add (v4i32 QPR:$src1),
-                      (mul (v4i32 QPR:$src2),
-                           (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
-          (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1),
-                              (v4i32 QPR:$src2),
+                  (mul (v4i32 QPR:$src2),
+                       (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+          (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
                               (v2i32 (EXTRACT_SUBREG QPR:$src3,
-                                                  (DSubReg_i32_reg imm:$lane))),
+                                      (DSubReg_i32_reg imm:$lane))),
                               (SubReg_i32_lane imm:$lane)))>;
 
 def : Pat<(v4f32 (fadd (v4f32 QPR:$src1),
-                       (fmul (v4f32 QPR:$src2),
-                             (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+                  (fmul (v4f32 QPR:$src2),
+                        (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
           (v4f32 (VMLAslfq (v4f32 QPR:$src1),
                            (v4f32 QPR:$src2),
                            (v2f32 (EXTRACT_SUBREG QPR:$src3,
-                                                  (DSubReg_i32_reg imm:$lane))),
+                                   (DSubReg_i32_reg imm:$lane))),
                            (SubReg_i32_lane imm:$lane)))>;
 
 //   VMLAL    : Vector Multiply Accumulate Long (Q += D * D)
@@ -2003,30 +1976,27 @@ def  VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32",
                             v4f32, v2f32, fmul, fsub>;
 
 def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
-                      (mul (v8i16 QPR:$src2),
-                           (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
-          (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1),
-                              (v8i16 QPR:$src2),
+                  (mul (v8i16 QPR:$src2),
+                       (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+          (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
                               (v4i16 (EXTRACT_SUBREG QPR:$src3,
-                                                     (DSubReg_i16_reg imm:$lane))),
+                                      (DSubReg_i16_reg imm:$lane))),
                               (SubReg_i16_lane imm:$lane)))>;
 
 def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
-                      (mul (v4i32 QPR:$src2),
-                         (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
-          (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1),
-                              (v4i32 QPR:$src2),
+                  (mul (v4i32 QPR:$src2),
+                     (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+          (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
                               (v2i32 (EXTRACT_SUBREG QPR:$src3,
-                                                     (DSubReg_i32_reg imm:$lane))),
+                                      (DSubReg_i32_reg imm:$lane))),
                               (SubReg_i32_lane imm:$lane)))>;
 
 def : Pat<(v4f32 (fsub (v4f32 QPR:$src1),
-                       (fmul (v4f32 QPR:$src2),
-                           (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
-          (v4f32 (VMLSslfq (v4f32 QPR:$src1),
-                           (v4f32 QPR:$src2),
+                  (fmul (v4f32 QPR:$src2),
+                        (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+          (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2),
                            (v2f32 (EXTRACT_SUBREG QPR:$src3,
-                                                  (DSubReg_i32_reg imm:$lane))),
+                                   (DSubReg_i32_reg imm:$lane))),
                            (SubReg_i32_lane imm:$lane)))>;
 
 //   VMLSL    : Vector Multiply Subtract Long (Q -= D * D)
@@ -2088,6 +2058,10 @@ def  VCEQfd   : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
                      NEONvceq, 1>;
 def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
                      NEONvceq, 1>;
+// For disassembly only.
+defm VCEQz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
+                           "$dst, $src, #0">;
+
 //   VCGE     : Vector Compare Greater Than or Equal
 defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
                         IIC_VBINi4Q, "vcge", "s", NEONvcge, 0>;
@@ -2097,6 +2071,13 @@ def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32",
                      v2i32, v2f32, NEONvcge, 0>;
 def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
                      NEONvcge, 0>;
+// For disassembly only.
+defm VCGEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
+                            "$dst, $src, #0">;
+// For disassembly only.
+defm VCLEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s",
+                            "$dst, $src, #0">;
+
 //   VCGT     : Vector Compare Greater Than
 defm VCGTs    : N3V_QHS<0, 0, 0b0011, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
                         IIC_VBINi4Q, "vcgt", "s", NEONvcgt, 0>;
@@ -2106,6 +2087,13 @@ def  VCGTfd   : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
                      NEONvcgt, 0>;
 def  VCGTfq   : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
                      NEONvcgt, 0>;
+// For disassembly only.
+defm VCGTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
+                            "$dst, $src, #0">;
+// For disassembly only.
+defm VCLTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
+                            "$dst, $src, #0">;
+
 //   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
 def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, IIC_VBIND, "vacge", "f32",
                         v2i32, v2f32, int_arm_neon_vacged, 0>;
@@ -2247,9 +2235,9 @@ defm VABALu   : N3VLInt3_QHS<1,1,0b0101,0, "vabal", "u", int_arm_neon_vabalu>;
 // Vector Maximum and Minimum.
 
 //   VMAX     : Vector Maximum
-defm VMAXs    : N3VInt_QHS<0, 0, 0b0110, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+defm VMAXs    : N3VInt_QHS<0,0,0b0110,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
                            IIC_VBINi4Q, "vmax", "s", int_arm_neon_vmaxs, 1>;
-defm VMAXu    : N3VInt_QHS<1, 0, 0b0110, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+defm VMAXu    : N3VInt_QHS<1,0,0b0110,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
                            IIC_VBINi4Q, "vmax", "u", int_arm_neon_vmaxu, 1>;
 def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, IIC_VBIND, "vmax", "f32",
                         v2f32, v2f32, int_arm_neon_vmaxs, 1>;
@@ -2257,9 +2245,9 @@ def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, IIC_VBINQ, "vmax", "f32",
                         v4f32, v4f32, int_arm_neon_vmaxs, 1>;
 
 //   VMIN     : Vector Minimum
-defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+defm VMINs    : N3VInt_QHS<0,0,0b0110,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
                            IIC_VBINi4Q, "vmin", "s", int_arm_neon_vmins, 1>;
-defm VMINu    : N3VInt_QHS<1, 0, 0b0110, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+defm VMINu    : N3VInt_QHS<1,0,0b0110,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
                            IIC_VBINi4Q, "vmin", "u", int_arm_neon_vminu, 1>;
 def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, IIC_VBIND, "vmin", "f32",
                         v2f32, v2f32, int_arm_neon_vmins, 1>;
@@ -2401,16 +2389,17 @@ def  VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32",
                           v2i64, v2i32, NEONvshlli>;
 
 //   VSHRN    : Vector Shift Right and Narrow
-defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i", NEONvshrn>;
+defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i",
+                           NEONvshrn>;
 
 //   VRSHL    : Vector Rounding Shift
 defm VRSHLs   : N3VInt_QHSD<0,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vrshl", "s", int_arm_neon_vrshifts, 0>;
+                            IIC_VSHLi4Q, "vrshl", "s", int_arm_neon_vrshifts,0>;
 defm VRSHLu   : N3VInt_QHSD<1,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vrshl", "u", int_arm_neon_vrshiftu, 0>;
+                            IIC_VSHLi4Q, "vrshl", "u", int_arm_neon_vrshiftu,0>;
 //   VRSHR    : Vector Rounding Shift Right
-defm VRSHRs   : N2VSh_QHSD<0, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs>;
-defm VRSHRu   : N2VSh_QHSD<1, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru>;
+defm VRSHRs   : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs>;
+defm VRSHRu   : N2VSh_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru>;
 
 //   VRSHRN   : Vector Rounding Shift Right and Narrow
 defm VRSHRN   : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i",
@@ -2418,14 +2407,14 @@ defm VRSHRN   : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i",
 
 //   VQSHL    : Vector Saturating Shift
 defm VQSHLs   : N3VInt_QHSD<0,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vqshl", "s", int_arm_neon_vqshifts, 0>;
+                            IIC_VSHLi4Q, "vqshl", "s", int_arm_neon_vqshifts,0>;
 defm VQSHLu   : N3VInt_QHSD<1,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vqshl", "u", int_arm_neon_vqshiftu, 0>;
+                            IIC_VSHLi4Q, "vqshl", "u", int_arm_neon_vqshiftu,0>;
 //   VQSHL    : Vector Saturating Shift Left (Immediate)
-defm VQSHLsi  : N2VSh_QHSD<0, 1, 0b0111, 1, IIC_VSHLi4D, "vqshl", "s", NEONvqshls>;
-defm VQSHLui  : N2VSh_QHSD<1, 1, 0b0111, 1, IIC_VSHLi4D, "vqshl", "u", NEONvqshlu>;
+defm VQSHLsi  : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s", NEONvqshls>;
+defm VQSHLui  : N2VSh_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u", NEONvqshlu>;
 //   VQSHLU   : Vector Saturating Shift Left (Immediate, Unsigned)
-defm VQSHLsu  : N2VSh_QHSD<1, 1, 0b0110, 1, IIC_VSHLi4D, "vqshlu", "s", NEONvqshlsu>;
+defm VQSHLsu  : N2VSh_QHSD<1,1,0b0110,1, IIC_VSHLi4D, "vqshlu","s",NEONvqshlsu>;
 
 //   VQSHRN   : Vector Saturating Shift Right and Narrow
 defm VQSHRNs  : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s",
@@ -2438,10 +2427,10 @@ defm VQSHRUN  : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s",
                            NEONvqshrnsu>;
 
 //   VQRSHL   : Vector Saturating Rounding Shift
-defm VQRSHLs  : N3VInt_QHSD<0, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+defm VQRSHLs  : N3VInt_QHSD<0,0,0b0101,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
                             IIC_VSHLi4Q, "vqrshl", "s",
                             int_arm_neon_vqrshifts, 0>;
-defm VQRSHLu  : N3VInt_QHSD<1, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+defm VQRSHLu  : N3VInt_QHSD<1,0,0b0101,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
                             IIC_VSHLi4Q, "vqrshl", "u",
                             int_arm_neon_vqrshiftu, 0>;
 
@@ -2508,7 +2497,7 @@ def  VNEGs16q : VNEGQ<0b01, "vneg", "s16", v8i16>;
 def  VNEGs32q : VNEGQ<0b10, "vneg", "s32", v4i32>;
 
 //   VNEG     : Vector Negate (floating-point)
-def  VNEGf32d : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
+def  VNEGfd   : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
                     (outs DPR:$dst), (ins DPR:$src), IIC_VUNAD,
                     "vneg", "f32", "$dst, $src", "",
                     [(set DPR:$dst, (v2f32 (fneg DPR:$src)))]>;
@@ -2547,6 +2536,14 @@ def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
                         IIC_VCNTiQ, "vcnt", "8",
                         v16i8, v16i8, int_arm_neon_vcnt>;
 
+// Vector Swap -- for disassembly only.
+def  VSWPd    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0,
+                     (outs DPR:$dst), (ins DPR:$src), NoItinerary,
+                     "vswp", "$dst, $src", "", []>;
+def  VSWPq    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0,
+                     (outs QPR:$dst), (ins QPR:$src), NoItinerary,
+                     "vswp", "$dst, $src", "", []>;
+
 // Vector Move Operations.
 
 //   VMOV     : Vector Move (Register)
@@ -2678,10 +2675,10 @@ def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
                              (DSubReg_i32_reg imm:$lane))),
                      (SubReg_i32_lane imm:$lane))>;
 def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2),
-          (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1), DPR_VFP2)),
+          (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1),DPR_VFP2)),
                           (SSubReg_f32_reg imm:$src2))>;
 def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2),
-          (EXTRACT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4f32 QPR:$src1), QPR_VFP2)),
+          (EXTRACT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4f32 QPR:$src1),QPR_VFP2)),
                           (SSubReg_f32_reg imm:$src2))>;
 //def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2),
 //          (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
@@ -2849,11 +2846,13 @@ def  VDUPfqf  : N2V<0b11, 0b11, {?,1}, {0,0}, 0b11000, 1, 0,
 
 def : Pat<(v2i64 (NEONvduplane (v2i64 QPR:$src), imm:$lane)),
           (INSERT_SUBREG QPR:$src, 
-                         (i64 (EXTRACT_SUBREG QPR:$src, (DSubReg_f64_reg imm:$lane))),
+                         (i64 (EXTRACT_SUBREG QPR:$src,
+                               (DSubReg_f64_reg imm:$lane))),
                          (DSubReg_f64_other_reg imm:$lane))>;
 def : Pat<(v2f64 (NEONvduplane (v2f64 QPR:$src), imm:$lane)),
           (INSERT_SUBREG QPR:$src, 
-                         (f64 (EXTRACT_SUBREG QPR:$src, (DSubReg_f64_reg imm:$lane))),
+                         (f64 (EXTRACT_SUBREG QPR:$src,
+                               (DSubReg_f64_reg imm:$lane))),
                          (DSubReg_f64_other_reg imm:$lane))>;
 
 //   VMOVN    : Vector Narrowing Move
@@ -3092,70 +3091,110 @@ def  VTBX4
 // NEON instructions for single-precision FP math
 //===----------------------------------------------------------------------===//
 
+class N2VSPat<SDNode OpNode, ValueType ResTy, ValueType OpTy, NeonI Inst>
+  : NEONFPPat<(ResTy (OpNode SPR:$a)),
+              (EXTRACT_SUBREG (Inst (INSERT_SUBREG (OpTy (IMPLICIT_DEF)),
+                                                   SPR:$a, arm_ssubreg_0)),
+                              arm_ssubreg_0)>;
+
+class N3VSPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)),
+              (EXTRACT_SUBREG (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+                                                   SPR:$a, arm_ssubreg_0),
+                                    (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+                                                   SPR:$b, arm_ssubreg_0)),
+                              arm_ssubreg_0)>;
+
+class N3VSMulOpPat<SDNode MulNode, SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))),
+              (EXTRACT_SUBREG (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+                                                   SPR:$acc, arm_ssubreg_0),
+                                    (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+                                                   SPR:$a, arm_ssubreg_0),
+                                    (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+                                                   SPR:$b, arm_ssubreg_0)),
+                              arm_ssubreg_0)>;
+
 // These need separate instructions because they must use DPR_VFP2 register
 // class which have SPR sub-registers.
 
 // Vector Add Operations used for single-precision FP
 let neverHasSideEffects = 1 in
-def VADDfd_sfp : N3VDs<0, 0, 0b00, 0b1101, 0, "vadd", "f32", v2f32, v2f32, fadd,1>;
-def : N3VDsPat<fadd, VADDfd_sfp>;
+def VADDfd_sfp : N3VS<0,0,0b00,0b1101,0, "vadd", "f32", v2f32, v2f32, fadd, 1>;
+def : N3VSPat<fadd, VADDfd_sfp>;
 
 // Vector Sub Operations used for single-precision FP
 let neverHasSideEffects = 1 in
-def VSUBfd_sfp : N3VDs<0, 0, 0b10, 0b1101, 0, "vsub", "f32", v2f32, v2f32, fsub,0>;
-def : N3VDsPat<fsub, VSUBfd_sfp>;
+def VSUBfd_sfp : N3VS<0,0,0b10,0b1101,0, "vsub", "f32", v2f32, v2f32, fsub, 0>;
+def : N3VSPat<fsub, VSUBfd_sfp>;
 
 // Vector Multiply Operations used for single-precision FP
 let neverHasSideEffects = 1 in
-def VMULfd_sfp : N3VDs<1, 0, 0b00, 0b1101, 1, "vmul", "f32", v2f32, v2f32, fmul,1>;
-def : N3VDsPat<fmul, VMULfd_sfp>;
+def VMULfd_sfp : N3VS<1,0,0b00,0b1101,1, "vmul", "f32", v2f32, v2f32, fmul, 1>;
+def : N3VSPat<fmul, VMULfd_sfp>;
 
 // Vector Multiply-Accumulate/Subtract used for single-precision FP
 // vml[as].f32 can cause 4-8 cycle stalls in following ASIMD instructions, so
 // we want to avoid them for now. e.g., alternating vmla/vadd instructions.
 
 //let neverHasSideEffects = 1 in
-//def VMLAfd_sfp : N3VDMulOps<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", v2f32,fmul,fadd>;
-//def : N3VDMulOpsPat<fmul, fadd, VMLAfd_sfp>;
+//def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32",
+//                            v2f32, fmul, fadd>;
+//def : N3VSMulOpPat<fmul, fadd, VMLAfd_sfp>;
 
 //let neverHasSideEffects = 1 in
-//def VMLSfd_sfp : N3VDMulOps<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", v2f32,fmul,fsub>;
-//def : N3VDMulOpsPat<fmul, fsub, VMLSfd_sfp>;
+//def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32",
+//                            v2f32, fmul, fsub>;
+//def : N3VSMulOpPat<fmul, fsub, VMLSfd_sfp>;
 
 // Vector Absolute used for single-precision FP
 let neverHasSideEffects = 1 in
-def  VABSfd_sfp : N2VDInts<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
-                           IIC_VUNAD, "vabs", "f32",
-                           v2f32, v2f32, int_arm_neon_vabs>;
-def : N2VDIntsPat<fabs, VABSfd_sfp>;
+def  VABSfd_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01110, 0, 0,
+                      (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), IIC_VUNAD,
+                      "vabs", "f32", "$dst, $src", "", []>;
+def : N2VSPat<fabs, f32, v2f32, VABSfd_sfp>;
 
 // Vector Negate used for single-precision FP
 let neverHasSideEffects = 1 in
-def  VNEGf32d_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
-                        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), IIC_VUNAD,
-                        "vneg", "f32", "$dst, $src", "", []>;
-def : N2VDIntsPat<fneg, VNEGf32d_sfp>;
+def  VNEGfd_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
+                      (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), IIC_VUNAD,
+                      "vneg", "f32", "$dst, $src", "", []>;
+def : N2VSPat<fneg, f32, v2f32, VNEGfd_sfp>;
+
+// Vector Maximum used for single-precision FP
+let neverHasSideEffects = 1 in
+def VMAXfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst),
+                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
+                     "vmax", "f32", "$dst, $src1, $src2", "", []>;
+def : N3VSPat<NEONfmax, VMAXfd_sfp>;
+
+// Vector Minimum used for single-precision FP
+let neverHasSideEffects = 1 in
+def VMINfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst),
+                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
+                     "vmin", "f32", "$dst, $src1, $src2", "", []>;
+def : N3VSPat<NEONfmin, VMINfd_sfp>;
 
 // Vector Convert between single-precision FP and integer
 let neverHasSideEffects = 1 in
-def  VCVTf2sd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
-                          v2i32, v2f32, fp_to_sint>;
-def : N2VDsPat<arm_ftosi, f32, v2f32, VCVTf2sd_sfp>;
+def  VCVTf2sd_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
+                         v2i32, v2f32, fp_to_sint>;
+def : N2VSPat<arm_ftosi, f32, v2f32, VCVTf2sd_sfp>;
 
 let neverHasSideEffects = 1 in
-def  VCVTf2ud_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
-                          v2i32, v2f32, fp_to_uint>;
-def : N2VDsPat<arm_ftoui, f32, v2f32, VCVTf2ud_sfp>;
+def  VCVTf2ud_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
+                         v2i32, v2f32, fp_to_uint>;
+def : N2VSPat<arm_ftoui, f32, v2f32, VCVTf2ud_sfp>;
 
 let neverHasSideEffects = 1 in
-def  VCVTs2fd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
-                          v2f32, v2i32, sint_to_fp>;
-def : N2VDsPat<arm_sitof, f32, v2i32, VCVTs2fd_sfp>;
+def  VCVTs2fd_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
+                         v2f32, v2i32, sint_to_fp>;
+def : N2VSPat<arm_sitof, f32, v2i32, VCVTs2fd_sfp>;
 
 let neverHasSideEffects = 1 in
-def  VCVTu2fd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
-                          v2f32, v2i32, uint_to_fp>;
-def : N2VDsPat<arm_uitof, f32, v2i32, VCVTu2fd_sfp>;
+def  VCVTu2fd_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
+                         v2f32, v2i32, uint_to_fp>;
+def : N2VSPat<arm_uitof, f32, v2i32, VCVTu2fd_sfp>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 64142ad..1c77f27 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -120,7 +120,10 @@ def t_addrmode_sp : Operand<i32>,
 //  Miscellaneous Instructions.
 //
 
-let Defs = [SP], Uses = [SP] in {
+// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
+// from removing one half of the matched pairs. That breaks PEI, which assumes
+// these will always be in pairs, and asserts if it finds otherwise. Better way?
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
 def tADJCALLSTACKUP :
 PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary,
            "@ tADJCALLSTACKUP $amt1",
@@ -132,6 +135,55 @@ PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
            [(ARMcallseq_start imm:$amt)]>, Requires<[IsThumb1Only]>;
 }
 
+def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "",
+                [/* For disassembly only; pattern left blank */]>,
+           T1Encoding<0b101111> {
+  let Inst{9-8} = 0b11;
+  let Inst{7-0} = 0b00000000;
+} 
+
+def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "",
+                  [/* For disassembly only; pattern left blank */]>,
+             T1Encoding<0b101111> {
+  let Inst{9-8} = 0b11;
+  let Inst{7-0} = 0b00010000;
+} 
+
+def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "",
+                [/* For disassembly only; pattern left blank */]>,
+           T1Encoding<0b101111> {
+  let Inst{9-8} = 0b11;
+  let Inst{7-0} = 0b00100000;
+} 
+
+def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "",
+                [/* For disassembly only; pattern left blank */]>,
+           T1Encoding<0b101111> {
+  let Inst{9-8} = 0b11;
+  let Inst{7-0} = 0b00110000;
+} 
+
+def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "",
+                [/* For disassembly only; pattern left blank */]>,
+           T1Encoding<0b101111> {
+  let Inst{9-8} = 0b11;
+  let Inst{7-0} = 0b01000000;
+} 
+
+def tSETENDBE : T1I<(outs), (ins), NoItinerary, "setend\tbe",
+                    [/* For disassembly only; pattern left blank */]>,
+                T1Encoding<0b101101> {
+  let Inst{9-5} = 0b10010;
+  let Inst{3} = 1;
+}
+
+def tSETENDLE : T1I<(outs), (ins), NoItinerary, "setend\tle",
+                    [/* For disassembly only; pattern left blank */]>,
+                T1Encoding<0b101101> {
+  let Inst{9-5} = 0b10010;
+  let Inst{3} = 0;
+}
+
 // The i32imm operand $val can be used by a debugger to store more information
 // about the breakpoint.
 def tBKPT : T1I<(outs), (ins i32imm:$val), NoItinerary, "bkpt\t$val",
@@ -140,6 +192,19 @@ def tBKPT : T1I<(outs), (ins i32imm:$val), NoItinerary, "bkpt\t$val",
   let Inst{9-8} = 0b10;
 }
 
+// Change Processor State is a system instruction -- for disassembly only.
+// The singleton $opt operand contains the following information:
+// opt{4-0} = mode ==> don't care
+// opt{5} = changemode ==> 0 (false for 16-bit Thumb instr)
+// opt{8-6} = AIF from Inst{2-0}
+// opt{10-9} = 1:imod from Inst{4} with 0b10 as enable and 0b11 as disable
+//
+// The opt{4-0} and opt{5} sub-fields are to accommodate 32-bit Thumb and ARM
+// CPS which has more options.
+def tCPS : T1I<(outs), (ins i32imm:$opt), NoItinerary, "cps${opt:cps}",
+              [/* For disassembly only; pattern left blank */]>,
+           T1Misc<0b0110011>;
+
 // For both thumb1 and thumb2.
 let isNotDuplicable = 1 in
 def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr,
@@ -208,7 +273,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
     let Inst{6-3} = 0b1110; // Rm = lr
   }
   // Alternative return instruction used by vararg functions.
-  def tBX_RET_vararg : TI<(outs), (ins tGPR:$target), IIC_Br, "bx\t$target", []>,
+  def tBX_RET_vararg : TI<(outs), (ins tGPR:$target), IIC_Br, "bx\t$target",[]>,
                        T1Special<{1,1,0,?}>; // A6.2.3 & A8.6.25
 }
 
@@ -236,20 +301,20 @@ let isCall = 1,
           D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
   // Also used for Thumb2
   def tBL  : TIx2<0b11110, 0b11, 1,
-                  (outs), (ins i32imm:$func, variable_ops), IIC_Br, 
+                  (outs), (ins i32imm:$func, variable_ops), IIC_Br,
                   "bl\t${func:call}",
                   [(ARMtcall tglobaladdr:$func)]>,
              Requires<[IsThumb, IsNotDarwin]>;
 
   // ARMv5T and above, also used for Thumb2
   def tBLXi : TIx2<0b11110, 0b11, 0,
-                   (outs), (ins i32imm:$func, variable_ops), IIC_Br, 
+                   (outs), (ins i32imm:$func, variable_ops), IIC_Br,
                    "blx\t${func:call}",
                    [(ARMcall tglobaladdr:$func)]>,
               Requires<[IsThumb, HasV5T, IsNotDarwin]>;
 
   // Also used for Thumb2
-  def tBLXr : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br, 
+  def tBLXr : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br,
                   "blx\t$func",
                   [(ARMtcall GPR:$func)]>,
               Requires<[IsThumb, HasV5T, IsNotDarwin]>,
@@ -257,7 +322,7 @@ let isCall = 1,
 
   // ARMv4T
   def tBX : TIx2<{?,?,?,?,?}, {?,?}, ?,
-                  (outs), (ins tGPR:$func, variable_ops), IIC_Br, 
+                  (outs), (ins tGPR:$func, variable_ops), IIC_Br,
                   "mov\tlr, pc\n\tbx\t$func",
                   [(ARMcall_nolink tGPR:$func)]>,
             Requires<[IsThumb1Only, IsNotDarwin]>;
@@ -271,20 +336,20 @@ let isCall = 1,
           D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
   // Also used for Thumb2
   def tBLr9 : TIx2<0b11110, 0b11, 1,
-                   (outs), (ins i32imm:$func, variable_ops), IIC_Br, 
+                   (outs), (ins i32imm:$func, variable_ops), IIC_Br,
                    "bl\t${func:call}",
                    [(ARMtcall tglobaladdr:$func)]>,
               Requires<[IsThumb, IsDarwin]>;
 
   // ARMv5T and above, also used for Thumb2
   def tBLXi_r9 : TIx2<0b11110, 0b11, 0,
-                      (outs), (ins i32imm:$func, variable_ops), IIC_Br, 
+                      (outs), (ins i32imm:$func, variable_ops), IIC_Br,
                       "blx\t${func:call}",
                       [(ARMcall tglobaladdr:$func)]>,
                  Requires<[IsThumb, HasV5T, IsDarwin]>;
 
   // Also used for Thumb2
-  def tBLXr_r9 : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br, 
+  def tBLXr_r9 : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br,
                     "blx\t$func",
                     [(ARMtcall GPR:$func)]>,
                  Requires<[IsThumb, HasV5T, IsDarwin]>,
@@ -292,7 +357,7 @@ let isCall = 1,
 
   // ARMv4T
   def tBXr9 : TIx2<{?,?,?,?,?}, {?,?}, ?,
-                   (outs), (ins tGPR:$func, variable_ops), IIC_Br, 
+                   (outs), (ins tGPR:$func, variable_ops), IIC_Br,
                    "mov\tlr, pc\n\tbx\t$func",
                    [(ARMcall_nolink tGPR:$func)]>,
               Requires<[IsThumb1Only, IsDarwin]>;
@@ -307,7 +372,7 @@ let isBranch = 1, isTerminator = 1 in {
 
   // Far jump
   let Defs = [LR] in
-  def tBfar : TIx2<0b11110, 0b11, 1, (outs), (ins brtarget:$target), IIC_Br, 
+  def tBfar : TIx2<0b11110, 0b11, 1, (outs), (ins brtarget:$target), IIC_Br,
                     "bl\t$target\t@ far jump",[]>;
 
   def tBR_JTr : T1JTI<(outs),
@@ -340,16 +405,34 @@ let isBranch = 1, isTerminator = 1 in {
               T1Misc<{1,0,?,1,?,?,?}>;
 }
 
+// A8.6.218 Supervisor Call (Software Interrupt) -- for disassembly only
+// A8.6.16 B: Encoding T1
+// If Inst{11-8} == 0b1111 then SEE SVC
+let isCall = 1 in {
+def tSVC : T1pI<(outs), (ins i32imm:$svc), IIC_Br, "svc", "\t$svc", []>,
+           Encoding16 {
+  let Inst{15-12} = 0b1101;
+  let Inst{11-8} = 0b1111;
+}
+}
+
+// A8.6.16 B: Encoding T1 -- for disassembly only
+// If Inst{11-8} == 0b1110 then UNDEFINED
+def tTRAP : T1I<(outs), (ins), IIC_Br, "trap", []>, Encoding16 {
+  let Inst{15-12} = 0b1101;
+  let Inst{11-8} = 0b1110;
+}
+
 //===----------------------------------------------------------------------===//
 //  Load Store Instructions.
 //
 
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
-def tLDR : T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr, 
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def tLDR : T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr,
                "ldr", "\t$dst, $addr",
                [(set tGPR:$dst, (load t_addrmode_s4:$addr))]>,
            T1LdSt<0b100>;
-def tLDRi: T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr, 
+def tLDRi: T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr,
                "ldr", "\t$dst, $addr",
                []>,
            T1LdSt4Imm<{1,?,?}>;
@@ -399,15 +482,14 @@ def tRestore : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi,
 
 // Load tconstpool
 // FIXME: Use ldr.n to work around a Darwin assembler bug.
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1  in 
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def tLDRpci : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
                   "ldr", ".n\t$dst, $addr",
                   [(set tGPR:$dst, (load (ARMWrapper tconstpool:$addr)))]>,
               T1Encoding<{0,1,0,0,1,?}>; // A6.2 & A8.6.59
 
 // Special LDR for loads from non-pc-relative constpools.
-let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
-    mayHaveSideEffects = 1  in
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1 in
 def tLDRcp  : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
                   "ldr", "\t$dst, $addr", []>,
               T1LdStSP<{1,?,?}>;
@@ -769,7 +851,7 @@ def tUXTH  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
              T1Misc<{0,0,1,0,1,0,?}>;
 
 
-// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC DAG operation.
+// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation.
 // Expanded after instruction selection into a branch sequence.
 let usesCustomInserter = 1 in  // Expanded after instruction selection.
   def tMOVCCr_pseudo :
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 55c7aa2..316567d 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -13,7 +13,7 @@
 
 // IT block predicate field
 def it_pred : Operand<i32> {
-  let PrintMethod = "printPredicateOperand";
+  let PrintMethod = "printMandatoryPredicateOperand";
 }
 
 // IT block condition mask
@@ -53,10 +53,10 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
 // bits [bits 0-7], the 4-bit shift/splat amount is the next 4 bits [bits 8-11].
 def t2_so_imm : Operand<i32>,
                 PatLeaf<(imm), [{
-  return ARM_AM::getT2SOImmVal((uint32_t)N->getZExtValue()) != -1; 
+  return ARM_AM::getT2SOImmVal((uint32_t)N->getZExtValue()) != -1;
 }]>;
 
-// t2_so_imm_not - Match an immediate that is a complement 
+// t2_so_imm_not - Match an immediate that is a complement
 // of a t2_so_imm.
 def t2_so_imm_not : Operand<i32>,
                     PatLeaf<(imm), [{
@@ -114,13 +114,13 @@ def imm0_4095 : Operand<i32>,
   return (uint32_t)N->getZExtValue() < 4096;
 }]>;
 
-def imm0_4095_neg : PatLeaf<(i32 imm), [{ 
- return (uint32_t)(-N->getZExtValue()) < 4096; 
-}], imm_neg_XFORM>; 
+def imm0_4095_neg : PatLeaf<(i32 imm), [{
+ return (uint32_t)(-N->getZExtValue()) < 4096;
+}], imm_neg_XFORM>;
 
 def imm0_255_neg : PatLeaf<(i32 imm), [{
   return (uint32_t)(-N->getZExtValue()) < 255;
-}], imm_neg_XFORM>; 
+}], imm_neg_XFORM>;
 
 // Define Thumb2 specific addressing modes.
 
@@ -208,7 +208,7 @@ multiclass T2I_un_irs<bits<4> opcod, string opc, PatFrag opnode,
 /// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
 //  binary operation that produces a value. These are predicable and can be
 /// changed to modify CPSR.
-multiclass T2I_bin_irs<bits<4> opcod, string opc, PatFrag opnode, 
+multiclass T2I_bin_irs<bits<4> opcod, string opc, PatFrag opnode,
                        bit Commutable = 0, string wide =""> {
    // shifted imm
    def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
@@ -368,15 +368,16 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
 }
 
 /// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns
-/// for a binary operation that produces a value and use and define the carry
+/// for a binary operation that produces a value and use the carry
 /// bit. It's not predicable.
 let Uses = [CPSR] in {
-multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, bit Commutable = 0> {
+multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
+                             bit Commutable = 0> {
    // shifted imm
    def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
                  opc, "\t$dst, $lhs, $rhs",
                  [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>,
-                 Requires<[IsThumb2, CarryDefIsUnused]> {
+                 Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -387,7 +388,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, bit Comm
    def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
                  opc, ".w\t$dst, $lhs, $rhs",
                  [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>,
-                 Requires<[IsThumb2, CarryDefIsUnused]> {
+                 Requires<[IsThumb2]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -401,19 +402,23 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, bit Comm
    def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
                  opc, ".w\t$dst, $lhs, $rhs",
                  [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>,
-                 Requires<[IsThumb2, CarryDefIsUnused]> {
+                 Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
      let Inst{20} = 0; // The S bit.
    }
-   // Carry setting variants
+}
+
+// Carry setting variants
+let Defs = [CPSR] in {
+multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
+                               bit Commutable = 0> {
    // shifted imm
-   def Sri : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
-                  !strconcat(opc, "s\t$dst, $lhs, $rhs"),
-                  [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>,
-                  Requires<[IsThumb2, CarryDefIsUsed]> {
-     let Defs = [CPSR];
+   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+                 opc, "\t$dst, $lhs, $rhs",
+                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>,
+                 Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -421,11 +426,10 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, bit Comm
      let Inst{15} = 0;
    }
    // register
-   def Srr : T2XI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
-                  !strconcat(opc, "s.w\t$dst, $lhs, $rhs"),
-                  [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>,
-                  Requires<[IsThumb2, CarryDefIsUsed]> {
-     let Defs = [CPSR];
+   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                 opc, ".w\t$dst, $lhs, $rhs",
+                 [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>,
+                 Requires<[IsThumb2]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -436,11 +440,10 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, bit Comm
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def Srs : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
-                  !strconcat(opc, "s.w\t$dst, $lhs, $rhs"),
-                  [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>,
-                  Requires<[IsThumb2, CarryDefIsUsed]> {
-     let Defs = [CPSR];
+   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+                 opc, ".w\t$dst, $lhs, $rhs",
+                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>,
+                 Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -448,6 +451,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, bit Comm
    }
 }
 }
+}
 
 /// T2I_rbin_s_is - Same as T2I_rbin_is except sets 's' bit.
 let Defs = [CPSR] in {
@@ -626,19 +630,6 @@ multiclass T2I_st<bits<2> opcod, string opc, PatFrag opnode> {
   }
 }
 
-/// T2I_picld - Defines the PIC load pattern.
-class T2I_picld<string opc, PatFrag opnode> :
-      T2I<(outs GPR:$dst), (ins addrmodepc:$addr), IIC_iLoadi,
-          !strconcat("\n${addr:label}:\n\t", opc), "\t$dst, $addr",
-          [(set GPR:$dst, (opnode addrmodepc:$addr))]>;
-
-/// T2I_picst - Defines the PIC store pattern.
-class T2I_picst<string opc, PatFrag opnode> :
-      T2I<(outs), (ins GPR:$src, addrmodepc:$addr), IIC_iStorer,
-          !strconcat("\n${addr:label}:\n\t", opc), "\t$src, $addr",
-          [(opnode GPR:$src, addrmodepc:$addr)]>;
-
-
 /// T2I_unary_rrot - A unary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
 multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
@@ -666,6 +657,31 @@ multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
    }
 }
 
+// DO variant - disassembly only, no pattern
+
+multiclass T2I_unary_rrot_DO<bits<3> opcod, string opc> {
+  def r     : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+                  opc, "\t$dst, $src", []> {
+     let Inst{31-27} = 0b11111;
+     let Inst{26-23} = 0b0100;
+     let Inst{22-20} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{15-12} = 0b1111;
+     let Inst{7} = 1;
+     let Inst{5-4} = 0b00; // rotate
+   }
+  def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi,
+                  opc, "\t$dst, $src, ror $rot", []> {
+     let Inst{31-27} = 0b11111;
+     let Inst{26-23} = 0b0100;
+     let Inst{22-20} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{15-12} = 0b1111;
+     let Inst{7} = 1;
+     let Inst{5-4} = {?,?}; // rotate
+   }
+}
+
 /// T2I_bin_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
 multiclass T2I_bin_rrot<bits<3> opcod, string opc, PatFrag opnode> {
@@ -692,6 +708,29 @@ multiclass T2I_bin_rrot<bits<3> opcod, string opc, PatFrag opnode> {
    }
 }
 
+// DO variant - disassembly only, no pattern
+
+multiclass T2I_bin_rrot_DO<bits<3> opcod, string opc> {
+  def rr     : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), IIC_iALUr,
+                  opc, "\t$dst, $LHS, $RHS", []> {
+     let Inst{31-27} = 0b11111;
+     let Inst{26-23} = 0b0100;
+     let Inst{22-20} = opcod;
+     let Inst{15-12} = 0b1111;
+     let Inst{7} = 1;
+     let Inst{5-4} = 0b00; // rotate
+   }
+  def rr_rot : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot),
+                  IIC_iALUsr, opc, "\t$dst, $LHS, $RHS, ror $rot", []> {
+     let Inst{31-27} = 0b11111;
+     let Inst{26-23} = 0b0100;
+     let Inst{22-20} = opcod;
+     let Inst{15-12} = 0b1111;
+     let Inst{7} = 1;
+     let Inst{5-4} = {?,?}; // rotate
+   }
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -734,7 +773,7 @@ def t2ADDrSPi   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
   let Inst{19-16} = 0b1101; // Rn = sp
   let Inst{15} = 0;
 }
-def t2ADDrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm), 
+def t2ADDrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm),
                        IIC_iALUi, "addw", "\t$dst, $sp, $imm", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
@@ -787,6 +826,25 @@ def t2SUBrSPs   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
   let Inst{15} = 0;
 }
 
+// Signed and unsigned division, for disassembly only
+def t2SDIV : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALUi, 
+                 "sdiv", "\t$dst, $a, $b", []> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-21} = 0b011100;
+  let Inst{20} = 0b1;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = 0b1111;
+}
+
+def t2UDIV : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALUi, 
+                 "udiv", "\t$dst, $a, $b", []> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-21} = 0b011101;
+  let Inst{20} = 0b1;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = 0b1111;
+}
+
 // Pseudo instruction that will expand into a t2SUBrSPi + a copy.
 let usesCustomInserter = 1 in { // Expanded after instruction selection.
 def t2SUBrSPi_   : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
@@ -803,7 +861,7 @@ def t2SUBrSPs_   : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
 //
 
 // Load
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1  in 
+let canFoldAsLoad = 1, isReMaterializable = 1  in
 defm t2LDR   : T2I_ld<0, 0b10, "ldr",  UnOpFrag<(load node:$Src)>>;
 
 // Loads with zero extension
@@ -926,9 +984,9 @@ def t2LDRSH_POST : T2Iidxldst<1, 0b01, 1, 0, (outs GPR:$dst, GPR:$base_wb),
 }
 
 // Store
-defm t2STR   : T2I_st<0b10, "str",  BinOpFrag<(store node:$LHS, node:$RHS)>>;
-defm t2STRB  : T2I_st<0b00, "strb", BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
-defm t2STRH  : T2I_st<0b01, "strh", BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+defm t2STR :T2I_st<0b10,"str", BinOpFrag<(store node:$LHS, node:$RHS)>>;
+defm t2STRB:T2I_st<0b00,"strb",BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+defm t2STRH:T2I_st<0b01,"strh",BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
 
 // Store doubleword
 let mayLoad = 1, hasExtraSrcRegAllocReq = 1 in
@@ -989,7 +1047,7 @@ def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb),
 let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
 def t2LDM : T2XI<(outs),
                  (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
-              IIC_iLoadm, "ldm${addr:submode}${p}${addr:wide}\t$addr, $wb", []> {
+             IIC_iLoadm, "ldm${addr:submode}${p}${addr:wide}\t$addr, $wb", []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b00;
   let Inst{24-23} = {?, ?}; // IA: '01', DB: '10'
@@ -1001,7 +1059,7 @@ def t2LDM : T2XI<(outs),
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
 def t2STM : T2XI<(outs),
                  (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
-             IIC_iStorem, "stm${addr:submode}${p}${addr:wide}\t$addr, $wb", []> {
+            IIC_iStorem, "stm${addr:submode}${p}${addr:wide}\t$addr, $wb", []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b00;
   let Inst{24-23} = {?, ?}; // IA: '01', DB: '10'
@@ -1074,13 +1132,15 @@ defm t2SXTB  : T2I_unary_rrot<0b100, "sxtb",
                               UnOpFrag<(sext_inreg node:$Src, i8)>>;
 defm t2SXTH  : T2I_unary_rrot<0b000, "sxth",
                               UnOpFrag<(sext_inreg node:$Src, i16)>>;
+defm t2SXTB16 : T2I_unary_rrot_DO<0b010, "sxtb16">;
 
 defm t2SXTAB : T2I_bin_rrot<0b100, "sxtab",
                         BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
 defm t2SXTAH : T2I_bin_rrot<0b000, "sxtah",
                         BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
+defm t2SXTAB16 : T2I_bin_rrot_DO<0b010, "sxtab16">;
 
-// TODO: SXT(A){B|H}16
+// TODO: SXT(A){B|H}16 - done for disassembly only
 
 // Zero extenders
 
@@ -1101,6 +1161,7 @@ defm t2UXTAB : T2I_bin_rrot<0b101, "uxtab",
                            BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
 defm t2UXTAH : T2I_bin_rrot<0b001, "uxtah",
                            BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
+defm t2UXTAB16 : T2I_bin_rrot_DO<0b011, "uxtab16">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1119,9 +1180,13 @@ defm t2SUBS : T2I_bin_s_irs <0b1101, "sub",
                              BinOpFrag<(subc node:$LHS, node:$RHS)>>;
 
 defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc",
-                                BinOpFrag<(adde node:$LHS, node:$RHS)>, 1>;
+                          BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>, 1>;
 defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc",
-                                BinOpFrag<(sube node:$LHS, node:$RHS)>>;
+                          BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>;
+defm t2ADCS : T2I_adde_sube_s_irs<0b1010, "adc",
+                          BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>;
+defm t2SBCS : T2I_adde_sube_s_irs<0b1011, "sbc",
+                          BinOpFrag<(sube_live_carry node:$LHS, node:$RHS)>>;
 
 // RSB
 defm t2RSB  : T2I_rbin_is   <0b1110, "rsb",
@@ -1138,6 +1203,155 @@ def : T2Pat<(add       GPR:$src, t2_so_imm_neg:$imm),
 def : T2Pat<(add       GPR:$src, imm0_4095_neg:$imm),
             (t2SUBri12 GPR:$src, imm0_4095_neg:$imm)>;
 
+// Select Bytes -- for disassembly only
+
+def t2SEL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), NoItinerary, "sel",
+                "\t$dst, $a, $b", []> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-24} = 0b010;
+  let Inst{23} = 0b1;
+  let Inst{22-20} = 0b010;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 0b1;
+  let Inst{6-4} = 0b000;
+}
+
+// A6.3.13, A6.3.14, A6.3.15 Parallel addition and subtraction (signed/unsigned)
+// And Miscellaneous operations -- for disassembly only
+class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc>
+  : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), NoItinerary, opc,
+        "\t$dst, $a, $b", [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0101;
+  let Inst{22-20} = op22_20;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = op7_4;
+}
+
+// Saturating add/subtract -- for disassembly only
+
+def t2QADD    : T2I_pam<0b000, 0b1000, "qadd">;
+def t2QADD16  : T2I_pam<0b001, 0b0001, "qadd16">;
+def t2QADD8   : T2I_pam<0b000, 0b0001, "qadd8">;
+def t2QASX    : T2I_pam<0b010, 0b0001, "qasx">;
+def t2QDADD   : T2I_pam<0b000, 0b1001, "qdadd">;
+def t2QDSUB   : T2I_pam<0b000, 0b1011, "qdsub">;
+def t2QSAX    : T2I_pam<0b110, 0b0001, "qsax">;
+def t2QSUB    : T2I_pam<0b000, 0b1010, "qsub">;
+def t2QSUB16  : T2I_pam<0b101, 0b0001, "qsub16">;
+def t2QSUB8   : T2I_pam<0b100, 0b0001, "qsub8">;
+def t2UQADD16 : T2I_pam<0b001, 0b0101, "uqadd16">;
+def t2UQADD8  : T2I_pam<0b000, 0b0101, "uqadd8">;
+def t2UQASX   : T2I_pam<0b010, 0b0101, "uqasx">;
+def t2UQSAX   : T2I_pam<0b110, 0b0101, "uqsax">;
+def t2UQSUB16 : T2I_pam<0b101, 0b0101, "uqsub16">;
+def t2UQSUB8  : T2I_pam<0b100, 0b0101, "uqsub8">;
+
+// Signed/Unsigned add/subtract -- for disassembly only
+
+def t2SASX    : T2I_pam<0b010, 0b0000, "sasx">;
+def t2SADD16  : T2I_pam<0b001, 0b0000, "sadd16">;
+def t2SADD8   : T2I_pam<0b000, 0b0000, "sadd8">;
+def t2SSAX    : T2I_pam<0b110, 0b0000, "ssax">;
+def t2SSUB16  : T2I_pam<0b101, 0b0000, "ssub16">;
+def t2SSUB8   : T2I_pam<0b100, 0b0000, "ssub8">;
+def t2UASX    : T2I_pam<0b010, 0b0100, "uasx">;
+def t2UADD16  : T2I_pam<0b001, 0b0100, "uadd16">;
+def t2UADD8   : T2I_pam<0b000, 0b0100, "uadd8">;
+def t2USAX    : T2I_pam<0b110, 0b0100, "usax">;
+def t2USUB16  : T2I_pam<0b101, 0b0100, "usub16">;
+def t2USUB8   : T2I_pam<0b100, 0b0100, "usub8">;
+
+// Signed/Unsigned halving add/subtract -- for disassembly only
+
+def t2SHASX   : T2I_pam<0b010, 0b0010, "shasx">;
+def t2SHADD16 : T2I_pam<0b001, 0b0010, "shadd16">;
+def t2SHADD8  : T2I_pam<0b000, 0b0010, "shadd8">;
+def t2SHSAX   : T2I_pam<0b110, 0b0010, "shsax">;
+def t2SHSUB16 : T2I_pam<0b101, 0b0010, "shsub16">;
+def t2SHSUB8  : T2I_pam<0b100, 0b0010, "shsub8">;
+def t2UHASX   : T2I_pam<0b010, 0b0110, "uhasx">;
+def t2UHADD16 : T2I_pam<0b001, 0b0110, "uhadd16">;
+def t2UHADD8  : T2I_pam<0b000, 0b0110, "uhadd8">;
+def t2UHSAX   : T2I_pam<0b110, 0b0110, "uhsax">;
+def t2UHSUB16 : T2I_pam<0b101, 0b0110, "uhsub16">;
+def t2UHSUB8  : T2I_pam<0b100, 0b0110, "uhsub8">;
+
+// Unsigned Sum of Absolute Differences [and Accumulate] -- for disassembly only
+
+def t2USAD8   : T2I_mac<0, 0b111, 0b0000, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+                        NoItinerary, "usad8", "\t$dst, $a, $b", []> {
+  let Inst{15-12} = 0b1111;
+}
+def t2USADA8  : T2I_mac<0, 0b111, 0b0000, (outs GPR:$dst),
+                        (ins GPR:$a, GPR:$b, GPR:$acc), NoItinerary, "usada8",
+                        "\t$dst, $a, $b, $acc", []>;
+
+// Signed/Unsigned saturate -- for disassembly only
+
+def t2SSATlsl : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos,GPR:$a,i32imm:$shamt),
+                    NoItinerary, "ssat", "\t$dst, $bit_pos, $a, lsl $shamt",
+                    [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1100;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{21} = 0;        // sh = '0'
+}
+
+def t2SSATasr : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos,GPR:$a,i32imm:$shamt),
+                    NoItinerary, "ssat", "\t$dst, $bit_pos, $a, asr $shamt",
+                    [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1100;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{21} = 1;        // sh = '1'
+}
+
+def t2SSAT16 : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), NoItinerary,
+                   "ssat16", "\t$dst, $bit_pos, $a",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1100;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{21} = 1;        // sh = '1'
+  let Inst{14-12} = 0b000; // imm3 = '000'
+  let Inst{7-6} = 0b00;    // imm2 = '00'
+}
+
+def t2USATlsl : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos,GPR:$a,i32imm:$shamt),
+                     NoItinerary, "usat", "\t$dst, $bit_pos, $a, lsl $shamt",
+                     [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1110;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{21} = 0;        // sh = '0'
+}
+
+def t2USATasr : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos,GPR:$a,i32imm:$shamt),
+                     NoItinerary, "usat", "\t$dst, $bit_pos, $a, asr $shamt",
+                     [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1110;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{21} = 1;        // sh = '1'
+}
+
+def t2USAT16 : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), NoItinerary,
+                   "usat16", "\t$dst, $bit_pos, $a",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1110;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{21} = 1;        // sh = '1'
+  let Inst{14-12} = 0b000; // imm3 = '000'
+  let Inst{7-6} = 0b00;    // imm2 = '00'
+}
 
 //===----------------------------------------------------------------------===//
 //  Shift and rotate Instructions.
@@ -1342,6 +1556,8 @@ def t2UMAAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
 }
 } // neverHasSideEffects
 
+// Rounding variants of the below included for disassembly only
+
 // Most significant word multiply
 def t2SMMUL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
                   "smmul", "\t$dst, $a, $b",
@@ -1353,6 +1569,15 @@ def t2SMMUL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
   let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
 }
 
+def t2SMMULR : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+                  "smmulr", "\t$dst, $a, $b", []> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b101;
+  let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+  let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
+}
+
 def t2SMMLA : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
                   "smmla", "\t$dst, $a, $b, $c",
                   [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]> {
@@ -1363,6 +1588,14 @@ def t2SMMLA : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
   let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
 }
 
+def t2SMMLAR : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+                  "smmlar", "\t$dst, $a, $b, $c", []> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b101;
+  let Inst{15-12} = {?, ?, ?, ?}; // Ra
+  let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
+}
 
 def t2SMMLS : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
                    "smmls", "\t$dst, $a, $b, $c",
@@ -1374,6 +1607,15 @@ def t2SMMLS : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
   let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
 }
 
+def t2SMMLSR : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+                   "smmlsr", "\t$dst, $a, $b, $c", []> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b110;
+  let Inst{15-12} = {?, ?, ?, ?}; // Ra
+  let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
+}
+
 multiclass T2I_smul<string opc, PatFrag opnode> {
   def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
               !strconcat(opc, "bb"), "\t$dst, $a, $b",
@@ -1466,7 +1708,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
   def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
              !strconcat(opc, "bt"), "\t$dst, $a, $b, $acc",
              [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
-                                                    (sra GPR:$b, (i32 16)))))]> {
+                                                   (sra GPR:$b, (i32 16)))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1490,7 +1732,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
   def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
               !strconcat(opc, "tt"), "\t$dst, $a, $b, $acc",
              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
-                                                    (sra GPR:$b, (i32 16)))))]> {
+                                                   (sra GPR:$b, (i32 16)))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1502,7 +1744,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
   def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
               !strconcat(opc, "wb"), "\t$dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
-                                       (sext_inreg GPR:$b, i16)), (i32 16))))]> {
+                                      (sext_inreg GPR:$b, i16)), (i32 16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -1514,7 +1756,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
   def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
               !strconcat(opc, "wt"), "\t$dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
-                                         (sra GPR:$b, (i32 16))), (i32 16))))]> {
+                                        (sra GPR:$b, (i32 16))), (i32 16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -1527,16 +1769,70 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
 defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 
-// TODO: Halfword multiple accumulate long: SMLAL<x><y>
-// TODO: Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
-
+// Halfword multiple accumulate long: SMLAL<x><y> -- for disassembly only
+def t2SMLALBB : T2I_mac<1, 0b100, 0b1000, (outs GPR:$ldst,GPR:$hdst),
+           (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlalbb", "\t$ldst, $hdst, $a, $b",
+           [/* For disassembly only; pattern left blank */]>;
+def t2SMLALBT : T2I_mac<1, 0b100, 0b1001, (outs GPR:$ldst,GPR:$hdst),
+           (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlalbt", "\t$ldst, $hdst, $a, $b",
+           [/* For disassembly only; pattern left blank */]>;
+def t2SMLALTB : T2I_mac<1, 0b100, 0b1010, (outs GPR:$ldst,GPR:$hdst),
+           (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlaltb", "\t$ldst, $hdst, $a, $b",
+           [/* For disassembly only; pattern left blank */]>;
+def t2SMLALTT : T2I_mac<1, 0b100, 0b1011, (outs GPR:$ldst,GPR:$hdst),
+           (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlaltt", "\t$ldst, $hdst, $a, $b",
+           [/* For disassembly only; pattern left blank */]>;
+
+// Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
+// These are for disassembly only.
+
+def t2SMUAD   : T2I_mac<0, 0b010, 0b0000, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+                        IIC_iMAC32, "smuad", "\t$dst, $a, $b", []> {
+  let Inst{15-12} = 0b1111;
+}
+def t2SMUADX  : T2I_mac<0, 0b010, 0b0001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+                        IIC_iMAC32, "smuadx", "\t$dst, $a, $b", []> {
+  let Inst{15-12} = 0b1111;
+}
+def t2SMUSD   : T2I_mac<0, 0b100, 0b0000, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+                        IIC_iMAC32, "smusd", "\t$dst, $a, $b", []> {
+  let Inst{15-12} = 0b1111;
+}
+def t2SMUSDX  : T2I_mac<0, 0b100, 0b0001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+                        IIC_iMAC32, "smusdx", "\t$dst, $a, $b", []> {
+  let Inst{15-12} = 0b1111;
+}
+def t2SMLAD   : T2I_mac<0, 0b010, 0b0000, (outs GPR:$dst),
+                        (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC32, "smlad",
+                        "\t$dst, $a, $b, $acc", []>;
+def t2SMLADX  : T2I_mac<0, 0b010, 0b0001, (outs GPR:$dst),
+                        (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC32, "smladx",
+                        "\t$dst, $a, $b, $acc", []>;
+def t2SMLSD   : T2I_mac<0, 0b100, 0b0000, (outs GPR:$dst),
+                        (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC32, "smlsd",
+                        "\t$dst, $a, $b, $acc", []>;
+def t2SMLSDX  : T2I_mac<0, 0b100, 0b0001, (outs GPR:$dst),
+                        (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC32, "smlsdx",
+                        "\t$dst, $a, $b, $acc", []>;
+def t2SMLALD  : T2I_mac<1, 0b100, 0b1100, (outs GPR:$ldst,GPR:$hdst),
+                        (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlald",
+                        "\t$ldst, $hdst, $a, $b", []>;
+def t2SMLALDX : T2I_mac<1, 0b100, 0b1101, (outs GPR:$ldst,GPR:$hdst),
+                        (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlaldx",
+                        "\t$ldst, $hdst, $a, $b", []>;
+def t2SMLSLD  : T2I_mac<1, 0b101, 0b1100, (outs GPR:$ldst,GPR:$hdst),
+                        (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlsld",
+                        "\t$ldst, $hdst, $a, $b", []>;
+def t2SMLSLDX : T2I_mac<1, 0b101, 0b1101, (outs GPR:$ldst,GPR:$hdst),
+                        (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlsldx",
+                        "\t$ldst, $hdst, $a, $b", []>;
 
 //===----------------------------------------------------------------------===//
 //  Misc. Arithmetic Instructions.
 //
 
-class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops, InstrItinClass itin,
-              string opc, string asm, list<dag> pattern>
+class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops,
+      InstrItinClass itin, string opc, string asm, list<dag> pattern>
   : T2I<oops, iops, itin, opc, asm, pattern> {
   let Inst{31-27} = 0b11111;
   let Inst{26-22} = 0b01010;
@@ -1572,7 +1868,7 @@ def t2REVSH : T2I_misc<0b01, 0b11, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
                           (shl GPR:$src, (i32 8))), i16))]>;
 
 def t2PKHBT : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
-                  IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2, LSL $shamt",
+                  IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2, lsl $shamt",
                   [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF),
                                       (and (shl GPR:$src2, (i32 imm:$shamt)),
                                            0xFFFF0000)))]> {
@@ -1590,7 +1886,7 @@ def : T2Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)),
             (t2PKHBT GPR:$src1, GPR:$src2, imm16_31:$shamt)>;
 
 def t2PKHTB : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
-                  IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2, ASR $shamt",
+                  IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2, asr $shamt",
                   [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000),
                                       (and (sra GPR:$src2, imm16_31:$shamt),
                                            0xFFFF)))]> {
@@ -1643,7 +1939,7 @@ defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
 
 // Conditional moves
 // FIXME: should be able to write a pattern for ARMcmov, but can't use
-// a two-value operand where a dag node expects two operands. :( 
+// a two-value operand where a dag node expects two operands. :(
 def t2MOVCCr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true), IIC_iCMOVr,
                    "mov", ".w\t$dst, $true",
       [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>,
@@ -1723,6 +2019,66 @@ def t2Int_SyncBarrierV7 : AInoP<(outs), (ins),
 }
 }
 
+// Helper class for multiclass T2MemB -- for disassembly only
+class T2I_memb<string opc, string asm>
+  : T2I<(outs), (ins), NoItinerary, opc, asm,
+        [/* For disassembly only; pattern left blank */]>,
+    Requires<[IsThumb2, HasV7]> {
+  let Inst{31-20} = 0xf3b;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+}
+
+multiclass T2MemB<bits<4> op7_4, string opc> {
+
+  def st : T2I_memb<opc, "\tst"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b1110;
+  }
+
+  def ish : T2I_memb<opc, "\tish"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b1011;
+  }
+
+  def ishst : T2I_memb<opc, "\tishst"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b1010;
+  }
+
+  def nsh : T2I_memb<opc, "\tnsh"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b0111;
+  }
+
+  def nshst : T2I_memb<opc, "\tnshst"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b0110;
+  }
+
+  def osh : T2I_memb<opc, "\tosh"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b0011;
+  }
+
+  def oshst : T2I_memb<opc, "\toshst"> {
+    let Inst{7-4} = op7_4;
+    let Inst{3-0} = 0b0010;
+  }
+}
+
+// These DMB variants are for disassembly only.
+defm t2DMB : T2MemB<0b0101, "dmb">;
+
+// These DSB variants are for disassembly only.
+defm t2DSB : T2MemB<0b0100, "dsb">;
+
+// ISB has only full system option -- for disassembly only
+def t2ISBsy : T2I_memb<"isb", ""> {
+  let Inst{7-4} = 0b0110;
+  let Inst{3-0} = 0b1111;
+}
+
 class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz,
                 InstrItinClass itin, string opc, string asm, string cstr,
                 list<dag> pattern, bits<4> rt2 = 0b1111>
@@ -1789,6 +2145,16 @@ def t2STREXD : T2I_strex<0b11, (outs GPR:$success),
                          {?, ?, ?, ?}>;
 }
 
+// Clear-Exclusive is for disassembly only.
+def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "",
+                  [/* For disassembly only; pattern left blank */]>,
+            Requires<[IsARM, HasV7]>  {
+  let Inst{31-20} = 0xf3b;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+  let Inst{7-4} = 0b0010;
+}
+
 //===----------------------------------------------------------------------===//
 // TLS Instructions
 //
@@ -1906,6 +2272,24 @@ def t2TBH :
   let Inst{15-8} = 0b11110000;
   let Inst{7-4} = 0b0001; // H form
 }
+
+// Generic versions of the above two instructions, for disassembly only
+
+def t2TBBgen : T2I<(outs), (ins GPR:$a, GPR:$b), IIC_Br,
+                    "tbb", "\t[$a, $b]", []>{
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001101;
+  let Inst{15-8} = 0b11110000;
+  let Inst{7-4} = 0b0000; // B form
+}
+
+def t2TBHgen : T2I<(outs), (ins GPR:$a, GPR:$b), IIC_Br,
+                   "tbh", "\t[$a, $b, lsl #1]", []> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001101;
+  let Inst{15-8} = 0b11110000;
+  let Inst{7-4} = 0b0001; // H form
+}
 } // isNotDuplicable, isIndirectBranch
 
 } // isBranch, isTerminator, isBarrier
@@ -1931,6 +2315,119 @@ def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
   let Inst{15-8} = 0b10111111;
 }
 
+// Branch and Exchange Jazelle -- for disassembly only
+// Rm = Inst{19-16}
+def t2BXJ : T2I<(outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
+              [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26} = 0;
+  let Inst{25-20} = 0b111100;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+}
+
+// Change Processor State is a system instruction -- for disassembly only.
+// The singleton $opt operand contains the following information:
+// opt{4-0} = mode from Inst{4-0}
+// opt{5} = changemode from Inst{17}
+// opt{8-6} = AIF from Inst{8-6}
+// opt{10-9} = imod from Inst{19-18} with 0b10 as enable and 0b11 as disable
+def t2CPS : T2XI<(outs),(ins i32imm:$opt), NoItinerary, "cps${opt:cps}",
+                 [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26} = 0;
+  let Inst{25-20} = 0b111010;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+}
+
+// A6.3.4 Branches and miscellaneous control
+// Table A6-14 Change Processor State, and hint instructions
+// Helper class for disassembly only.
+class T2I_hint<bits<8> op7_0, string opc, string asm>
+  : T2I<(outs), (ins), NoItinerary, opc, asm,
+        [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-20} = 0xf3a;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+  let Inst{10-8} = 0b000;
+  let Inst{7-0} = op7_0;
+}
+
+def t2NOP   : T2I_hint<0b00000000, "nop",   ".w">;
+def t2YIELD : T2I_hint<0b00000001, "yield", ".w">;
+def t2WFE   : T2I_hint<0b00000010, "wfe",   ".w">;
+def t2WFI   : T2I_hint<0b00000011, "wfi",   ".w">;
+def t2SEV   : T2I_hint<0b00000100, "sev",   ".w">;
+
+def t2DBG : T2I<(outs),(ins i32imm:$opt), NoItinerary, "dbg", "\t$opt",
+                [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-20} = 0xf3a;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+  let Inst{10-8} = 0b000;
+  let Inst{7-4} = 0b1111;
+}
+
+// Secure Monitor Call is a system instruction -- for disassembly only
+// Option = Inst{19-16}
+def t2SMC : T2I<(outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt",
+                [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26-20} = 0b1111111;
+  let Inst{15-12} = 0b1000;
+}
+
+// Store Return State is a system instruction -- for disassembly only
+def t2SRSDBW : T2I<(outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp!, $mode",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000010; // W = 1
+}
+
+def t2SRSDB  : T2I<(outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp, $mode",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000000; // W = 0
+}
+
+def t2SRSIAW : T2I<(outs),(ins i32imm:$mode),NoItinerary,"srsia","\tsp!, $mode",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0011010; // W = 1
+}
+
+def t2SRSIA  : T2I<(outs), (ins i32imm:$mode),NoItinerary,"srsia","\tsp, $mode",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0011000; // W = 0
+}
+
+// Return From Exception is a system instruction -- for disassembly only
+def t2RFEDBW : T2I<(outs), (ins GPR:$base), NoItinerary, "rfedb", "\t$base!",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000011; // W = 1
+}
+
+def t2RFEDB  : T2I<(outs), (ins GPR:$base), NoItinerary, "rfeab", "\t$base",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000001; // W = 0
+}
+
+def t2RFEIAW : T2I<(outs), (ins GPR:$base), NoItinerary, "rfeia", "\t$base!",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0011011; // W = 1
+}
+
+def t2RFEIA  : T2I<(outs), (ins GPR:$base), NoItinerary, "rfeia", "\t$base",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0011001; // W = 0
+}
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
@@ -1970,9 +2467,59 @@ def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
 // Pseudo instruction that combines ldr from constpool and add pc. This should
 // be expanded into two instructions late to allow if-conversion and
 // scheduling.
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in 
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def t2LDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
                    NoItinerary, "@ ldr.w\t$dst, $addr\n$cp:\n\tadd\t$dst, pc",
                [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
                                            imm:$cp))]>,
                Requires<[IsThumb2]>;
+
+//===----------------------------------------------------------------------===//
+// Move between special register and ARM core register -- for disassembly only
+//
+
+// Rd = Instr{11-8}
+def t2MRS : T2I<(outs GPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, cpsr",
+                [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26} = 0;
+  let Inst{25-21} = 0b11111;
+  let Inst{20} = 0; // The R bit.
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+}
+
+// Rd = Instr{11-8}
+def t2MRSsys : T2I<(outs GPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, spsr",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26} = 0;
+  let Inst{25-21} = 0b11111;
+  let Inst{20} = 1; // The R bit.
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+}
+
+// FIXME: mask is ignored for the time being.
+// Rn = Inst{19-16}
+def t2MSR : T2I<(outs), (ins GPR:$src), NoItinerary, "msr", "\tcpsr, $src",
+                [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26} = 0;
+  let Inst{25-21} = 0b11100;
+  let Inst{20} = 0; // The R bit.
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+}
+
+// FIXME: mask is ignored for the time being.
+// Rn = Inst{19-16}
+def t2MSRsys : T2I<(outs), (ins GPR:$src), NoItinerary, "msr", "\tspsr, $src",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26} = 0;
+  let Inst{25-21} = 0b11100;
+  let Inst{20} = 1; // The R bit.
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+}
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index e516593..7c117ed 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -54,7 +54,7 @@ def vfp_f64imm : Operand<f64>,
 //  Load / store Instructions.
 //
 
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
 def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$dst), (ins addrmode5:$addr),
                  IIC_fpLoad64, "vldr", ".64\t$dst, $addr",
                  [(set DPR:$dst, (load addrmode5:$addr))]>;
diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
index bef5a06..8c0b720 100644
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ b/lib/Target/ARM/ARMJITInfo.cpp
@@ -60,7 +60,7 @@ extern "C" {
     // whole compilation callback doesn't exist as far as the caller is
     // concerned, so we can't just preserve the callee saved regs.
     "stmdb sp!, {r0, r1, r2, r3, lr}\n"
-#ifndef __SOFTFP__
+#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
     "fstmfdd sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
 #endif
     // The LR contains the address of the stub function on entry.
@@ -83,7 +83,7 @@ extern "C" {
     // 6-20 | D0..D7 | Saved VFP registers
     //      +--------+
     //
-#ifndef __SOFTFP__
+#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
     // Restore VFP caller-saved registers.
     "fldmfdd sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
 #endif
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 4e2d181..5b4f02d 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -748,11 +748,19 @@ static bool isMemoryOp(const MachineInstr *MI) {
     if (MMO->isVolatile())
       return false;
 
-    // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is not.
+    // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
+    // not.
     if (MMO->getAlignment() < 4)
       return false;
   }
 
+  // str <undef> could probably be eliminated entirely, but for now we just want
+  // to avoid making a mess of it.
+  // FIXME: Use str <undef> as a wildcard to enable better stm folding.
+  if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() &&
+      MI->getOperand(0).isUndef())
+    return false;
+
   int Opcode = MI->getOpcode();
   switch (Opcode) {
   default: break;
diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
index f60cc33..d6d595c 100644
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@@ -122,6 +122,7 @@ namespace {
     void printT2AddrModeSoRegOperand(const MachineInstr *MI, int OpNum);
 
     void printPredicateOperand(const MachineInstr *MI, int OpNum);
+    void printMandatoryPredicateOperand(const MachineInstr *MI, int OpNum);
     void printSBitModifierOperand(const MachineInstr *MI, int OpNum);
     void printPCLabel(const MachineInstr *MI, int OpNum);
     void printRegisterList(const MachineInstr *MI, int OpNum);
@@ -786,6 +787,12 @@ void ARMAsmPrinter::printPredicateOperand(const MachineInstr *MI, int OpNum) {
     O << ARMCondCodeToString(CC);
 }
 
+void ARMAsmPrinter::printMandatoryPredicateOperand(const MachineInstr *MI,
+                                                   int OpNum) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+  O << ARMCondCodeToString(CC);
+}
+
 void ARMAsmPrinter::printSBitModifierOperand(const MachineInstr *MI, int OpNum){
   unsigned Reg = MI->getOperand(OpNum).getReg();
   if (Reg) {
diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
index d7d8e09..a2084b0 100644
--- a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
@@ -325,6 +325,12 @@ void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum) {
     O << ARMCondCodeToString(CC);
 }
 
+void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI, 
+                                                    unsigned OpNum) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+  O << ARMCondCodeToString(CC);
+}
+
 void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum){
   if (MI->getOperand(OpNum).getReg()) {
     assert(MI->getOperand(OpNum).getReg() == ARM::CPSR &&
diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
index 23a7f05..b7964c9 100644
--- a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
@@ -71,6 +71,7 @@ public:
   void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum) {}
   
   void printPredicateOperand(const MCInst *MI, unsigned OpNum);
+  void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum);
   void printSBitModifierOperand(const MCInst *MI, unsigned OpNum);
   void printRegisterList(const MCInst *MI, unsigned OpNum);
   void printCPInstOperand(const MCInst *MI, unsigned OpNum,
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 9efb5a1..57b65cf 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -10,6 +10,8 @@ Reimplement 'select' in terms of 'SEL'.
 * Implement pre/post increment support.  (e.g. PR935)
 * Implement smarter constant generation for binops with large immediates.
 
+A few ARMv6T2 ops should be pattern matched: BFI, SBFX, and UBFX
+
 //===---------------------------------------------------------------------===//
 
 Crazy idea:  Consider code that uses lots of 8-bit or 16-bit values.  By the
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index d6630ce..b61ce29 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -450,9 +450,9 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     Offset -= AFI->getGPRCalleeSavedArea1Offset();
   else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
     Offset -= AFI->getGPRCalleeSavedArea2Offset();
-  else if (hasFP(MF)) {
-    assert(SPAdj == 0 && "Unexpected");
-    // There is alloca()'s in this function, must reference off the frame
+  else if (MF.getFrameInfo()->hasVarSizedObjects()) {
+    assert(SPAdj == 0 && hasFP(MF) && "Unexpected");
+    // There are alloca()'s in this function, must reference off the frame
     // pointer instead.
     FrameReg = getFrameRegister(MF);
     Offset -= AFI->getFramePtrSpillOffset();
diff --git a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
index d6b17c2..5303d85 100644
--- a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
+++ b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
@@ -159,10 +159,6 @@ namespace {
     // target-specific node if it hasn't already been changed.
     SDNode *Select(SDNode *N);
     
-    /// InstructionSelect - This callback is invoked by
-    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-    virtual void InstructionSelect();
-    
     virtual const char *getPassName() const {
       return "Alpha DAG->DAG Pattern Instruction Selection";
     } 
@@ -222,20 +218,11 @@ SDNode *AlphaDAGToDAGISel::getGlobalRetAddr() {
   return CurDAG->getRegister(GlobalRetAddr, TLI.getPointerTy()).getNode();
 }
 
-/// InstructionSelect - This callback is invoked by
-/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-void AlphaDAGToDAGISel::InstructionSelect() {
-  // Select target instructions for the DAG.
-  SelectRoot(*CurDAG);
-  CurDAG->RemoveDeadNodes();
-}
-
 // Select - Convert the specified operand from a target-independent to a
 // target-specific node if it hasn't already been changed.
 SDNode *AlphaDAGToDAGISel::Select(SDNode *N) {
-  if (N->isMachineOpcode()) {
+  if (N->isMachineOpcode())
     return NULL;   // Already selected.
-  }
   DebugLoc dl = N->getDebugLoc();
 
   switch (N->getOpcode()) {
diff --git a/lib/Target/Alpha/AlphaInstrInfo.td b/lib/Target/Alpha/AlphaInstrInfo.td
index 8917e86..341c4a7 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.td
+++ b/lib/Target/Alpha/AlphaInstrInfo.td
@@ -92,7 +92,7 @@ def immSExt16int  : PatLeaf<(imm), [{ //(int)imm fits in a 16 bit sign extended
          ((int64_t)N->getZExtValue() << 32) >> 32;
 }], SExt16>;
 
-def zappat : PatFrag<(ops node:$LHS), (and node:$LHS, imm:$L), [{
+def zappat : PatFrag<(ops node:$LHS), (and node:$LHS, imm), [{
   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!RHS) return 0;
   uint64_t build = get_zapImm(N->getOperand(0), (uint64_t)RHS->getZExtValue());
@@ -602,9 +602,9 @@ def RPCC : MfcForm<0x18, 0xC000, "rpcc $RA", s_rpcc>; //Read process cycle count
 def MB  : MfcPForm<0x18, 0x4000, "mb",  s_imisc>; //memory barrier
 def WMB : MfcPForm<0x18, 0x4400, "wmb", s_imisc>; //write memory barrier
 
-def : Pat<(membarrier (i64 imm:$ll), (i64 imm:$ls), (i64 imm:$sl), (i64 1), (i64 imm:$dev)),
+def : Pat<(membarrier (i64 imm), (i64 imm), (i64 imm), (i64 1), (i64 imm)),
           (WMB)>;
-def : Pat<(membarrier (i64 imm:$ll), (i64 imm:$ls), (i64 imm:$sl), (i64 imm:$ss), (i64 imm:$dev)),
+def : Pat<(membarrier (i64 imm), (i64 imm), (i64 imm), (i64 imm), (i64 imm)),
           (MB)>;
 
 //Basic Floating point ops
diff --git a/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
index 2c9cc60..c8d71aa 100644
--- a/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
+++ b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
@@ -41,7 +41,7 @@ namespace {
     BlackfinDAGToDAGISel(BlackfinTargetMachine &TM, CodeGenOpt::Level OptLevel)
       : SelectionDAGISel(TM, OptLevel) {}
 
-    virtual void InstructionSelect();
+    virtual void PostprocessISelDAG();
 
     virtual const char *getPassName() const {
       return "Blackfin DAG->DAG Pattern Instruction Selection";
@@ -72,13 +72,7 @@ FunctionPass *llvm::createBlackfinISelDag(BlackfinTargetMachine &TM,
   return new BlackfinDAGToDAGISel(TM, OptLevel);
 }
 
-/// InstructionSelect - This callback is invoked by
-/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-void BlackfinDAGToDAGISel::InstructionSelect() {
-  // Select target instructions for the DAG.
-  SelectRoot(*CurDAG);
-  DEBUG(errs() << "Selected selection DAG before regclass fixup:\n");
-  DEBUG(CurDAG->dump());
+void BlackfinDAGToDAGISel::PostprocessISelDAG() {
   FixRegisterClasses(*CurDAG);
 }
 
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index c1c1d80..10f873f 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -49,7 +49,6 @@
 #include "llvm/System/Host.h"
 #include "llvm/Config/config.h"
 #include <algorithm>
-#include <sstream>
 using namespace llvm;
 
 extern "C" void LLVMInitializeCBackendTarget() { 
@@ -153,26 +152,16 @@ namespace {
       return false;
     }
 
-    raw_ostream &printType(formatted_raw_ostream &Out,
-                           const Type *Ty, 
+    raw_ostream &printType(raw_ostream &Out, const Type *Ty,
                            bool isSigned = false,
                            const std::string &VariableName = "",
                            bool IgnoreName = false,
                            const AttrListPtr &PAL = AttrListPtr());
-    std::ostream &printType(std::ostream &Out, const Type *Ty, 
-                           bool isSigned = false,
-                           const std::string &VariableName = "",
-                           bool IgnoreName = false,
-                           const AttrListPtr &PAL = AttrListPtr());
-    raw_ostream &printSimpleType(formatted_raw_ostream &Out,
-                                 const Type *Ty, 
-                                 bool isSigned, 
-                                 const std::string &NameSoFar = "");
-    std::ostream &printSimpleType(std::ostream &Out, const Type *Ty, 
-                                 bool isSigned, 
+    raw_ostream &printSimpleType(raw_ostream &Out, const Type *Ty,
+                                 bool isSigned,
                                  const std::string &NameSoFar = "");
 
-    void printStructReturnPointerFunctionType(formatted_raw_ostream &Out,
+    void printStructReturnPointerFunctionType(raw_ostream &Out,
                                               const AttrListPtr &PAL,
                                               const PointerType *Ty);
 
@@ -385,8 +374,8 @@ bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) {
     
     // If this isn't a struct or array type, remove it from our set of types
     // to name. This simplifies emission later.
-    if (!isa<StructType>(I->second) && !isa<OpaqueType>(I->second) &&
-        !isa<ArrayType>(I->second)) {
+    if (!I->second->isStructTy() && !I->second->isOpaqueTy() &&
+        !I->second->isArrayTy()) {
       TST.remove(I);
     } else {
       // If this is not used, remove it from the symbol table.
@@ -405,7 +394,7 @@ bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) {
   unsigned RenameCounter = 0;
   for (std::set<const Type *>::const_iterator I = UT.begin(), E = UT.end();
        I != E; ++I)
-    if (isa<StructType>(*I) || isa<ArrayType>(*I)) {
+    if ((*I)->isStructTy() || (*I)->isArrayTy()) {
       while (M.addTypeName("unnamed"+utostr(RenameCounter), *I))
         ++RenameCounter;
       Changed = true;
@@ -454,11 +443,12 @@ bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) {
 /// printStructReturnPointerFunctionType - This is like printType for a struct
 /// return type, except, instead of printing the type as void (*)(Struct*, ...)
 /// print it as "Struct (*)(...)", for struct return functions.
-void CWriter::printStructReturnPointerFunctionType(formatted_raw_ostream &Out,
+void CWriter::printStructReturnPointerFunctionType(raw_ostream &Out,
                                                    const AttrListPtr &PAL,
                                                    const PointerType *TheTy) {
   const FunctionType *FTy = cast<FunctionType>(TheTy->getElementType());
-  std::stringstream FunctionInnards;
+  std::string tstr;
+  raw_string_ostream FunctionInnards(tstr);
   FunctionInnards << " (*) (";
   bool PrintedType = false;
 
@@ -470,7 +460,7 @@ void CWriter::printStructReturnPointerFunctionType(formatted_raw_ostream &Out,
       FunctionInnards << ", ";
     const Type *ArgTy = *I;
     if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
-      assert(isa<PointerType>(ArgTy));
+      assert(ArgTy->isPointerTy());
       ArgTy = cast<PointerType>(ArgTy)->getElementType();
     }
     printType(FunctionInnards, ArgTy,
@@ -484,63 +474,14 @@ void CWriter::printStructReturnPointerFunctionType(formatted_raw_ostream &Out,
     FunctionInnards << "void";
   }
   FunctionInnards << ')';
-  std::string tstr = FunctionInnards.str();
   printType(Out, RetTy, 
-      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), tstr);
+      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), FunctionInnards.str());
 }
 
 raw_ostream &
-CWriter::printSimpleType(formatted_raw_ostream &Out, const Type *Ty,
-                         bool isSigned,
+CWriter::printSimpleType(raw_ostream &Out, const Type *Ty, bool isSigned,
                          const std::string &NameSoFar) {
-  assert((Ty->isPrimitiveType() || Ty->isIntegerTy() || isa<VectorType>(Ty)) && 
-         "Invalid type for printSimpleType");
-  switch (Ty->getTypeID()) {
-  case Type::VoidTyID:   return Out << "void " << NameSoFar;
-  case Type::IntegerTyID: {
-    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
-    if (NumBits == 1) 
-      return Out << "bool " << NameSoFar;
-    else if (NumBits <= 8)
-      return Out << (isSigned?"signed":"unsigned") << " char " << NameSoFar;
-    else if (NumBits <= 16)
-      return Out << (isSigned?"signed":"unsigned") << " short " << NameSoFar;
-    else if (NumBits <= 32)
-      return Out << (isSigned?"signed":"unsigned") << " int " << NameSoFar;
-    else if (NumBits <= 64)
-      return Out << (isSigned?"signed":"unsigned") << " long long "<< NameSoFar;
-    else { 
-      assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
-      return Out << (isSigned?"llvmInt128":"llvmUInt128") << " " << NameSoFar;
-    }
-  }
-  case Type::FloatTyID:  return Out << "float "   << NameSoFar;
-  case Type::DoubleTyID: return Out << "double "  << NameSoFar;
-  // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is
-  // present matches host 'long double'.
-  case Type::X86_FP80TyID:
-  case Type::PPC_FP128TyID:
-  case Type::FP128TyID:  return Out << "long double " << NameSoFar;
-      
-  case Type::VectorTyID: {
-    const VectorType *VTy = cast<VectorType>(Ty);
-    return printSimpleType(Out, VTy->getElementType(), isSigned,
-                     " __attribute__((vector_size(" +
-                     utostr(TD->getTypeAllocSize(VTy)) + " ))) " + NameSoFar);
-  }
-    
-  default:
-#ifndef NDEBUG
-    errs() << "Unknown primitive type: " << *Ty << "\n";
-#endif
-    llvm_unreachable(0);
-  }
-}
-
-std::ostream &
-CWriter::printSimpleType(std::ostream &Out, const Type *Ty, bool isSigned,
-                         const std::string &NameSoFar) {
-  assert((Ty->isPrimitiveType() || Ty->isIntegerTy() || isa<VectorType>(Ty)) && 
+  assert((Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) && 
          "Invalid type for printSimpleType");
   switch (Ty->getTypeID()) {
   case Type::VoidTyID:   return Out << "void " << NameSoFar;
@@ -587,120 +528,16 @@ CWriter::printSimpleType(std::ostream &Out, const Type *Ty, bool isSigned,
 // Pass the Type* and the variable name and this prints out the variable
 // declaration.
 //
-raw_ostream &CWriter::printType(formatted_raw_ostream &Out,
-                                const Type *Ty,
+raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
                                 bool isSigned, const std::string &NameSoFar,
                                 bool IgnoreName, const AttrListPtr &PAL) {
-  if (Ty->isPrimitiveType() || Ty->isIntegerTy() || isa<VectorType>(Ty)) {
-    printSimpleType(Out, Ty, isSigned, NameSoFar);
-    return Out;
-  }
-
-  // Check to see if the type is named.
-  if (!IgnoreName || isa<OpaqueType>(Ty)) {
-    std::map<const Type *, std::string>::iterator I = TypeNames.find(Ty);
-    if (I != TypeNames.end()) return Out << I->second << ' ' << NameSoFar;
-  }
-
-  switch (Ty->getTypeID()) {
-  case Type::FunctionTyID: {
-    const FunctionType *FTy = cast<FunctionType>(Ty);
-    std::stringstream FunctionInnards;
-    FunctionInnards << " (" << NameSoFar << ") (";
-    unsigned Idx = 1;
-    for (FunctionType::param_iterator I = FTy->param_begin(),
-           E = FTy->param_end(); I != E; ++I) {
-      const Type *ArgTy = *I;
-      if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
-        assert(isa<PointerType>(ArgTy));
-        ArgTy = cast<PointerType>(ArgTy)->getElementType();
-      }
-      if (I != FTy->param_begin())
-        FunctionInnards << ", ";
-      printType(FunctionInnards, ArgTy,
-        /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt), "");
-      ++Idx;
-    }
-    if (FTy->isVarArg()) {
-      if (FTy->getNumParams())
-        FunctionInnards << ", ...";
-    } else if (!FTy->getNumParams()) {
-      FunctionInnards << "void";
-    }
-    FunctionInnards << ')';
-    std::string tstr = FunctionInnards.str();
-    printType(Out, FTy->getReturnType(), 
-      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), tstr);
-    return Out;
-  }
-  case Type::StructTyID: {
-    const StructType *STy = cast<StructType>(Ty);
-    Out << NameSoFar + " {\n";
-    unsigned Idx = 0;
-    for (StructType::element_iterator I = STy->element_begin(),
-           E = STy->element_end(); I != E; ++I) {
-      Out << "  ";
-      printType(Out, *I, false, "field" + utostr(Idx++));
-      Out << ";\n";
-    }
-    Out << '}';
-    if (STy->isPacked())
-      Out << " __attribute__ ((packed))";
-    return Out;
-  }
-
-  case Type::PointerTyID: {
-    const PointerType *PTy = cast<PointerType>(Ty);
-    std::string ptrName = "*" + NameSoFar;
-
-    if (isa<ArrayType>(PTy->getElementType()) ||
-        isa<VectorType>(PTy->getElementType()))
-      ptrName = "(" + ptrName + ")";
-
-    if (!PAL.isEmpty())
-      // Must be a function ptr cast!
-      return printType(Out, PTy->getElementType(), false, ptrName, true, PAL);
-    return printType(Out, PTy->getElementType(), false, ptrName);
-  }
-
-  case Type::ArrayTyID: {
-    const ArrayType *ATy = cast<ArrayType>(Ty);
-    unsigned NumElements = ATy->getNumElements();
-    if (NumElements == 0) NumElements = 1;
-    // Arrays are wrapped in structs to allow them to have normal
-    // value semantics (avoiding the array "decay").
-    Out << NameSoFar << " { ";
-    printType(Out, ATy->getElementType(), false,
-              "array[" + utostr(NumElements) + "]");
-    return Out << "; }";
-  }
-
-  case Type::OpaqueTyID: {
-    std::string TyName = "struct opaque_" + itostr(OpaqueCounter++);
-    assert(TypeNames.find(Ty) == TypeNames.end());
-    TypeNames[Ty] = TyName;
-    return Out << TyName << ' ' << NameSoFar;
-  }
-  default:
-    llvm_unreachable("Unhandled case in getTypeProps!");
-  }
-
-  return Out;
-}
-
-// Pass the Type* and the variable name and this prints out the variable
-// declaration.
-//
-std::ostream &CWriter::printType(std::ostream &Out, const Type *Ty,
-                                 bool isSigned, const std::string &NameSoFar,
-                                 bool IgnoreName, const AttrListPtr &PAL) {
-  if (Ty->isPrimitiveType() || Ty->isIntegerTy() || isa<VectorType>(Ty)) {
+  if (Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) {
     printSimpleType(Out, Ty, isSigned, NameSoFar);
     return Out;
   }
 
   // Check to see if the type is named.
-  if (!IgnoreName || isa<OpaqueType>(Ty)) {
+  if (!IgnoreName || Ty->isOpaqueTy()) {
     std::map<const Type *, std::string>::iterator I = TypeNames.find(Ty);
     if (I != TypeNames.end()) return Out << I->second << ' ' << NameSoFar;
   }
@@ -708,14 +545,15 @@ std::ostream &CWriter::printType(std::ostream &Out, const Type *Ty,
   switch (Ty->getTypeID()) {
   case Type::FunctionTyID: {
     const FunctionType *FTy = cast<FunctionType>(Ty);
-    std::stringstream FunctionInnards;
+    std::string tstr;
+    raw_string_ostream FunctionInnards(tstr);
     FunctionInnards << " (" << NameSoFar << ") (";
     unsigned Idx = 1;
     for (FunctionType::param_iterator I = FTy->param_begin(),
            E = FTy->param_end(); I != E; ++I) {
       const Type *ArgTy = *I;
       if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
-        assert(isa<PointerType>(ArgTy));
+        assert(ArgTy->isPointerTy());
         ArgTy = cast<PointerType>(ArgTy)->getElementType();
       }
       if (I != FTy->param_begin())
@@ -731,9 +569,8 @@ std::ostream &CWriter::printType(std::ostream &Out, const Type *Ty,
       FunctionInnards << "void";
     }
     FunctionInnards << ')';
-    std::string tstr = FunctionInnards.str();
     printType(Out, FTy->getReturnType(), 
-      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), tstr);
+      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), FunctionInnards.str());
     return Out;
   }
   case Type::StructTyID: {
@@ -756,8 +593,8 @@ std::ostream &CWriter::printType(std::ostream &Out, const Type *Ty,
     const PointerType *PTy = cast<PointerType>(Ty);
     std::string ptrName = "*" + NameSoFar;
 
-    if (isa<ArrayType>(PTy->getElementType()) ||
-        isa<VectorType>(PTy->getElementType()))
+    if (PTy->getElementType()->isArrayTy() ||
+        PTy->getElementType()->isVectorTy())
       ptrName = "(" + ptrName + ")";
 
     if (!PAL.isEmpty())
@@ -1144,7 +981,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
     Out << "((";
     printType(Out, CPV->getType()); // sign doesn't matter
     Out << ")/*UNDEF*/";
-    if (!isa<VectorType>(CPV->getType())) {
+    if (!CPV->getType()->isVectorTy()) {
       Out << "0)";
     } else {
       Out << "{})";
@@ -1660,7 +1497,7 @@ void CWriter::writeOperandWithCast(Value* Operand, const ICmpInst &Cmp) {
 
   // If the operand was a pointer, convert to a large integer type.
   const Type* OpTy = Operand->getType();
-  if (isa<PointerType>(OpTy))
+  if (OpTy->isPointerTy())
     OpTy = TD->getIntPtrType(Operand->getContext());
   
   Out << "((";
@@ -2102,10 +1939,10 @@ bool CWriter::doInitialization(Module &M) {
           // complete.  If the value is an aggregate, print out { 0 }, and let
           // the compiler figure out the rest of the zeros.
           Out << " = " ;
-          if (isa<StructType>(I->getInitializer()->getType()) ||
-              isa<VectorType>(I->getInitializer()->getType())) {
+          if (I->getInitializer()->getType()->isStructTy() ||
+              I->getInitializer()->getType()->isVectorTy()) {
             Out << "{ 0 }";
-          } else if (isa<ArrayType>(I->getInitializer()->getType())) {
+          } else if (I->getInitializer()->getType()->isArrayTy()) {
             // As with structs and vectors, but with an extra set of braces
             // because arrays are wrapped in structs.
             Out << "{ { 0 } }";
@@ -2274,7 +2111,7 @@ void CWriter::printModuleTypes(const TypeSymbolTable &TST) {
   //
   Out << "/* Structure contents */\n";
   for (I = TST.begin(); I != End; ++I)
-    if (isa<StructType>(I->second) || isa<ArrayType>(I->second))
+    if (I->second->isStructTy() || I->second->isArrayTy())
       // Only print out used types!
       printContainedStructs(I->second, StructPrinted);
 }
@@ -2287,7 +2124,7 @@ void CWriter::printModuleTypes(const TypeSymbolTable &TST) {
 void CWriter::printContainedStructs(const Type *Ty,
                                     std::set<const Type*> &StructPrinted) {
   // Don't walk through pointers.
-  if (isa<PointerType>(Ty) || Ty->isPrimitiveType() || Ty->isIntegerTy())
+  if (Ty->isPointerTy() || Ty->isPrimitiveType() || Ty->isIntegerTy())
     return;
   
   // Print all contained types first.
@@ -2295,7 +2132,7 @@ void CWriter::printContainedStructs(const Type *Ty,
        E = Ty->subtype_end(); I != E; ++I)
     printContainedStructs(*I, StructPrinted);
   
-  if (isa<StructType>(Ty) || isa<ArrayType>(Ty)) {
+  if (Ty->isStructTy() || Ty->isArrayTy()) {
     // Check to see if we have already printed this struct.
     if (StructPrinted.insert(Ty).second) {
       // Print structure type out.
@@ -2328,7 +2165,8 @@ void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
   const FunctionType *FT = cast<FunctionType>(F->getFunctionType());
   const AttrListPtr &PAL = F->getAttributes();
 
-  std::stringstream FunctionInnards;
+  std::string tstr;
+  raw_string_ostream FunctionInnards(tstr);
 
   // Print out the name...
   FunctionInnards << GetValueName(F) << '(';
@@ -2383,7 +2221,7 @@ void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
       if (PrintedArg) FunctionInnards << ", ";
       const Type *ArgTy = *I;
       if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
-        assert(isa<PointerType>(ArgTy));
+        assert(ArgTy->isPointerTy());
         ArgTy = cast<PointerType>(ArgTy)->getElementType();
       }
       printType(FunctionInnards, ArgTy,
@@ -2714,7 +2552,7 @@ void CWriter::visitPHINode(PHINode &I) {
 
 void CWriter::visitBinaryOperator(Instruction &I) {
   // binary instructions, shift instructions, setCond instructions.
-  assert(!isa<PointerType>(I.getType()));
+  assert(!I.getType()->isPointerTy());
 
   // We must cast the results of binary operations which might be promoted.
   bool needsCast = false;
@@ -3490,7 +3328,7 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
     // exposed, like a global, avoid emitting (&foo)[0], just emit foo instead.
     if (isAddressExposed(Ptr)) {
       writeOperandInternal(Ptr, Static);
-    } else if (I != E && isa<StructType>(*I)) {
+    } else if (I != E && (*I)->isStructTy()) {
       // If we didn't already emit the first operand, see if we can print it as
       // P->f instead of "P[0].f"
       writeOperand(Ptr);
@@ -3505,13 +3343,13 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
   }
 
   for (; I != E; ++I) {
-    if (isa<StructType>(*I)) {
+    if ((*I)->isStructTy()) {
       Out << ".field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
-    } else if (isa<ArrayType>(*I)) {
+    } else if ((*I)->isArrayTy()) {
       Out << ".array[";
       writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
       Out << ']';
-    } else if (!isa<VectorType>(*I)) {
+    } else if (!(*I)->isVectorTy()) {
       Out << '[';
       writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
       Out << ']';
@@ -3669,7 +3507,7 @@ void CWriter::visitInsertValueInst(InsertValueInst &IVI) {
        i != e; ++i) {
     const Type *IndexedTy =
       ExtractValueInst::getIndexedType(IVI.getOperand(0)->getType(), b, i+1);
-    if (isa<ArrayType>(IndexedTy))
+    if (IndexedTy->isArrayTy())
       Out << ".array[" << *i << "]";
     else
       Out << ".field" << *i;
@@ -3690,7 +3528,7 @@ void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
          i != e; ++i) {
       const Type *IndexedTy =
         ExtractValueInst::getIndexedType(EVI.getOperand(0)->getType(), b, i+1);
-      if (isa<ArrayType>(IndexedTy))
+      if (IndexedTy->isArrayTy())
         Out << ".array[" << *i << "]";
       else
         Out << ".field" << *i;
@@ -3706,7 +3544,8 @@ void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
 bool CTargetMachine::addPassesToEmitWholeFile(PassManager &PM,
                                               formatted_raw_ostream &o,
                                               CodeGenFileType FileType,
-                                              CodeGenOpt::Level OptLevel) {
+                                              CodeGenOpt::Level OptLevel,
+                                              bool DisableVerify) {
   if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
 
   PM.add(createGCLoweringPass());
diff --git a/lib/Target/CBackend/CTargetMachine.h b/lib/Target/CBackend/CTargetMachine.h
index 715bbda..d178e7f 100644
--- a/lib/Target/CBackend/CTargetMachine.h
+++ b/lib/Target/CBackend/CTargetMachine.h
@@ -27,7 +27,8 @@ struct CTargetMachine : public TargetMachine {
   virtual bool addPassesToEmitWholeFile(PassManager &PM,
                                         formatted_raw_ostream &Out,
                                         CodeGenFileType FileType,
-                                        CodeGenOpt::Level OptLevel);
+                                        CodeGenOpt::Level OptLevel,
+                                        bool DisableVerify);
   
   virtual const TargetData *getTargetData() const { return 0; }
 };
diff --git a/lib/Target/CellSPU/SPU64InstrInfo.td b/lib/Target/CellSPU/SPU64InstrInfo.td
index 06eb149..47cb579 100644
--- a/lib/Target/CellSPU/SPU64InstrInfo.td
+++ b/lib/Target/CellSPU/SPU64InstrInfo.td
@@ -123,8 +123,8 @@ multiclass CompareLogicalGreaterThan64 {
 defm I64LGT: CompareLogicalGreaterThan64;
 
 def : Pat<(setugt R64C:$rA, R64C:$rB), I64LGTr64.Fragment>;
-def : Pat<(setugt (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
-                  I64LGTv2i64.Fragment>;
+//def : Pat<(setugt (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
+//          I64LGTv2i64.Fragment>;
 
 // i64 setult:
 def : I64SETCCNegCond<setule, I64LGTr64>;
@@ -201,8 +201,8 @@ multiclass CompareGreaterThan64 {
 defm I64GT: CompareLogicalGreaterThan64;
 
 def : Pat<(setgt R64C:$rA, R64C:$rB), I64GTr64.Fragment>;
-def : Pat<(setgt (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
-                  I64GTv2i64.Fragment>;
+//def : Pat<(setgt (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
+//                  I64GTv2i64.Fragment>;
 
 // i64 setult:
 def : I64SETCCNegCond<setle, I64GTr64>;
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index 1ed06e3..396a921 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -294,15 +294,19 @@ namespace {
           ((vecVT == MVT::v2i64) &&
            ((SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i64).getNode() != 0) ||
             (SPU::get_ILHUvec_imm(bvNode, *CurDAG, MVT::i64).getNode() != 0) ||
-            (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i64).getNode() != 0))))
-        return Select(bvNode);
+            (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i64).getNode() != 0)))) {
+        HandleSDNode Dummy(SDValue(bvNode, 0));
+        if (SDNode *N = Select(bvNode))
+          return N;
+        return Dummy.getValue().getNode();
+      }
 
       // No, need to emit a constant pool spill:
       std::vector<Constant*> CV;
 
       for (size_t i = 0; i < bvNode->getNumOperands(); ++i) {
         ConstantSDNode *V = dyn_cast<ConstantSDNode > (bvNode->getOperand(i));
-        CV.push_back(const_cast<ConstantInt *> (V->getConstantIntValue()));
+        CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
       }
 
       Constant *CP = ConstantVector::get(CV);
@@ -311,10 +315,15 @@ namespace {
       SDValue CGPoolOffset =
               SPU::LowerConstantPool(CPIdx, *CurDAG,
                                      SPUtli.getSPUTargetMachine());
-      return SelectCode(CurDAG->getLoad(vecVT, dl,
-                                        CurDAG->getEntryNode(), CGPoolOffset,
-                                        PseudoSourceValue::getConstantPool(), 0,
-                                        false, false, Alignment).getNode());
+      
+      HandleSDNode Dummy(CurDAG->getLoad(vecVT, dl,
+                                         CurDAG->getEntryNode(), CGPoolOffset,
+                                         PseudoSourceValue::getConstantPool(),0,
+                                         false, false, Alignment));
+      CurDAG->ReplaceAllUsesWith(SDValue(bvNode, 0), Dummy.getValue());
+      if (SDNode *N = SelectCode(Dummy.getValue().getNode()))
+        return N;
+      return Dummy.getValue().getNode();
     }
 
     /// Select - Convert the specified operand from a target-independent to a
@@ -390,10 +399,6 @@ namespace {
       return false;
     }
 
-    /// InstructionSelect - This callback is invoked by
-    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-    virtual void InstructionSelect();
-
     virtual const char *getPassName() const {
       return "Cell SPU DAG->DAG Pattern Instruction Selection";
     }
@@ -411,16 +416,6 @@ namespace {
   };
 }
 
-/// InstructionSelect - This callback is invoked by
-/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-void
-SPUDAGToDAGISel::InstructionSelect()
-{
-  // Select target instructions for the DAG.
-  SelectRoot(*CurDAG);
-  CurDAG->RemoveDeadNodes();
-}
-
 /*!
  \arg Op The ISD instruction operand
  \arg N The address to be tested
@@ -692,9 +687,8 @@ SPUDAGToDAGISel::Select(SDNode *N) {
   SDValue Ops[8];
   DebugLoc dl = N->getDebugLoc();
 
-  if (N->isMachineOpcode()) {
+  if (N->isMachineOpcode())
     return NULL;   // Already selected.
-  }
 
   if (Opc == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
@@ -759,43 +753,67 @@ SPUDAGToDAGISel::Select(SDNode *N) {
     }
 
     SDNode *shufMaskLoad = emitBuildVector(shufMask.getNode());
-    SDNode *PromoteScalar =
-            SelectCode(CurDAG->getNode(SPUISD::PREFSLOT2VEC, dl,
-                                       Op0VecVT, Op0).getNode());
-
+    
+    HandleSDNode PromoteScalar(CurDAG->getNode(SPUISD::PREFSLOT2VEC, dl,
+                                               Op0VecVT, Op0));
+    
+    SDValue PromScalar;
+    if (SDNode *N = SelectCode(PromoteScalar.getValue().getNode()))
+      PromScalar = SDValue(N, 0);
+    else
+      PromScalar = PromoteScalar.getValue();
+    
     SDValue zextShuffle =
             CurDAG->getNode(SPUISD::SHUFB, dl, OpVecVT,
-                            SDValue(PromoteScalar, 0),
-                            SDValue(PromoteScalar, 0),
+                            PromScalar, PromScalar, 
                             SDValue(shufMaskLoad, 0));
 
-    // N.B.: BIT_CONVERT replaces and updates the zextShuffle node, so we
-    // re-use it in the VEC2PREFSLOT selection without needing to explicitly
-    // call SelectCode (it's already done for us.)
-    SelectCode(CurDAG->getNode(ISD::BIT_CONVERT, dl, OpVecVT, zextShuffle).getNode());
-    return SelectCode(CurDAG->getNode(SPUISD::VEC2PREFSLOT, dl, OpVT,
-                                      zextShuffle).getNode());
+    HandleSDNode Dummy2(zextShuffle);
+    if (SDNode *N = SelectCode(Dummy2.getValue().getNode()))
+      zextShuffle = SDValue(N, 0);
+    else
+      zextShuffle = Dummy2.getValue();
+    HandleSDNode Dummy(CurDAG->getNode(SPUISD::VEC2PREFSLOT, dl, OpVT,
+                                       zextShuffle));
+    
+    CurDAG->ReplaceAllUsesWith(N, Dummy.getValue().getNode());
+    SelectCode(Dummy.getValue().getNode());
+    return Dummy.getValue().getNode();
   } else if (Opc == ISD::ADD && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
     SDNode *CGLoad =
             emitBuildVector(getCarryGenerateShufMask(*CurDAG, dl).getNode());
 
-    return SelectCode(CurDAG->getNode(SPUISD::ADD64_MARKER, dl, OpVT,
-                                      N->getOperand(0), N->getOperand(1),
-                                      SDValue(CGLoad, 0)).getNode());
+    HandleSDNode Dummy(CurDAG->getNode(SPUISD::ADD64_MARKER, dl, OpVT,
+                                       N->getOperand(0), N->getOperand(1),
+                                       SDValue(CGLoad, 0)));
+    
+    CurDAG->ReplaceAllUsesWith(N, Dummy.getValue().getNode());
+    if (SDNode *N = SelectCode(Dummy.getValue().getNode()))
+      return N;
+    return Dummy.getValue().getNode();
   } else if (Opc == ISD::SUB && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
     SDNode *CGLoad =
             emitBuildVector(getBorrowGenerateShufMask(*CurDAG, dl).getNode());
 
-    return SelectCode(CurDAG->getNode(SPUISD::SUB64_MARKER, dl, OpVT,
-                                      N->getOperand(0), N->getOperand(1),
-                                      SDValue(CGLoad, 0)).getNode());
+    HandleSDNode Dummy(CurDAG->getNode(SPUISD::SUB64_MARKER, dl, OpVT,
+                                       N->getOperand(0), N->getOperand(1),
+                                       SDValue(CGLoad, 0)));
+    
+    CurDAG->ReplaceAllUsesWith(N, Dummy.getValue().getNode());
+    if (SDNode *N = SelectCode(Dummy.getValue().getNode()))
+      return N;
+    return Dummy.getValue().getNode();
   } else if (Opc == ISD::MUL && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
     SDNode *CGLoad =
             emitBuildVector(getCarryGenerateShufMask(*CurDAG, dl).getNode());
 
-    return SelectCode(CurDAG->getNode(SPUISD::MUL64_MARKER, dl, OpVT,
-                                      N->getOperand(0), N->getOperand(1),
-                                      SDValue(CGLoad, 0)).getNode());
+    HandleSDNode Dummy(CurDAG->getNode(SPUISD::MUL64_MARKER, dl, OpVT,
+                                       N->getOperand(0), N->getOperand(1),
+                                       SDValue(CGLoad, 0)));
+    CurDAG->ReplaceAllUsesWith(N, Dummy.getValue().getNode());
+    if (SDNode *N = SelectCode(Dummy.getValue().getNode()))
+      return N;
+    return Dummy.getValue().getNode();
   } else if (Opc == ISD::TRUNCATE) {
     SDValue Op0 = N->getOperand(0);
     if ((Op0.getOpcode() == ISD::SRA || Op0.getOpcode() == ISD::SRL)
@@ -832,17 +850,14 @@ SPUDAGToDAGISel::Select(SDNode *N) {
       }
     }
   } else if (Opc == ISD::SHL) {
-    if (OpVT == MVT::i64) {
+    if (OpVT == MVT::i64)
       return SelectSHLi64(N, OpVT);
-    }
   } else if (Opc == ISD::SRL) {
-    if (OpVT == MVT::i64) {
+    if (OpVT == MVT::i64)
       return SelectSRLi64(N, OpVT);
-    }
   } else if (Opc == ISD::SRA) {
-    if (OpVT == MVT::i64) {
+    if (OpVT == MVT::i64)
       return SelectSRAi64(N, OpVT);
-    }
   } else if (Opc == ISD::FNEG
              && (OpVT == MVT::f64 || OpVT == MVT::v2f64)) {
     DebugLoc dl = N->getDebugLoc();
@@ -1224,13 +1239,15 @@ SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, EVT OpVT,
                             ? shufmask.getNode()
                             : emitBuildVector(shufmask.getNode()));
 
-    SDNode *shufNode =
-            Select(CurDAG->getNode(SPUISD::SHUFB, dl, OpVecVT,
+   SDValue shufNode =
+            CurDAG->getNode(SPUISD::SHUFB, dl, OpVecVT,
                                    SDValue(lhsNode, 0), SDValue(rhsNode, 0),
-                                   SDValue(shufMaskNode, 0)).getNode());
-
-    return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT,
-                                  SDValue(shufNode, 0));
+                                   SDValue(shufMaskNode, 0));
+    HandleSDNode Dummy(shufNode);
+    SDNode *SN = SelectCode(Dummy.getValue().getNode());
+    if (SN == 0) SN = Dummy.getValue().getNode();
+    
+    return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(SN, 0));
   } else if (i64vec.getOpcode() == ISD::BUILD_VECTOR) {
     return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT,
                                   SDValue(emitBuildVector(i64vec.getNode()), 0));
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index b21eb37..e863ee3 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -118,8 +118,7 @@ namespace {
             TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
                             0, TLI.getLibcallCallingConv(LC), false,
                             /*isReturnValueUsed=*/true,
-                            Callee, Args, DAG, Op.getDebugLoc(),
-                            DAG.GetOrdering(InChain.getNode()));
+                            Callee, Args, DAG, Op.getDebugLoc());
 
     return CallInfo.first;
   }
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index e3f2e9f..9c5893c 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -221,7 +221,7 @@ namespace {
     APFloat APF = APFloat(CFP->getValueAPF());  // copy
     if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
       APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
-    Out << "ConstantFP::get(getGlobalContext(), ";
+    Out << "ConstantFP::get(mod->getContext(), ";
     Out << "APFloat(";
 #if HAVE_PRINTF_A
     char Buffer[100];
@@ -346,21 +346,21 @@ namespace {
     // First, handle the primitive types .. easy
     if (Ty->isPrimitiveType() || Ty->isIntegerTy()) {
       switch (Ty->getTypeID()) {
-      case Type::VoidTyID:   return "Type::getVoidTy(getGlobalContext())";
+      case Type::VoidTyID:   return "Type::getVoidTy(mod->getContext())";
       case Type::IntegerTyID: {
         unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
-        return "IntegerType::get(getGlobalContext(), " + utostr(BitWidth) + ")";
+        return "IntegerType::get(mod->getContext(), " + utostr(BitWidth) + ")";
       }
-      case Type::X86_FP80TyID: return "Type::getX86_FP80Ty(getGlobalContext())";
-      case Type::FloatTyID:    return "Type::getFloatTy(getGlobalContext())";
-      case Type::DoubleTyID:   return "Type::getDoubleTy(getGlobalContext())";
-      case Type::LabelTyID:    return "Type::getLabelTy(getGlobalContext())";
+      case Type::X86_FP80TyID: return "Type::getX86_FP80Ty(mod->getContext())";
+      case Type::FloatTyID:    return "Type::getFloatTy(mod->getContext())";
+      case Type::DoubleTyID:   return "Type::getDoubleTy(mod->getContext())";
+      case Type::LabelTyID:    return "Type::getLabelTy(mod->getContext())";
       default:
         error("Invalid primitive type");
         break;
       }
       // shouldn't be returned, but make it sensible
-      return "Type::getVoidTy(getGlobalContext())";
+      return "Type::getVoidTy(mod->getContext())";
     }
 
     // Now, see if we've seen the type before and return that
@@ -514,7 +514,7 @@ namespace {
       TypeMap::const_iterator I = UnresolvedTypes.find(Ty);
       if (I == UnresolvedTypes.end()) {
         Out << "PATypeHolder " << typeName;
-        Out << "_fwd = OpaqueType::get(getGlobalContext());";
+        Out << "_fwd = OpaqueType::get(mod->getContext());";
         nl(Out);
         UnresolvedTypes[Ty] = typeName;
       }
@@ -615,7 +615,7 @@ namespace {
     }
     case Type::OpaqueTyID: {
       Out << "OpaqueType* " << typeName;
-      Out << " = OpaqueType::get(getGlobalContext());";
+      Out << " = OpaqueType::get(mod->getContext());";
       nl(Out);
       break;
     }
@@ -751,7 +751,7 @@ namespace {
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
       std::string constValue = CI->getValue().toString(10, true);
       Out << "ConstantInt* " << constName
-          << " = ConstantInt::get(getGlobalContext(), APInt("
+          << " = ConstantInt::get(mod->getContext(), APInt("
           << cast<IntegerType>(CI->getType())->getBitWidth()
           << ", StringRef(\"" <<  constValue << "\"), 10));";
     } else if (isa<ConstantAggregateZero>(CV)) {
@@ -769,7 +769,7 @@ namespace {
           CA->getType()->getElementType() ==
               Type::getInt8Ty(CA->getContext())) {
         Out << "Constant* " << constName <<
-               " = ConstantArray::get(getGlobalContext(), \"";
+               " = ConstantArray::get(mod->getContext(), \"";
         std::string tmp = CA->getAsString();
         bool nullTerminate = false;
         if (tmp[tmp.length()-1] == 0) {
@@ -995,7 +995,7 @@ namespace {
   void CppWriter::printVariableHead(const GlobalVariable *GV) {
     nl(Out) << "GlobalVariable* " << getCppName(GV);
     if (is_inline) {
-      Out << " = mod->getGlobalVariable(getGlobalContext(), ";
+      Out << " = mod->getGlobalVariable(mod->getContext(), ";
       printEscapedString(GV->getName());
       Out << ", " << getCppName(GV->getType()->getElementType()) << ",true)";
       nl(Out) << "if (!" << getCppName(GV) << ") {";
@@ -1094,7 +1094,7 @@ namespace {
 
     case Instruction::Ret: {
       const ReturnInst* ret =  cast<ReturnInst>(I);
-      Out << "ReturnInst::Create(getGlobalContext(), "
+      Out << "ReturnInst::Create(mod->getContext(), "
           << (ret->getReturnValue() ? opNames[0] + ", " : "") << bbname << ");";
       break;
     }
@@ -1171,7 +1171,7 @@ namespace {
     }
     case Instruction::Unreachable: {
       Out << "new UnreachableInst("
-          << "getGlobalContext(), "
+          << "mod->getContext(), "
           << bbname << ");";
       break;
     }
@@ -1673,7 +1673,7 @@ namespace {
          BI != BE; ++BI) {
       std::string bbname(getCppName(BI));
       Out << "BasicBlock* " << bbname <<
-             " = BasicBlock::Create(getGlobalContext(), \"";
+             " = BasicBlock::Create(mod->getContext(), \"";
       if (BI->hasName())
         printEscapedString(BI->getName());
       Out << "\"," << getCppName(BI->getParent()) << ",0);";
@@ -2009,7 +2009,8 @@ char CppWriter::ID = 0;
 bool CPPTargetMachine::addPassesToEmitWholeFile(PassManager &PM,
                                                 formatted_raw_ostream &o,
                                                 CodeGenFileType FileType,
-                                                CodeGenOpt::Level OptLevel) {
+                                                CodeGenOpt::Level OptLevel,
+                                                bool DisableVerify) {
   if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
   PM.add(new CppWriter(o));
   return false;
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index 1f74f76..b7aae91 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -30,7 +30,8 @@ struct CPPTargetMachine : public TargetMachine {
   virtual bool addPassesToEmitWholeFile(PassManager &PM,
                                         formatted_raw_ostream &Out,
                                         CodeGenFileType FileType,
-                                        CodeGenOpt::Level OptLevel);
+                                        CodeGenOpt::Level OptLevel,
+                                        bool DisableVerify);
 
   virtual const TargetData *getTargetData() const { return 0; }
 };
diff --git a/lib/Target/MBlaze/AsmPrinter/CMakeLists.txt b/lib/Target/MBlaze/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..cfb2fc8
--- /dev/null
+++ b/lib/Target/MBlaze/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,9 @@
+include_directories(
+  ${CMAKE_CURRENT_BINARY_DIR}/..
+  ${CMAKE_CURRENT_SOURCE_DIR}/..
+  )
+
+add_llvm_library(LLVMMBlazeAsmPrinter
+  MBlazeAsmPrinter.cpp
+  )
+add_dependencies(LLVMMBlazeAsmPrinter MBlazeCodeGenTable_gen)
+\ No newline at end of file
diff --git a/lib/Target/MBlaze/AsmPrinter/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/AsmPrinter/MBlazeAsmPrinter.cpp
new file mode 100644
index 0000000..6fe1026
--- /dev/null
+++ b/lib/Target/MBlaze/AsmPrinter/MBlazeAsmPrinter.cpp
@@ -0,0 +1,302 @@
+//===-- MBlazeAsmPrinter.cpp - MBlaze LLVM assembly writer ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format MBlaze assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mblaze-asm-printer"
+
+#include "MBlaze.h"
+#include "MBlazeSubtarget.h"
+#include "MBlazeInstrInfo.h"
+#include "MBlazeTargetMachine.h"
+#include "MBlazeMachineFunction.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/MathExtras.h"
+#include <cctype>
+
+using namespace llvm;
+
+namespace {
+  class MBlazeAsmPrinter : public AsmPrinter {
+    const MBlazeSubtarget *Subtarget;
+  public:
+    explicit MBlazeAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                              MCContext &Ctx, MCStreamer &Streamer, 
+                              const MCAsmInfo *T )
+      : AsmPrinter(O, TM, Ctx, Streamer, T) {
+      Subtarget = &TM.getSubtarget<MBlazeSubtarget>();
+    }
+
+    virtual const char *getPassName() const {
+      return "MBlaze Assembly Printer";
+    }
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode);
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printUnsignedImm(const MachineInstr *MI, int opNum);
+    void printFSLImm(const MachineInstr *MI, int opNum);
+    void printMemOperand(const MachineInstr *MI, int opNum,
+                         const char *Modifier = 0);
+    void printFCCOperand(const MachineInstr *MI, int opNum,
+                         const char *Modifier = 0);
+    void printSavedRegsBitmask();
+    void printHex32(unsigned int Value);
+
+    const char *emitCurrentABIString();
+    void emitFrameDirective();
+
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    void EmitInstruction(const MachineInstr *MI) { 
+        printInstruction(MI);
+        O << '\n';
+    }
+    virtual void EmitFunctionBodyStart();
+    virtual void EmitFunctionBodyEnd();
+    static const char *getRegisterName(unsigned RegNo);
+
+    virtual void EmitFunctionEntryLabel();
+    void EmitStartOfAsmFile(Module &M);
+  };
+} // end of anonymous namespace
+
+#include "MBlazeGenAsmWriter.inc"
+
+//===----------------------------------------------------------------------===//
+//
+//  MBlaze Asm Directives
+//
+//  -- Frame directive "frame Stackpointer, Stacksize, RARegister"
+//  Describe the stack frame.
+//
+//  -- Mask directives "mask  bitmask, offset"
+//  Tells the assembler which registers are saved and where.
+//  bitmask - contain a little endian bitset indicating which registers are
+//            saved on function prologue (e.g. with a 0x80000000 mask, the
+//            assembler knows the register 31 (RA) is saved at prologue.
+//  offset  - the position before stack pointer subtraction indicating where
+//            the first saved register on prologue is located. (e.g. with a
+//
+//  Consider the following function prologue:
+//
+//    .frame  R19,48,R15
+//    .mask   0xc0000000,-8
+//       addiu R1, R1, -48
+//       sw R15, 40(R1)
+//       sw R19, 36(R1)
+//
+//    With a 0xc0000000 mask, the assembler knows the register 15 (R15) and
+//    19 (R19) are saved at prologue. As the save order on prologue is from
+//    left to right, R15 is saved first. A -8 offset means that after the
+//    stack pointer subtration, the first register in the mask (R15) will be
+//    saved at address 48-8=40.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mask directives
+//===----------------------------------------------------------------------===//
+
+// Create a bitmask with all callee saved registers for CPU or Floating Point
+// registers. For CPU registers consider RA, GP and FP for saving if necessary.
+void MBlazeAsmPrinter::printSavedRegsBitmask() {
+  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+  const MBlazeFunctionInfo *MBlazeFI = MF->getInfo<MBlazeFunctionInfo>();
+
+  // CPU Saved Registers Bitmasks
+  unsigned int CPUBitmask = 0;
+
+  // Set the CPU Bitmasks
+  const MachineFrameInfo *MFI = MF->getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned RegNum = MBlazeRegisterInfo::getRegisterNumbering(CSI[i].getReg());
+    if (CSI[i].getRegClass() == MBlaze::CPURegsRegisterClass)
+      CPUBitmask |= (1 << RegNum);
+  }
+
+  // Return Address and Frame registers must also be set in CPUBitmask.
+  if (RI.hasFP(*MF))
+    CPUBitmask |= (1 << MBlazeRegisterInfo::
+                getRegisterNumbering(RI.getFrameRegister(*MF)));
+
+  if (MFI->hasCalls())
+    CPUBitmask |= (1 << MBlazeRegisterInfo::
+                getRegisterNumbering(RI.getRARegister()));
+
+  // Print CPUBitmask
+  O << "\t.mask \t"; printHex32(CPUBitmask); O << ','
+    << MBlazeFI->getCPUTopSavedRegOff() << '\n';
+}
+
+// Print a 32 bit hex number with all numbers.
+void MBlazeAsmPrinter::printHex32(unsigned int Value) {
+  O << "0x";
+  for (int i = 7; i >= 0; i--)
+    O << utohexstr( (Value & (0xF << (i*4))) >> (i*4) );
+}
+
+//===----------------------------------------------------------------------===//
+// Frame and Set directives
+//===----------------------------------------------------------------------===//
+
+/// Frame Directive
+void MBlazeAsmPrinter::emitFrameDirective() {
+  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+
+  unsigned stackReg  = RI.getFrameRegister(*MF);
+  unsigned returnReg = RI.getRARegister();
+  unsigned stackSize = MF->getFrameInfo()->getStackSize();
+
+
+  O << "\t.frame\t" << getRegisterName(stackReg)
+                    << ',' << stackSize << ','
+                    << getRegisterName(returnReg)
+                    << '\n';
+}
+
+void MBlazeAsmPrinter::EmitFunctionEntryLabel() {
+      O << "\t.ent\t" << *CurrentFnSym << '\n';
+        OutStreamer.EmitLabel(CurrentFnSym);
+}
+
+/// EmitFunctionBodyStart - Targets can override this to emit stuff before
+/// the first basic block in the function.
+void MBlazeAsmPrinter::EmitFunctionBodyStart() {
+  emitFrameDirective();
+  printSavedRegsBitmask();
+}
+
+/// EmitFunctionBodyEnd - Targets can override this to emit stuff after
+/// the last basic block in the function.
+void MBlazeAsmPrinter::EmitFunctionBodyEnd() {
+  O << "\t.end\t" << *CurrentFnSym << '\n';
+}
+
+// Print out an operand for an inline asm expression.
+bool MBlazeAsmPrinter::
+PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                unsigned AsmVariant,const char *ExtraCode){
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  printOperand(MI, OpNo);
+  return false;
+}
+
+void MBlazeAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+
+  switch (MO.getType()) {
+    case MachineOperand::MO_Register:
+      O << getRegisterName(MO.getReg());
+      break;
+
+    case MachineOperand::MO_Immediate:
+      O << (int)MO.getImm();
+      break;
+
+    case MachineOperand::MO_FPImmediate: {
+      const ConstantFP* fp = MO.getFPImm();
+      printHex32(fp->getValueAPF().bitcastToAPInt().getZExtValue());
+      O << ";\t# immediate = " << *fp;
+      break;
+      }
+
+    case MachineOperand::MO_MachineBasicBlock:
+      O << *MO.getMBB()->getSymbol(OutContext);
+      return;
+
+    case MachineOperand::MO_GlobalAddress:
+      O << *GetGlobalValueSymbol(MO.getGlobal());
+      break;
+
+    case MachineOperand::MO_ExternalSymbol:
+      O << *GetExternalSymbolSymbol(MO.getSymbolName());
+      break;
+
+    case MachineOperand::MO_JumpTableIndex:
+      O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+      break;
+
+    case MachineOperand::MO_ConstantPoolIndex:
+      O << MAI->getPrivateGlobalPrefix() << "CPI"
+        << getFunctionNumber() << "_" << MO.getIndex();
+      if (MO.getOffset())
+        O << "+" << MO.getOffset();
+      break;
+
+    default:
+      llvm_unreachable("<unknown operand type>");
+  }
+}
+
+void MBlazeAsmPrinter::printUnsignedImm(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.getType() == MachineOperand::MO_Immediate)
+    O << (unsigned int)MO.getImm();
+  else
+    printOperand(MI, opNum);
+}
+
+void MBlazeAsmPrinter::printFSLImm(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.getType() == MachineOperand::MO_Immediate)
+    O << "rfsl" << (unsigned int)MO.getImm();
+  else
+    printOperand(MI, opNum);
+}
+
+void MBlazeAsmPrinter::
+printMemOperand(const MachineInstr *MI, int opNum, const char *Modifier) {
+  printOperand(MI, opNum+1);
+  O << ", ";
+  printOperand(MI, opNum);
+}
+
+void MBlazeAsmPrinter::
+printFCCOperand(const MachineInstr *MI, int opNum, const char *Modifier) {
+  const MachineOperand& MO = MI->getOperand(opNum);
+  O << MBlaze::MBlazeFCCToString((MBlaze::CondCode)MO.getImm());
+}
+
+void MBlazeAsmPrinter::EmitStartOfAsmFile(Module &M) {
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMBlazeAsmPrinter() {
+  RegisterAsmPrinter<MBlazeAsmPrinter> X(TheMBlazeTarget);
+}
diff --git a/lib/Target/MBlaze/AsmPrinter/Makefile b/lib/Target/MBlaze/AsmPrinter/Makefile
new file mode 100644
index 0000000..c8e4d8f
--- /dev/null
+++ b/lib/Target/MBlaze/AsmPrinter/Makefile
@@ -0,0 +1,17 @@
+##===- lib/Target/MBlaze/AsmPrinter/Makefile ---------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMMBlazeAsmPrinter
+
+# Hack: we need to include 'main' MBlaze target directory to grab
+# private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/MBlaze/CMakeLists.txt b/lib/Target/MBlaze/CMakeLists.txt
new file mode 100644
index 0000000..c93e3df
--- /dev/null
+++ b/lib/Target/MBlaze/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(LLVM_TARGET_DEFINITIONS MBlaze.td)
+
+tablegen(MBlazeGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(MBlazeGenRegisterNames.inc -gen-register-enums)
+tablegen(MBlazeGenRegisterInfo.inc -gen-register-desc)
+tablegen(MBlazeGenInstrNames.inc -gen-instr-enums)
+tablegen(MBlazeGenInstrInfo.inc -gen-instr-desc)
+tablegen(MBlazeGenAsmWriter.inc -gen-asm-writer)
+tablegen(MBlazeGenDAGISel.inc -gen-dag-isel)
+tablegen(MBlazeGenCallingConv.inc -gen-callingconv)
+tablegen(MBlazeGenSubtarget.inc -gen-subtarget)
+tablegen(MBlazeGenIntrinsics.inc -gen-tgt-intrinsic)
+
+add_llvm_target(MBlazeCodeGen
+  MBlazeDelaySlotFiller.cpp
+  MBlazeInstrInfo.cpp
+  MBlazeISelDAGToDAG.cpp
+  MBlazeISelLowering.cpp
+  MBlazeMCAsmInfo.cpp
+  MBlazeRegisterInfo.cpp
+  MBlazeSubtarget.cpp
+  MBlazeTargetMachine.cpp
+  MBlazeTargetObjectFile.cpp
+  MBlazeIntrinsicInfo.cpp
+  )
+
+target_link_libraries (LLVMMBlazeCodeGen LLVMSelectionDAG)
diff --git a/lib/Target/MBlaze/MBlaze.h b/lib/Target/MBlaze/MBlaze.h
new file mode 100644
index 0000000..f9d828b
--- /dev/null
+++ b/lib/Target/MBlaze/MBlaze.h
@@ -0,0 +1,39 @@
+//===-- MBlaze.h - Top-level interface for MBlaze ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM MBlaze back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_MBLAZE_H
+#define TARGET_MBLAZE_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class MBlazeTargetMachine;
+  class FunctionPass;
+  class MachineCodeEmitter;
+  class formatted_raw_ostream;
+
+  FunctionPass *createMBlazeISelDag(MBlazeTargetMachine &TM);
+  FunctionPass *createMBlazeDelaySlotFillerPass(MBlazeTargetMachine &TM);
+
+  extern Target TheMBlazeTarget;
+} // end namespace llvm;
+
+// Defines symbolic names for MBlaze registers.  This defines a mapping from
+// register name to register number.
+#include "MBlazeGenRegisterNames.inc"
+
+// Defines symbolic names for the MBlaze instructions.
+#include "MBlazeGenInstrNames.inc"
+
+#endif
diff --git a/lib/Target/MBlaze/MBlaze.td b/lib/Target/MBlaze/MBlaze.td
new file mode 100644
index 0000000..1679752
--- /dev/null
+++ b/lib/Target/MBlaze/MBlaze.td
@@ -0,0 +1,85 @@
+//===- MBlaze.td - Describe the MBlaze Target Machine -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the MBlaze target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MBlazeRegisterInfo.td"
+include "MBlazeSchedule.td"
+include "MBlazeIntrinsics.td"
+include "MBlazeInstrInfo.td"
+include "MBlazeCallingConv.td"
+
+def MBlazeInstrInfo : InstrInfo {
+  let TSFlagsFields = [];
+  let TSFlagsShifts = [];
+}
+
+
+//===----------------------------------------------------------------------===//
+// Microblaze Subtarget features                                              //
+//===----------------------------------------------------------------------===//
+
+def FeaturePipe3       : SubtargetFeature<"pipe3", "HasPipe3", "true",
+                                "Implements 3-stage pipeline.">;
+def FeatureBarrel      : SubtargetFeature<"barrel", "HasBarrel", "true",
+                                "Implements barrel shifter.">;
+def FeatureDiv         : SubtargetFeature<"div", "HasDiv", "true",
+                                "Implements hardware divider.">;
+def FeatureMul         : SubtargetFeature<"mul", "HasMul", "true",
+                                "Implements hardware multiplier.">;
+def FeatureFSL         : SubtargetFeature<"fsl", "HasFSL", "true",
+                                "Implements FSL instructions.">;
+def FeatureEFSL        : SubtargetFeature<"efsl", "HasEFSL", "true",
+                                "Implements extended FSL instructions.">;
+def FeatureMSRSet      : SubtargetFeature<"msrset", "HasMSRSet", "true",
+                                "Implements MSR register set and clear.">;
+def FeatureException   : SubtargetFeature<"exception", "HasException", "true",
+                                "Implements hardware exception support.">;
+def FeaturePatCmp      : SubtargetFeature<"patcmp", "HasPatCmp", "true",
+                                "Implements pattern compare instruction.">;
+def FeatureFPU         : SubtargetFeature<"fpu", "HasFPU", "true",
+                                "Implements floating point unit.">;
+def FeatureESR         : SubtargetFeature<"esr", "HasESR", "true",
+                                "Implements ESR and EAR registers">;
+def FeaturePVR         : SubtargetFeature<"pvr", "HasPVR", "true",
+                                "Implements processor version register.">;
+def FeatureMul64       : SubtargetFeature<"mul64", "HasMul64", "true",
+                                "Implements multiplier with 64-bit result">;
+def FeatureSqrt        : SubtargetFeature<"sqrt", "HasSqrt", "true",
+                                "Implements sqrt and floating point convert.">;
+def FeatureMMU         : SubtargetFeature<"mmu", "HasMMU", "true",
+                                "Implements memory management unit.">;
+
+//===----------------------------------------------------------------------===//
+// MBlaze processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, MBlazeGenericItineraries, Features>;
+
+
+def : Proc<"v400", []>;
+def : Proc<"v500", []>;
+def : Proc<"v600", []>;
+def : Proc<"v700", []>;
+def : Proc<"v710", []>;
+
+def MBlaze : Target {
+  let InstructionSet = MBlazeInstrInfo;
+}
diff --git a/lib/Target/MBlaze/MBlazeCallingConv.td b/lib/Target/MBlaze/MBlazeCallingConv.td
new file mode 100644
index 0000000..dfc87f5
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeCallingConv.td
@@ -0,0 +1,41 @@
+//===- MBlazeCallingConv.td - Calling Conventions for MBlaze ----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for MBlaze architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>: 
+  CCIf<!strconcat("State.getTarget().getSubtarget<MBlazeSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// MBlaze ABI Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_MBlaze : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[R5, R6, R7, R8, R9, R10]>>,
+
+  // Single fp arguments are passed in floating point registers
+  CCIfType<[f32], CCAssignToReg<[F5, F6, F7, F8, F9, F10]>>,
+
+  // 32-bit values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>
+]>;
+
+def RetCC_MBlaze : CallingConv<[
+  // i32 are returned in registers R3, R4
+  CCIfType<[i32], CCAssignToReg<[R3, R4]>>,
+
+  // f32 are returned in registers F3, F4
+  CCIfType<[f32], CCAssignToReg<[F3, F4]>>
+]>;
diff --git a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
new file mode 100644
index 0000000..42fea25
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
@@ -0,0 +1,75 @@
+//===-- DelaySlotFiller.cpp - MBlaze delay slot filler --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple pass to fills delay slots with NOPs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+#include "MBlaze.h"
+#include "MBlazeTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+namespace {
+  struct Filler : public MachineFunctionPass {
+
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+    Filler(TargetMachine &tm) 
+      : MachineFunctionPass(&ID), TM(tm), TII(tm.getInstrInfo()) { }
+
+    virtual const char *getPassName() const {
+      return "MBlaze Delay Slot Filler";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F) {
+      bool Changed = false;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+      return Changed;
+    }
+
+  };
+  char Filler::ID = 0;
+} // end of anonymous namespace
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// Currently, we fill delay slots with NOPs. We assume there is only one
+/// delay slot per delayed instruction.
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+    if (I->getDesc().hasDelaySlot()) {
+      MachineBasicBlock::iterator J = I;
+      ++J;
+      BuildMI(MBB, J, I->getDebugLoc(), TII->get(MBlaze::NOP));
+      ++FilledSlots;
+      Changed = true;
+    }
+  return Changed;
+}
+
+/// createMBlazeDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in MBlaze MachineFunctions
+FunctionPass *llvm::createMBlazeDelaySlotFillerPass(MBlazeTargetMachine &tm) {
+  return new Filler(tm);
+}
+
diff --git a/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp b/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp
new file mode 100644
index 0000000..a0ebea0
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp
@@ -0,0 +1,339 @@
+//===-- MBlazeISelDAGToDAG.cpp - A dag to dag inst selector for MBlaze ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MBlaze target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mblaze-isel"
+#include "MBlaze.h"
+#include "MBlazeISelLowering.h"
+#include "MBlazeMachineFunction.h"
+#include "MBlazeRegisterInfo.h"
+#include "MBlazeSubtarget.h"
+#include "MBlazeTargetMachine.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlazeDAGToDAGISel - MBlaze specific code to select MBlaze machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace {
+
+class MBlazeDAGToDAGISel : public SelectionDAGISel {
+
+  /// TM - Keep a reference to MBlazeTargetMachine.
+  MBlazeTargetMachine &TM;
+
+  /// Subtarget - Keep a pointer to the MBlazeSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const MBlazeSubtarget &Subtarget;
+
+public:
+  explicit MBlazeDAGToDAGISel(MBlazeTargetMachine &tm) :
+  SelectionDAGISel(tm),
+  TM(tm), Subtarget(tm.getSubtarget<MBlazeSubtarget>()) {}
+
+  // Pass Name
+  virtual const char *getPassName() const {
+    return "MBlaze DAG->DAG Pattern Instruction Selection";
+  }
+private:
+  // Include the pieces autogenerated from the target description.
+  #include "MBlazeGenDAGISel.inc"
+
+  /// getTargetMachine - Return a reference to the TargetMachine, casted
+  /// to the target-specific type.
+  const MBlazeTargetMachine &getTargetMachine() {
+    return static_cast<const MBlazeTargetMachine &>(TM);
+  }
+
+  /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
+  /// to the target-specific type.
+  const MBlazeInstrInfo *getInstrInfo() {
+    return getTargetMachine().getInstrInfo();
+  }
+
+  SDNode *getGlobalBaseReg();
+  SDNode *Select(SDNode *N);
+
+  // Complex Pattern.
+  bool SelectAddr(SDNode *Op, SDValue N,
+                  SDValue &Base, SDValue &Offset);
+
+  // Address Selection
+  bool SelectAddrRegReg(SDNode *Op, SDValue N, SDValue &Base, SDValue &Index);
+  bool SelectAddrRegImm(SDNode *Op, SDValue N, SDValue &Disp, SDValue &Base);
+
+  // getI32Imm - Return a target constant with the specified value, of type i32.
+  inline SDValue getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+};
+
+}
+
+/// isIntS32Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 32-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS32Immediate(SDNode *N, int32_t &Imm) {
+  unsigned Opc = N->getOpcode();
+  if (Opc != ISD::Constant)
+    return false;
+
+  Imm = (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+
+static bool isIntS32Immediate(SDValue Op, int32_t &Imm) {
+  return isIntS32Immediate(Op.getNode(), Imm);
+}
+
+
+/// SelectAddressRegReg - Given the specified addressed, check to see if it
+/// can be represented as an indexed [r+r] operation.  Returns false if it
+/// can be more efficiently represented with [r+imm].
+bool MBlazeDAGToDAGISel::
+SelectAddrRegReg(SDNode *Op, SDValue N, SDValue &Base, SDValue &Index) {
+  if (N.getOpcode() == ISD::FrameIndex) return false;
+  if (N.getOpcode() == ISD::TargetExternalSymbol ||
+      N.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // direct calls.
+
+  if (N.getOperand(0).getOpcode() == ISD::TargetJumpTable ||
+      N.getOperand(1).getOpcode() == ISD::TargetJumpTable)
+    return false; // jump tables.
+
+  int32_t imm = 0;
+  if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) {
+    if (isIntS32Immediate(N.getOperand(1), imm))
+      return false;    // r+i
+
+    Base = N.getOperand(1);
+    Index = N.getOperand(0);
+    return true;
+  }
+
+  return false;
+}
+
+/// Returns true if the address N can be represented by a base register plus
+/// a signed 32-bit displacement [r+imm], and if it is not better
+/// represented as reg+reg.
+bool MBlazeDAGToDAGISel::
+SelectAddrRegImm(SDNode *Op, SDValue N, SDValue &Disp, SDValue &Base) {
+  // If this can be more profitably realized as r+r, fail.
+  if (SelectAddrRegReg(Op, N, Disp, Base))
+    return false;
+
+  if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) {
+    int32_t imm = 0;
+    if (isIntS32Immediate(N.getOperand(1), imm)) {
+      Disp = CurDAG->getTargetConstant(imm, MVT::i32);
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+        Base = CurDAG->getTargetFrameIndex(FI->getIndex(), N.getValueType());
+      } else {
+        Base = N.getOperand(0);
+      }
+      DEBUG( errs() << "WESLEY: Using Operand Immediate\n" );
+      return true; // [r+i]
+    }
+  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    // Loading from a constant address.
+    uint32_t Imm = CN->getZExtValue();
+    Disp = CurDAG->getTargetConstant(Imm, CN->getValueType(0));
+    Base = CurDAG->getRegister(MBlaze::R0, CN->getValueType(0));
+    DEBUG( errs() << "WESLEY: Using Constant Node\n" );
+    return true;
+  }
+
+  Disp = CurDAG->getTargetConstant(0, TM.getTargetLowering()->getPointerTy());
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
+    Base = CurDAG->getTargetFrameIndex(FI->getIndex(), N.getValueType());
+  else
+    Base = N;
+  return true;      // [r+0]
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// GOT address into a register.
+SDNode *MBlazeDAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
+/// ComplexPattern used on MBlazeInstrInfo
+/// Used on MBlaze Load/Store instructions
+bool MBlazeDAGToDAGISel::
+SelectAddr(SDNode *Op, SDValue Addr, SDValue &Offset, SDValue &Base) {
+  // if Address is FI, get the TargetFrameIndex.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  // on PIC code Load GA
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    if ((Addr.getOpcode() == ISD::TargetGlobalAddress) ||
+        (Addr.getOpcode() == ISD::TargetConstantPool) ||
+        (Addr.getOpcode() == ISD::TargetJumpTable)){
+      Base   = CurDAG->getRegister(MBlaze::R15, MVT::i32);
+      Offset = Addr;
+      return true;
+    }
+  } else {
+    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+        Addr.getOpcode() == ISD::TargetGlobalAddress))
+      return false;
+  }
+
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (Predicate_immSExt16(CN)) {
+
+        // If the first operand is a FI, get the TargetFI Node
+        if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                    (Addr.getOperand(0))) {
+          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+        } else {
+          Base = Addr.getOperand(0);
+        }
+
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+        return true;
+      }
+    }
+  }
+
+  Base   = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+SDNode* MBlazeDAGToDAGISel::Select(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+  DebugLoc dl = Node->getDebugLoc();
+
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    return NULL;
+  }
+
+  ///
+  // Instruction Selection not handled by the auto-generated
+  // tablegen selection should be handled here.
+  ///
+  switch(Opcode) {
+    default: break;
+
+    // Get target GOT address.
+    case ISD::GLOBAL_OFFSET_TABLE:
+      return getGlobalBaseReg();
+
+    case ISD::FrameIndex: {
+        SDValue imm = CurDAG->getTargetConstant(0, MVT::i32);
+        int FI = dyn_cast<FrameIndexSDNode>(Node)->getIndex();
+        EVT VT = Node->getValueType(0);
+        SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
+        unsigned Opc = MBlaze::ADDI;
+        if (Node->hasOneUse())
+          return CurDAG->SelectNodeTo(Node, Opc, VT, TFI, imm);
+        return CurDAG->getMachineNode(Opc, dl, VT, TFI, imm);
+    }
+
+
+    /// Handle direct and indirect calls when using PIC. On PIC, when
+    /// GOT is smaller than about 64k (small code) the GA target is
+    /// loaded with only one instruction. Otherwise GA's target must
+    /// be loaded with 3 instructions.
+    case MBlazeISD::JmpLink: {
+      if (TM.getRelocationModel() == Reloc::PIC_) {
+        SDValue Chain  = Node->getOperand(0);
+        SDValue Callee = Node->getOperand(1);
+        SDValue R20Reg = CurDAG->getRegister(MBlaze::R20, MVT::i32);
+        SDValue InFlag(0, 0);
+
+        if ( (isa<GlobalAddressSDNode>(Callee)) ||
+             (isa<ExternalSymbolSDNode>(Callee)) )
+        {
+          /// Direct call for global addresses and external symbols
+          SDValue GPReg = CurDAG->getRegister(MBlaze::R15, MVT::i32);
+
+          // Use load to get GOT target
+          SDValue Ops[] = { Callee, GPReg, Chain };
+          SDValue Load = SDValue(CurDAG->getMachineNode(MBlaze::LW, dl,
+                                 MVT::i32, MVT::Other, Ops, 3), 0);
+          Chain = Load.getValue(1);
+
+          // Call target must be on T9
+          Chain = CurDAG->getCopyToReg(Chain, dl, R20Reg, Load, InFlag);
+        } else
+          /// Indirect call
+          Chain = CurDAG->getCopyToReg(Chain, dl, R20Reg, Callee, InFlag);
+
+        // Emit Jump and Link Register
+        SDNode *ResNode = CurDAG->getMachineNode(MBlaze::BRLID, dl, MVT::Other,
+                                                 MVT::Flag, R20Reg, Chain);
+        Chain  = SDValue(ResNode, 0);
+        InFlag = SDValue(ResNode, 1);
+        ReplaceUses(SDValue(Node, 0), Chain);
+        ReplaceUses(SDValue(Node, 1), InFlag);
+        return ResNode;
+      }
+    }
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ");
+  if (ResNode == NULL || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
+  return ResNode;
+}
+
+/// createMBlazeISelDag - This pass converts a legalized DAG into a
+/// MBlaze-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createMBlazeISelDag(MBlazeTargetMachine &TM) {
+  return new MBlazeDAGToDAGISel(TM);
+}
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
new file mode 100644
index 0000000..7790248
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -0,0 +1,881 @@
+//===-- MBlazeISelLowering.cpp - MBlaze DAG Lowering Implementation -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that MBlaze uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mblaze-lower"
+#include "MBlazeISelLowering.h"
+#include "MBlazeMachineFunction.h"
+#include "MBlazeTargetMachine.h"
+#include "MBlazeTargetObjectFile.h"
+#include "MBlazeSubtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+const char *MBlazeTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+    case MBlazeISD::JmpLink    : return "MBlazeISD::JmpLink";
+    case MBlazeISD::GPRel      : return "MBlazeISD::GPRel";
+    case MBlazeISD::Wrap       : return "MBlazeISD::Wrap";
+    case MBlazeISD::ICmp       : return "MBlazeISD::ICmp";
+    case MBlazeISD::Ret        : return "MBlazeISD::Ret";
+    case MBlazeISD::Select_CC  : return "MBlazeISD::Select_CC";
+    default                    : return NULL;
+  }
+}
+
+MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
+  : TargetLowering(TM, new MBlazeTargetObjectFile()) {
+  Subtarget = &TM.getSubtarget<MBlazeSubtarget>();
+
+  // MBlaze does not have i1 type, so use i32 for
+  // setcc operations results (slt, sgt, ...).
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  // Set up the register classes
+  addRegisterClass(MVT::i32, MBlaze::CPURegsRegisterClass);
+  if (Subtarget->hasFPU()) {
+    addRegisterClass(MVT::f32, MBlaze::FGR32RegisterClass);
+    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  }
+
+  // Floating point operations which are not supported
+  setOperationAction(ISD::FREM,       MVT::f32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+  setOperationAction(ISD::FP_ROUND,   MVT::f32, Expand);
+  setOperationAction(ISD::FP_ROUND,   MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN,  MVT::f32, Expand);
+  setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
+  setOperationAction(ISD::FSIN,       MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,       MVT::f32, Expand);
+  setOperationAction(ISD::FPOWI,      MVT::f32, Expand);
+  setOperationAction(ISD::FPOW,       MVT::f32, Expand);
+  setOperationAction(ISD::FLOG,       MVT::f32, Expand);
+  setOperationAction(ISD::FLOG2,      MVT::f32, Expand);
+  setOperationAction(ISD::FLOG10,     MVT::f32, Expand);
+  setOperationAction(ISD::FEXP,       MVT::f32, Expand);
+
+  // Load extented operations for i1 types must be promoted
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+
+  // MBlaze has no REM or DIVREM operations.
+  setOperationAction(ISD::UREM,    MVT::i32, Expand);
+  setOperationAction(ISD::SREM,    MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+
+  // If the processor doesn't support multiply then expand it
+  if (!Subtarget->hasMul()) {
+    setOperationAction(ISD::MUL, MVT::i32, Expand);
+  }
+
+  // If the processor doesn't support 64-bit multiply then expand
+  if (!Subtarget->hasMul() || !Subtarget->hasMul64()) {
+    setOperationAction(ISD::MULHS, MVT::i32, Expand);
+    setOperationAction(ISD::MULHS, MVT::i64, Expand);
+    setOperationAction(ISD::MULHU, MVT::i32, Expand);
+    setOperationAction(ISD::MULHU, MVT::i64, Expand);
+  }
+
+  // If the processor doesn't support division then expand
+  if (!Subtarget->hasDiv()) {
+    setOperationAction(ISD::UDIV, MVT::i32, Expand);
+    setOperationAction(ISD::SDIV, MVT::i32, Expand);
+  }
+
+  // Expand unsupported conversions
+  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i32, Expand);
+
+  // Expand SELECT_CC
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+
+  // MBlaze doesn't have MUL_LOHI
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+
+  // Used by legalize types to correctly generate the setcc result.
+  // Without this, every float setcc comes with a AND/OR with the result,
+  // we don't want this, since the fpcmp result goes to a flag register,
+  // which is used implicitly by brcond and select operations.
+  AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
+  AddPromotedToType(ISD::SELECT, MVT::i1, MVT::i32);
+  AddPromotedToType(ISD::SELECT_CC, MVT::i1, MVT::i32);
+
+  // MBlaze Custom Operations
+  setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
+  setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
+  setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,       MVT::i32,   Custom);
+
+  // Operations not directly supported by MBlaze.
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Expand);
+  setOperationAction(ISD::BR_JT,              MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,              MVT::Other, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG,  MVT::i1,    Expand);
+  setOperationAction(ISD::ROTL,               MVT::i32,   Expand);
+  setOperationAction(ISD::ROTR,               MVT::i32,   Expand);
+  setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Expand);
+  setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Expand);
+  setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Expand);
+  setOperationAction(ISD::CTLZ,               MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ,               MVT::i32,   Expand);
+  setOperationAction(ISD::CTPOP,              MVT::i32,   Expand);
+  setOperationAction(ISD::BSWAP,              MVT::i32,   Expand);
+
+  // We don't have line number support yet.
+  setOperationAction(ISD::EH_LABEL,          MVT::Other, Expand);
+
+  // Use the default for now
+  setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
+  setOperationAction(ISD::MEMBARRIER,        MVT::Other, Expand);
+
+  // MBlaze doesn't have extending float->double load/store
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  setStackPointerRegisterToSaveRestore(MBlaze::R1);
+  computeRegisterProperties();
+}
+
+MVT::SimpleValueType MBlazeTargetLowering::getSetCCResultType(EVT VT) const {
+  return MVT::i32;
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned MBlazeTargetLowering::getFunctionAlignment(const Function *) const {
+  return 2;
+}
+
+SDValue MBlazeTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode())
+  {
+    case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+    case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+    case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+    case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+    case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Lower helper functions
+//===----------------------------------------------------------------------===//
+MachineBasicBlock* MBlazeTargetLowering::
+EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB,
+                            DenseMap<MachineBasicBlock*,
+                            MachineBasicBlock*> *EM) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  switch (MI->getOpcode()) {
+  default: assert(false && "Unexpected instr type to insert");
+  case MBlaze::ShiftRL:
+  case MBlaze::ShiftRA:
+  case MBlaze::ShiftL: {
+    // To "insert" a shift left instruction, we actually have to insert a
+    // simple loop.  The incoming instruction knows the destination vreg to
+    // set, the source vreg to operate over and the shift amount.
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator It = BB;
+    ++It;
+
+    // start:
+    //   andi     samt, samt, 31
+    //   beqid    samt, finish
+    //   add      dst, src, r0
+    // loop:
+    //   addik    samt, samt, -1
+    //   sra      dst, dst
+    //   bneid    samt, loop
+    //   nop
+    // finish:
+    MachineFunction *F = BB->getParent();
+    MachineRegisterInfo &R = F->getRegInfo();
+    MachineBasicBlock *loop = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *finish = F->CreateMachineBasicBlock(LLVM_BB);
+
+    unsigned IAMT = R.createVirtualRegister(MBlaze::CPURegsRegisterClass);
+    BuildMI(BB, dl, TII->get(MBlaze::ANDI), IAMT)
+      .addReg(MI->getOperand(2).getReg())
+      .addImm(31);
+
+    unsigned IVAL = R.createVirtualRegister(MBlaze::CPURegsRegisterClass);
+    BuildMI(BB, dl, TII->get(MBlaze::ADDI), IVAL)
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(0);
+
+    BuildMI(BB, dl, TII->get(MBlaze::BEQID))
+      .addReg(IAMT)
+      .addMBB(finish);
+
+    F->insert(It, loop);
+    F->insert(It, finish);
+
+    // Update machine-CFG edges by first adding all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    // Also inform sdisel of the edge changes.
+    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
+          e = BB->succ_end(); i != e; ++i) {
+      EM->insert(std::make_pair(*i, finish));
+      finish->addSuccessor(*i);
+    }
+
+    // Next, remove all successors of the current block, and add the true
+    // and fallthrough blocks as its successors.
+    while(!BB->succ_empty())
+      BB->removeSuccessor(BB->succ_begin());
+    BB->addSuccessor(loop);
+    BB->addSuccessor(finish);
+
+    // Next, add the finish block as a successor of the loop block
+    loop->addSuccessor(finish);
+    loop->addSuccessor(loop);
+
+    unsigned DST = R.createVirtualRegister(MBlaze::CPURegsRegisterClass);
+    unsigned NDST = R.createVirtualRegister(MBlaze::CPURegsRegisterClass);
+    BuildMI(loop, dl, TII->get(MBlaze::PHI), DST)
+      .addReg(IVAL).addMBB(BB)
+      .addReg(NDST).addMBB(loop);
+
+    unsigned SAMT = R.createVirtualRegister(MBlaze::CPURegsRegisterClass);
+    unsigned NAMT = R.createVirtualRegister(MBlaze::CPURegsRegisterClass);
+    BuildMI(loop, dl, TII->get(MBlaze::PHI), SAMT)
+      .addReg(IAMT).addMBB(BB)
+      .addReg(NAMT).addMBB(loop);
+
+    if (MI->getOpcode() == MBlaze::ShiftL)
+      BuildMI(loop, dl, TII->get(MBlaze::ADD), NDST).addReg(DST).addReg(DST);
+    else if (MI->getOpcode() == MBlaze::ShiftRA)
+      BuildMI(loop, dl, TII->get(MBlaze::SRA), NDST).addReg(DST);
+    else if (MI->getOpcode() == MBlaze::ShiftRL)
+      BuildMI(loop, dl, TII->get(MBlaze::SRL), NDST).addReg(DST);
+    else
+        llvm_unreachable( "Cannot lower unknown shift instruction" );
+
+    BuildMI(loop, dl, TII->get(MBlaze::ADDI), NAMT)
+      .addReg(SAMT)
+      .addImm(-1);
+
+    BuildMI(loop, dl, TII->get(MBlaze::BNEID))
+      .addReg(NAMT)
+      .addMBB(loop);
+
+    BuildMI(finish, dl, TII->get(MBlaze::PHI), MI->getOperand(0).getReg())
+      .addReg(IVAL).addMBB(BB)
+      .addReg(NDST).addMBB(loop);
+
+    // The pseudo instruction is no longer needed so remove it
+    F->DeleteMachineInstr(MI);
+    return finish;
+    }
+
+  case MBlaze::Select_FCC:
+  case MBlaze::Select_CC: {
+    // To "insert" a SELECT_CC instruction, we actually have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // destination vreg to set, the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator It = BB;
+    ++It;
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   setcc r1, r2, r3
+    //   bNE   r1, r0, copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineFunction *F = BB->getParent();
+    MachineBasicBlock *flsBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *dneBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+    unsigned Opc;
+    switch (MI->getOperand(4).getImm()) {
+    default: llvm_unreachable( "Unknown branch condition" );
+    case MBlazeCC::EQ: Opc = MBlaze::BNEID; break;
+    case MBlazeCC::NE: Opc = MBlaze::BEQID; break;
+    case MBlazeCC::GT: Opc = MBlaze::BLEID; break;
+    case MBlazeCC::LT: Opc = MBlaze::BGEID; break;
+    case MBlazeCC::GE: Opc = MBlaze::BLTID; break;
+    case MBlazeCC::LE: Opc = MBlaze::BGTID; break;
+    }
+
+    BuildMI(BB, dl, TII->get(Opc))
+      .addReg(MI->getOperand(3).getReg())
+      .addMBB(dneBB);
+
+    F->insert(It, flsBB);
+    F->insert(It, dneBB);
+
+    // Update machine-CFG edges by first adding all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    // Also inform sdisel of the edge changes.
+    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
+          e = BB->succ_end(); i != e; ++i) {
+      EM->insert(std::make_pair(*i, dneBB));
+      dneBB->addSuccessor(*i);
+    }
+
+    // Next, remove all successors of the current block, and add the true
+    // and fallthrough blocks as its successors.
+    while(!BB->succ_empty())
+      BB->removeSuccessor(BB->succ_begin());
+    BB->addSuccessor(flsBB);
+    BB->addSuccessor(dneBB);
+    flsBB->addSuccessor(dneBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    //BuildMI(dneBB, dl, TII->get(MBlaze::PHI), MI->getOperand(0).getReg())
+    //  .addReg(MI->getOperand(1).getReg()).addMBB(flsBB)
+    //  .addReg(MI->getOperand(2).getReg()).addMBB(BB);
+
+    BuildMI(dneBB, dl, TII->get(MBlaze::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(2).getReg()).addMBB(flsBB)
+      .addReg(MI->getOperand(1).getReg()).addMBB(BB);
+
+    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    return dneBB;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
+//
+
+SDValue MBlazeTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc;
+
+  SDValue CompareFlag;
+  if (LHS.getValueType() == MVT::i32) {
+    Opc = MBlazeISD::Select_CC;
+    CompareFlag = DAG.getNode(MBlazeISD::ICmp, dl, MVT::i32, LHS, RHS)
+                    .getValue(1);
+  } else {
+    llvm_unreachable( "Cannot lower select_cc with unknown type" );
+  }
+ 
+  return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
+                     CompareFlag);
+}
+
+SDValue MBlazeTargetLowering::
+LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
+  // FIXME there isn't actually debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+
+  return DAG.getNode(MBlazeISD::Wrap, dl, MVT::i32, GA);
+}
+
+SDValue MBlazeTargetLowering::
+LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
+  llvm_unreachable("TLS not implemented for MicroBlaze.");
+  return SDValue(); // Not reached
+}
+
+SDValue MBlazeTargetLowering::
+LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
+  SDValue ResNode;
+  SDValue HiPart;
+  // FIXME there isn't actually debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+  unsigned char OpFlag = IsPIC ? MBlazeII::MO_GOT : MBlazeII::MO_ABS_HILO;
+
+  EVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT  = cast<JumpTableSDNode>(Op);
+
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
+  return DAG.getNode(MBlazeISD::Wrap, dl, MVT::i32, JTI);
+  //return JTI;
+}
+
+SDValue MBlazeTargetLowering::
+LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+  SDValue ResNode;
+  EVT PtrVT = Op.getValueType();
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  Constant *C = N->getConstVal();
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  // FIXME there isn't actually debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
+                                         N->getOffset(), MBlazeII::MO_ABS_HILO);
+  return DAG.getNode(MBlazeISD::Wrap, dl, MVT::i32, CP);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//                  Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// LowerCall - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+/// TODO: isVarArg, isTailCall.
+SDValue MBlazeTargetLowering::
+LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
+          bool isVarArg, bool &isTailCall,
+          const SmallVectorImpl<ISD::OutputArg> &Outs,
+          const SmallVectorImpl<ISD::InputArg> &Ins,
+          DebugLoc dl, SelectionDAG &DAG,
+          SmallVectorImpl<SDValue> &InVals) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(Outs, CC_MBlaze);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // First/LastArgStackLoc contains the first/last
+  // "at stack" argument location.
+  int LastArgStackLoc = 0;
+  unsigned FirstStackArgLoc = 4;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    EVT RegVT = VA.getLocVT();
+    SDValue Arg = Outs[i].Val;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg);
+      break;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      // Register can't get to this point...
+      assert(VA.isMemLoc());
+
+      // Create the frame index object for this incoming parameter
+      LastArgStackLoc = (FirstStackArgLoc + VA.getLocMemOffset());
+      int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+                                      LastArgStackLoc, true, false);
+
+      SDValue PtrOff = DAG.getFrameIndex(FI,getPointerTy());
+
+      // emit ISD::STORE whichs stores the
+      // parameter value to a stack Location
+      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0,
+                                         false, false, 0));
+    }
+  }
+
+  // Transform all store nodes into one single node because all store
+  // nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emited instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  unsigned char OpFlag = MBlazeII::MO_NO_FLAG;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(),
+                                getPointerTy(), 0, OpFlag);
+  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(),
+                                getPointerTy(), OpFlag);
+
+  // MBlazeJmpLink = #chain, #target_address, #opt_in_flags...
+  //             = Chain, Callee, Reg#1, Reg#2, ...
+  //
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+  }
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain  = DAG.getNode(MBlazeISD::JmpLink, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue MBlazeTargetLowering::
+LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv,
+                bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals) {
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_MBlaze);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//             Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// LowerFormalArguments - transform physical registers into
+/// virtual registers and generate load operations for
+/// arguments places on the stack.
+/// TODO: isVarArg
+SDValue MBlazeTargetLowering::
+LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                     DebugLoc dl, SelectionDAG &DAG,
+                     SmallVectorImpl<SDValue> &InVals) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
+
+  unsigned StackReg = MF.getTarget().getRegisterInfo()->getFrameRegister(MF);
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_MBlaze);
+  SDValue StackPtr;
+
+  unsigned FirstStackArgLoc = 4;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments stored on registers
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+      TargetRegisterClass *RC = 0;
+
+      if (RegVT == MVT::i32)
+        RC = MBlaze::CPURegsRegisterClass;
+      else if (RegVT == MVT::f32)
+        RC = MBlaze::FGR32RegisterClass;
+      else
+        llvm_unreachable("RegVT not supported by LowerFormalArguments");
+
+      // Transform the arguments stored on
+      // physical registers into virtual ones
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+      // If this is an 8 or 16-bit value, it has been passed promoted
+      // to 32 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      if (VA.getLocInfo() != CCValAssign::Full) {
+        unsigned Opcode = 0;
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          Opcode = ISD::AssertSext;
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          Opcode = ISD::AssertZext;
+        if (Opcode)
+          ArgValue = DAG.getNode(Opcode, dl, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+      }
+
+      InVals.push_back(ArgValue);
+
+      // To meet ABI, when VARARGS are passed on registers, the registers
+      // must have their values written to the caller stack frame.
+      if (isVarArg) {
+        if (StackPtr.getNode() == 0)
+          StackPtr = DAG.getRegister(StackReg, getPointerTy());
+
+        // The stack pointer offset is relative to the caller stack frame.
+        // Since the real stack size is unknown here, a negative SPOffset
+        // is used so there's a way to adjust these offsets when the stack
+        // size get known (on EliminateFrameIndex). A dummy SPOffset is
+        // used instead of a direct negative address (which is recorded to
+        // be used on emitPrologue) to avoid mis-calc of the first stack
+        // offset on PEI::calculateFrameObjectOffsets.
+        // Arguments are always 32-bit.
+        int FI = MFI->CreateFixedObject(4, 0, true, false);
+        MBlazeFI->recordStoreVarArgsFI(FI, -(FirstStackArgLoc+(i*4)));
+        SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
+
+        // emit ISD::STORE whichs stores the
+        // parameter value to a stack Location
+        InVals.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff, NULL, 0,
+                                      false, false, 0));
+      }
+
+    } else { // VA.isRegLoc()
+
+      // sanity check
+      assert(VA.isMemLoc());
+
+      // The stack pointer offset is relative to the caller stack frame.
+      // Since the real stack size is unknown here, a negative SPOffset
+      // is used so there's a way to adjust these offsets when the stack
+      // size get known (on EliminateFrameIndex). A dummy SPOffset is
+      // used instead of a direct negative address (which is recorded to
+      // be used on emitPrologue) to avoid mis-calc of the first stack
+      // offset on PEI::calculateFrameObjectOffsets.
+      // Arguments are always 32-bit.
+      unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
+      int FI = MFI->CreateFixedObject(ArgSize, 0, true, false);
+      MBlazeFI->recordLoadArgsFI(FI, -(ArgSize+
+        (FirstStackArgLoc + VA.getLocMemOffset())));
+
+      // Create load nodes to retrieve arguments from the stack
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, NULL, 0,
+                                   false, false, 0));
+    }
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+SDValue MBlazeTargetLowering::
+LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+            const SmallVectorImpl<ISD::OutputArg> &Outs,
+            DebugLoc dl, SelectionDAG &DAG) {
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_MBlaze);
+
+  // If this is the first return lowered for this function, add
+  // the regs to the liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             Outs[i].Val, Flag);
+
+    // guarantee that all emitted copies are
+    // stuck together, avoiding something bad
+    Flag = Chain.getValue(1);
+  }
+
+  // Return on MBlaze is always a "rtsd R15, 8"
+  if (Flag.getNode())
+    return DAG.getNode(MBlazeISD::Ret, dl, MVT::Other,
+                       Chain, DAG.getRegister(MBlaze::R15, MVT::i32), Flag);
+  else // Return Void
+    return DAG.getNode(MBlazeISD::Ret, dl, MVT::Other,
+                       Chain, DAG.getRegister(MBlaze::R15, MVT::i32));
+}
+
+//===----------------------------------------------------------------------===//
+//                           MBlaze Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+MBlazeTargetLowering::ConstraintType MBlazeTargetLowering::
+getConstraintType(const std::string &Constraint) const
+{
+  // MBlaze specific constrainy
+  //
+  // 'd' : An address register. Equivalent to r.
+  // 'y' : Equivalent to r; retained for
+  //       backwards compatibility.
+  // 'f' : Floating Point registers.
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+      default : break;
+      case 'd':
+      case 'y':
+      case 'f':
+        return C_RegisterClass;
+        break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// getRegClassForInlineAsmConstraint - Given a constraint letter (e.g. "r"),
+/// return a list of registers that can be used to satisfy the constraint.
+/// This should only be used for C_RegisterClass constraints.
+std::pair<unsigned, const TargetRegisterClass*> MBlazeTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return std::make_pair(0U, MBlaze::CPURegsRegisterClass);
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, MBlaze::FGR32RegisterClass);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+/// Given a register class constraint, like 'r', if this corresponds directly
+/// to an LLVM register class, return a register of 0 and the register class
+/// pointer.
+std::vector<unsigned> MBlazeTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
+  if (Constraint.size() != 1)
+    return std::vector<unsigned>();
+
+  switch (Constraint[0]) {
+    default : break;
+    case 'r':
+    // GCC MBlaze Constraint Letters
+    case 'd':
+    case 'y':
+      return make_vector<unsigned>(
+        MBlaze::R3,  MBlaze::R4,  MBlaze::R5,  MBlaze::R6,
+        MBlaze::R7,  MBlaze::R9,  MBlaze::R10, MBlaze::R11,
+        MBlaze::R12, MBlaze::R19, MBlaze::R20, MBlaze::R21,
+        MBlaze::R22, MBlaze::R23, MBlaze::R24, MBlaze::R25,
+        MBlaze::R26, MBlaze::R27, MBlaze::R28, MBlaze::R29,
+        MBlaze::R30, MBlaze::R31, 0);
+
+    case 'f':
+      return make_vector<unsigned>(
+        MBlaze::F3,  MBlaze::F4,  MBlaze::F5,  MBlaze::F6,
+        MBlaze::F7,  MBlaze::F9,  MBlaze::F10, MBlaze::F11,
+        MBlaze::F12, MBlaze::F19, MBlaze::F20, MBlaze::F21,
+        MBlaze::F22, MBlaze::F23, MBlaze::F24, MBlaze::F25,
+        MBlaze::F26, MBlaze::F27, MBlaze::F28, MBlaze::F29,
+        MBlaze::F30, MBlaze::F31, 0);
+  }
+  return std::vector<unsigned>();
+}
+
+bool MBlazeTargetLowering::
+isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The MBlaze target isn't yet aware of offsets.
+  return false;
+}
+
+bool MBlazeTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  return VT != MVT::f32;
+}
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.h b/lib/Target/MBlaze/MBlazeISelLowering.h
new file mode 100644
index 0000000..75d2552
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeISelLowering.h
@@ -0,0 +1,146 @@
+//===-- MBlazeISelLowering.h - MBlaze DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that MBlaze uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBlazeISELLOWERING_H
+#define MBlazeISELLOWERING_H
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "MBlaze.h"
+#include "MBlazeSubtarget.h"
+
+namespace llvm {
+  namespace MBlazeCC {
+    enum CC {
+      FIRST = 0,
+      EQ,
+      NE,
+      GT,
+      LT,
+      GE,
+      LE
+    };
+  }
+
+  namespace MBlazeISD {
+    enum NodeType {
+      // Start the numbering from where ISD NodeType finishes.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      // Jump and link (call)
+      JmpLink,
+
+      // Handle gp_rel (small data/bss sections) relocation.
+      GPRel,
+
+      // Select CC Pseudo Instruction
+      Select_CC,
+
+      // Wrap up multiple types of instructions
+      Wrap,
+
+      // Integer Compare
+      ICmp,
+
+      // Return
+      Ret
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+
+  class MBlazeTargetLowering : public TargetLowering  {
+  public:
+
+    explicit MBlazeTargetLowering(MBlazeTargetMachine &TM);
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    //  DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType - get the ISD::SETCC result ValueType
+    MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+
+    virtual unsigned getFunctionAlignment(const Function *F) const;
+  private:
+    // Subtarget Info
+    const MBlazeSubtarget *Subtarget;
+
+
+    // Lower Operand helpers
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+
+    // Lower Operand specifics
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                         MachineBasicBlock *MBB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+
+    // Inline asm support
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+              getRegForInlineAsmConstraint(const std::string &Constraint,
+              EVT VT) const;
+
+    std::vector<unsigned>
+    getRegClassForInlineAsmConstraint(const std::string &Constraint,
+              EVT VT) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+  };
+}
+
+#endif // MBlazeISELLOWERING_H
diff --git a/lib/Target/MBlaze/MBlazeInstrFPU.td b/lib/Target/MBlaze/MBlazeInstrFPU.td
new file mode 100644
index 0000000..a48a8c9
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeInstrFPU.td
@@ -0,0 +1,223 @@
+//===- MBlazeInstrFPU.td - MBlaze FPU Instruction defs ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlaze profiles and nodes
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlaze Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Memory Access Instructions
+//===----------------------------------------------------------------------===//
+class LoadFM<bits<6> op, string instr_asm, PatFrag OpNode> :
+             TA<op, 0x000, (outs FGR32:$dst), (ins memrr:$addr),
+                !strconcat(instr_asm, "   $dst, $addr"),
+                [(set FGR32:$dst, (OpNode xaddr:$addr))], IILoad>;
+
+class LoadFMI<bits<6> op, string instr_asm, PatFrag OpNode> :
+              TAI<op, (outs FGR32:$dst), (ins memri:$addr),
+                  !strconcat(instr_asm, "   $dst, $addr"),
+                  [(set FGR32:$dst, (OpNode iaddr:$addr))], IILoad>;
+
+class StoreFM<bits<6> op, string instr_asm, PatFrag OpNode> :
+              TA<op, 0x000, (outs), (ins FGR32:$dst, memrr:$addr),
+                 !strconcat(instr_asm, "   $dst, $addr"),
+                 [(OpNode FGR32:$dst, xaddr:$addr)], IIStore>;
+
+class StoreFMI<bits<6> op, string instr_asm, PatFrag OpNode> :
+               TAI<op, (outs), (ins FGR32:$dst, memrr:$addr),
+                   !strconcat(instr_asm, "   $dst, $addr"),
+                   [(OpNode FGR32:$dst, iaddr:$addr)], IIStore>;
+
+class ArithF<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
+             InstrItinClass itin> :
+             TA<op, flags, (outs FGR32:$dst), (ins FGR32:$b, FGR32:$c),
+                !strconcat(instr_asm, "   $dst, $b, $c"),
+                [(set FGR32:$dst, (OpNode FGR32:$b, FGR32:$c))], itin>;
+
+class CmpFN<bits<6> op, bits<11> flags, string instr_asm,
+            InstrItinClass itin> :
+            TA<op, flags, (outs CPURegs:$dst), (ins FGR32:$b, FGR32:$c),
+               !strconcat(instr_asm, "   $dst, $b, $c"),
+               [], itin>;
+
+class ArithFR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
+             InstrItinClass itin> :
+             TA<op, flags, (outs FGR32:$dst), (ins FGR32:$b, FGR32:$c),
+                !strconcat(instr_asm, "   $dst, $c, $b"),
+                [(set FGR32:$dst, (OpNode FGR32:$b, FGR32:$c))], itin>;
+
+class ArithF2<bits<6> op, bits<11> flags, string instr_asm,
+              InstrItinClass itin> :
+              TF<op, flags, (outs FGR32:$dst), (ins FGR32:$b),
+                 !strconcat(instr_asm, "   $dst, $b"),
+                 [], itin>;
+
+class ArithIF<bits<6> op, bits<11> flags, string instr_asm,
+              InstrItinClass itin> :
+              TF<op, flags, (outs FGR32:$dst), (ins CPURegs:$b),
+                 !strconcat(instr_asm, "   $dst, $b"),
+                 [], itin>;
+
+class ArithFI<bits<6> op, bits<11> flags, string instr_asm,
+              InstrItinClass itin> :
+              TF<op, flags, (outs CPURegs:$dst), (ins FGR32:$b),
+                 !strconcat(instr_asm, "   $dst, $b"),
+                 [], itin>;
+
+class LogicF<bits<6> op, string instr_asm> :
+             TAI<op, (outs FGR32:$dst), (ins FGR32:$b, FGR32:$c),
+                 !strconcat(instr_asm, "   $dst, $b, $c"),
+                 [],
+                 IIAlu>;
+
+class LogicFI<bits<6> op, string instr_asm> :
+             TAI<op, (outs FGR32:$dst), (ins FGR32:$b, fimm:$c),
+                 !strconcat(instr_asm, "   $dst, $b, $c"),
+                 [],
+                 IIAlu>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPU Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+let Predicates=[HasFPU] in {
+  def FOR    :  LogicF<0x28, "or     ">;
+  def FORI   : LogicFI<0x28, "ori    ">;
+  def FADD   :  ArithF<0x16, 0x000, "fadd   ", fadd, IIAlu>;
+  def FRSUB  : ArithFR<0x16, 0x080, "frsub  ", fsub, IIAlu>;
+  def FMUL   :  ArithF<0x16, 0x100, "fmul   ", fmul, IIAlu>;
+  def FDIV   :  ArithF<0x16, 0x180, "fdiv   ", fdiv, IIAlu>;
+
+  def LWF    :   LoadFM<0x32, "lw     ", load>;
+  def LWFI   :  LoadFMI<0x32, "lwi    ", load>;
+
+  def SWF    :  StoreFM<0x32, "sw     ", store>;
+  def SWFI   : StoreFMI<0x32, "swi    ", store>;
+}
+
+let Predicates=[HasFPU,HasSqrt] in {
+  def FLT    : ArithIF<0x16, 0x280, "flt    ", IIAlu>;
+  def FINT   : ArithFI<0x16, 0x300, "fint   ", IIAlu>;
+  def FSQRT  : ArithF2<0x16, 0x300, "fsqrt  ", IIAlu>;
+}
+
+let isAsCheapAsAMove = 1 in {
+  def FCMP_UN : CmpFN<0x16, 0x200, "fcmp.un", IIAlu>;
+  def FCMP_LT : CmpFN<0x16, 0x210, "fcmp.lt", IIAlu>;
+  def FCMP_EQ : CmpFN<0x16, 0x220, "fcmp.eq", IIAlu>;
+  def FCMP_LE : CmpFN<0x16, 0x230, "fcmp.le", IIAlu>;
+  def FCMP_GT : CmpFN<0x16, 0x240, "fcmp.gt", IIAlu>;
+  def FCMP_NE : CmpFN<0x16, 0x250, "fcmp.ne", IIAlu>;
+  def FCMP_GE : CmpFN<0x16, 0x260, "fcmp.ge", IIAlu>;
+}
+
+
+let usesCustomInserter = 1 in {
+  def Select_FCC : MBlazePseudo<(outs FGR32:$dst),
+    (ins FGR32:$T, FGR32:$F, CPURegs:$CMP, i32imm:$CC),
+    "; SELECT_FCC PSEUDO!",
+    []>;
+}
+
+// Floating point conversions
+let Predicates=[HasFPU] in {
+  def : Pat<(sint_to_fp CPURegs:$V), (FLT CPURegs:$V)>;
+  def : Pat<(fp_to_sint FGR32:$V), (FINT FGR32:$V)>;
+  def : Pat<(fsqrt FGR32:$V), (FSQRT FGR32:$V)>;
+}
+
+// SET_CC operations
+let Predicates=[HasFPU] in {
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETEQ),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_EQ FGR32:$L, FGR32:$R), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETNE),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_EQ FGR32:$L, FGR32:$R), 1)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETOEQ),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_EQ FGR32:$L, FGR32:$R), 2)>;
+ def : Pat<(setcc FGR32:$L, FGR32:$R, SETONE),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (XOR (FCMP_UN FGR32:$L, FGR32:$R),
+                            (FCMP_EQ FGR32:$L, FGR32:$R)), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETONE),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (OR (FCMP_UN FGR32:$L, FGR32:$R),
+                           (FCMP_EQ FGR32:$L, FGR32:$R)), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETGT),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_GT FGR32:$L, FGR32:$R), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETLT),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_LT FGR32:$L, FGR32:$R), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETGE),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_GE FGR32:$L, FGR32:$R), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETLE),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_LE FGR32:$L, FGR32:$R), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETOGT),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_GT FGR32:$L, FGR32:$R), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETOLT),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_LT FGR32:$L, FGR32:$R), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETOGE),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_GE FGR32:$L, FGR32:$R), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETOLE),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_LE FGR32:$L, FGR32:$R), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETUEQ),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (OR (FCMP_UN FGR32:$L, FGR32:$R),
+                           (FCMP_EQ FGR32:$L, FGR32:$R)), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETUNE),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_NE FGR32:$L, FGR32:$R), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETUGT),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (OR (FCMP_UN FGR32:$L, FGR32:$R),
+                           (FCMP_GT FGR32:$L, FGR32:$R)), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETULT),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (OR (FCMP_UN FGR32:$L, FGR32:$R),
+                           (FCMP_LT FGR32:$L, FGR32:$R)), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETUGE),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (OR (FCMP_UN FGR32:$L, FGR32:$R),
+                           (FCMP_GE FGR32:$L, FGR32:$R)), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETULE),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (OR (FCMP_UN FGR32:$L, FGR32:$R),
+                           (FCMP_LE FGR32:$L, FGR32:$R)), 2)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETO),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_UN FGR32:$L, FGR32:$R), 1)>;
+  def : Pat<(setcc FGR32:$L, FGR32:$R, SETUO),
+            (Select_CC (ADDI R0, 1), (ADDI R0, 0),
+                       (FCMP_UN FGR32:$L, FGR32:$R), 2)>;
+}
+
+// SELECT operations
+def : Pat<(select CPURegs:$C, FGR32:$T, FGR32:$F),
+          (Select_FCC FGR32:$T, FGR32:$F, CPURegs:$C, 2)>;
+
+//===----------------------------------------------------------------------===//
+// Patterns for Floating Point Instructions
+//===----------------------------------------------------------------------===//
+def : Pat<(f32 fpimm:$imm), (FORI F0, fpimm:$imm)>;
diff --git a/lib/Target/MBlaze/MBlazeInstrFSL.td b/lib/Target/MBlaze/MBlazeInstrFSL.td
new file mode 100644
index 0000000..b59999e
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeInstrFSL.td
@@ -0,0 +1,153 @@
+//===- MBlazeInstrFSL.td - MBlaze FSL Instruction defs ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FSL Instruction Formats
+//===----------------------------------------------------------------------===//
+class FSLGetD<bits<6> op, bits<11> flags, string instr_asm, Intrinsic OpNode> :
+              TA<op, flags, (outs CPURegs:$dst), (ins CPURegs:$b),
+                 !strconcat(instr_asm, " $dst, $b"),
+                 [(set CPURegs:$dst, (OpNode CPURegs:$b))], IIAlu>;
+
+class FSLGet<bits<6> op, string instr_asm, Intrinsic OpNode> :
+             TAI<op, (outs CPURegs:$dst), (ins fslimm:$b),
+                 !strconcat(instr_asm, " $dst, $b"),
+                 [(set CPURegs:$dst, (OpNode immZExt4:$b))], IIAlu>;
+
+class FSLPutD<bits<6> op, bits<11> flags, string instr_asm, Intrinsic OpNode> :
+              TA<op, flags, (outs), (ins CPURegs:$v, CPURegs:$b),
+                 !strconcat(instr_asm, " $v, $b"),
+                 [(OpNode CPURegs:$v, CPURegs:$b)], IIAlu>;
+
+class FSLPut<bits<6> op, string instr_asm, Intrinsic OpNode> :
+             TAI<op, (outs), (ins CPURegs:$v, fslimm:$b),
+                 !strconcat(instr_asm, " $v, $b"),
+                 [(OpNode CPURegs:$v, immZExt4:$b)], IIAlu>;
+
+class FSLPutTD<bits<6> op, bits<11> flags, string instr_asm, Intrinsic OpNode> :
+               TA<op, flags, (outs), (ins CPURegs:$b),
+                  !strconcat(instr_asm, " $b"),
+                  [(OpNode CPURegs:$b)], IIAlu>;
+
+class FSLPutT<bits<6> op, string instr_asm, Intrinsic OpNode> :
+              TAI<op, (outs), (ins fslimm:$b),
+                  !strconcat(instr_asm, " $b"),
+                  [(OpNode immZExt4:$b)], IIAlu>;
+
+//===----------------------------------------------------------------------===//
+// FSL Get Instructions
+//===----------------------------------------------------------------------===//
+def GET      : FSLGet<0x1B, "get      ", int_mblaze_fsl_get>;
+def AGET     : FSLGet<0x1B, "aget     ", int_mblaze_fsl_aget>;
+def CGET     : FSLGet<0x1B, "cget     ", int_mblaze_fsl_cget>;
+def CAGET    : FSLGet<0x1B, "caget    ", int_mblaze_fsl_caget>;
+def EGET     : FSLGet<0x1B, "eget     ", int_mblaze_fsl_eget>;
+def EAGET    : FSLGet<0x1B, "eaget    ", int_mblaze_fsl_eaget>;
+def ECGET    : FSLGet<0x1B, "ecget    ", int_mblaze_fsl_ecget>;
+def ECAGET   : FSLGet<0x1B, "ecaget   ", int_mblaze_fsl_ecaget>;
+def NGET     : FSLGet<0x1B, "nget     ", int_mblaze_fsl_nget>;
+def NAGET    : FSLGet<0x1B, "naget    ", int_mblaze_fsl_naget>;
+def NCGET    : FSLGet<0x1B, "ncget    ", int_mblaze_fsl_ncget>;
+def NCAGET   : FSLGet<0x1B, "ncaget   ", int_mblaze_fsl_ncaget>;
+def NEGET    : FSLGet<0x1B, "neget    ", int_mblaze_fsl_neget>;
+def NEAGET   : FSLGet<0x1B, "neaget   ", int_mblaze_fsl_neaget>;
+def NECGET   : FSLGet<0x1B, "necget   ", int_mblaze_fsl_necget>;
+def NECAGET  : FSLGet<0x1B, "necaget  ", int_mblaze_fsl_necaget>;
+def TGET     : FSLGet<0x1B, "tget     ", int_mblaze_fsl_tget>;
+def TAGET    : FSLGet<0x1B, "taget    ", int_mblaze_fsl_taget>;
+def TCGET    : FSLGet<0x1B, "tcget    ", int_mblaze_fsl_tcget>;
+def TCAGET   : FSLGet<0x1B, "tcaget   ", int_mblaze_fsl_tcaget>;
+def TEGET    : FSLGet<0x1B, "teget    ", int_mblaze_fsl_teget>;
+def TEAGET   : FSLGet<0x1B, "teaget   ", int_mblaze_fsl_teaget>;
+def TECGET   : FSLGet<0x1B, "tecget   ", int_mblaze_fsl_tecget>;
+def TECAGET  : FSLGet<0x1B, "tecaget  ", int_mblaze_fsl_tecaget>;
+def TNGET    : FSLGet<0x1B, "tnget    ", int_mblaze_fsl_tnget>;
+def TNAGET   : FSLGet<0x1B, "tnaget   ", int_mblaze_fsl_tnaget>;
+def TNCGET   : FSLGet<0x1B, "tncget   ", int_mblaze_fsl_tncget>;
+def TNCAGET  : FSLGet<0x1B, "tncaget  ", int_mblaze_fsl_tncaget>;
+def TNEGET   : FSLGet<0x1B, "tneget   ", int_mblaze_fsl_tneget>;
+def TNEAGET  : FSLGet<0x1B, "tneaget  ", int_mblaze_fsl_tneaget>;
+def TNECGET  : FSLGet<0x1B, "tnecget  ", int_mblaze_fsl_tnecget>;
+def TNECAGET : FSLGet<0x1B, "tnecaget ", int_mblaze_fsl_tnecaget>;
+
+//===----------------------------------------------------------------------===//
+// FSL Dynamic Get Instructions
+//===----------------------------------------------------------------------===//
+def GETD      : FSLGetD<0x1B, 0x00, "getd     ", int_mblaze_fsl_get>;
+def AGETD     : FSLGetD<0x1B, 0x00, "agetd    ", int_mblaze_fsl_aget>;
+def CGETD     : FSLGetD<0x1B, 0x00, "cgetd    ", int_mblaze_fsl_cget>;
+def CAGETD    : FSLGetD<0x1B, 0x00, "cagetd   ", int_mblaze_fsl_caget>;
+def EGETD     : FSLGetD<0x1B, 0x00, "egetd    ", int_mblaze_fsl_eget>;
+def EAGETD    : FSLGetD<0x1B, 0x00, "eagetd   ", int_mblaze_fsl_eaget>;
+def ECGETD    : FSLGetD<0x1B, 0x00, "ecgetd   ", int_mblaze_fsl_ecget>;
+def ECAGETD   : FSLGetD<0x1B, 0x00, "ecagetd  ", int_mblaze_fsl_ecaget>;
+def NGETD     : FSLGetD<0x1B, 0x00, "ngetd    ", int_mblaze_fsl_nget>;
+def NAGETD    : FSLGetD<0x1B, 0x00, "nagetd   ", int_mblaze_fsl_naget>;
+def NCGETD    : FSLGetD<0x1B, 0x00, "ncgetd   ", int_mblaze_fsl_ncget>;
+def NCAGETD   : FSLGetD<0x1B, 0x00, "ncagetd  ", int_mblaze_fsl_ncaget>;
+def NEGETD    : FSLGetD<0x1B, 0x00, "negetd   ", int_mblaze_fsl_neget>;
+def NEAGETD   : FSLGetD<0x1B, 0x00, "neagetd  ", int_mblaze_fsl_neaget>;
+def NECGETD   : FSLGetD<0x1B, 0x00, "necgetd  ", int_mblaze_fsl_necget>;
+def NECAGETD  : FSLGetD<0x1B, 0x00, "necagetd ", int_mblaze_fsl_necaget>;
+def TGETD     : FSLGetD<0x1B, 0x00, "tgetd    ", int_mblaze_fsl_tget>;
+def TAGETD    : FSLGetD<0x1B, 0x00, "tagetd   ", int_mblaze_fsl_taget>;
+def TCGETD    : FSLGetD<0x1B, 0x00, "tcgetd   ", int_mblaze_fsl_tcget>;
+def TCAGETD   : FSLGetD<0x1B, 0x00, "tcagetd  ", int_mblaze_fsl_tcaget>;
+def TEGETD    : FSLGetD<0x1B, 0x00, "tegetd   ", int_mblaze_fsl_teget>;
+def TEAGETD   : FSLGetD<0x1B, 0x00, "teagetd  ", int_mblaze_fsl_teaget>;
+def TECGETD   : FSLGetD<0x1B, 0x00, "tecgetd  ", int_mblaze_fsl_tecget>;
+def TECAGETD  : FSLGetD<0x1B, 0x00, "tecagetd ", int_mblaze_fsl_tecaget>;
+def TNGETD    : FSLGetD<0x1B, 0x00, "tngetd   ", int_mblaze_fsl_tnget>;
+def TNAGETD   : FSLGetD<0x1B, 0x00, "tnagetd  ", int_mblaze_fsl_tnaget>;
+def TNCGETD   : FSLGetD<0x1B, 0x00, "tncgetd  ", int_mblaze_fsl_tncget>;
+def TNCAGETD  : FSLGetD<0x1B, 0x00, "tncagetd ", int_mblaze_fsl_tncaget>;
+def TNEGETD   : FSLGetD<0x1B, 0x00, "tnegetd  ", int_mblaze_fsl_tneget>;
+def TNEAGETD  : FSLGetD<0x1B, 0x00, "tneagetd ", int_mblaze_fsl_tneaget>;
+def TNECGETD  : FSLGetD<0x1B, 0x00, "tnecgetd ", int_mblaze_fsl_tnecget>;
+def TNECAGETD : FSLGetD<0x1B, 0x00, "tnecagetd", int_mblaze_fsl_tnecaget>;
+
+//===----------------------------------------------------------------------===//
+// FSL Put Instructions
+//===----------------------------------------------------------------------===//
+def PUT     :  FSLPut<0x1B, "put      ", int_mblaze_fsl_put>;
+def APUT    :  FSLPut<0x1B, "aput     ", int_mblaze_fsl_aput>;
+def CPUT    :  FSLPut<0x1B, "cput     ", int_mblaze_fsl_cput>;
+def CAPUT   :  FSLPut<0x1B, "caput    ", int_mblaze_fsl_caput>;
+def NPUT    :  FSLPut<0x1B, "nput     ", int_mblaze_fsl_nput>;
+def NAPUT   :  FSLPut<0x1B, "naput    ", int_mblaze_fsl_naput>;
+def NCPUT   :  FSLPut<0x1B, "ncput    ", int_mblaze_fsl_ncput>;
+def NCAPUT  :  FSLPut<0x1B, "ncaput   ", int_mblaze_fsl_ncaput>;
+def TPUT    : FSLPutT<0x1B, "tput     ", int_mblaze_fsl_tput>;
+def TAPUT   : FSLPutT<0x1B, "taput    ", int_mblaze_fsl_taput>;
+def TCPUT   : FSLPutT<0x1B, "tcput    ", int_mblaze_fsl_tcput>;
+def TCAPUT  : FSLPutT<0x1B, "tcaput   ", int_mblaze_fsl_tcaput>;
+def TNPUT   : FSLPutT<0x1B, "tnput    ", int_mblaze_fsl_tnput>;
+def TNAPUT  : FSLPutT<0x1B, "tnaput   ", int_mblaze_fsl_tnaput>;
+def TNCPUT  : FSLPutT<0x1B, "tncput   ", int_mblaze_fsl_tncput>;
+def TNCAPUT : FSLPutT<0x1B, "tncaput  ", int_mblaze_fsl_tncaput>;
+
+//===----------------------------------------------------------------------===//
+// FSL Dynamic Put Instructions
+//===----------------------------------------------------------------------===//
+def PUTD     :  FSLPutD<0x1B, 0x00, "putd     ", int_mblaze_fsl_put>;
+def APUTD    :  FSLPutD<0x1B, 0x00, "aputd    ", int_mblaze_fsl_aput>;
+def CPUTD    :  FSLPutD<0x1B, 0x00, "cputd    ", int_mblaze_fsl_cput>;
+def CAPUTD   :  FSLPutD<0x1B, 0x00, "caputd   ", int_mblaze_fsl_caput>;
+def NPUTD    :  FSLPutD<0x1B, 0x00, "nputd    ", int_mblaze_fsl_nput>;
+def NAPUTD   :  FSLPutD<0x1B, 0x00, "naputd   ", int_mblaze_fsl_naput>;
+def NCPUTD   :  FSLPutD<0x1B, 0x00, "ncputd   ", int_mblaze_fsl_ncput>;
+def NCAPUTD  :  FSLPutD<0x1B, 0x00, "ncaputd  ", int_mblaze_fsl_ncaput>;
+def TPUTD    : FSLPutTD<0x1B, 0x00, "tputd    ", int_mblaze_fsl_tput>;
+def TAPUTD   : FSLPutTD<0x1B, 0x00, "taputd   ", int_mblaze_fsl_taput>;
+def TCPUTD   : FSLPutTD<0x1B, 0x00, "tcputd   ", int_mblaze_fsl_tcput>;
+def TCAPUTD  : FSLPutTD<0x1B, 0x00, "tcaputd  ", int_mblaze_fsl_tcaput>;
+def TNPUTD   : FSLPutTD<0x1B, 0x00, "tnputd   ", int_mblaze_fsl_tnput>;
+def TNAPUTD  : FSLPutTD<0x1B, 0x00, "tnaputd  ", int_mblaze_fsl_tnaput>;
+def TNCPUTD  : FSLPutTD<0x1B, 0x00, "tncputd  ", int_mblaze_fsl_tncput>;
+def TNCAPUTD : FSLPutTD<0x1B, 0x00, "tncaputd ", int_mblaze_fsl_tncaput>;
diff --git a/lib/Target/MBlaze/MBlazeInstrFormats.td b/lib/Target/MBlaze/MBlazeInstrFormats.td
new file mode 100644
index 0000000..7d65543
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeInstrFormats.td
@@ -0,0 +1,246 @@
+//===- MBlazeInstrFormats.td - MB Instruction defs --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MBlaze instructions format
+//
+//  CPU INSTRUCTION FORMATS
+//
+//  opcode  - operation code.
+//  rd      - dst reg.
+//  ra      - first src. reg.
+//  rb      - second src. reg.
+//  imm16   - 16-bit immediate value.
+//
+//===----------------------------------------------------------------------===//
+
+// Generic MBlaze Format
+class MBlazeInst<dag outs, dag ins, string asmstr, list<dag> pattern, 
+               InstrItinClass itin> : Instruction 
+{
+  field bits<32> Inst;
+
+  let Namespace = "MBlaze";
+
+  bits<6> opcode;
+
+  // Top 6 bits are the 'opcode' field
+  let Inst{0-5} = opcode;   
+  
+  dag OutOperandList = outs;
+  dag InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instruction class
+//===----------------------------------------------------------------------===//
+class MBlazePseudo<dag outs, dag ins, string asmstr, list<dag> pattern>:
+      MBlazeInst<outs, ins, asmstr, pattern, IIPseudo>;
+
+//===----------------------------------------------------------------------===//
+// Type A instruction class in MBlaze : <|opcode|rd|ra|rb|flags|>
+//===----------------------------------------------------------------------===//
+
+class TA<bits<6> op, bits<11> flags, dag outs, dag ins, string asmstr,
+         list<dag> pattern, InstrItinClass itin> : 
+         MBlazeInst<outs, ins, asmstr, pattern, itin> 
+{
+  bits<5> rd;
+  bits<5> ra;
+  bits<5> rb;
+
+  let opcode = op;
+
+  let Inst{6-10}  = rd;
+  let Inst{11-15} = ra; 
+  let Inst{16-20} = rb;
+  let Inst{21-31} = flags;
+}
+
+class TAI<bits<6> op, dag outs, dag ins, string asmstr,
+          list<dag> pattern, InstrItinClass itin> :
+          MBlazeInst<outs, ins, asmstr, pattern, itin>
+{
+  bits<5> rd;
+  bits<5> ra;
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{6-10}  = rd;
+  let Inst{11-15} = ra; 
+  let Inst{16-31} = imm16;
+}
+
+class TIMM<bits<6> op, dag outs, dag ins, string asmstr,
+           list<dag> pattern, InstrItinClass itin> :
+           MBlazeInst<outs, ins, asmstr, pattern, itin>
+{
+  bits<5> ra;
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{6-15}  = 0;
+  let Inst{16-31} = imm16;
+}
+
+class TADDR<bits<6> op, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin> :
+            MBlazeInst<outs, ins, asmstr, pattern, itin>
+{
+  bits<26> addr;
+
+  let opcode = op;
+
+  let Inst{6-31} = addr;
+}
+
+//===----------------------------------------------------------------------===//
+// Type B instruction class in MBlaze : <|opcode|rd|ra|immediate|>
+//===----------------------------------------------------------------------===//
+
+class TB<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin> : 
+         MBlazeInst<outs, ins, asmstr, pattern, itin> 
+{
+  bits<5>  rd;
+  bits<5>  ra;
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{6-10}  = rd;
+  let Inst{11-15} = ra; 
+  let Inst{16-31} = imm16;
+}
+
+//===----------------------------------------------------------------------===//
+// Float instruction class in MBlaze : <|opcode|rd|ra|flags|>
+//===----------------------------------------------------------------------===//
+
+class TF<bits<6> op, bits<11> flags, dag outs, dag ins, string asmstr,
+         list<dag> pattern, InstrItinClass itin> : 
+         MBlazeInst<outs, ins, asmstr, pattern, itin> 
+{
+  bits<5>  rd;
+  bits<5>  ra;
+
+  let opcode = op;
+
+  let Inst{6-10}  = rd;
+  let Inst{11-15} = ra; 
+  let Inst{16-20} = 0;
+  let Inst{21-31} = flags;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch instruction class in MBlaze : <|opcode|rd|br|ra|flags|>
+//===----------------------------------------------------------------------===//
+
+class TBR<bits<6> op, bits<5> br, bits<11> flags, dag outs, dag ins,
+          string asmstr, list<dag> pattern, InstrItinClass itin> :
+          MBlazeInst<outs, ins, asmstr, pattern, itin>
+{
+  bits<5> ra;
+
+  let opcode = op;
+
+  let Inst{6-10}  = 0;
+  let Inst{11-15} = br; 
+  let Inst{16-20} = ra;
+  let Inst{21-31} = flags;
+}
+
+class TBRC<bits<6> op, bits<5> br, bits<11> flags, dag outs, dag ins,
+           string asmstr, list<dag> pattern, InstrItinClass itin> :
+           MBlazeInst<outs, ins, asmstr, pattern, itin>
+{
+  bits<5> ra;
+  bits<5> rb;
+
+  let opcode = op;
+
+  let Inst{6-10}  = br;
+  let Inst{11-15} = ra; 
+  let Inst{16-20} = rb;
+  let Inst{21-31} = flags;
+}
+
+class TBRL<bits<6> op, bits<5> br, bits<11> flags, dag outs, dag ins,
+           string asmstr, list<dag> pattern, InstrItinClass itin> :
+           MBlazeInst<outs, ins, asmstr, pattern, itin>
+{
+  bits<5> ra;
+
+  let opcode = op;
+
+  let Inst{6-10}  = 0xF;
+  let Inst{11-15} = br; 
+  let Inst{16-20} = ra;
+  let Inst{21-31} = flags;
+}
+
+class TBRI<bits<6> op, bits<5> br, dag outs, dag ins,
+           string asmstr, list<dag> pattern, InstrItinClass itin> :
+           MBlazeInst<outs, ins, asmstr, pattern, itin>
+{
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{6-10}  = 0;
+  let Inst{11-15} = br; 
+  let Inst{16-31} = imm16;
+}
+
+class TBRLI<bits<6> op, bits<5> br, dag outs, dag ins,
+            string asmstr, list<dag> pattern, InstrItinClass itin> :
+            MBlazeInst<outs, ins, asmstr, pattern, itin>
+{
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{6-10}  = 0xF;
+  let Inst{11-15} = br; 
+  let Inst{16-31} = imm16;
+}
+
+class TBRCI<bits<6> op, bits<5> br, dag outs, dag ins,
+            string asmstr, list<dag> pattern, InstrItinClass itin> :
+            MBlazeInst<outs, ins, asmstr, pattern, itin>
+{
+  bits<5> ra;
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{6-10}  = br;
+  let Inst{11-15} = ra; 
+  let Inst{16-31} = imm16;
+}
+
+class TRET<bits<6> op, dag outs, dag ins,
+            string asmstr, list<dag> pattern, InstrItinClass itin> :
+            MBlazeInst<outs, ins, asmstr, pattern, itin>
+{
+  bits<5>  ra;
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{6-10}  = 0x10;
+  let Inst{11-15} = ra; 
+  let Inst{16-31} = imm16;
+}
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
new file mode 100644
index 0000000..a7e8eb7
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
@@ -0,0 +1,222 @@
+//===- MBlazeInstrInfo.cpp - MBlaze Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MBlaze implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeInstrInfo.h"
+#include "MBlazeTargetMachine.h"
+#include "MBlazeMachineFunction.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "MBlazeGenInstrInfo.inc"
+
+using namespace llvm;
+
+MBlazeInstrInfo::MBlazeInstrInfo(MBlazeTargetMachine &tm)
+  : TargetInstrInfoImpl(MBlazeInsts, array_lengthof(MBlazeInsts)),
+    TM(tm), RI(*TM.getSubtargetImpl(), *this) {}
+
+static bool isZeroImm(const MachineOperand &op) {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+bool MBlazeInstrInfo::
+isMoveInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg,
+            unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
+  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
+
+  // add $dst, $src, $zero || addu $dst, $zero, $src
+  // or  $dst, $src, $zero || or   $dst, $zero, $src
+  if ((MI.getOpcode() == MBlaze::ADD) || (MI.getOpcode() == MBlaze::OR)) {
+    if (MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == MBlaze::R0) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(2).getReg();
+      return true;
+    } else if (MI.getOperand(2).isReg() && 
+               MI.getOperand(2).getReg() == MBlaze::R0) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+      return true;
+    }
+  }
+
+  // addi $dst, $src, 0
+  // ori  $dst, $src, 0
+  if ((MI.getOpcode() == MBlaze::ADDI) || (MI.getOpcode() == MBlaze::ORI)) {
+    if ((MI.getOperand(1).isReg()) && (isZeroImm(MI.getOperand(2)))) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned MBlazeInstrInfo::
+isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const {
+  if (MI->getOpcode() == MBlaze::LWI) {
+    if ((MI->getOperand(2).isFI()) && // is a stack slot
+        (MI->getOperand(1).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(1)))) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned MBlazeInstrInfo::
+isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const {
+  if (MI->getOpcode() == MBlaze::SWI) {
+    if ((MI->getOperand(2).isFI()) && // is a stack slot
+        (MI->getOperand(1).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(1)))) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+/// insertNoop - If data hazard condition is found insert the target nop
+/// instruction.
+void MBlazeInstrInfo::
+insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  BuildMI(MBB, MI, DL, get(MBlaze::NOP));
+}
+
+bool MBlazeInstrInfo::
+copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+             unsigned DestReg, unsigned SrcReg,
+             const TargetRegisterClass *DestRC,
+             const TargetRegisterClass *SrcRC) const {
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  llvm::BuildMI(MBB, I, dl, get(MBlaze::ADD), DestReg)
+      .addReg(SrcReg).addReg(MBlaze::R0);
+  return true;
+}
+
+void MBlazeInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC) const {
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  BuildMI(MBB, I, dl, get(MBlaze::SWI)).addReg(SrcReg,getKillRegState(isKill))
+    .addImm(0).addFrameIndex(FI);
+}
+
+void MBlazeInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC) const {
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  BuildMI(MBB, I, dl, get(MBlaze::LWI), DestReg)
+      .addImm(0).addFrameIndex(FI);
+}
+
+MachineInstr *MBlazeInstrInfo::
+foldMemoryOperandImpl(MachineFunction &MF,
+                      MachineInstr* MI,
+                      const SmallVectorImpl<unsigned> &Ops, int FI) const {
+  if (Ops.size() != 1) return NULL;
+
+  MachineInstr *NewMI = NULL;
+
+  switch (MI->getOpcode()) {
+  case MBlaze::OR:
+  case MBlaze::ADD:
+    if ((MI->getOperand(0).isReg()) &&
+        (MI->getOperand(2).isReg()) &&
+        (MI->getOperand(2).getReg() == MBlaze::R0) &&
+        (MI->getOperand(1).isReg())) {
+      if (Ops[0] == 0) {    // COPY -> STORE
+        unsigned SrcReg = MI->getOperand(1).getReg();
+        bool isKill = MI->getOperand(1).isKill();
+        bool isUndef = MI->getOperand(1).isUndef();
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(MBlaze::SW))
+          .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+          .addImm(0).addFrameIndex(FI);
+      } else {              // COPY -> LOAD
+        unsigned DstReg = MI->getOperand(0).getReg();
+        bool isDead = MI->getOperand(0).isDead();
+        bool isUndef = MI->getOperand(0).isUndef();
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(MBlaze::LW))
+          .addReg(DstReg, RegState::Define | getDeadRegState(isDead) |
+                  getUndefRegState(isUndef))
+          .addImm(0).addFrameIndex(FI);
+      }
+    }
+    break;
+  }
+
+  return NewMI;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+unsigned MBlazeInstrInfo::
+InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+             MachineBasicBlock *FBB,
+             const SmallVectorImpl<MachineOperand> &Cond) const {
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+
+  // Can only insert uncond branches so far.
+  assert(Cond.empty() && !FBB && TBB && "Can only handle uncond branches!");
+  BuildMI(&MBB, dl, get(MBlaze::BRI)).addMBB(TBB);
+  return 1;
+}
+
+/// getGlobalBaseReg - Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+unsigned MBlazeInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+  MBlazeFunctionInfo *MBlazeFI = MF->getInfo<MBlazeFunctionInfo>();
+  unsigned GlobalBaseReg = MBlazeFI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Insert the set of GlobalBaseReg into the first MBB of the function
+  MachineBasicBlock &FirstMBB = MF->front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+
+  GlobalBaseReg = RegInfo.createVirtualRegister(MBlaze::CPURegsRegisterClass);
+  bool Ok = TII->copyRegToReg(FirstMBB, MBBI, GlobalBaseReg, MBlaze::R20,
+                              MBlaze::CPURegsRegisterClass,
+                              MBlaze::CPURegsRegisterClass);
+  assert(Ok && "Couldn't assign to global base register!");
+  Ok = Ok; // Silence warning when assertions are turned off.
+  RegInfo.addLiveIn(MBlaze::R20);
+
+  MBlazeFI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.h b/lib/Target/MBlaze/MBlazeInstrInfo.h
new file mode 100644
index 0000000..4f79f1c
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.h
@@ -0,0 +1,242 @@
+//===- MBlazeInstrInfo.h - MBlaze Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MBlaze implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZEINSTRUCTIONINFO_H
+#define MBLAZEINSTRUCTIONINFO_H
+
+#include "MBlaze.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "MBlazeRegisterInfo.h"
+
+namespace llvm {
+
+namespace MBlaze {
+
+  // MBlaze Branch Codes
+  enum FPBranchCode {
+    BRANCH_F,
+    BRANCH_T,
+    BRANCH_FL,
+    BRANCH_TL,
+    BRANCH_INVALID
+  };
+
+  // MBlaze Condition Codes
+  enum CondCode {
+    // To be used with float branch True
+    FCOND_F,
+    FCOND_UN,
+    FCOND_EQ,
+    FCOND_UEQ,
+    FCOND_OLT,
+    FCOND_ULT,
+    FCOND_OLE,
+    FCOND_ULE,
+    FCOND_SF,
+    FCOND_NGLE,
+    FCOND_SEQ,
+    FCOND_NGL,
+    FCOND_LT,
+    FCOND_NGE,
+    FCOND_LE,
+    FCOND_NGT,
+
+    // To be used with float branch False
+    // This conditions have the same mnemonic as the
+    // above ones, but are used with a branch False;
+    FCOND_T,
+    FCOND_OR,
+    FCOND_NEQ,
+    FCOND_OGL,
+    FCOND_UGE,
+    FCOND_OGE,
+    FCOND_UGT,
+    FCOND_OGT,
+    FCOND_ST,
+    FCOND_GLE,
+    FCOND_SNE,
+    FCOND_GL,
+    FCOND_NLT,
+    FCOND_GE,
+    FCOND_NLE,
+    FCOND_GT,
+
+    // Only integer conditions
+    COND_E,
+    COND_GZ,
+    COND_GEZ,
+    COND_LZ,
+    COND_LEZ,
+    COND_NE,
+    COND_INVALID
+  };
+
+  // Turn condition code into conditional branch opcode.
+  unsigned GetCondBranchFromCond(CondCode CC);
+
+  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+  /// e.g. turning COND_E to COND_NE.
+  CondCode GetOppositeBranchCondition(MBlaze::CondCode CC);
+
+  /// MBlazeCCToString - Map each FP condition code to its string
+  inline static const char *MBlazeFCCToString(MBlaze::CondCode CC)
+  {
+    switch (CC) {
+      default: llvm_unreachable("Unknown condition code");
+      case FCOND_F:
+      case FCOND_T:   return "f";
+      case FCOND_UN:
+      case FCOND_OR:  return "un";
+      case FCOND_EQ:
+      case FCOND_NEQ: return "eq";
+      case FCOND_UEQ:
+      case FCOND_OGL: return "ueq";
+      case FCOND_OLT:
+      case FCOND_UGE: return "olt";
+      case FCOND_ULT:
+      case FCOND_OGE: return "ult";
+      case FCOND_OLE:
+      case FCOND_UGT: return "ole";
+      case FCOND_ULE:
+      case FCOND_OGT: return "ule";
+      case FCOND_SF:
+      case FCOND_ST:  return "sf";
+      case FCOND_NGLE:
+      case FCOND_GLE: return "ngle";
+      case FCOND_SEQ:
+      case FCOND_SNE: return "seq";
+      case FCOND_NGL:
+      case FCOND_GL:  return "ngl";
+      case FCOND_LT:
+      case FCOND_NLT: return "lt";
+      case FCOND_NGE:
+      case FCOND_GE:  return "ge";
+      case FCOND_LE:
+      case FCOND_NLE: return "nle";
+      case FCOND_NGT:
+      case FCOND_GT:  return "gt";
+    }
+  }
+}
+
+/// MBlazeII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MBlazeII {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // MBlaze Specific MachineOperand flags.
+    MO_NO_FLAG,
+
+    /// MO_GOT - Represents the offset into the global offset table at which
+    /// the address the relocation entry symbol resides during execution.
+    MO_GOT,
+
+    /// MO_GOT_CALL - Represents the offset into the global offset table at
+    /// which the address of a call site relocation entry symbol resides
+    /// during execution. This is different from the above since this flag
+    /// can only be present in call instructions.
+    MO_GOT_CALL,
+
+    /// MO_GPREL - Represents the offset from the current gp value to be used
+    /// for the relocatable object file being produced.
+    MO_GPREL,
+
+    /// MO_ABS_HILO - Represents the hi or low part of an absolute symbol
+    /// address.
+    MO_ABS_HILO
+
+  };
+}
+
+class MBlazeInstrInfo : public TargetInstrInfoImpl {
+  MBlazeTargetMachine &TM;
+  const MBlazeRegisterInfo RI;
+public:
+  explicit MBlazeInstrInfo(MBlazeTargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MBlazeRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+
+  /// Branch Analysis
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const {
+    return 0;
+  }
+
+  /// Insert nop instruction when hazard condition is found
+  virtual void insertNoop(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI) const;
+
+  /// getGlobalBaseReg - Return a virtual register initialized with the
+  /// the global base register value. Output instructions required to
+  /// initialize the register in the function entry block, if necessary.
+  ///
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+};
+
+}
+
+#endif
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td
new file mode 100644
index 0000000..3c406dd
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.td
@@ -0,0 +1,672 @@
+//===- MBlazeInstrInfo.td - MBlaze Instruction defs -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+include "MBlazeInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// MBlaze profiles and nodes
+//===----------------------------------------------------------------------===//
+def SDT_MBlazeRet     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_MBlazeJmpLink : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+// Call
+def MBlazeJmpLink : SDNode<"MBlazeISD::JmpLink",SDT_MBlazeJmpLink,
+                               [SDNPHasChain,SDNPOptInFlag,SDNPOutFlag]>;
+
+// Return
+def MBlazeRet : SDNode<"MBlazeISD::Ret", SDT_MBlazeRet,
+                           [SDNPHasChain, SDNPOptInFlag]>;
+
+// Hi and Lo nodes are used to handle global addresses. Used on 
+// MBlazeISelLowering to lower stuff like GlobalAddress, ExternalSymbol 
+// static model.
+def MBWrapper   : SDNode<"MBlazeISD::Wrap", SDTIntUnaryOp>;
+def MBlazeGPRel : SDNode<"MBlazeISD::GPRel", SDTIntUnaryOp>;
+
+def SDT_MBCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_MBCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MBCallSeqStart,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MBCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def SDTMBlazeSelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>]>;
+
+//===----------------------------------------------------------------------===//
+// MBlaze Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+def HasPipe3     : Predicate<"Subtarget.hasPipe3()">;
+def HasBarrel    : Predicate<"Subtarget.hasBarrel()">;
+def NoBarrel     : Predicate<"!Subtarget.hasBarrel()">;
+def HasDiv       : Predicate<"Subtarget.hasDiv()">;
+def HasMul       : Predicate<"Subtarget.hasMul()">;
+def HasFSL       : Predicate<"Subtarget.hasFSL()">;
+def HasEFSL      : Predicate<"Subtarget.hasEFSL()">;
+def HasMSRSet    : Predicate<"Subtarget.hasMSRSet()">;
+def HasException : Predicate<"Subtarget.hasException()">;
+def HasPatCmp    : Predicate<"Subtarget.hasPatCmp()">;
+def HasFPU       : Predicate<"Subtarget.hasFPU()">;
+def HasESR       : Predicate<"Subtarget.hasESR()">;
+def HasPVR       : Predicate<"Subtarget.hasPVR()">;
+def HasMul64     : Predicate<"Subtarget.hasMul64()">;
+def HasSqrt      : Predicate<"Subtarget.hasSqrt()">;
+def HasMMU       : Predicate<"Subtarget.hasMMU()">;
+
+//===----------------------------------------------------------------------===//
+// MBlaze Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+// Instruction operand types
+def brtarget    : Operand<OtherVT>;
+def calltarget  : Operand<i32>;
+def simm16      : Operand<i32>;
+def uimm5       : Operand<i32>;
+def fimm        : Operand<f32>;
+
+// Unsigned Operand
+def uimm16      : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+// FSL Operand
+def fslimm      : Operand<i32> {
+  let PrintMethod = "printFSLImm";
+}
+
+// Address operand
+def memri : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops simm16, CPURegs);
+}
+
+def memrr : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops CPURegs, CPURegs);
+}
+
+// Transformation Function - get the lower 16 bits.
+def LO16 : SDNodeXForm<imm, [{
+  return getI32Imm((unsigned)N->getZExtValue() & 0xFFFF);
+}]>;
+
+// Transformation Function - get the higher 16 bits.
+def HI16 : SDNodeXForm<imm, [{
+  return getI32Imm((unsigned)N->getZExtValue() >> 16);
+}]>;
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt16  : PatLeaf<(imm), [{
+  return (N->getZExtValue() >> 16) == 0;
+}]>;
+
+// Node immediate fits as 16-bit zero extended on target immediate.
+// The LO16 param means that only the lower 16 bits of the node
+// immediate are caught.
+// e.g. addiu, sltiu
+def immZExt16  : PatLeaf<(imm), [{
+  return (N->getZExtValue() >> 16) == 0;
+}], LO16>;
+
+// FSL immediate field must fit in 4 bits.
+def immZExt4 : PatLeaf<(imm), [{
+      return N->getZExtValue() == ((N->getZExtValue()) & 0xf) ;
+}]>;
+
+// shamt field must fit in 5 bits.
+def immZExt5 : PatLeaf<(imm), [{
+      return N->getZExtValue() == ((N->getZExtValue()) & 0x1f) ;
+}]>;
+
+// MBlaze Address Mode! SDNode frameindex could possibily be a match
+// since load and store instructions from stack used it.
+def iaddr : ComplexPattern<i32, 2, "SelectAddrRegImm", [frameindex], []>;
+def xaddr : ComplexPattern<i32, 2, "SelectAddrRegReg", [], []>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+// As stack alignment is always done with addiu, we need a 16-bit immediate
+let Defs = [R1], Uses = [R1] in {
+def ADJCALLSTACKDOWN : MBlazePseudo<(outs), (ins simm16:$amt),
+                                  "${:comment} ADJCALLSTACKDOWN $amt",
+                                  [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : MBlazePseudo<(outs),
+                                  (ins uimm16:$amt1, simm16:$amt2),
+                                  "${:comment} ADJCALLSTACKUP $amt1",
+                                  [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+// Some assembly macros need to avoid pseudoinstructions and assembler
+// automatic reodering, we should reorder ourselves.
+def MACRO     : MBlazePseudo<(outs), (ins), ".set macro",     []>;
+def REORDER   : MBlazePseudo<(outs), (ins), ".set reorder",   []>;
+def NOMACRO   : MBlazePseudo<(outs), (ins), ".set nomacro",   []>;
+def NOREORDER : MBlazePseudo<(outs), (ins), ".set noreorder", []>;
+
+// When handling PIC code the assembler needs .cpload and .cprestore
+// directives. If the real instructions corresponding these directives
+// are used, we have the same behavior, but get also a bunch of warnings
+// from the assembler.
+def CPLOAD : MBlazePseudo<(outs), (ins CPURegs:$reg), ".cpload $reg", []>;
+def CPRESTORE : MBlazePseudo<(outs), (ins uimm16:$l), ".cprestore $l\n", []>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+class Arith<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
+            InstrItinClass itin> :
+            TA<op, flags, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
+               !strconcat(instr_asm, "   $dst, $b, $c"),
+               [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], itin>;
+
+class ArithI<bits<6> op, string instr_asm, SDNode OpNode,
+             Operand Od, PatLeaf imm_type> :
+             TAI<op, (outs CPURegs:$dst), (ins CPURegs:$b, Od:$c),
+                 !strconcat(instr_asm, "   $dst, $b, $c"),
+                 [(set CPURegs:$dst, (OpNode CPURegs:$b, imm_type:$c))], IIAlu>;
+
+class ArithR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
+            InstrItinClass itin> :
+            TA<op, flags, (outs CPURegs:$dst), (ins CPURegs:$c, CPURegs:$b),
+               !strconcat(instr_asm, "   $dst, $c, $b"),
+               [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], itin>;
+
+class ArithRI<bits<6> op, string instr_asm, SDNode OpNode,
+             Operand Od, PatLeaf imm_type> :
+             TAI<op, (outs CPURegs:$dst), (ins Od:$b, CPURegs:$c),
+                 !strconcat(instr_asm, "   $dst, $c, $b"),
+                 [(set CPURegs:$dst, (OpNode imm_type:$b, CPURegs:$c))], IIAlu>;
+
+class ArithN<bits<6> op, bits<11> flags, string instr_asm,
+            InstrItinClass itin> :
+            TA<op, flags, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
+               !strconcat(instr_asm, "   $dst, $b, $c"),
+               [], itin>;
+
+class ArithNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
+             TAI<op, (outs CPURegs:$dst), (ins CPURegs:$b, Od:$c),
+                 !strconcat(instr_asm, "   $dst, $b, $c"),
+                 [], IIAlu>;
+
+class ArithRN<bits<6> op, bits<11> flags, string instr_asm,
+            InstrItinClass itin> :
+            TA<op, flags, (outs CPURegs:$dst), (ins CPURegs:$c, CPURegs:$b),
+               !strconcat(instr_asm, "   $dst, $b, $c"),
+               [], itin>;
+
+class ArithRNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
+             TAI<op, (outs CPURegs:$dst), (ins Od:$c, CPURegs:$b),
+                 !strconcat(instr_asm, "   $dst, $b, $c"),
+                 [], IIAlu>;
+
+//===----------------------------------------------------------------------===//
+// Misc Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+class Logic<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode> :
+            TA<op, flags, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
+               !strconcat(instr_asm, "   $dst, $b, $c"),
+               [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], IIAlu>;
+
+class LogicI<bits<6> op, string instr_asm, SDNode OpNode> :
+             TAI<op, (outs CPURegs:$dst), (ins CPURegs:$b, uimm16:$c),
+                 !strconcat(instr_asm, "   $dst, $b, $c"),
+                 [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt16:$c))],
+                 IIAlu>;
+
+class EffectiveAddress<string instr_asm> :
+          TAI<0x08, (outs CPURegs:$dst), (ins memri:$addr),
+              instr_asm, [(set CPURegs:$dst, iaddr:$addr)], IIAlu>;
+
+//===----------------------------------------------------------------------===//
+// Memory Access Instructions
+//===----------------------------------------------------------------------===//
+class LoadM<bits<6> op, string instr_asm, PatFrag OpNode> :
+            TA<op, 0x000, (outs CPURegs:$dst), (ins memrr:$addr),
+               !strconcat(instr_asm, "   $dst, $addr"),
+               [(set CPURegs:$dst, (OpNode xaddr:$addr))], IILoad>;
+
+class LoadMI<bits<6> op, string instr_asm, PatFrag OpNode> :
+             TAI<op, (outs CPURegs:$dst), (ins memri:$addr),
+                 !strconcat(instr_asm, "   $dst, $addr"),
+                 [(set CPURegs:$dst, (OpNode iaddr:$addr))], IILoad>;
+
+class StoreM<bits<6> op, string instr_asm, PatFrag OpNode> :
+             TA<op, 0x000, (outs), (ins CPURegs:$dst, memrr:$addr),
+                !strconcat(instr_asm, "   $dst, $addr"),
+                [(OpNode CPURegs:$dst, xaddr:$addr)], IIStore>;
+
+class StoreMI<bits<6> op, string instr_asm, PatFrag OpNode> :
+              TAI<op, (outs), (ins CPURegs:$dst, memri:$addr),
+                  !strconcat(instr_asm, "   $dst, $addr"),
+                  [(OpNode CPURegs:$dst, iaddr:$addr)], IIStore>;
+
+//===----------------------------------------------------------------------===//
+// Branch Instructions
+//===----------------------------------------------------------------------===//
+class Branch<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
+             TBR<op, br, flags, (outs), (ins CPURegs:$target),
+                 !strconcat(instr_asm, "   $target"),
+                 [(brind CPURegs:$target)], IIBranch>;
+
+class BranchI<bits<6> op, bits<5> brf, string instr_asm> :
+              TBRI<op, brf, (outs), (ins brtarget:$target),
+                   !strconcat(instr_asm, "   $target"),
+                   [(br bb:$target)], IIBranch>;
+
+//===----------------------------------------------------------------------===//
+// Branch and Link Instructions
+//===----------------------------------------------------------------------===//
+class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
+              TBRL<op, br, flags, (outs), (ins CPURegs:$target),
+                   !strconcat(instr_asm, "   r15, $target"),
+                   [], IIBranch>;
+
+class BranchLI<bits<6> op, bits<5> br, string instr_asm> :
+               TBRLI<op, br, (outs), (ins calltarget:$target),
+                     !strconcat(instr_asm, "   r15, $target"),
+                     [], IIBranch>;
+
+//===----------------------------------------------------------------------===//
+// Conditional Branch Instructions
+//===----------------------------------------------------------------------===//
+class BranchC<bits<6> op, bits<5> br, bits<11> flags, string instr_asm,
+              PatFrag cond_op> :
+              TBRC<op, br, flags, (outs),
+                   (ins CPURegs:$a, CPURegs:$b, brtarget:$offset),
+                   !strconcat(instr_asm, "   $a, $b, $offset"),
+                   [], IIBranch>; 
+                   //(brcond (cond_op CPURegs:$a, CPURegs:$b), bb:$offset)],
+                   //IIBranch>;
+
+class BranchCI<bits<6> op, bits<5> br, string instr_asm, PatFrag cond_op> :
+               TBRCI<op, br, (outs), (ins CPURegs:$a, brtarget:$offset),
+                     !strconcat(instr_asm, "   $a, $offset"),
+                     [], IIBranch>;
+
+//===----------------------------------------------------------------------===//
+// MBlaze arithmetic instructions
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1, isAsCheapAsAMove = 1 in {
+    def ADD    :  Arith<0x00, 0x000, "add    ", add,  IIAlu>;
+    def ADDC   :  Arith<0x02, 0x000, "addc   ", adde, IIAlu>;
+    def ADDK   :  Arith<0x04, 0x000, "addk   ", addc, IIAlu>;
+    def ADDKC  : ArithN<0x06, 0x000, "addkc  ", IIAlu>;
+    def AND    :  Logic<0x21, 0x000, "and    ", and>;
+    def OR     :  Logic<0x20, 0x000, "or     ", or>;
+    def XOR    :  Logic<0x22, 0x000, "xor    ", xor>;
+}
+
+let isAsCheapAsAMove = 1 in {
+    def ANDN   :  ArithN<0x23, 0x000, "andn   ", IIAlu>;
+    def CMP    :  ArithN<0x05, 0x001, "cmp    ", IIAlu>;
+    def CMPU   :  ArithN<0x05, 0x003, "cmpu   ", IIAlu>;
+    def RSUB   :  ArithR<0x01, 0x000, "rsub   ", sub,  IIAlu>;
+    def RSUBC  :  ArithR<0x03, 0x000, "rsubc  ", sube, IIAlu>;
+    def RSUBK  :  ArithR<0x05, 0x000, "rsubk  ", subc, IIAlu>;
+    def RSUBKC : ArithRN<0x07, 0x000, "rsubkc ", IIAlu>;
+}
+
+let isCommutable = 1, Predicates=[HasMul] in {
+    def MUL    : Arith<0x10, 0x000, "mul    ", mul,   IIAlu>;
+}
+
+let isCommutable = 1, Predicates=[HasMul,HasMul64] in {
+    def MULH   : Arith<0x10, 0x001, "mulh   ", mulhs, IIAlu>;
+    def MULHU  : Arith<0x10, 0x003, "mulhu  ", mulhu, IIAlu>;
+}
+
+let Predicates=[HasMul,HasMul64] in {
+    def MULHSU : ArithN<0x10, 0x002, "mulhsu ", IIAlu>;
+}
+
+let Predicates=[HasBarrel] in {
+    def BSRL   :   Arith<0x11, 0x000, "bsrl   ", srl, IIAlu>;
+    def BSRA   :   Arith<0x11, 0x200, "bsra   ", sra, IIAlu>;
+    def BSLL   :   Arith<0x11, 0x400, "bsll   ", shl, IIAlu>;
+    def BSRLI  :  ArithI<0x11, "bsrli  ", srl, uimm5, immZExt5>;
+    def BSRAI  :  ArithI<0x11, "bsrai  ", sra, uimm5, immZExt5>;
+    def BSLLI  :  ArithI<0x11, "bslli  ", shl, uimm5, immZExt5>;
+}
+
+let Predicates=[HasDiv] in {
+    def IDIV   :  Arith<0x12, 0x000, "idiv   ", sdiv, IIAlu>;
+    def IDIVU  :  Arith<0x12, 0x002, "idivu  ", udiv, IIAlu>;
+}
+
+//===----------------------------------------------------------------------===//
+// MBlaze immediate mode arithmetic instructions
+//===----------------------------------------------------------------------===//
+
+let isAsCheapAsAMove = 1 in {
+    def ADDI    :   ArithI<0x08, "addi   ", add,  simm16, immSExt16>;
+    def ADDIC   :  ArithNI<0x0A, "addic  ", simm16, immSExt16>;
+    def ADDIK   :  ArithNI<0x0C, "addik  ", simm16, immSExt16>;
+    def ADDIKC  :   ArithI<0x0E, "addikc ", addc, simm16, immSExt16>;
+    def RSUBI   :   ArithRI<0x09, "rsubi  ", sub,  simm16, immSExt16>;
+    def RSUBIC  :  ArithRNI<0x0B, "rsubi  ", simm16, immSExt16>;
+    def RSUBIK  :  ArithRNI<0x0E, "rsubic ", simm16, immSExt16>;
+    def RSUBIKC :   ArithRI<0x0F, "rsubikc", subc, simm16, immSExt16>;
+    def ANDNI   :  ArithNI<0x2B, "andni  ", uimm16, immZExt16>;
+    def ANDI    :   LogicI<0x29, "andi   ", and>;
+    def ORI     :   LogicI<0x28, "ori    ", or>;
+    def XORI    :   LogicI<0x2A, "xori   ", xor>;
+}
+
+let Predicates=[HasMul] in {
+    def MULI    :   ArithI<0x18, "muli   ", mul, simm16, immSExt16>;
+}
+
+//===----------------------------------------------------------------------===//
+// MBlaze memory access instructions
+//===----------------------------------------------------------------------===//
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+    def LBU  :  LoadM<0x30, "lbu    ", zextloadi8>;
+    def LHU  :  LoadM<0x31, "lhu    ", zextloadi16>;
+    def LW   :  LoadM<0x32, "lw     ", load>;
+
+    def LBUI : LoadMI<0x30, "lbui   ", zextloadi8>;
+    def LHUI : LoadMI<0x31, "lhui   ", zextloadi16>;
+    def LWI  : LoadMI<0x32, "lwi    ", load>;
+}
+
+    def SB  :  StoreM<0x34, "sb     ", truncstorei8>;
+    def SH  :  StoreM<0x35, "sh     ", truncstorei16>;
+    def SW  :  StoreM<0x36, "sw     ", store>;
+
+    def SBI : StoreMI<0x34, "sbi    ", truncstorei8>;
+    def SHI : StoreMI<0x35, "shi    ", truncstorei16>;
+    def SWI : StoreMI<0x36, "swi    ", store>;
+
+//===----------------------------------------------------------------------===//
+// MBlaze branch instructions
+//===----------------------------------------------------------------------===//
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+    def BRI    :  BranchI<0x2E, 0x00, "bri    ">;
+    def BRAI   :  BranchI<0x2E, 0x08, "brai   ">;
+    def BEQI   : BranchCI<0x2F, 0x00, "beqi   ", seteq>;
+    def BNEI   : BranchCI<0x2F, 0x01, "bnei   ", setne>;
+    def BLTI   : BranchCI<0x2F, 0x02, "blti   ", setlt>;
+    def BLEI   : BranchCI<0x2F, 0x03, "blei   ", setle>;
+    def BGTI   : BranchCI<0x2F, 0x04, "bgti   ", setgt>;
+    def BGEI   : BranchCI<0x2F, 0x05, "bgei   ", setge>;
+}
+
+let isBranch = 1, isIndirectBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+    def BR     :   Branch<0x26, 0x00, 0x000, "br     ">;
+    def BRA    :   Branch<0x26, 0x08, 0x000, "bra    ">;
+    def BEQ    :  BranchC<0x27, 0x00, 0x000, "beq    ", seteq>;
+    def BNE    :  BranchC<0x27, 0x01, 0x000, "bne    ", setne>;
+    def BLT    :  BranchC<0x27, 0x02, 0x000, "blt    ", setlt>;
+    def BLE    :  BranchC<0x27, 0x03, 0x000, "ble    ", setle>;
+    def BGT    :  BranchC<0x27, 0x04, 0x000, "bgt    ", setgt>;
+    def BGE    :  BranchC<0x27, 0x05, 0x000, "bge    ", setge>;
+}
+
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1, hasCtrlDep = 1 in {
+    def BRID   :  BranchI<0x2E, 0x10, "brid   ">;
+    def BRAID  :  BranchI<0x2E, 0x18, "braid  ">;
+    def BEQID  : BranchCI<0x2F, 0x10, "beqid  ", seteq>;
+    def BNEID  : BranchCI<0x2F, 0x11, "bneid  ", setne>;
+    def BLTID  : BranchCI<0x2F, 0x12, "bltid  ", setlt>;
+    def BLEID  : BranchCI<0x2F, 0x13, "bleid  ", setle>;
+    def BGTID  : BranchCI<0x2F, 0x14, "bgtid  ", setgt>;
+    def BGEID  : BranchCI<0x2F, 0x15, "bgeid  ", setge>;
+}
+
+let isBranch = 1, isIndirectBranch = 1, isTerminator = 1,
+    hasDelaySlot = 1, hasCtrlDep = 1 in {
+    def BRD    :   Branch<0x26, 0x10, 0x000, "brd    ">;
+    def BRAD   :   Branch<0x26, 0x18, 0x000, "brad   ">;
+    def BEQD   :  BranchC<0x27, 0x10, 0x000, "beqd   ", seteq>;
+    def BNED   :  BranchC<0x27, 0x11, 0x000, "bned   ", setne>;
+    def BLTD   :  BranchC<0x27, 0x12, 0x000, "bltd   ", setlt>;
+    def BLED   :  BranchC<0x27, 0x13, 0x000, "bled   ", setle>;
+    def BGTD   :  BranchC<0x27, 0x14, 0x000, "bgtd   ", setgt>;
+    def BGED   :  BranchC<0x27, 0x15, 0x000, "bged   ", setge>;
+}
+
+let isCall = 1, hasCtrlDep = 1, isIndirectBranch = 1,
+    Defs = [R3,R4,R5,R6,R7,R8,R9,R10,R11,R12],
+    Uses = [R1,R5,R6,R7,R8,R9,R10] in {
+    def BRL    : BranchL<0x26, 0x04, 0x000, "brl    ">;
+    def BRAL   : BranchL<0x26, 0x0C, 0x000, "bral   ">;
+}
+
+let isCall = 1, hasDelaySlot = 1, hasCtrlDep = 1,
+    Defs = [R3,R4,R5,R6,R7,R8,R9,R10,R11,R12],
+    Uses = [R1,R5,R6,R7,R8,R9,R10] in {
+    def BRLID  : BranchLI<0x2E, 0x14, "brlid  ">;
+    def BRALID : BranchLI<0x2E, 0x1C, "bralid ">;
+}
+
+let isCall = 1, hasDelaySlot = 1, hasCtrlDep = 1, isIndirectBranch = 1,
+    Defs = [R3,R4,R5,R6,R7,R8,R9,R10,R11,R12],
+    Uses = [R1,R5,R6,R7,R8,R9,R10] in {
+    def BRLD   : BranchL<0x26, 0x14, 0x000, "brld   ">;
+    def BRALD  : BranchL<0x26, 0x1C, 0x000, "brald  ">;
+}
+
+let isReturn=1, isTerminator=1, hasDelaySlot=1,
+    isBarrier=1, hasCtrlDep=1, imm16=0x8 in {
+    def RTSD   : TRET<0x2D, (outs), (ins CPURegs:$target),
+                      "rtsd      $target, 8",
+                      [(MBlazeRet CPURegs:$target)],
+                      IIBranch>;
+}
+
+//===----------------------------------------------------------------------===//
+// MBlaze misc instructions
+//===----------------------------------------------------------------------===//
+
+let addr = 0 in {
+    def NOP :  TADDR<0x00, (outs), (ins), "nop    ", [], IIAlu>;
+}
+
+let usesCustomInserter = 1 in {
+  //class PseudoSelCC<RegisterClass RC, string asmstr>:
+  //  MBlazePseudo<(outs RC:$D), (ins RC:$T, RC:$F, CPURegs:$CMP), asmstr,
+  //  [(set RC:$D, (MBlazeSelectCC RC:$T, RC:$F, CPURegs:$CMP))]>;
+  //def Select_CC : PseudoSelCC<CPURegs, "# MBlazeSelect_CC">;
+
+  def Select_CC : MBlazePseudo<(outs CPURegs:$dst),
+    (ins CPURegs:$T, CPURegs:$F, CPURegs:$CMP, i32imm:$CC),
+    "; SELECT_CC PSEUDO!",
+    []>;
+
+  def ShiftL : MBlazePseudo<(outs CPURegs:$dst),
+    (ins CPURegs:$L, CPURegs:$R),
+    "; ShiftL PSEUDO!",
+    []>;
+
+  def ShiftRA : MBlazePseudo<(outs CPURegs:$dst),
+    (ins CPURegs:$L, CPURegs:$R),
+    "; ShiftRA PSEUDO!",
+    []>;
+
+  def ShiftRL : MBlazePseudo<(outs CPURegs:$dst),
+    (ins CPURegs:$L, CPURegs:$R),
+    "; ShiftRL PSEUDO!",
+    []>;
+}
+
+
+let rb = 0 in {
+    def SEXT16 : TA<0x24, 0x061, (outs CPURegs:$dst), (ins CPURegs:$src),
+                    "sext16  $dst, $src", [], IIAlu>;
+    def SEXT8  : TA<0x24, 0x060, (outs CPURegs:$dst), (ins CPURegs:$src),
+                    "sext8   $dst, $src", [], IIAlu>;
+    def SRL    : TA<0x24, 0x041, (outs CPURegs:$dst), (ins CPURegs:$src),
+                    "srl     $dst, $src", [], IIAlu>;
+    def SRA    : TA<0x24, 0x001, (outs CPURegs:$dst), (ins CPURegs:$src),
+                    "sra     $dst, $src", [], IIAlu>;
+    def SRC    : TA<0x24, 0x021, (outs CPURegs:$dst), (ins CPURegs:$src),
+                    "src     $dst, $src", [], IIAlu>;
+}
+
+def LEA_ADDI : EffectiveAddress<"addi    $dst, ${addr:stackloc}">;
+
+//===----------------------------------------------------------------------===//
+//  Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// Small immediates
+def : Pat<(i32 0), (ADD R0, R0)>;
+def : Pat<(i32 immSExt16:$imm), (ADDI R0, imm:$imm)>;
+def : Pat<(i32 immZExt16:$imm), (ORI R0, imm:$imm)>;
+
+// Arbitrary immediates
+def : Pat<(i32 imm:$imm), (ADDI R0, imm:$imm)>;
+
+// In register sign extension
+def : Pat<(sext_inreg CPURegs:$src, i16), (SEXT16 CPURegs:$src)>;
+def : Pat<(sext_inreg CPURegs:$src, i8),  (SEXT8 CPURegs:$src)>;
+
+// Call
+def : Pat<(MBlazeJmpLink (i32 tglobaladdr:$dst)), (BRLID tglobaladdr:$dst)>;
+def : Pat<(MBlazeJmpLink (i32 texternalsym:$dst)),(BRLID texternalsym:$dst)>;
+def : Pat<(MBlazeJmpLink CPURegs:$dst), (BRLD CPURegs:$dst)>;
+
+// Shift Instructions
+def : Pat<(shl CPURegs:$L, CPURegs:$R), (ShiftL CPURegs:$L, CPURegs:$R)>;
+def : Pat<(sra CPURegs:$L, CPURegs:$R), (ShiftRA CPURegs:$L, CPURegs:$R)>;
+def : Pat<(srl CPURegs:$L, CPURegs:$R), (ShiftRL CPURegs:$L, CPURegs:$R)>;
+
+// SET_CC operations
+def : Pat<(setcc CPURegs:$L, CPURegs:$R, SETEQ),
+          (Select_CC (ADDI R0, 1), (ADDI R0, 0), 
+                     (CMP CPURegs:$L, CPURegs:$R), 1)>;
+def : Pat<(setcc CPURegs:$L, CPURegs:$R, SETNE),
+          (Select_CC (ADDI R0, 1), (ADDI R0, 0), 
+                     (CMP CPURegs:$L, CPURegs:$R), 2)>;
+def : Pat<(setcc CPURegs:$L, CPURegs:$R, SETGT),
+          (Select_CC (ADDI R0, 1), (ADDI R0, 0), 
+                     (CMP CPURegs:$L, CPURegs:$R), 3)>;
+def : Pat<(setcc CPURegs:$L, CPURegs:$R, SETLT),
+          (Select_CC (ADDI R0, 1), (ADDI R0, 0), 
+                     (CMP CPURegs:$L, CPURegs:$R), 4)>;
+def : Pat<(setcc CPURegs:$L, CPURegs:$R, SETGE),
+          (Select_CC (ADDI R0, 1), (ADDI R0, 0), 
+                     (CMP CPURegs:$L, CPURegs:$R), 5)>;
+def : Pat<(setcc CPURegs:$L, CPURegs:$R, SETLE),
+          (Select_CC (ADDI R0, 1), (ADDI R0, 0), 
+                     (CMP CPURegs:$L, CPURegs:$R), 6)>;
+def : Pat<(setcc CPURegs:$L, CPURegs:$R, SETUGT),
+          (Select_CC (ADDI R0, 1), (ADDI R0, 0), 
+                     (CMPU CPURegs:$L, CPURegs:$R), 3)>;
+def : Pat<(setcc CPURegs:$L, CPURegs:$R, SETULT),
+          (Select_CC (ADDI R0, 1), (ADDI R0, 0), 
+                     (CMPU CPURegs:$L, CPURegs:$R), 4)>;
+def : Pat<(setcc CPURegs:$L, CPURegs:$R, SETUGE),
+          (Select_CC (ADDI R0, 1), (ADDI R0, 0), 
+                     (CMPU CPURegs:$L, CPURegs:$R), 5)>;
+def : Pat<(setcc CPURegs:$L, CPURegs:$R, SETULE),
+          (Select_CC (ADDI R0, 1), (ADDI R0, 0), 
+                     (CMPU CPURegs:$L, CPURegs:$R), 6)>;
+
+// SELECT operations
+def : Pat<(select CPURegs:$C, CPURegs:$T, CPURegs:$F),
+          (Select_CC CPURegs:$T, CPURegs:$F, CPURegs:$C, 2)>;
+
+// SELECT_CC 
+def : Pat<(selectcc CPURegs:$L, CPURegs:$R, CPURegs:$T, CPURegs:$F, SETEQ),
+          (Select_CC CPURegs:$T, CPURegs:$F, (CMP CPURegs:$L, CPURegs:$R), 1)>;
+def : Pat<(selectcc CPURegs:$L, CPURegs:$R, CPURegs:$T, CPURegs:$F, SETNE),
+          (Select_CC CPURegs:$T, CPURegs:$F, (CMP CPURegs:$L, CPURegs:$R), 2)>;
+def : Pat<(selectcc CPURegs:$L, CPURegs:$R, CPURegs:$T, CPURegs:$F, SETGT),
+          (Select_CC CPURegs:$T, CPURegs:$F, (CMP CPURegs:$L, CPURegs:$R), 3)>;
+def : Pat<(selectcc CPURegs:$L, CPURegs:$R, CPURegs:$T, CPURegs:$F, SETLT),
+          (Select_CC CPURegs:$T, CPURegs:$F, (CMP CPURegs:$L, CPURegs:$R), 4)>;
+def : Pat<(selectcc CPURegs:$L, CPURegs:$R, CPURegs:$T, CPURegs:$F, SETGE),
+          (Select_CC CPURegs:$T, CPURegs:$F, (CMP CPURegs:$L, CPURegs:$R), 5)>;
+def : Pat<(selectcc CPURegs:$L, CPURegs:$R, CPURegs:$T, CPURegs:$F, SETLE),
+          (Select_CC CPURegs:$T, CPURegs:$F, (CMP CPURegs:$L, CPURegs:$R), 6)>;
+def : Pat<(selectcc CPURegs:$L, CPURegs:$R, CPURegs:$T, CPURegs:$F, SETUGT),
+          (Select_CC CPURegs:$T, CPURegs:$F, (CMPU CPURegs:$L, CPURegs:$R), 3)>;
+def : Pat<(selectcc CPURegs:$L, CPURegs:$R, CPURegs:$T, CPURegs:$F, SETULT),
+          (Select_CC CPURegs:$T, CPURegs:$F, (CMPU CPURegs:$L, CPURegs:$R), 4)>;
+def : Pat<(selectcc CPURegs:$L, CPURegs:$R, CPURegs:$T, CPURegs:$F, SETUGE),
+          (Select_CC CPURegs:$T, CPURegs:$F, (CMPU CPURegs:$L, CPURegs:$R), 5)>;
+def : Pat<(selectcc CPURegs:$L, CPURegs:$R, CPURegs:$T, CPURegs:$F, SETULE),
+          (Select_CC CPURegs:$T, CPURegs:$F, (CMPU CPURegs:$L, CPURegs:$R), 6)>;
+
+// BRCOND instructions
+def : Pat<(brcond (setcc CPURegs:$L, CPURegs:$R, SETEQ), bb:$T),
+          (BEQID (CMP CPURegs:$R, CPURegs:$L), bb:$T)>;
+def : Pat<(brcond (setcc CPURegs:$L, CPURegs:$R, SETNE), bb:$T),
+          (BNEID (CMP CPURegs:$R, CPURegs:$L), bb:$T)>;
+def : Pat<(brcond (setcc CPURegs:$L, CPURegs:$R, SETGT), bb:$T),
+          (BGTID (CMP CPURegs:$R, CPURegs:$L), bb:$T)>;
+def : Pat<(brcond (setcc CPURegs:$L, CPURegs:$R, SETLT), bb:$T),
+          (BLTID (CMP CPURegs:$R, CPURegs:$L), bb:$T)>;
+def : Pat<(brcond (setcc CPURegs:$L, CPURegs:$R, SETGE), bb:$T),
+          (BGEID (CMP CPURegs:$R, CPURegs:$L), bb:$T)>;
+def : Pat<(brcond (setcc CPURegs:$L, CPURegs:$R, SETLE), bb:$T),
+          (BLEID (CMP CPURegs:$R, CPURegs:$L), bb:$T)>;
+def : Pat<(brcond (setcc CPURegs:$L, CPURegs:$R, SETUGT), bb:$T),
+          (BGTID (CMPU CPURegs:$R, CPURegs:$L), bb:$T)>;
+def : Pat<(brcond (setcc CPURegs:$L, CPURegs:$R, SETULT), bb:$T),
+          (BLTID (CMPU CPURegs:$R, CPURegs:$L), bb:$T)>;
+def : Pat<(brcond (setcc CPURegs:$L, CPURegs:$R, SETUGE), bb:$T),
+          (BGEID (CMPU CPURegs:$R, CPURegs:$L), bb:$T)>;
+def : Pat<(brcond (setcc CPURegs:$L, CPURegs:$R, SETULE), bb:$T),
+          (BLEID (CMPU CPURegs:$R, CPURegs:$L), bb:$T)>;
+def : Pat<(brcond CPURegs:$C, bb:$T),
+          (BNEID CPURegs:$C, bb:$T)>;
+
+// Jump tables, global addresses, and constant pools
+def : Pat<(MBWrapper tglobaladdr:$in), (ORI R0, tglobaladdr:$in)>;
+def : Pat<(MBWrapper tjumptable:$in),  (ORI R0, tjumptable:$in)>;
+def : Pat<(MBWrapper tconstpool:$in),  (ORI R0, tconstpool:$in)>;
+
+// Misc instructions
+def : Pat<(and CPURegs:$lh, (not CPURegs:$rh)),(ANDN CPURegs:$lh, CPURegs:$rh)>;
+
+// Arithmetic with immediates
+def : Pat<(add CPURegs:$in, imm:$imm),(ADDI CPURegs:$in, imm:$imm)>;
+def : Pat<(or CPURegs:$in, imm:$imm),(ORI CPURegs:$in, imm:$imm)>;
+def : Pat<(xor CPURegs:$in, imm:$imm),(XORI CPURegs:$in, imm:$imm)>;
+
+// extended load and stores
+def : Pat<(extloadi1  iaddr:$src), (LBUI iaddr:$src)>;
+def : Pat<(extloadi8  iaddr:$src), (LBUI iaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src), (LHUI iaddr:$src)>;
+def : Pat<(extloadi1  xaddr:$src), (LBU  xaddr:$src)>;
+def : Pat<(extloadi8  xaddr:$src), (LBU  xaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src), (LHU  xaddr:$src)>;
+
+def : Pat<(sextloadi1  iaddr:$src), (SEXT8  (LBUI iaddr:$src))>;
+def : Pat<(sextloadi8  iaddr:$src), (SEXT8  (LBUI iaddr:$src))>;
+def : Pat<(sextloadi16 iaddr:$src), (SEXT16 (LHUI iaddr:$src))>;
+def : Pat<(sextloadi1  xaddr:$src), (SEXT8  (LBU xaddr:$src))>;
+def : Pat<(sextloadi8  xaddr:$src), (SEXT8  (LBU xaddr:$src))>;
+def : Pat<(sextloadi16 xaddr:$src), (SEXT16 (LHU xaddr:$src))>;
+
+// peepholes
+def : Pat<(store (i32 0), iaddr:$dst), (SWI R0, iaddr:$dst)>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//===----------------------------------------------------------------------===//
+include "MBlazeInstrFSL.td"
+include "MBlazeInstrFPU.td"
diff --git a/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp b/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
new file mode 100644
index 0000000..c8faffc
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
@@ -0,0 +1,109 @@
+//===- MBlazeIntrinsicInfo.cpp - Intrinsic Information -00-------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MBlaze implementation of TargetIntrinsicInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeIntrinsicInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstring>
+
+using namespace llvm;
+
+namespace mblazeIntrinsic {
+
+  enum ID {
+    last_non_mblaze_intrinsic = Intrinsic::num_intrinsics-1,
+#define GET_INTRINSIC_ENUM_VALUES
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_INTRINSIC_ENUM_VALUES
+    , num_mblaze_intrinsics
+  };
+
+#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+}
+
+std::string MBlazeIntrinsicInfo::getName(unsigned IntrID, const Type **Tys,
+                                         unsigned numTys) const {
+  static const char *const names[] = {
+#define GET_INTRINSIC_NAME_TABLE
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_INTRINSIC_NAME_TABLE
+  };
+
+  assert(!isOverloaded(IntrID) && "MBlaze intrinsics are not overloaded");
+  if (IntrID < Intrinsic::num_intrinsics)
+    return 0;
+  assert(IntrID < mblazeIntrinsic::num_mblaze_intrinsics && 
+         "Invalid intrinsic ID");
+
+  std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
+  return Result;
+}
+
+unsigned MBlazeIntrinsicInfo::
+lookupName(const char *Name, unsigned Len) const {
+#define GET_FUNCTION_RECOGNIZER
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_FUNCTION_RECOGNIZER
+  return 0;
+}
+
+unsigned MBlazeIntrinsicInfo::
+lookupGCCName(const char *Name) const {
+    return mblazeIntrinsic::getIntrinsicForGCCBuiltin("mblaze",Name);
+}
+
+bool MBlazeIntrinsicInfo::isOverloaded(unsigned IntrID) const {
+  // Overload Table
+  const bool OTable[] = {
+#define GET_INTRINSIC_OVERLOAD_TABLE
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_INTRINSIC_OVERLOAD_TABLE
+  };
+  if (IntrID == 0)
+    return false;
+  else
+    return OTable[IntrID - Intrinsic::num_intrinsics];
+}
+
+/// This defines the "getAttributes(ID id)" method.
+#define GET_INTRINSIC_ATTRIBUTES
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_INTRINSIC_ATTRIBUTES
+
+static const FunctionType *getType(LLVMContext &Context, unsigned id) {
+  const Type *ResultTy = NULL;
+  std::vector<const Type*> ArgTys;
+  bool IsVarArg = false;
+  
+#define GET_INTRINSIC_GENERATOR
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_INTRINSIC_GENERATOR
+
+  return FunctionType::get(ResultTy, ArgTys, IsVarArg); 
+}
+
+Function *MBlazeIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+                                                const Type **Tys,
+                                                unsigned numTy) const {
+  assert(!isOverloaded(IntrID) && "MBlaze intrinsics are not overloaded");
+  AttrListPtr AList = getAttributes((mblazeIntrinsic::ID) IntrID);
+  return cast<Function>(M->getOrInsertFunction(getName(IntrID),
+                                               getType(M->getContext(), IntrID),
+                                               AList));
+}
diff --git a/lib/Target/MBlaze/MBlazeIntrinsicInfo.h b/lib/Target/MBlaze/MBlazeIntrinsicInfo.h
new file mode 100644
index 0000000..9804c77
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeIntrinsicInfo.h
@@ -0,0 +1,33 @@
+//===- MBlazeIntrinsicInfo.h - MBlaze Intrinsic Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MBlaze implementation of TargetIntrinsicInfo.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MBLAZEINTRINSICS_H
+#define MBLAZEINTRINSICS_H
+
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+namespace llvm {
+
+  class MBlazeIntrinsicInfo : public TargetIntrinsicInfo {
+  public:
+    std::string getName(unsigned IntrID, const Type **Tys = 0,
+                        unsigned numTys = 0) const;
+    unsigned lookupName(const char *Name, unsigned Len) const;
+    unsigned lookupGCCName(const char *Name) const;
+    bool isOverloaded(unsigned IID) const;
+    Function *getDeclaration(Module *M, unsigned ID, const Type **Tys = 0,
+                             unsigned numTys = 0) const;
+  };
+
+}
+
+#endif
diff --git a/lib/Target/MBlaze/MBlazeIntrinsics.td b/lib/Target/MBlaze/MBlazeIntrinsics.td
new file mode 100644
index 0000000..76eb563
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeIntrinsics.td
@@ -0,0 +1,137 @@
+//===- IntrinsicsMBlaze.td - Defines MBlaze intrinsics -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the MicroBlaze-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Definitions for all MBlaze intrinsics.
+//
+
+// MBlaze intrinsic classes.
+let TargetPrefix = "mblaze", isTarget = 1 in { 
+  class MBFSL_Get_Intrinsic : Intrinsic<[llvm_i32_ty],
+                                        [llvm_i32_ty],
+                                        [IntrWriteMem]>;
+
+  class MBFSL_Put_Intrinsic : Intrinsic<[llvm_void_ty],
+                                        [llvm_i32_ty, llvm_i32_ty],
+                                        [IntrWriteMem]>;
+
+  class MBFSL_PutT_Intrinsic : Intrinsic<[llvm_void_ty],
+                                         [llvm_i32_ty],
+                                         [IntrWriteMem]>;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroBlaze FSL Get Intrinsic Definitions.
+//
+
+def int_mblaze_fsl_get      : GCCBuiltin<"__builtin_mblaze_fsl_get">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_aget     : GCCBuiltin<"__builtin_mblaze_fsl_aget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_cget     : GCCBuiltin<"__builtin_mblaze_fsl_cget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_caget    : GCCBuiltin<"__builtin_mblaze_fsl_caget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_eget     : GCCBuiltin<"__builtin_mblaze_fsl_eget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_eaget    : GCCBuiltin<"__builtin_mblaze_fsl_eaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_ecget    : GCCBuiltin<"__builtin_mblaze_fsl_ecget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_ecaget   : GCCBuiltin<"__builtin_mblaze_fsl_ecaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_nget     : GCCBuiltin<"__builtin_mblaze_fsl_nget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_naget    : GCCBuiltin<"__builtin_mblaze_fsl_naget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_ncget    : GCCBuiltin<"__builtin_mblaze_fsl_ncget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_ncaget   : GCCBuiltin<"__builtin_mblaze_fsl_ncaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_neget    : GCCBuiltin<"__builtin_mblaze_fsl_neget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_neaget   : GCCBuiltin<"__builtin_mblaze_fsl_neaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_necget   : GCCBuiltin<"__builtin_mblaze_fsl_necget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_necaget  : GCCBuiltin<"__builtin_mblaze_fsl_necaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tget     : GCCBuiltin<"__builtin_mblaze_fsl_tget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_taget    : GCCBuiltin<"__builtin_mblaze_fsl_taget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tcget    : GCCBuiltin<"__builtin_mblaze_fsl_tcget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tcaget   : GCCBuiltin<"__builtin_mblaze_fsl_tcaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_teget    : GCCBuiltin<"__builtin_mblaze_fsl_teget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_teaget   : GCCBuiltin<"__builtin_mblaze_fsl_teaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tecget   : GCCBuiltin<"__builtin_mblaze_fsl_tecget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tecaget  : GCCBuiltin<"__builtin_mblaze_fsl_tecaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tnget    : GCCBuiltin<"__builtin_mblaze_fsl_tnget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tnaget   : GCCBuiltin<"__builtin_mblaze_fsl_tnaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tncget   : GCCBuiltin<"__builtin_mblaze_fsl_tncget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tncaget  : GCCBuiltin<"__builtin_mblaze_fsl_tncaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tneget   : GCCBuiltin<"__builtin_mblaze_fsl_tneget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tneaget  : GCCBuiltin<"__builtin_mblaze_fsl_tneaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tnecget  : GCCBuiltin<"__builtin_mblaze_fsl_tnecget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tnecaget : GCCBuiltin<"__builtin_mblaze_fsl_tnecaget">,
+                              MBFSL_Get_Intrinsic;
+
+//===----------------------------------------------------------------------===//
+// MicroBlaze FSL Put Intrinsic Definitions.
+//
+
+def int_mblaze_fsl_put     : GCCBuiltin<"__builtin_mblaze_fsl_put">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_aput    : GCCBuiltin<"__builtin_mblaze_fsl_aput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_cput    : GCCBuiltin<"__builtin_mblaze_fsl_cput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_caput   : GCCBuiltin<"__builtin_mblaze_fsl_caput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_nput    : GCCBuiltin<"__builtin_mblaze_fsl_nput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_naput   : GCCBuiltin<"__builtin_mblaze_fsl_naput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_ncput   : GCCBuiltin<"__builtin_mblaze_fsl_ncput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_ncaput  : GCCBuiltin<"__builtin_mblaze_fsl_ncaput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_tput    : GCCBuiltin<"__builtin_mblaze_fsl_tput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_taput   : GCCBuiltin<"__builtin_mblaze_fsl_taput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_tcput   : GCCBuiltin<"__builtin_mblaze_fsl_tcput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_tcaput  : GCCBuiltin<"__builtin_mblaze_fsl_tcaput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_tnput   : GCCBuiltin<"__builtin_mblaze_fsl_tnput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_tnaput  : GCCBuiltin<"__builtin_mblaze_fsl_tnaput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_tncput  : GCCBuiltin<"__builtin_mblaze_fsl_tncput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_tncaput : GCCBuiltin<"__builtin_mblaze_fsl_tncaput">,
+                             MBFSL_PutT_Intrinsic;
diff --git a/lib/Target/MBlaze/MBlazeMCAsmInfo.cpp b/lib/Target/MBlaze/MBlazeMCAsmInfo.cpp
new file mode 100644
index 0000000..7ae465d
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeMCAsmInfo.cpp
@@ -0,0 +1,27 @@
+//===-- MBlazeMCAsmInfo.cpp - MBlaze asm properties -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MBlazeMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeMCAsmInfo.h"
+using namespace llvm;
+
+MBlazeMCAsmInfo::MBlazeMCAsmInfo(const Target &T, const StringRef &TT) {
+  AlignmentIsInBytes          = false;
+  Data16bitsDirective         = "\t.half\t";
+  Data32bitsDirective         = "\t.word\t";
+  Data64bitsDirective         = 0;
+  PrivateGlobalPrefix         = "$";
+  CommentString               = "#";
+  ZeroDirective               = "\t.space\t";
+  GPRel32Directive            = "\t.gpword\t";
+  HasSetDirective             = false;
+}
diff --git a/lib/Target/MBlaze/MBlazeMCAsmInfo.h b/lib/Target/MBlaze/MBlazeMCAsmInfo.h
new file mode 100644
index 0000000..bccb418
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeMCAsmInfo.h
@@ -0,0 +1,30 @@
+//=====-- MBlazeMCAsmInfo.h - MBlaze asm properties -----------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MBlazeMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZETARGETASMINFO_H
+#define MBLAZETARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+  
+  class MBlazeMCAsmInfo : public MCAsmInfo {
+  public:
+    explicit MBlazeMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/MBlaze/MBlazeMachineFunction.h b/lib/Target/MBlaze/MBlazeMachineFunction.h
new file mode 100644
index 0000000..08d4dca
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeMachineFunction.h
@@ -0,0 +1,136 @@
+//===-- MBlazeMachineFunctionInfo.h - Private data ----------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MBlaze specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZE_MACHINE_FUNCTION_INFO_H
+#define MBLAZE_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+
+namespace llvm {
+
+/// MBlazeFunctionInfo - This class is derived from MachineFunction private
+/// MBlaze target-specific information for each MachineFunction.
+class MBlazeFunctionInfo : public MachineFunctionInfo {
+
+private:
+  /// Holds for each function where on the stack the Frame Pointer must be 
+  /// saved. This is used on Prologue and Epilogue to emit FP save/restore
+  int FPStackOffset;
+
+  /// Holds for each function where on the stack the Return Address must be 
+  /// saved. This is used on Prologue and Epilogue to emit RA save/restore
+  int RAStackOffset;
+
+  /// At each function entry a special bitmask directive must be emitted
+  /// to help in debugging CPU callee saved registers. It needs a negative
+  /// offset from the final stack size and its higher register location on
+  /// the stack.
+  int CPUTopSavedRegOff;
+
+  /// MBlazeFIHolder - Holds a FrameIndex and it's Stack Pointer Offset
+  struct MBlazeFIHolder {
+
+    int FI;
+    int SPOffset;
+
+    MBlazeFIHolder(int FrameIndex, int StackPointerOffset)
+      : FI(FrameIndex), SPOffset(StackPointerOffset) {}
+  };
+
+  /// When PIC is used the GP must be saved on the stack on the function 
+  /// prologue and must be reloaded from this stack location after every 
+  /// call. A reference to its stack location and frame index must be kept 
+  /// to be used on emitPrologue and processFunctionBeforeFrameFinalized.
+  MBlazeFIHolder GPHolder;
+
+  /// On LowerFormalArguments the stack size is unknown, so the Stack
+  /// Pointer Offset calculation of "not in register arguments" must be 
+  /// postponed to emitPrologue. 
+  SmallVector<MBlazeFIHolder, 16> FnLoadArgs;
+  bool HasLoadArgs;
+
+  // When VarArgs, we must write registers back to caller stack, preserving 
+  // on register arguments. Since the stack size is unknown on 
+  // LowerFormalArguments, the Stack Pointer Offset calculation must be
+  // postponed to emitPrologue. 
+  SmallVector<MBlazeFIHolder, 4> FnStoreVarArgs;
+  bool HasStoreVarArgs;
+
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+public:
+  MBlazeFunctionInfo(MachineFunction& MF) 
+  : FPStackOffset(0), RAStackOffset(0), CPUTopSavedRegOff(0), 
+    GPHolder(-1,-1), HasLoadArgs(false), HasStoreVarArgs(false),
+    SRetReturnReg(0), GlobalBaseReg(0)
+  {}
+
+  int getFPStackOffset() const { return FPStackOffset; }
+  void setFPStackOffset(int Off) { FPStackOffset = Off; }
+
+  int getRAStackOffset() const { return RAStackOffset; }
+  void setRAStackOffset(int Off) { RAStackOffset = Off; }
+
+  int getCPUTopSavedRegOff() const { return CPUTopSavedRegOff; }
+  void setCPUTopSavedRegOff(int Off) { CPUTopSavedRegOff = Off; }
+
+  int getGPStackOffset() const { return GPHolder.SPOffset; }
+  int getGPFI() const { return GPHolder.FI; }
+  void setGPStackOffset(int Off) { GPHolder.SPOffset = Off; }
+  void setGPFI(int FI) { GPHolder.FI = FI; }
+  bool needGPSaveRestore() const { return GPHolder.SPOffset != -1; }
+
+  bool hasLoadArgs() const { return HasLoadArgs; }
+  bool hasStoreVarArgs() const { return HasStoreVarArgs; } 
+
+  void recordLoadArgsFI(int FI, int SPOffset) {
+    if (!HasLoadArgs) HasLoadArgs=true;
+    FnLoadArgs.push_back(MBlazeFIHolder(FI, SPOffset));
+  }
+  void recordStoreVarArgsFI(int FI, int SPOffset) {
+    if (!HasStoreVarArgs) HasStoreVarArgs=true;
+    FnStoreVarArgs.push_back(MBlazeFIHolder(FI, SPOffset));
+  }
+
+  void adjustLoadArgsFI(MachineFrameInfo *MFI) const {
+    if (!hasLoadArgs()) return;
+    for (unsigned i = 0, e = FnLoadArgs.size(); i != e; ++i) 
+      MFI->setObjectOffset( FnLoadArgs[i].FI, FnLoadArgs[i].SPOffset );
+  }
+  void adjustStoreVarArgsFI(MachineFrameInfo *MFI) const {
+    if (!hasStoreVarArgs()) return; 
+    for (unsigned i = 0, e = FnStoreVarArgs.size(); i != e; ++i) 
+      MFI->setObjectOffset( FnStoreVarArgs[i].FI, FnStoreVarArgs[i].SPOffset );
+  }
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+};
+
+} // end of namespace llvm
+
+#endif // MBLAZE_MACHINE_FUNCTION_INFO_H
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
new file mode 100644
index 0000000..9067f8b
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -0,0 +1,378 @@
+//===- MBlazeRegisterInfo.cpp - MBlaze Register Information -== -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MBlaze implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mblaze-reg-info"
+
+#include "MBlaze.h"
+#include "MBlazeSubtarget.h"
+#include "MBlazeRegisterInfo.h"
+#include "MBlazeMachineFunction.h"
+#include "llvm/Constants.h"
+#include "llvm/Type.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace llvm;
+
+MBlazeRegisterInfo::
+MBlazeRegisterInfo(const MBlazeSubtarget &ST, const TargetInstrInfo &tii)
+  : MBlazeGenRegisterInfo(MBlaze::ADJCALLSTACKDOWN, MBlaze::ADJCALLSTACKUP),
+    Subtarget(ST), TII(tii) {}
+
+/// getRegisterNumbering - Given the enum value for some register, e.g.
+/// MBlaze::R0, return the number that it corresponds to (e.g. 0).
+unsigned MBlazeRegisterInfo::getRegisterNumbering(unsigned RegEnum) {
+  switch (RegEnum) {
+    case MBlaze::R0  : case MBlaze::F0  : return 0;
+    case MBlaze::R1  : case MBlaze::F1  : return 1;
+    case MBlaze::R2  : case MBlaze::F2  : return 2;
+    case MBlaze::R3  : case MBlaze::F3  : return 3;
+    case MBlaze::R4  : case MBlaze::F4  : return 4;
+    case MBlaze::R5  : case MBlaze::F5  : return 5;
+    case MBlaze::R6  : case MBlaze::F6  : return 6;
+    case MBlaze::R7  : case MBlaze::F7  : return 7;
+    case MBlaze::R8  : case MBlaze::F8  : return 8;
+    case MBlaze::R9  : case MBlaze::F9  : return 9;
+    case MBlaze::R10 : case MBlaze::F10 : return 10;
+    case MBlaze::R11 : case MBlaze::F11 : return 11;
+    case MBlaze::R12 : case MBlaze::F12 : return 12;
+    case MBlaze::R13 : case MBlaze::F13 : return 13;
+    case MBlaze::R14 : case MBlaze::F14 : return 14;
+    case MBlaze::R15 : case MBlaze::F15 : return 15;
+    case MBlaze::R16 : case MBlaze::F16 : return 16;
+    case MBlaze::R17 : case MBlaze::F17 : return 17;
+    case MBlaze::R18 : case MBlaze::F18 : return 18;
+    case MBlaze::R19 : case MBlaze::F19 : return 19;
+    case MBlaze::R20 : case MBlaze::F20 : return 20;
+    case MBlaze::R21 : case MBlaze::F21 : return 21;
+    case MBlaze::R22 : case MBlaze::F22 : return 22;
+    case MBlaze::R23 : case MBlaze::F23 : return 23;
+    case MBlaze::R24 : case MBlaze::F24 : return 24;
+    case MBlaze::R25 : case MBlaze::F25 : return 25;
+    case MBlaze::R26 : case MBlaze::F26 : return 26;
+    case MBlaze::R27 : case MBlaze::F27 : return 27;
+    case MBlaze::R28 : case MBlaze::F28 : return 28;
+    case MBlaze::R29 : case MBlaze::F29 : return 29;
+    case MBlaze::R30 : case MBlaze::F30 : return 30;
+    case MBlaze::R31 : case MBlaze::F31 : return 31;
+    default: llvm_unreachable("Unknown register number!");
+  }
+  return 0; // Not reached
+}
+
+unsigned MBlazeRegisterInfo::getPICCallReg() {
+  return MBlaze::R20;
+}
+
+//===----------------------------------------------------------------------===//
+// Callee Saved Registers methods
+//===----------------------------------------------------------------------===//
+
+/// MBlaze Callee Saved Registers
+const unsigned* MBlazeRegisterInfo::
+getCalleeSavedRegs(const MachineFunction *MF) const {
+  // MBlaze callee-save register range is R20 - R31
+  static const unsigned CalleeSavedRegs[] = {
+    MBlaze::R20, MBlaze::R21, MBlaze::R22, MBlaze::R23,
+    MBlaze::R24, MBlaze::R25, MBlaze::R26, MBlaze::R27,
+    MBlaze::R28, MBlaze::R29, MBlaze::R30, MBlaze::R31,
+    0
+  };
+
+  return CalleeSavedRegs;
+}
+
+/// MBlaze Callee Saved Register Classes
+const TargetRegisterClass* const* MBlazeRegisterInfo::
+getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRC[] = {
+    &MBlaze::CPURegsRegClass, &MBlaze::CPURegsRegClass,
+    &MBlaze::CPURegsRegClass, &MBlaze::CPURegsRegClass,
+    &MBlaze::CPURegsRegClass, &MBlaze::CPURegsRegClass,
+    &MBlaze::CPURegsRegClass, &MBlaze::CPURegsRegClass,
+    &MBlaze::CPURegsRegClass, &MBlaze::CPURegsRegClass,
+    &MBlaze::CPURegsRegClass, &MBlaze::CPURegsRegClass,
+    0
+  };
+
+  return CalleeSavedRC;
+}
+
+BitVector MBlazeRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(MBlaze::R0);
+  Reserved.set(MBlaze::R1);
+  Reserved.set(MBlaze::R2);
+  Reserved.set(MBlaze::R13);
+  Reserved.set(MBlaze::R14);
+  Reserved.set(MBlaze::R15);
+  Reserved.set(MBlaze::R16);
+  Reserved.set(MBlaze::R17);
+  Reserved.set(MBlaze::R18);
+  Reserved.set(MBlaze::R19);
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Stack Frame Processing methods
+// +----------------------------+
+//
+// The stack is allocated decrementing the stack pointer on
+// the first instruction of a function prologue. Once decremented,
+// all stack references are are done through a positive offset
+// from the stack/frame pointer, so the stack is considered
+// to grow up.
+//
+//===----------------------------------------------------------------------===//
+
+void MBlazeRegisterInfo::adjustMBlazeStackFrame(MachineFunction &MF) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
+
+  // See the description at MicroBlazeMachineFunction.h
+  int TopCPUSavedRegOff = -1;
+
+  // Adjust CPU Callee Saved Registers Area. Registers RA and FP must
+  // be saved in this CPU Area there is the need. This whole Area must
+  // be aligned to the default Stack Alignment requirements.
+  unsigned StackOffset = MFI->getStackSize();
+  unsigned RegSize = 4;
+
+  // Replace the dummy '0' SPOffset by the negative offsets, as explained on
+  // LowerFORMAL_ARGUMENTS. Leaving '0' for while is necessary to avoid
+  // the approach done by calculateFrameObjectOffsets to the stack frame.
+  MBlazeFI->adjustLoadArgsFI(MFI);
+  MBlazeFI->adjustStoreVarArgsFI(MFI);
+
+  if (hasFP(MF)) {
+    MFI->setObjectOffset(MFI->CreateStackObject(RegSize, RegSize, true),
+                         StackOffset);
+    MBlazeFI->setFPStackOffset(StackOffset);
+    TopCPUSavedRegOff = StackOffset;
+    StackOffset += RegSize;
+  }
+
+  if (MFI->hasCalls()) {
+    MFI->setObjectOffset(MFI->CreateStackObject(RegSize, RegSize, true),
+                         StackOffset);
+    MBlazeFI->setRAStackOffset(StackOffset);
+    TopCPUSavedRegOff = StackOffset;
+    StackOffset += RegSize;
+  }
+
+  // Update frame info
+  MFI->setStackSize(StackOffset);
+
+  // Recalculate the final tops offset. The final values must be '0'
+  // if there isn't a callee saved register for CPU or FPU, otherwise
+  // a negative offset is needed.
+  if (TopCPUSavedRegOff >= 0)
+    MBlazeFI->setCPUTopSavedRegOff(TopCPUSavedRegOff-StackOffset);
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool MBlazeRegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasVarSizedObjects();
+}
+
+// This function eliminate ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void MBlazeRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+// FrameIndex represent objects inside a abstract stack.
+// We must replace FrameIndex with an stack/frame pointer
+// direct reference.
+unsigned MBlazeRegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                    int *Value, RegScavenger *RS) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+
+  unsigned i = 0;
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+
+  unsigned oi = i == 2 ? 1 : 2;
+
+  DEBUG(errs() << "\nFunction : " << MF.getFunction()->getName() << "\n";
+        errs() << "<--------->\n" << MI);
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+  int stackSize  = MF.getFrameInfo()->getStackSize();
+  int spOffset   = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"
+               << "spOffset   : " << spOffset << "\n"
+               << "stackSize  : " << stackSize << "\n");
+
+  // as explained on LowerFormalArguments, detect negative offsets
+  // and adjust SPOffsets considering the final stack size.
+  int Offset = ((spOffset < 0) ? (stackSize + (-(spOffset+4))) : (spOffset));
+  Offset    += MI.getOperand(oi).getImm();
+
+  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+
+  MI.getOperand(oi).ChangeToImmediate(Offset);
+  MI.getOperand(i).ChangeToRegister(getFrameRegister(MF), false);
+  return 0;
+}
+
+void MBlazeRegisterInfo::
+emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB   = MF.front();
+  MachineFrameInfo *MFI    = MF.getFrameInfo();
+  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+
+  // Get the right frame order for MBlaze.
+  adjustMBlazeStackFrame(MF);
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned StackSize = MFI->getStackSize();
+
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI->hasCalls()) return;
+
+  int FPOffset = MBlazeFI->getFPStackOffset();
+  int RAOffset = MBlazeFI->getRAStackOffset();
+
+  // Adjust stack : addi R1, R1, -imm
+  BuildMI(MBB, MBBI, dl, TII.get(MBlaze::ADDI), MBlaze::R1)
+      .addReg(MBlaze::R1).addImm(-StackSize);
+
+  // Save the return address only if the function isnt a leaf one.
+  // swi  R15, R1, stack_loc
+  if (MFI->hasCalls()) {
+    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::SWI))
+        .addReg(MBlaze::R15).addImm(RAOffset).addReg(MBlaze::R1);
+  }
+
+  // if framepointer enabled, save it and set it
+  // to point to the stack pointer
+  if (hasFP(MF)) {
+    // swi  R19, R1, stack_loc
+    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::SWI))
+      .addReg(MBlaze::R19).addImm(FPOffset).addReg(MBlaze::R1);
+
+    // add R19, R1, R0
+    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::ADD), MBlaze::R19)
+      .addReg(MBlaze::R1).addReg(MBlaze::R0);
+  }
+}
+
+void MBlazeRegisterInfo::
+emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  MachineFrameInfo *MFI            = MF.getFrameInfo();
+  MBlazeFunctionInfo *MBlazeFI         = MF.getInfo<MBlazeFunctionInfo>();
+  DebugLoc dl = MBBI->getDebugLoc();
+
+  // Get the number of bytes from FrameInfo
+  int NumBytes = (int) MFI->getStackSize();
+
+  // Get the FI's where RA and FP are saved.
+  int FPOffset = MBlazeFI->getFPStackOffset();
+  int RAOffset = MBlazeFI->getRAStackOffset();
+
+  // if framepointer enabled, restore it and restore the
+  // stack pointer
+  if (hasFP(MF)) {
+    // add R1, R19, R0
+    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::ADD), MBlaze::R1)
+      .addReg(MBlaze::R19).addReg(MBlaze::R0);
+
+    // lwi  R19, R1, stack_loc
+    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::LWI), MBlaze::R19)
+      .addImm(FPOffset).addReg(MBlaze::R1);
+  }
+
+  // Restore the return address only if the function isnt a leaf one.
+  // lwi R15, R1, stack_loc
+  if (MFI->hasCalls()) {
+    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::LWI), MBlaze::R15)
+      .addImm(RAOffset).addReg(MBlaze::R1);
+  }
+
+  // adjust stack.
+  // addi R1, R1, imm
+  if (NumBytes) {
+    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::ADDI), MBlaze::R1)
+      .addReg(MBlaze::R1).addImm(NumBytes);
+  }
+}
+
+
+void MBlazeRegisterInfo::
+processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
+  // Set the stack offset where GP must be saved/loaded from.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
+  if (MBlazeFI->needGPSaveRestore())
+    MFI->setObjectOffset(MBlazeFI->getGPFI(), MBlazeFI->getGPStackOffset());
+}
+
+unsigned MBlazeRegisterInfo::getRARegister() const {
+  return MBlaze::R15;
+}
+
+unsigned MBlazeRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return hasFP(MF) ? MBlaze::R19 : MBlaze::R1;
+}
+
+unsigned MBlazeRegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+  return 0;
+}
+
+unsigned MBlazeRegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+  return 0;
+}
+
+int MBlazeRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  llvm_unreachable("What is the dwarf register number");
+  return -1;
+}
+
+#include "MBlazeGenRegisterInfo.inc"
+
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h
new file mode 100644
index 0000000..4847f1e
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.h
@@ -0,0 +1,90 @@
+//===- MBlazeRegisterInfo.h - MBlaze Register Information Impl --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MBlaze implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZEREGISTERINFO_H
+#define MBLAZEREGISTERINFO_H
+
+#include "MBlaze.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "MBlazeGenRegisterInfo.h.inc"
+
+namespace llvm {
+class MBlazeSubtarget;
+class TargetInstrInfo;
+class Type;
+
+namespace MBlaze {
+  /// SubregIndex - The index of various sized subregister classes. Note that 
+  /// these indices must be kept in sync with the class indices in the 
+  /// MBlazeRegisterInfo.td file.
+  enum SubregIndex {
+    SUBREG_FPEVEN = 1, SUBREG_FPODD = 2
+  };
+}
+
+struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
+  const MBlazeSubtarget &Subtarget;
+  const TargetInstrInfo &TII;
+  
+  MBlazeRegisterInfo(const MBlazeSubtarget &Subtarget,
+                     const TargetInstrInfo &tii);
+
+  /// getRegisterNumbering - Given the enum value for some register, e.g.
+  /// MBlaze::RA, return the number that it corresponds to (e.g. 31).
+  static unsigned getRegisterNumbering(unsigned RegEnum);
+
+  /// Get PIC indirect call register
+  static unsigned getPICCallReg();
+
+  /// Adjust the MBlaze stack frame.
+  void adjustMBlazeStackFrame(MachineFunction &MF) const;
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+  const TargetRegisterClass* const*
+  getCalleeSavedRegClasses(const MachineFunction* MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  /// Stack Frame Processing Methods
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  
+  /// Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  /// Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.td b/lib/Target/MBlaze/MBlazeRegisterInfo.td
new file mode 100644
index 0000000..96a5c98
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.td
@@ -0,0 +1,186 @@
+//===- MBlazeRegisterInfo.td - MBlaze Register defs -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the MicroBlaze register file
+//===----------------------------------------------------------------------===//
+
+// We have banks of 32 registers each.
+class MBlazeReg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "MBlaze";
+}
+
+class MBlazeRegWithSubRegs<string n, list<Register> subregs>
+  : RegisterWithSubRegs<n, subregs> {
+  field bits<5> Num;
+  let Namespace = "MBlaze";
+}
+
+// MBlaze CPU Registers
+class MBlazeGPRReg<bits<5> num, string n> : MBlazeReg<n> {
+  let Num = num;
+}
+
+// MBlaze 32-bit (aliased) FPU Registers
+class FPR<bits<5> num, string n, list<Register> subregs>
+  : MBlazeRegWithSubRegs<n, subregs> {
+  let Num = num;
+}
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+let Namespace = "MBlaze" in {
+
+  // General Purpose Registers
+  def R0  : MBlazeGPRReg< 0,  "r0">,   DwarfRegNum<[0]>;
+  def R1  : MBlazeGPRReg< 1,  "r1">,   DwarfRegNum<[1]>;
+  def R2  : MBlazeGPRReg< 2,  "r2">,   DwarfRegNum<[2]>;
+  def R3  : MBlazeGPRReg< 3,  "r3">,   DwarfRegNum<[3]>;
+  def R4  : MBlazeGPRReg< 4,  "r4">,   DwarfRegNum<[5]>;
+  def R5  : MBlazeGPRReg< 5,  "r5">,   DwarfRegNum<[5]>;
+  def R6  : MBlazeGPRReg< 6,  "r6">,   DwarfRegNum<[6]>;
+  def R7  : MBlazeGPRReg< 7,  "r7">,   DwarfRegNum<[7]>;
+  def R8  : MBlazeGPRReg< 8,  "r8">,   DwarfRegNum<[8]>;
+  def R9  : MBlazeGPRReg< 9,  "r9">,   DwarfRegNum<[9]>;
+  def R10 : MBlazeGPRReg< 10, "r10">,  DwarfRegNum<[10]>;
+  def R11 : MBlazeGPRReg< 11, "r11">,  DwarfRegNum<[11]>;
+  def R12 : MBlazeGPRReg< 12, "r12">,  DwarfRegNum<[12]>;
+  def R13 : MBlazeGPRReg< 13, "r13">,  DwarfRegNum<[13]>;
+  def R14 : MBlazeGPRReg< 14, "r14">,  DwarfRegNum<[14]>;
+  def R15 : MBlazeGPRReg< 15, "r15">,  DwarfRegNum<[15]>;
+  def R16 : MBlazeGPRReg< 16, "r16">,  DwarfRegNum<[16]>;
+  def R17 : MBlazeGPRReg< 17, "r17">,  DwarfRegNum<[17]>;
+  def R18 : MBlazeGPRReg< 18, "r18">,  DwarfRegNum<[18]>;
+  def R19 : MBlazeGPRReg< 19, "r19">,  DwarfRegNum<[19]>;
+  def R20 : MBlazeGPRReg< 20, "r20">,  DwarfRegNum<[20]>;
+  def R21 : MBlazeGPRReg< 21, "r21">,  DwarfRegNum<[21]>;
+  def R22 : MBlazeGPRReg< 22, "r22">,  DwarfRegNum<[22]>;
+  def R23 : MBlazeGPRReg< 23, "r23">,  DwarfRegNum<[23]>;
+  def R24 : MBlazeGPRReg< 24, "r24">,  DwarfRegNum<[24]>;
+  def R25 : MBlazeGPRReg< 25, "r25">,  DwarfRegNum<[25]>;
+  def R26 : MBlazeGPRReg< 26, "r26">,  DwarfRegNum<[26]>;
+  def R27 : MBlazeGPRReg< 27, "r27">,  DwarfRegNum<[27]>;
+  def R28 : MBlazeGPRReg< 28, "r28">,  DwarfRegNum<[28]>;
+  def R29 : MBlazeGPRReg< 29, "r29">,  DwarfRegNum<[29]>;
+  def R30 : MBlazeGPRReg< 30, "r30">,  DwarfRegNum<[30]>;
+  def R31 : MBlazeGPRReg< 31, "r31">,  DwarfRegNum<[31]>;
+
+  /// MBlaze Single point precision FPU Registers
+  def F0  : FPR< 0,  "r0", [R0]>,  DwarfRegNum<[32]>;
+  def F1  : FPR< 1,  "r1", [R1]>,  DwarfRegNum<[33]>;
+  def F2  : FPR< 2,  "r2", [R2]>,  DwarfRegNum<[34]>;
+  def F3  : FPR< 3,  "r3", [R3]>,  DwarfRegNum<[35]>;
+  def F4  : FPR< 4,  "r4", [R4]>,  DwarfRegNum<[36]>;
+  def F5  : FPR< 5,  "r5", [R5]>,  DwarfRegNum<[37]>;
+  def F6  : FPR< 6,  "r6", [R6]>,  DwarfRegNum<[38]>;
+  def F7  : FPR< 7,  "r7", [R7]>,  DwarfRegNum<[39]>;
+  def F8  : FPR< 8,  "r8", [R8]>,  DwarfRegNum<[40]>;
+  def F9  : FPR< 9,  "r9", [R9]>,  DwarfRegNum<[41]>;
+  def F10 : FPR<10, "r10", [R10]>, DwarfRegNum<[42]>;
+  def F11 : FPR<11, "r11", [R11]>, DwarfRegNum<[43]>;
+  def F12 : FPR<12, "r12", [R12]>, DwarfRegNum<[44]>;
+  def F13 : FPR<13, "r13", [R13]>, DwarfRegNum<[45]>;
+  def F14 : FPR<14, "r14", [R14]>, DwarfRegNum<[46]>;
+  def F15 : FPR<15, "r15", [R15]>, DwarfRegNum<[47]>;
+  def F16 : FPR<16, "r16", [R16]>, DwarfRegNum<[48]>;
+  def F17 : FPR<17, "r17", [R17]>, DwarfRegNum<[49]>;
+  def F18 : FPR<18, "r18", [R18]>, DwarfRegNum<[50]>;
+  def F19 : FPR<19, "r19", [R19]>, DwarfRegNum<[51]>;
+  def F20 : FPR<20, "r20", [R20]>, DwarfRegNum<[52]>;
+  def F21 : FPR<21, "r21", [R21]>, DwarfRegNum<[53]>;
+  def F22 : FPR<22, "r22", [R22]>, DwarfRegNum<[54]>;
+  def F23 : FPR<23, "r23", [R23]>, DwarfRegNum<[55]>;
+  def F24 : FPR<24, "r24", [R24]>, DwarfRegNum<[56]>;
+  def F25 : FPR<25, "r25", [R25]>, DwarfRegNum<[57]>;
+  def F26 : FPR<26, "r26", [R26]>, DwarfRegNum<[58]>;
+  def F27 : FPR<27, "r27", [R27]>, DwarfRegNum<[59]>;
+  def F28 : FPR<28, "r28", [R28]>, DwarfRegNum<[60]>;
+  def F29 : FPR<29, "r29", [R29]>, DwarfRegNum<[61]>;
+  def F30 : FPR<30, "r30", [R30]>, DwarfRegNum<[62]>;
+  def F31 : FPR<31, "r31", [R31]>, DwarfRegNum<[63]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+def CPURegs : RegisterClass<"MBlaze", [i32], 32,
+  [
+  // Return Values and Arguments
+  R3, R4, R5, R6, R7, R8, R9, R10,
+
+  // Not preserved across procedure calls
+  R11, R12,
+
+  // Callee save
+  R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+
+  // Reserved
+  R0,  // Always zero
+  R1,  // The stack pointer
+  R2,  // Read-only small data area anchor
+  R13, // Read-write small data area anchor
+  R14, // Return address for interrupts
+  R15, // Return address for sub-routines
+  R16, // Return address for trap
+  R17, // Return address for exceptions
+  R18, // Reserved for assembler
+  R19  // The frame-pointer
+  ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    CPURegsClass::iterator
+    CPURegsClass::allocation_order_end(const MachineFunction &MF) const {
+      // The last 10 registers on the list above are reserved
+      return end()-10;
+    }
+  }];
+}
+
+def FGR32 : RegisterClass<"MBlaze", [f32], 32,
+  [
+  // Return Values and Arguments
+  F3, F4, F5, F6, F7, F8, F9, F10,
+
+  // Not preserved across procedure calls
+  F11, F12,
+
+  // Callee save
+  F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30, F31,
+
+  // Reserved
+  F0,  // Always zero
+  F1,  // The stack pointer
+  F2,  // Read-only small data area anchor
+  F13, // Read-write small data area anchor
+  F14, // Return address for interrupts
+  F15, // Return address for sub-routines
+  F16, // Return address for trap
+  F17, // Return address for exceptions
+  F18, // Reserved for assembler
+  F19  // The frame pointer
+  ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    FGR32Class::iterator
+    FGR32Class::allocation_order_end(const MachineFunction &MF) const {
+      // The last 10 registers on the list above are reserved
+      return end()-10;
+    }
+  }];
+}
diff --git a/lib/Target/MBlaze/MBlazeSchedule.td b/lib/Target/MBlaze/MBlazeSchedule.td
new file mode 100644
index 0000000..6a94491
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeSchedule.td
@@ -0,0 +1,63 @@
+//===- MBlazeSchedule.td - MBlaze Scheduling Definitions --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Functional units across MBlaze chips sets. Based on GCC/MBlaze backend files.
+//===----------------------------------------------------------------------===//
+def ALU     : FuncUnit;
+def IMULDIV : FuncUnit;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for MBlaze 
+//===----------------------------------------------------------------------===//
+def IIAlu              : InstrItinClass;
+def IILoad             : InstrItinClass;
+def IIStore            : InstrItinClass;
+def IIXfer             : InstrItinClass;
+def IIBranch           : InstrItinClass;
+def IIHiLo             : InstrItinClass;
+def IIImul             : InstrItinClass;
+def IIIdiv             : InstrItinClass;
+def IIFcvt             : InstrItinClass;
+def IIFmove            : InstrItinClass;
+def IIFcmp             : InstrItinClass;
+def IIFadd             : InstrItinClass;
+def IIFmulSingle       : InstrItinClass;
+def IIFmulDouble       : InstrItinClass;
+def IIFdivSingle       : InstrItinClass;
+def IIFdivDouble       : InstrItinClass;
+def IIFsqrtSingle      : InstrItinClass;
+def IIFsqrtDouble      : InstrItinClass;
+def IIFrecipFsqrtStep  : InstrItinClass;
+def IIPseudo           : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// MBlaze Generic instruction itineraries.
+//===----------------------------------------------------------------------===//
+def MBlazeGenericItineraries : ProcessorItineraries<[
+  InstrItinData<IIAlu              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IILoad             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<IIStore            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIXfer             , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<IIBranch           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIHiLo             , [InstrStage<1,  [IMULDIV]>]>,
+  InstrItinData<IIImul             , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<IIIdiv             , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<IIFcvt             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIFmove            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<IIFcmp             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<IIFadd             , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<IIFmulSingle       , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<IIFmulDouble       , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<IIFdivSingle       , [InstrStage<23, [ALU]>]>,
+  InstrItinData<IIFdivDouble       , [InstrStage<36, [ALU]>]>,
+  InstrItinData<IIFsqrtSingle      , [InstrStage<54, [ALU]>]>,
+  InstrItinData<IIFsqrtDouble      , [InstrStage<12, [ALU]>]>,
+  InstrItinData<IIFrecipFsqrtStep  , [InstrStage<5,  [ALU]>]>
+]>;
diff --git a/lib/Target/MBlaze/MBlazeSubtarget.cpp b/lib/Target/MBlaze/MBlazeSubtarget.cpp
new file mode 100644
index 0000000..3440521
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeSubtarget.cpp
@@ -0,0 +1,31 @@
+//===- MBlazeSubtarget.cpp - MBlaze Subtarget Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MBlaze specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeSubtarget.h"
+#include "MBlaze.h"
+#include "MBlazeGenSubtarget.inc"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+MBlazeSubtarget::MBlazeSubtarget(const std::string &TT, const std::string &FS):
+  HasPipe3(false), HasBarrel(false), HasDiv(false), HasMul(false),
+  HasFSL(false), HasEFSL(false), HasMSRSet(false), HasException(false),
+  HasPatCmp(false), HasFPU(false), HasESR(false), HasPVR(false),
+  HasMul64(false), HasSqrt(false), HasMMU(false)
+{
+  std::string CPU = "v400";
+  MBlazeArchVersion = V400;
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}
diff --git a/lib/Target/MBlaze/MBlazeSubtarget.h b/lib/Target/MBlaze/MBlazeSubtarget.h
new file mode 100644
index 0000000..bebb3f7
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeSubtarget.h
@@ -0,0 +1,79 @@
+//=====-- MBlazeSubtarget.h - Define Subtarget for the MBlaze -*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MBlaze specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZESUBTARGET_H
+#define MBLAZESUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <string>
+
+namespace llvm {
+
+class MBlazeSubtarget : public TargetSubtarget {
+
+protected:
+
+  enum MBlazeArchEnum {
+    V400, V500, V600, V700, V710
+  };
+
+  // MBlaze architecture version
+  MBlazeArchEnum MBlazeArchVersion;
+
+  bool HasPipe3;
+  bool HasBarrel;
+  bool HasDiv;
+  bool HasMul;
+  bool HasFSL;
+  bool HasEFSL;
+  bool HasMSRSet;
+  bool HasException;
+  bool HasPatCmp;
+  bool HasFPU;
+  bool HasESR;
+  bool HasPVR;
+  bool HasMul64;
+  bool HasSqrt;
+  bool HasMMU;
+
+  InstrItineraryData InstrItins;
+
+public:
+
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  MBlazeSubtarget(const std::string &TT, const std::string &FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+  bool hasFPU()    const { return HasFPU; }
+  bool hasSqrt()   const { return HasSqrt; }
+  bool hasMul()    const { return HasMul; }
+  bool hasMul64()  const { return HasMul64; }
+  bool hasDiv()    const { return HasDiv; }
+  bool hasBarrel() const { return HasBarrel; }
+
+  bool isV400() const { return MBlazeArchVersion == V400; }
+  bool isV500() const { return MBlazeArchVersion == V500; }
+  bool isV600() const { return MBlazeArchVersion == V600; }
+  bool isV700() const { return MBlazeArchVersion == V700; }
+  bool isV710() const { return MBlazeArchVersion == V710; }
+};
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
new file mode 100644
index 0000000..9eba2b3
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -0,0 +1,66 @@
+//===-- MBlazeTargetMachine.cpp - Define TargetMachine for MBlaze ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about MBlaze target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlaze.h"
+#include "MBlazeMCAsmInfo.h"
+#include "MBlazeTargetMachine.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeMBlazeTarget() {
+  // Register the target.
+  RegisterTargetMachine<MBlazeTargetMachine> X(TheMBlazeTarget);
+  RegisterAsmInfo<MBlazeMCAsmInfo> A(TheMBlazeTarget);
+}
+
+// DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
+// The stack is always 8 byte aligned
+// On function prologue, the stack is created by decrementing
+// its pointer. Once decremented, all references are done with positive
+// offset from the stack/frame pointer, using StackGrowsUp enables
+// an easier handling.
+MBlazeTargetMachine::
+MBlazeTargetMachine(const Target &T, const std::string &TT,
+                    const std::string &FS):
+  LLVMTargetMachine(T, TT),
+  Subtarget(TT, FS),
+  DataLayout("E-p:32:32-i8:8:8-i16:16:16-i64:32:32-"
+             "f64:32:32-v64:32:32-v128:32:32-n32"),
+  InstrInfo(*this),
+  FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0),
+  TLInfo(*this) {
+  if (getRelocationModel() == Reloc::Default) {
+      setRelocationModel(Reloc::Static);
+  }
+
+  if (getCodeModel() == CodeModel::Default)
+    setCodeModel(CodeModel::Small);
+}
+
+// Install an instruction selector pass using
+// the ISelDag to gen MBlaze code.
+bool MBlazeTargetMachine::
+addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
+  PM.add(createMBlazeISelDag(*this));
+  return false;
+}
+
+// Implemented by targets that want to run passes immediately before
+// machine code is emitted. return true if -print-machineinstrs should
+// print out the code after the passes.
+bool MBlazeTargetMachine::
+addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
+  PM.add(createMBlazeDelaySlotFillerPass(*this));
+  return true;
+}
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.h b/lib/Target/MBlaze/MBlazeTargetMachine.h
new file mode 100644
index 0000000..85c975c
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.h
@@ -0,0 +1,69 @@
+//===-- MBlazeTargetMachine.h - Define TargetMachine for MBlaze --- C++ ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MBlaze specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZE_TARGETMACHINE_H
+#define MBLAZE_TARGETMACHINE_H
+
+#include "MBlazeSubtarget.h"
+#include "MBlazeInstrInfo.h"
+#include "MBlazeISelLowering.h"
+#include "MBlazeIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+
+namespace llvm {
+  class formatted_raw_ostream;
+
+  class MBlazeTargetMachine : public LLVMTargetMachine {
+    MBlazeSubtarget       Subtarget;
+    const TargetData    DataLayout; // Calculates type size & alignment
+    MBlazeInstrInfo       InstrInfo;
+    TargetFrameInfo     FrameInfo;
+    MBlazeTargetLowering  TLInfo;
+    MBlazeIntrinsicInfo IntrinsicInfo;
+  public:
+    MBlazeTargetMachine(const Target &T, const std::string &TT,
+                      const std::string &FS);
+
+    virtual const MBlazeInstrInfo *getInstrInfo() const
+    { return &InstrInfo; }
+
+    virtual const TargetFrameInfo *getFrameInfo() const
+    { return &FrameInfo; }
+
+    virtual const MBlazeSubtarget *getSubtargetImpl() const
+    { return &Subtarget; }
+
+    virtual const TargetData *getTargetData() const
+    { return &DataLayout;}
+
+    virtual const MBlazeRegisterInfo *getRegisterInfo() const
+    { return &InstrInfo.getRegisterInfo(); }
+
+    virtual MBlazeTargetLowering   *getTargetLowering() const
+    { return const_cast<MBlazeTargetLowering*>(&TLInfo); }
+
+    const TargetIntrinsicInfo *getIntrinsicInfo() const
+    { return &IntrinsicInfo; }
+
+    // Pass Pipeline Configuration
+    virtual bool addInstSelector(PassManagerBase &PM,
+                                 CodeGenOpt::Level OptLevel);
+
+    virtual bool addPreEmitPass(PassManagerBase &PM,
+                                CodeGenOpt::Level OptLevel);
+  };
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp b/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
new file mode 100644
index 0000000..79c9494
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
@@ -0,0 +1,88 @@
+//===-- MBlazeTargetObjectFile.cpp - MBlaze object files ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeTargetObjectFile.h"
+#include "MBlazeSubtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+void MBlazeTargetObjectFile::
+Initialize(MCContext &Ctx, const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+  SmallDataSection =
+    getELFSection(".sdata", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                  SectionKind::getDataRel());
+
+  SmallBSSSection =
+    getELFSection(".sbss", MCSectionELF::SHT_NOBITS,
+                  MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                  SectionKind::getBSS());
+
+}
+
+// A address must be loaded from a small section if its size is less than the
+// small section size threshold. Data in this section must be addressed using
+// gp_rel operator.
+static bool IsInSmallSection(uint64_t Size) {
+  return Size > 0 && Size <= 8;
+}
+
+bool MBlazeTargetObjectFile::
+IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM) const {
+  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+    return false;
+
+  return IsGlobalInSmallSection(GV, TM, getKindForGlobal(GV, TM));
+}
+
+/// IsGlobalInSmallSection - Return true if this global address should be
+/// placed into small data/bss section.
+bool MBlazeTargetObjectFile::
+IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
+                       SectionKind Kind) const {
+  // Only global variables, not functions.
+  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
+  if (!GVA)
+    return false;
+
+  // We can only do this for datarel or BSS objects for now.
+  if (!Kind.isBSS() && !Kind.isDataRel())
+    return false;
+
+  // If this is a internal constant string, there is a special
+  // section for it, but not in small data/bss.
+  if (Kind.isMergeable1ByteCString())
+    return false;
+
+  const Type *Ty = GV->getType()->getElementType();
+  return IsInSmallSection(TM.getTargetData()->getTypeAllocSize(Ty));
+}
+
+const MCSection *MBlazeTargetObjectFile::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+  // TODO: Could also support "weak" symbols as well with ".gnu.linkonce.s.*"
+  // sections?
+
+  // Handle Small Section classification here.
+  if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallBSSSection;
+  if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang,TM);
+}
diff --git a/lib/Target/MBlaze/MBlazeTargetObjectFile.h b/lib/Target/MBlaze/MBlazeTargetObjectFile.h
new file mode 100644
index 0000000..20e7702
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeTargetObjectFile.h
@@ -0,0 +1,41 @@
+//===-- llvm/Target/MBlazeTargetObjectFile.h - MBlaze Obj. Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MBLAZE_TARGETOBJECTFILE_H
+#define LLVM_TARGET_MBLAZE_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+  class MBlazeTargetObjectFile : public TargetLoweringObjectFileELF {
+    const MCSection *SmallDataSection;
+    const MCSection *SmallBSSSection;
+  public:
+    
+    void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    
+    /// IsGlobalInSmallSection - Return true if this global address should be
+    /// placed into small data/bss section.
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM,
+                                SectionKind Kind) const;
+
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM) const;  
+    
+    const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
+                                            SectionKind Kind,
+                                            Mangler *Mang,
+                                            const TargetMachine &TM) const;
+  };
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/MBlaze/Makefile b/lib/Target/MBlaze/Makefile
new file mode 100644
index 0000000..19e508c
--- /dev/null
+++ b/lib/Target/MBlaze/Makefile
@@ -0,0 +1,23 @@
+##===- lib/Target/MBlaze/Makefile --------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMMBlazeCodeGen
+TARGET = MBlaze
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = MBlazeGenRegisterInfo.h.inc MBlazeGenRegisterNames.inc \
+                MBlazeGenRegisterInfo.inc MBlazeGenInstrNames.inc \
+                MBlazeGenInstrInfo.inc MBlazeGenAsmWriter.inc \
+                MBlazeGenDAGISel.inc MBlazeGenCallingConv.inc \
+                MBlazeGenSubtarget.inc MBlazeGenIntrinsics.inc
+
+DIRS = AsmPrinter TargetInfo
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/MBlaze/TargetInfo/CMakeLists.txt b/lib/Target/MBlaze/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..5afb14d
--- /dev/null
+++ b/lib/Target/MBlaze/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMMBlazeInfo
+  MBlazeTargetInfo.cpp
+  )
+
+add_dependencies(LLVMMBlazeInfo MBlazeCodeGenTable_gen)
diff --git a/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp b/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp
new file mode 100644
index 0000000..16e01db
--- /dev/null
+++ b/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp
@@ -0,0 +1,19 @@
+//===-- MBlazeTargetInfo.cpp - MBlaze Target Implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlaze.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheMBlazeTarget;
+
+extern "C" void LLVMInitializeMBlazeTargetInfo() {
+  RegisterTarget<Triple::mblaze> X(TheMBlazeTarget, "mblaze", "MBlaze");
+}
diff --git a/lib/Target/MBlaze/TargetInfo/Makefile b/lib/Target/MBlaze/TargetInfo/Makefile
new file mode 100644
index 0000000..fb7ea11
--- /dev/null
+++ b/lib/Target/MBlaze/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/MBlaze/TargetInfo/Makefile ---------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMMBlazeInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/MSIL/MSILWriter.cpp b/lib/Target/MSIL/MSILWriter.cpp
index 3330d09..ac41cc8 100644
--- a/lib/Target/MSIL/MSILWriter.cpp
+++ b/lib/Target/MSIL/MSILWriter.cpp
@@ -38,7 +38,8 @@ namespace llvm {
     virtual bool addPassesToEmitWholeFile(PassManager &PM,
                                           formatted_raw_ostream &Out,
                                           CodeGenFileType FileType,
-                                          CodeGenOpt::Level OptLevel);
+                                          CodeGenOpt::Level OptLevel,
+                                          bool DisableVerify);
 
     virtual const TargetData *getTargetData() const { return 0; }
   };
@@ -57,7 +58,7 @@ bool MSILModule::runOnModule(Module &M) {
   TypeSymbolTable& Table = M.getTypeSymbolTable();
   std::set<const Type *> Types = getAnalysis<FindUsedTypes>().getTypes();
   for (TypeSymbolTable::iterator I = Table.begin(), E = Table.end(); I!=E; ) {
-    if (!isa<StructType>(I->second) && !isa<OpaqueType>(I->second))
+    if (!I->second->isStructTy() && !I->second->isOpaqueTy())
       Table.remove(I++);
     else {
       std::set<const Type *>::iterator T = Types.find(I->second);
@@ -1459,7 +1460,7 @@ void MSILWriter::printDeclarations(const TypeSymbolTable& ST) {
   for (std::set<const Type*>::const_iterator
        UI = UsedTypes->begin(), UE = UsedTypes->end(); UI!=UE; ++UI) {
     const Type* Ty = *UI;
-    if (isa<ArrayType>(Ty) || isa<VectorType>(Ty) || isa<StructType>(Ty))
+    if (Ty->isArrayTy() || Ty->isVectorTy() || Ty->isStructTy())
       Name = getTypeName(Ty, false, true);
     // Type with no need to declare.
     else continue;
@@ -1688,7 +1689,8 @@ void MSILWriter::printExternals() {
 bool MSILTarget::addPassesToEmitWholeFile(PassManager &PM,
                                           formatted_raw_ostream &o,
                                           CodeGenFileType FileType,
-                                          CodeGenOpt::Level OptLevel)
+                                          CodeGenOpt::Level OptLevel,
+                                          bool DisableVerify)
 {
   if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
   MSILWriter* Writer = new MSILWriter(o);
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index a8c5e0a..911cfcb 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -26,26 +26,12 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
-
 using namespace llvm;
 
-#ifndef NDEBUG
-static cl::opt<bool>
-ViewRMWDAGs("view-msp430-rmw-dags", cl::Hidden,
-          cl::desc("Pop up a window to show isel dags after RMW preprocess"));
-#else
-static const bool ViewRMWDAGs = false;
-#endif
-
-STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
-
-
 namespace {
   struct MSP430ISelAddressMode {
     enum {
@@ -123,8 +109,6 @@ namespace {
         Lowering(*TM.getTargetLowering()),
         Subtarget(*TM.getSubtargetImpl()) { }
 
-    virtual void InstructionSelect();
-
     virtual const char *getPassName() const {
       return "MSP430 DAG->DAG Pattern Instruction Selection";
     }
@@ -133,8 +117,6 @@ namespace {
     bool MatchWrapper(SDValue N, MSP430ISelAddressMode &AM);
     bool MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM);
 
-    bool IsLegalToFold(SDValue N, SDNode *U, SDNode *Root) const;
-
     virtual bool
     SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
                                  std::vector<SDValue> &OutOps);
@@ -143,18 +125,12 @@ namespace {
   #include "MSP430GenDAGISel.inc"
 
   private:
-    DenseMap<SDNode*, SDNode*> RMWStores;
-    void PreprocessForRMW();
     SDNode *Select(SDNode *N);
     SDNode *SelectIndexedLoad(SDNode *Op);
     SDNode *SelectIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2,
                                unsigned Opc8, unsigned Opc16);
 
     bool SelectAddr(SDNode *Op, SDValue Addr, SDValue &Base, SDValue &Disp);
-
-  #ifndef NDEBUG
-    unsigned Indent;
-  #endif
   };
 }  // end anonymous namespace
 
@@ -216,10 +192,7 @@ bool MSP430DAGToDAGISel::MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM)
 }
 
 bool MSP430DAGToDAGISel::MatchAddress(SDValue N, MSP430ISelAddressMode &AM) {
-  DEBUG({
-      errs() << "MatchAddress: ";
-      AM.dump();
-    });
+  DEBUG(errs() << "MatchAddress: "; AM.dump());
 
   switch (N.getOpcode()) {
   default: break;
@@ -335,270 +308,6 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
   return false;
 }
 
-bool MSP430DAGToDAGISel::IsLegalToFold(SDValue N, SDNode *U,
-                                       SDNode *Root) const {
-  if (OptLevel == CodeGenOpt::None) return false;
-
-  /// RMW preprocessing creates the following code:
-  ///         [Load1]
-  ///         ^     ^
-  ///        /      |
-  ///       /       |
-  ///       [Load2] |
-  ///       ^    ^  |
-  ///       |    |  |
-  ///       |     \-|
-  ///       |       |
-  ///       |     [Op]
-  ///       |       ^
-  ///       |       |
-  ///       \      /
-  ///        \    /
-  ///       [Store]
-  ///
-  /// The path Store => Load2 => Load1 is via chain. Note that in general it is
-  /// not allowed to fold Load1 into Op (and Store) since it will creates a
-  /// cycle. However, this is perfectly legal for the loads moved below the
-  /// TokenFactor by PreprocessForRMW. Query the map Store => Load1 (created
-  /// during preprocessing) to determine whether it's legal to introduce such
-  /// "cycle" for a moment.
-  DenseMap<SDNode*, SDNode*>::const_iterator I = RMWStores.find(Root);
-  if (I != RMWStores.end() && I->second == N.getNode())
-    return true;
-
-  // Proceed to 'generic' cycle finder code
-  return SelectionDAGISel::IsLegalToFold(N, U, Root);
-}
-
-
-/// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
-/// and move load below the TokenFactor. Replace store's chain operand with
-/// load's chain result.
-static void MoveBelowTokenFactor(SelectionDAG *CurDAG, SDValue Load,
-                                 SDValue Store, SDValue TF) {
-  SmallVector<SDValue, 4> Ops;
-  for (unsigned i = 0, e = TF.getNode()->getNumOperands(); i != e; ++i)
-    if (Load.getNode() == TF.getOperand(i).getNode())
-      Ops.push_back(Load.getOperand(0));
-    else
-      Ops.push_back(TF.getOperand(i));
-  SDValue NewTF = CurDAG->UpdateNodeOperands(TF, &Ops[0], Ops.size());
-  SDValue NewLoad = CurDAG->UpdateNodeOperands(Load, NewTF,
-                                               Load.getOperand(1),
-                                               Load.getOperand(2));
-  CurDAG->UpdateNodeOperands(Store, NewLoad.getValue(1), Store.getOperand(1),
-                             Store.getOperand(2), Store.getOperand(3));
-}
-
-/// MoveBelowTokenFactor2 - Replace TokenFactor operand with load's chain operand
-/// and move load below the TokenFactor. Replace store's chain operand with
-/// load's chain result. This a version which sinks two loads below token factor.
-/// Look into PreprocessForRMW comments for explanation of transform.
-static void MoveBelowTokenFactor2(SelectionDAG *CurDAG,
-                                  SDValue Load1, SDValue Load2,
-                                  SDValue Store, SDValue TF) {
-  SmallVector<SDValue, 4> Ops;
-  for (unsigned i = 0, e = TF.getNode()->getNumOperands(); i != e; ++i) {
-    SDNode* N = TF.getOperand(i).getNode();
-    if (Load2.getNode() == N)
-      Ops.push_back(Load2.getOperand(0));
-    else if (Load1.getNode() != N)
-      Ops.push_back(TF.getOperand(i));
-  }
-
-  SDValue NewTF = SDValue(CurDAG->MorphNodeTo(TF.getNode(),
-                                  TF.getOpcode(),
-                                  TF.getNode()->getVTList(),
-                                  &Ops[0], Ops.size()), TF.getResNo());
-  SDValue NewLoad2 = CurDAG->UpdateNodeOperands(Load2, NewTF,
-                                                Load2.getOperand(1),
-                                                Load2.getOperand(2));
-
-  SDValue NewLoad1 = CurDAG->UpdateNodeOperands(Load1, NewLoad2.getValue(1),
-                                                Load1.getOperand(1),
-                                                Load1.getOperand(2));
-
-  CurDAG->UpdateNodeOperands(Store,
-                             NewLoad1.getValue(1),
-                             Store.getOperand(1),
-                             Store.getOperand(2), Store.getOperand(3));
-}
-
-/// isAllowedToSink - return true if N a load which can be moved below token
-/// factor. Basically, the load should be non-volatile and has single use.
-static bool isLoadAllowedToSink(SDValue N, SDValue Chain) {
-  if (N.getOpcode() == ISD::BIT_CONVERT)
-    N = N.getOperand(0);
-
-  LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
-  if (!LD || LD->isVolatile())
-    return false;
-  if (LD->getAddressingMode() != ISD::UNINDEXED)
-    return false;
-
-  ISD::LoadExtType ExtType = LD->getExtensionType();
-  if (ExtType != ISD::NON_EXTLOAD && ExtType != ISD::EXTLOAD)
-    return false;
-
-  return (N.hasOneUse() &&
-          LD->hasNUsesOfValue(1, 1) &&
-          LD->isOperandOf(Chain.getNode()));
-}
-
-
-/// isRMWLoad - Return true if N is a load that's part of RMW sub-DAG.
-/// The chain produced by the load must only be used by the store's chain
-/// operand, otherwise this may produce a cycle in the DAG.
-static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
-                      SDValue &Load) {
-  if (isLoadAllowedToSink(N, Chain) &&
-      N.getOperand(1) == Address) {
-    Load = N;
-    return true;
-  }
-  return false;
-}
-
-/// PreprocessForRMW - Preprocess the DAG to make instruction selection better.
-/// This is only run if not in -O0 mode.
-/// This allows the instruction selector to pick more read-modify-write
-/// instructions. This is a common case:
-///
-///     [Load chain]
-///         ^
-///         |
-///       [Load]
-///       ^    ^
-///       |    |
-///      /      \-
-///     /         |
-/// [TokenFactor] [Op]
-///     ^          ^
-///     |          |
-///      \        /
-///       \      /
-///       [Store]
-///
-/// The fact the store's chain operand != load's chain will prevent the
-/// (store (op (load))) instruction from being selected. We can transform it to:
-///
-///     [Load chain]
-///         ^
-///         |
-///    [TokenFactor]
-///         ^
-///         |
-///       [Load]
-///       ^    ^
-///       |    |
-///       |     \-
-///       |       |
-///       |     [Op]
-///       |       ^
-///       |       |
-///       \      /
-///        \    /
-///       [Store]
-///
-/// We also recognize the case where second operand of Op is load as well and
-/// move it below token factor as well creating DAG as follows:
-///
-///       [Load chain]
-///            ^
-///            |
-///      [TokenFactor]
-///            ^
-///            |
-///         [Load1]
-///         ^     ^
-///        /      |
-///       /       |
-///       [Load2] |
-///       ^    ^  |
-///       |    |  |
-///       |     \-|
-///       |       |
-///       |     [Op]
-///       |       ^
-///       |       |
-///       \      /
-///        \    /
-///       [Store]
-///
-/// This allows selection of mem-mem instructions. Yay!
-
-void MSP430DAGToDAGISel::PreprocessForRMW() {
-  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-         E = CurDAG->allnodes_end(); I != E; ++I) {
-    if (!ISD::isNON_TRUNCStore(I))
-      continue;
-    SDValue Chain = I->getOperand(0);
-
-    if (Chain.getNode()->getOpcode() != ISD::TokenFactor)
-      continue;
-
-    SDValue N1 = I->getOperand(1);
-    SDValue N2 = I->getOperand(2);
-    if ((N1.getValueType().isFloatingPoint() &&
-         !N1.getValueType().isVector()) ||
-        !N1.hasOneUse())
-      continue;
-
-    unsigned RModW = 0;
-    SDValue Load1, Load2;
-    unsigned Opcode = N1.getNode()->getOpcode();
-    switch (Opcode) {
-    case ISD::ADD:
-    case ISD::AND:
-    case ISD::OR:
-    case ISD::XOR:
-    case ISD::ADDC:
-    case ISD::ADDE: {
-      SDValue N10 = N1.getOperand(0);
-      SDValue N11 = N1.getOperand(1);
-      if (isRMWLoad(N10, Chain, N2, Load1)) {
-        if (isLoadAllowedToSink(N11, Chain)) {
-          Load2 = N11;
-          RModW = 2;
-        } else
-          RModW = 1;
-      } else if (isRMWLoad(N11, Chain, N2, Load1)) {
-        if (isLoadAllowedToSink(N10, Chain)) {
-          Load2 = N10;
-          RModW = 2;
-        } else
-          RModW = 1;
-      }
-      break;
-    }
-    case ISD::SUB:
-    case ISD::SUBC:
-    case ISD::SUBE: {
-      SDValue N10 = N1.getOperand(0);
-      SDValue N11 = N1.getOperand(1);
-      if (isRMWLoad(N10, Chain, N2, Load1)) {
-        if (isLoadAllowedToSink(N11, Chain)) {
-          Load2 = N11;
-          RModW = 2;
-        } else
-          RModW = 1;
-      }
-      break;
-    }
-    }
-
-    NumLoadMoved += RModW;
-    if (RModW == 1)
-      MoveBelowTokenFactor(CurDAG, Load1, SDValue(I, 0), Chain);
-    else if (RModW == 2) {
-      MoveBelowTokenFactor2(CurDAG, Load1, Load2, SDValue(I, 0), Chain);
-      SDNode* Store = I;
-      RMWStores[Store] = Load2.getNode();
-    }
-  }
-}
-
-
 static bool isValidIndexedLoad(const LoadSDNode *LD) {
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   if (AM != ISD::POST_INC || LD->getExtensionType() != ISD::NON_EXTLOAD)
@@ -681,46 +390,19 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
 }
 
 
-/// InstructionSelect - This callback is invoked by
-/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-void MSP430DAGToDAGISel::InstructionSelect() {
-  std::string BlockName;
-  if (ViewRMWDAGs)
-    BlockName = MF->getFunction()->getNameStr() + ":" +
-                BB->getBasicBlock()->getNameStr();
-
-  PreprocessForRMW();
-
-  if (ViewRMWDAGs) CurDAG->viewGraph("RMW preprocessed:" + BlockName);
-
-  DEBUG(errs() << "Selection DAG after RMW preprocessing:\n");
-  DEBUG(CurDAG->dump());
-
-  // Codegen the basic block.
-  DEBUG(errs() << "===== Instruction selection begins:\n");
-  DEBUG(Indent = 0);
-  SelectRoot(*CurDAG);
-  DEBUG(errs() << "===== Instruction selection ends:\n");
-
-  CurDAG->RemoveDeadNodes();
-  RMWStores.clear();
-}
-
 SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
   DebugLoc dl = Node->getDebugLoc();
 
   // Dump information about the Node being selected
-  DEBUG(errs().indent(Indent) << "Selecting: ");
+  DEBUG(errs() << "Selecting: ");
   DEBUG(Node->dump(CurDAG));
   DEBUG(errs() << "\n");
-  DEBUG(Indent += 2);
 
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(errs().indent(Indent-2) << "== ";
+    DEBUG(errs() << "== ";
           Node->dump(CurDAG);
           errs() << "\n");
-    DEBUG(Indent -= 2);
     return NULL;
   }
 
@@ -808,13 +490,12 @@ SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
   // Select the default instruction
   SDNode *ResNode = SelectCode(Node);
 
-  DEBUG(errs() << std::string(Indent-2, ' ') << "=> ");
+  DEBUG(errs() << "=> ");
   if (ResNode == NULL || ResNode == Node)
     DEBUG(Node->dump(CurDAG));
   else
     DEBUG(ResNode->dump(CurDAG));
   DEBUG(errs() << "\n");
-  DEBUG(Indent -= 2);
 
   return ResNode;
 }
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 7281b37..e6c7e1e 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -795,18 +795,15 @@ SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
      if (andCC) {
        // C = ~Z, thus Res = SRW & 1, no processing is required
      } else {
-       // Res = (SRW >> 1) & 1
+       // Res = ~((SRW >> 1) & 1)
        Shift = true;
+       Invert = true;
      }
      break;
    case MSP430CC::COND_E:
-     if (andCC) {
-       // C = ~Z, thus Res = ~(SRW & 1)
-     } else {
-       // Res = ~((SRW >> 1) & 1)
-       Shift = true;
-     }
-     Invert = true;
+     Shift = true;
+     // C = ~Z for AND instruction, thus we can put Res = ~(SRW & 1), however,
+     // Res = (SRW >> 1) & 1 is 1 word shorter.
      break;
   }
   EVT VT = Op.getValueType();
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index bb06f7b..144ba26 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -250,7 +250,7 @@ def MOV16ri : I16ri<0x0,
                     [(set GR16:$dst, imm:$src)]>;
 }
 
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
 def MOV8rm  : I8rm<0x0,
                    (outs GR8:$dst), (ins memsrc:$src),
                    "mov.b\t{$src, $dst}",
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index f1d4a67..c4746db 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -59,8 +59,6 @@ public:
   SelectionDAGISel(tm),
   TM(tm), Subtarget(tm.getSubtarget<MipsSubtarget>()) {}
   
-  virtual void InstructionSelect();
-
   // Pass Name
   virtual const char *getPassName() const {
     return "MIPS DAG->DAG Pattern Instruction Selection";
@@ -98,29 +96,10 @@ private:
   inline SDValue getI32Imm(unsigned Imm) {
     return CurDAG->getTargetConstant(Imm, MVT::i32);
   }
-
-
-  #ifndef NDEBUG
-  unsigned Indent;
-  #endif
 };
 
 }
 
-/// InstructionSelect - This callback is invoked by
-/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-void MipsDAGToDAGISel::InstructionSelect() {
-  // Codegen the basic block.
-  DEBUG(errs() << "===== Instruction selection begins:\n");
-  DEBUG(Indent = 0);
-
-  // Select target instructions for the DAG.
-  SelectRoot(*CurDAG);
-
-  DEBUG(errs() << "===== Instruction selection ends:\n");
-
-  CurDAG->RemoveDeadNodes();
-}
 
 /// getGlobalBaseReg - Output the instructions required to put the
 /// GOT address into a register.
@@ -329,17 +308,11 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   DebugLoc dl = Node->getDebugLoc();
 
   // Dump information about the Node being selected
-  DEBUG(errs().indent(Indent) << "Selecting: ";
-        Node->dump(CurDAG);
-        errs() << "\n");
-  DEBUG(Indent += 2);
+  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
 
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(errs().indent(Indent-2) << "== ";
-          Node->dump(CurDAG);
-          errs() << "\n");
-    DEBUG(Indent -= 2);
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     return NULL;
   }
 
@@ -547,14 +520,12 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   // Select the default instruction
   SDNode *ResNode = SelectCode(Node);
 
-  DEBUG(errs().indent(Indent-2) << "=> ");
+  DEBUG(errs() << "=> ");
   if (ResNode == NULL || ResNode == Node)
     DEBUG(Node->dump(CurDAG));
   else
     DEBUG(ResNode->dump(CurDAG));
   DEBUG(errs() << "\n");
-  DEBUG(Indent -= 2);
-
   return ResNode;
 }
 
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index f16a805..cef3697 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -300,9 +300,8 @@ class JumpFR<bits<6> op, bits<6> func, string instr_asm>:
 // Jump and Link (Call)
 let isCall=1, hasDelaySlot=1,
   // All calls clobber the non-callee saved registers...
-  Defs = [AT, V0, V1, A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, 
-          K0, K1, F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13,
-          F14, F15, F16, F17, F18, F19], Uses = [GP] in {
+  Defs = [AT, V0, V1, A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9,
+          K0, K1, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9], Uses = [GP] in {
   class JumpLink<bits<6> op, string instr_asm>:
     FJ< op,
         (outs),
@@ -593,8 +592,8 @@ def : Pat<(MipsJmpLink (i32 tglobaladdr:$dst)),
           (JAL tglobaladdr:$dst)>;
 def : Pat<(MipsJmpLink (i32 texternalsym:$dst)),
           (JAL texternalsym:$dst)>;
-def : Pat<(MipsJmpLink CPURegs:$dst),
-          (JALR CPURegs:$dst)>;
+//def : Pat<(MipsJmpLink CPURegs:$dst),
+//          (JALR CPURegs:$dst)>;
 
 // hi/lo relocs
 def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
diff --git a/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp
index b015edd..44a6cc0 100644
--- a/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp
+++ b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp
@@ -158,6 +158,7 @@ bool PIC16AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 // printOperand - print operand of insn.
 void PIC16AsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
   const MachineOperand &MO = MI->getOperand(opNum);
+  const Function *F = MI->getParent()->getParent()->getFunction();
 
   switch (MO.getType()) {
     case MachineOperand::MO_Register:
@@ -190,19 +191,18 @@ void PIC16AsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
     }
     case MachineOperand::MO_ExternalSymbol: {
        const char *Sname = MO.getSymbolName();
+       std::string Printname = Sname;
 
-      // If its a libcall name, record it to decls section.
-      if (PAN::getSymbolTag(Sname) == PAN::LIBCALL)
-        LibcallDecls.push_back(Sname);
-
-      // Record a call to intrinsic to print the extern declaration for it.
-      std::string Sym = Sname;  
-      if (PAN::isMemIntrinsic(Sym)) {
-        Sym = PAN::addPrefix(Sym);
-        LibcallDecls.push_back(createESName(Sym));
+      // Intrinsic stuff needs to be renamed if we are printing IL fn. 
+      if (PAN::isIntrinsicStuff(Printname)) {
+        if (PAN::isISR(F->getSection())) {
+          Printname = PAN::Rename(Sname);
+        }
+        // Record these decls, we need to print them in asm as extern.
+        LibcallDecls.push_back(createESName(Printname));
       }
 
-      O << Sym;
+      O << Printname;
       break;
     }
     case MachineOperand::MO_MachineBasicBlock:
@@ -248,8 +248,6 @@ void PIC16AsmPrinter::printLibcallDecls() {
   for (std::list<const char*>::const_iterator I = LibcallDecls.begin(); 
        I != LibcallDecls.end(); I++) {
     O << MAI->getExternDirective() << *I << "\n";
-    O << MAI->getExternDirective() << PAN::getArgsLabel(*I) << "\n";
-    O << MAI->getExternDirective() << PAN::getRetvalLabel(*I) << "\n";
   }
   O << MAI->getCommentString() << "External decls for libcalls - END." <<"\n";
 }
diff --git a/lib/Target/PIC16/PIC16ABINames.h b/lib/Target/PIC16/PIC16ABINames.h
index b0b9318..4c1a8da 100644
--- a/lib/Target/PIC16/PIC16ABINames.h
+++ b/lib/Target/PIC16/PIC16ABINames.h
@@ -178,18 +178,21 @@ namespace llvm {
       return Func1 + tag;
     }
 
+    // Get the retval label for the given function.
     static std::string getRetvalLabel(const std::string &Func) {
       std::string Func1 = addPrefix(Func);
       std::string tag = getTagName(RET_LABEL);
       return Func1 + tag;
     }
 
+    // Get the argument label for the given function.
     static std::string getArgsLabel(const std::string &Func) {
       std::string Func1 = addPrefix(Func);
       std::string tag = getTagName(ARGS_LABEL);
       return Func1 + tag;
     }
 
+    // Get the tempdata label for the given function.
     static std::string getTempdataLabel(const std::string &Func) {
       std::string Func1 = addPrefix(Func);
       std::string tag = getTagName(TEMPS_LABEL);
@@ -263,6 +266,7 @@ namespace llvm {
       return false;
     }
 
+
     inline static bool isMemIntrinsic (const std::string &Name) {
       if (Name.compare("@memcpy") == 0 || Name.compare("@memset") == 0 ||
           Name.compare("@memmove") == 0) {
@@ -272,6 +276,41 @@ namespace llvm {
       return false;
     }
 
+    // Currently names of libcalls are assigned during TargetLowering
+    // object construction. There is no provision to change the when the 
+    // code for a function IL function being generated. 
+    // So we have to change these names while printing assembly.
+    // We need to do that mainly for names related to intrinsics. This
+    // function returns true if a name needs to be cloned. 
+    inline static bool isIntrinsicStuff(const std::string &Name) {
+      // Return true if the name contains LIBCALL marker, or a MemIntrinisc.
+      // these are mainly ARGS_LABEL, RET_LABEL, and the LIBCALL name itself.
+      if ((Name.find(getTagName(LIBCALL)) != std::string::npos) 
+          || isMemIntrinsic(Name))
+        return true;
+ 
+      return false;
+    }
+
+    // Rename the name for IL.
+    inline static std::string Rename(const std::string &Name) {
+      std::string Newname;
+      // If its a label (LIBCALL+Func+LABEL), change it to
+      // (LIBCALL+Func+IL+LABEL).
+      TAGS id = getSymbolTag(Name);
+      if (id == ARGS_LABEL || id == RET_LABEL) {
+        std::size_t pos = Name.find(getTagName(id));
+        Newname = Name.substr(0, pos) + ".IL" + getTagName(id);
+        return Newname;
+      }
+ 
+      // Else, just append IL to name. 
+      return Name + ".IL";
+   }
+    
+    
+   
+
     inline static bool isLocalToFunc (std::string &Func, std::string &Var) {
       if (! isLocalName(Var)) return false;
 
@@ -338,6 +377,22 @@ namespace llvm {
     inline static std::string getISRAddr(void) {
       return "0x4";
     }
+
+    // Returns the name of clone of a function.
+    static std::string getCloneFnName(const std::string &Func) {
+       return (Func + ".IL");
+    }
+
+    // Returns the name of clone of a variable.
+    static std::string getCloneVarName(const std::string &Fn, 
+                                       const std::string &Var) {
+      std::string cloneVarName = Var;
+      // These vars are named like fun.auto.var.
+      // Just replace the function name, with clone function name.
+      std::string cloneFnName = getCloneFnName(Fn);
+      cloneVarName.replace(cloneVarName.find(Fn), Fn.length(), cloneFnName);
+      return cloneVarName;
+    }
   }; // class PAN.
 } // end namespace llvm;
 
diff --git a/lib/Target/PIC16/PIC16DebugInfo.cpp b/lib/Target/PIC16/PIC16DebugInfo.cpp
index c517b1b..877e4ff 100644
--- a/lib/Target/PIC16/PIC16DebugInfo.cpp
+++ b/lib/Target/PIC16/PIC16DebugInfo.cpp
@@ -419,7 +419,7 @@ void PIC16DbgInfo::EmitAuxEntry(const std::string VarName, int Aux[], int Num,
   if (TagName != "")
     O << ", " << TagName;
   for (int i = 0; i<Num; i++)
-    O << "," << Aux[i];
+    O << "," << (Aux[i] && 0xff);
 }
 
 /// EmitSymbol - Emit .def for a symbol. Value is offset for the member.
diff --git a/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp b/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp
index 82197ae..6cbd002 100644
--- a/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp
+++ b/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp
@@ -14,10 +14,7 @@
 #define DEBUG_TYPE "pic16-isel"
 
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include "PIC16ISelDAGToDAG.h"
-#include "llvm/Support/Debug.h"
-
 using namespace llvm;
 
 /// createPIC16ISelDag - This pass converts a legalized DAG into a
@@ -27,13 +24,6 @@ FunctionPass *llvm::createPIC16ISelDag(PIC16TargetMachine &TM) {
 }
 
 
-/// InstructionSelect - This callback is invoked by
-/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-void PIC16DAGToDAGISel::InstructionSelect() {
-  SelectRoot(*CurDAG);
-  CurDAG->RemoveDeadNodes();
-}
-
 /// Select - Select instructions not customized! Used for
 /// expanded, promoted and normal instructions.
 SDNode* PIC16DAGToDAGISel::Select(SDNode *N) {
diff --git a/lib/Target/PIC16/PIC16ISelDAGToDAG.h b/lib/Target/PIC16/PIC16ISelDAGToDAG.h
index 813a540..8ed5bf7 100644
--- a/lib/Target/PIC16/PIC16ISelDAGToDAG.h
+++ b/lib/Target/PIC16/PIC16ISelDAGToDAG.h
@@ -19,6 +19,8 @@
 #include "PIC16TargetMachine.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Intrinsics.h"
 using namespace llvm;
 
@@ -46,8 +48,6 @@ public:
     return "PIC16 DAG->DAG Pattern Instruction Selection";
   } 
 
-  virtual void InstructionSelect();
-  
 private:
   // Include the pieces autogenerated from the target description.
 #include "PIC16GenDAGISel.inc"
diff --git a/lib/Target/PIC16/PIC16ISelLowering.cpp b/lib/Target/PIC16/PIC16ISelLowering.cpp
index d2fc8db..d17abb9 100644
--- a/lib/Target/PIC16/PIC16ISelLowering.cpp
+++ b/lib/Target/PIC16/PIC16ISelLowering.cpp
@@ -419,8 +419,7 @@ PIC16TargetLowering::MakePIC16Libcall(PIC16ISD::PIC16Libcall Call,
      LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
                  false, 0, CallingConv::C, false,
                  /*isReturnValueUsed=*/true,
-                 Callee, Args, DAG, dl,
-                 DAG.GetOrdering(DAG.getEntryNode().getNode()));
+                 Callee, Args, DAG, dl);
 
   return CallInfo.first;
 }
@@ -1527,10 +1526,24 @@ bool PIC16TargetLowering::NeedToConvertToMemOp(SDValue Op, unsigned &MemOp,
     return true;
 
   if (isDirectLoad(Op.getOperand(1))) {
-    if (Op.getOperand(1).hasOneUse())
-      return false;
-    else 
-      MemOp = 1; 
+    if (Op.getOperand(1).hasOneUse()) {
+      // Legal and profitable folding check uses the NodeId of DAG nodes.
+      // This NodeId is assigned by topological order. Therefore first 
+      // assign topological order then perform legal and profitable check.
+      // Note:- Though this ordering is done before begining with legalization,
+      // newly added node during legalization process have NodeId=-1 (NewNode)
+      // therefore before performing any check proper ordering of the node is
+      // required.
+      DAG.AssignTopologicalOrder();
+
+      // Direct load operands are folded in binary operations. But before folding
+      // verify if this folding is legal. Fold only if it is legal otherwise
+      // convert this direct load to a separate memory operation.
+      if(ISel->IsLegalToFold(Op.getOperand(1), Op.getNode(), Op.getNode()))
+         return false;
+      else 
+         MemOp = 1; 
+    }
   }
   return true;
 }  
diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Cloner.cpp b/lib/Target/PIC16/PIC16Passes/PIC16Cloner.cpp
new file mode 100644
index 0000000..865da35
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Cloner.cpp
@@ -0,0 +1,299 @@
+//===-- PIC16Cloner.cpp - PIC16 LLVM Cloner for shared functions -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to clone all functions that are shared between
+// the main line code (ML) and interrupt line code (IL). It clones all such
+// shared functions and their automatic global vars by adding the .IL suffix. 
+//
+// This pass is supposed to be run on the linked .bc module.
+// It traveses the module call graph twice. Once starting from the main function
+// and marking each reached function as "ML". Again, starting from the ISR
+// and cloning any reachable function that was marked as "ML". After cloning
+// the function, it remaps all the call sites in IL functions to call the
+// cloned functions. 
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "PIC16Cloner.h"
+#include "../PIC16ABINames.h"
+#include <vector>
+
+using namespace llvm;
+using std::vector;
+using std::string;
+using std::map;
+
+namespace llvm {
+  char PIC16Cloner::ID = 0;
+
+  ModulePass *createPIC16ClonerPass() { return new PIC16Cloner(); }
+}
+
+// We currently intend to run these passes in opt, which does not have any
+// diagnostic support. So use these functions for now. In future
+// we will probably write our own driver tool.
+//
+void PIC16Cloner::reportError(string ErrorString) {
+  errs() << "ERROR : " << ErrorString << "\n";
+  exit(1);
+}
+
+void PIC16Cloner::
+reportError (string ErrorString, vector<string> &Values) {
+  unsigned ValCount = Values.size();
+  string TargetString;
+  for (unsigned i=0; i<ValCount; ++i) {
+    TargetString = "%";
+    TargetString += ((char)i + '0');
+    ErrorString.replace(ErrorString.find(TargetString), TargetString.length(), 
+                        Values[i]);
+  }
+  errs() << "ERROR : " << ErrorString << "\n";
+  exit(1);
+}
+
+
+// Entry point
+//
+bool PIC16Cloner::runOnModule(Module &M) {
+   CallGraph &CG = getAnalysis<CallGraph>();
+
+   // Search for the "main" and "ISR" functions.
+   CallGraphNode *mainCGN = NULL, *isrCGN = NULL;
+   for (CallGraph::iterator it = CG.begin() ; it != CG.end(); it++)
+   {
+     // External calling node doesn't have any function associated with it.
+     if (! it->first)
+       continue;
+     
+     if (it->first->getName().str() == "main") {
+       mainCGN = it->second;
+     }
+
+     if (PAN::isISR(it->first->getSection())) {
+       isrCGN = it->second;
+     }
+ 
+     // Don't search further if we've found both.
+     if (mainCGN && isrCGN)
+       break;
+   }
+
+   // We have nothing to do if any of the main or ISR is missing.
+   if (! mainCGN || ! isrCGN) return false;
+       
+   // Time for some diagnostics.
+   // See if the main itself is interrupt function then report an error.
+   if (PAN::isISR(mainCGN->getFunction()->getSection())) {
+     reportError("Function 'main' can't be interrupt function");
+   }
+
+    
+   // Mark all reachable functions from main as ML.
+   markCallGraph(mainCGN, "ML");
+
+   // And then all the functions reachable from ISR will be cloned.
+   cloneSharedFunctions(isrCGN);
+
+   return true;
+}
+
+// Mark all reachable functions from the given node, with the given mark.
+//
+void PIC16Cloner::markCallGraph(CallGraphNode *CGN, string StringMark) {
+  // Mark the top node first.
+  Function *thisF = CGN->getFunction();
+
+  thisF->setSection(StringMark);
+
+  // Mark all the called functions
+  for(CallGraphNode::iterator cgn_it = CGN->begin();
+              cgn_it != CGN->end(); ++cgn_it) {
+     Function *CalledF = cgn_it->second->getFunction();
+
+     // If calling an external function then CallGraphNode
+     // will not be associated with any function.
+     if (! CalledF)
+       continue;
+  
+     // Issue diagnostic if interrupt function is being called.
+     if (PAN::isISR(CalledF->getSection())) {
+       vector<string> Values;
+       Values.push_back(CalledF->getName().str());
+       reportError("Interrupt function (%0) can't be called", Values); 
+     }
+
+     // Has already been mark 
+     if (CalledF->getSection().find(StringMark) != string::npos) {
+       // Should we do anything here?
+     } else {
+       // Mark now
+       CalledF->setSection(StringMark);
+     }
+
+     // Before going any further mark all the called function by current
+     // function.
+     markCallGraph(cgn_it->second ,StringMark);
+  } // end of loop of all called functions.
+}
+
+
+// For PIC16, automatic variables of a function are emitted as globals.
+// Clone the auto variables of a function  and put them in ValueMap, 
+// this ValueMap will be used while
+// Cloning the code of function itself.
+//
+void PIC16Cloner::CloneAutos(Function *F) {
+  // We'll need to update module's globals list as well. So keep a reference
+  // handy.
+  Module *M = F->getParent();
+  Module::GlobalListType &Globals = M->getGlobalList();
+
+  // Clear the leftovers in ValueMap by any previous cloning.
+  ValueMap.clear();
+
+  // Find the auto globls for this function and clone them, and put them
+  // in ValueMap.
+  std::string FnName = F->getName().str();
+  std::string VarName, ClonedVarName;
+  for (Module::global_iterator I = M->global_begin(), E = M->global_end();
+       I != E; ++I) {
+    VarName = I->getName().str();
+    if (PAN::isLocalToFunc(FnName, VarName)) {
+      // Auto variable for current function found. Clone it.
+      GlobalVariable *GV = I;
+
+      const Type *InitTy = GV->getInitializer()->getType();
+      GlobalVariable *ClonedGV = 
+        new GlobalVariable(InitTy, false, GV->getLinkage(), 
+                           GV->getInitializer());
+      ClonedGV->setName(PAN::getCloneVarName(FnName, VarName));
+      // Add these new globals to module's globals list.
+      Globals.push_back(ClonedGV);
+ 
+      // Update ValueMap.
+      ValueMap[GV] = ClonedGV;
+     }
+  }
+}
+
+
+// Clone all functions that are reachable from ISR and are already 
+// marked as ML.
+//
+void PIC16Cloner::cloneSharedFunctions(CallGraphNode *CGN) {
+
+  // Check all the called functions from ISR.
+  for(CallGraphNode::iterator cgn_it = CGN->begin(); 
+              cgn_it != CGN->end(); ++cgn_it) {
+     Function *CalledF = cgn_it->second->getFunction();
+
+     // If calling an external function then CallGraphNode
+     // will not be associated with any function.
+     if (!CalledF)
+       continue;
+  
+     // Issue diagnostic if interrupt function is being called.
+     if (PAN::isISR(CalledF->getSection())) {
+       vector<string> Values;
+       Values.push_back(CalledF->getName().str());
+       reportError("Interrupt function (%0) can't be called", Values); 
+     }
+
+     if (CalledF->getSection().find("ML") != string::npos) {
+       // Function is alternatively marked. It should be a shared one.
+       // Create IL copy. Passing called function as first argument
+       // and the caller as the second argument.
+
+       // Before making IL copy, first ensure that this function has a 
+       // body. If the function does have a body. It can't be cloned.
+       // Such a case may occur when the function has been declarated
+       // in the C source code but its body exists in assembly file.
+       if (!CalledF->isDeclaration()) {
+         Function *cf = cloneFunction(CalledF);
+         remapAllSites(CGN->getFunction(), CalledF, cf);
+       }  else {
+         // It is called only from ISR. Still mark it as we need this info
+         // in code gen while calling intrinsics.Function is not marked.
+         CalledF->setSection("IL");
+       }
+     }
+     // Before going any further clone all the shared function reachaable 
+     // by current function.
+     cloneSharedFunctions(cgn_it->second);
+  } // end of loop of all called functions.
+}
+
+// Clone the given function and return it.
+// Note: it uses the ValueMap member of the class, which is already populated
+// by cloneAutos by the time we reach here. 
+// FIXME: Should we just pass ValueMap's ref as a parameter here? rather
+// than keeping the ValueMap as a member.
+Function *
+PIC16Cloner::cloneFunction(Function *OrgF) {
+   Function *ClonedF;
+
+   // See if we already cloned it. Return that. 
+   cloned_map_iterator cm_it = ClonedFunctionMap.find(OrgF);
+   if(cm_it != ClonedFunctionMap.end()) {
+     ClonedF = cm_it->second;
+     return ClonedF;
+   }
+
+   // Clone does not exist. 
+   // First clone the autos, and populate ValueMap.
+   CloneAutos(OrgF);
+
+   // Now create the clone.
+   ClonedF = CloneFunction(OrgF, ValueMap);
+
+   // The new function should be for interrupt line. Therefore should have 
+   // the name suffixed with IL and section attribute marked with IL. 
+   ClonedF->setName(PAN::getCloneFnName(OrgF->getName()));
+   ClonedF->setSection("IL");
+
+   // Add the newly created function to the module.
+   OrgF->getParent()->getFunctionList().push_back(ClonedF);
+
+   // Update the ClonedFunctionMap to record this cloning activity.
+   ClonedFunctionMap[OrgF] = ClonedF;
+
+   return ClonedF;
+}
+
+
+// Remap the call sites of shared functions, that are in IL.
+// Change the IL call site of a shared function to its clone.
+//
+void PIC16Cloner::
+remapAllSites(Function *Caller, Function *OrgF, Function *Clone) {
+  // First find the caller to update. If the caller itself is cloned
+  // then use the cloned caller. Otherwise use it.
+  cloned_map_iterator cm_it = ClonedFunctionMap.find(Caller);
+  if (cm_it != ClonedFunctionMap.end())
+    Caller = cm_it->second;
+
+  // For the lack of a better call site finding mechanism, iterate over 
+  // all insns to find the uses of original fn.
+  for (Function::iterator BI = Caller->begin(); BI != Caller->end(); ++BI) {
+    BasicBlock &BB = *BI;
+    for (BasicBlock::iterator II = BB.begin(); II != BB.end(); ++II) {
+      if (II->getNumOperands() > 0 && II->getOperand(0) == OrgF)
+          II->setOperand(0, Clone);
+    }
+  }
+}
+
+
+
diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Cloner.h b/lib/Target/PIC16/PIC16Passes/PIC16Cloner.h
new file mode 100644
index 0000000..24c1152
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Cloner.h
@@ -0,0 +1,83 @@
+//===-- PIC16Cloner.h - PIC16 LLVM Cloner for shared functions --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains declaration of a cloner class clone all functions that 
+// are shared between the main line code (ML) and interrupt line code (IL).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16CLONER_H
+#define PIC16CLONER_H
+
+#include "llvm/ADT/DenseMap.h"
+
+using namespace llvm;
+using std::vector;
+using std::string;
+using std::map;
+
+namespace llvm {
+  // forward classes.
+  class Value;
+  class Function;
+  class Module;
+  class ModulePass;
+  class CallGraph;
+  class CallGraphNode;
+  class AnalysisUsage;
+
+  class PIC16Cloner : public ModulePass { 
+  public:
+    static char ID; // Class identification 
+    PIC16Cloner() : ModulePass(&ID)  {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<CallGraph>();
+    }
+    virtual bool runOnModule(Module &M);
+
+  private: // Functions
+    // Mark reachable functions for the MainLine or InterruptLine.
+    void markCallGraph(CallGraphNode *CGN, string StringMark);
+
+    // Clone auto variables of function specified.
+    void CloneAutos(Function *F);
+   
+    // Clone the body of a function.
+    Function *cloneFunction(Function *F);
+
+    // Clone all shared functions.
+    void cloneSharedFunctions(CallGraphNode *isrCGN);
+
+    // Remap all call sites to the shared function.
+    void remapAllSites(Function *Caller, Function *OrgF, Function *Clone);
+
+    // Error reporting for PIC16Pass
+    void reportError(string ErrorString, vector<string> &Values);
+    void reportError(string ErrorString);
+
+  private:  //data
+    // Records if the interrupt function has already been found.
+    // If more than one interrupt function is found then an error
+    // should be thrown.
+    bool foundISR;
+
+    // This ValueMap maps the auto variables of the original functions with
+    // the corresponding cloned auto variable of the cloned function. 
+    // This value map is passed during the function cloning so that all the
+    // uses of auto variables be updated properly. 
+    DenseMap<const Value*, Value*> ValueMap;
+
+    // Map of a already cloned functions. 
+    map<Function *, Function *> ClonedFunctionMap;
+    typedef map<Function *, Function *>::iterator cloned_map_iterator;
+  };
+}  // End of anonymous namespace
+
+#endif
diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp
index 197c398..5ecb6aa 100644
--- a/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp
@@ -24,27 +24,27 @@
 using namespace llvm;
 
 namespace llvm {
-  char PIC16FrameOverlay::ID = 0;
-  ModulePass *createPIC16OverlayPass() { return new PIC16FrameOverlay(); }
+  char PIC16Overlay::ID = 0;
+  ModulePass *createPIC16OverlayPass() { return new PIC16Overlay(); }
 }
 
-void PIC16FrameOverlay::getAnalysisUsage(AnalysisUsage &AU) const {
+void PIC16Overlay::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<CallGraph>();
 }
 
-void PIC16FrameOverlay::DFSTraverse(CallGraphNode *CGN, unsigned Depth) {
+void PIC16Overlay::DFSTraverse(CallGraphNode *CGN, unsigned Depth) {
   // Do not set any color for external calling node.
   if (Depth != 0 && CGN->getFunction()) {
     unsigned Color = getColor(CGN->getFunction());
 
     // Handle indirectly called functions
-    if (Color >= PIC16Overlay::StartIndirectCallColor || 
-        Depth >= PIC16Overlay::StartIndirectCallColor) {
+    if (Color >= PIC16OVERLAY::StartIndirectCallColor || 
+        Depth >= PIC16OVERLAY::StartIndirectCallColor) {
       // All functions called from an indirectly called function are given
       // an unique color.
-      if (Color < PIC16Overlay::StartIndirectCallColor &&
-          Depth >= PIC16Overlay::StartIndirectCallColor)
+      if (Color < PIC16OVERLAY::StartIndirectCallColor &&
+          Depth >= PIC16OVERLAY::StartIndirectCallColor)
         setColor(CGN->getFunction(), Depth);
 
       for (unsigned int i = 0; i < CGN->size(); i++)
@@ -65,7 +65,7 @@ void PIC16FrameOverlay::DFSTraverse(CallGraphNode *CGN, unsigned Depth) {
     DFSTraverse((*CGN)[i], Depth+1);
 }
 
-unsigned PIC16FrameOverlay::ModifyDepthForInterrupt(CallGraphNode *CGN,
+unsigned PIC16Overlay::ModifyDepthForInterrupt(CallGraphNode *CGN,
                                                     unsigned Depth) {
   Function *Fn = CGN->getFunction();
 
@@ -81,7 +81,7 @@ unsigned PIC16FrameOverlay::ModifyDepthForInterrupt(CallGraphNode *CGN,
   return Depth;
 }
 
-void PIC16FrameOverlay::setColor(Function *Fn, unsigned Color) {
+void PIC16Overlay::setColor(Function *Fn, unsigned Color) {
   std::string Section = "";
   if (Fn->hasSection())
     Section = Fn->getSection();
@@ -119,7 +119,7 @@ void PIC16FrameOverlay::setColor(Function *Fn, unsigned Color) {
   Fn->setSection(Section);
 }
 
-unsigned PIC16FrameOverlay::getColor(Function *Fn) {
+unsigned PIC16Overlay::getColor(Function *Fn) {
   int Color = 0;
   if (!Fn->hasSection())
     return 0;
@@ -150,7 +150,7 @@ unsigned PIC16FrameOverlay::getColor(Function *Fn) {
   return Color;    
 }
 
-bool PIC16FrameOverlay::runOnModule(Module &M) {
+bool PIC16Overlay::runOnModule(Module &M) {
   CallGraph &CG = getAnalysis<CallGraph>();
   CallGraphNode *ECN = CG.getExternalCallingNode();
 
@@ -164,7 +164,7 @@ bool PIC16FrameOverlay::runOnModule(Module &M) {
   return false;
 }
 
-void PIC16FrameOverlay::MarkIndirectlyCalledFunctions(Module &M) {
+void PIC16Overlay::MarkIndirectlyCalledFunctions(Module &M) {
   // If the use of a function is not a call instruction then this
   // function might be called indirectly. In that case give it
   // an unique color.
diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h
index d70c4e7..5a2551f 100644
--- a/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h
@@ -1,4 +1,4 @@
-//===-- PIC16FrameOverlay.h - Interface for PIC16 Frame Overlay -*- C++ -*-===//
+//===-- PIC16Overlay.h - Interface for PIC16 Frame Overlay -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,30 +14,35 @@
 #ifndef PIC16FRAMEOVERLAY_H
 #define PIC16FRAMEOVERLAY_H
  
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Pass.h"
-#include "llvm/CallGraphSCCPass.h"
 
 using std::string;
 using namespace llvm;
 
 namespace  llvm {
-  namespace PIC16Overlay {
+  // Forward declarations.
+  class Function;
+  class Module;
+  class ModulePass;
+  class AnalysisUsage;
+  class CallGraphNode;
+  class CallGraph;
+
+  namespace PIC16OVERLAY {
     enum OverlayConsts {
       StartInterruptColor = 200,
       StartIndirectCallColor = 300
     }; 
   }
-  class PIC16FrameOverlay : public ModulePass {
+  class PIC16Overlay : public ModulePass {
     std::string OverlayStr;
     unsigned InterruptDepth;
     unsigned IndirectCallColor;
   public:
     static char ID; // Class identification 
-    PIC16FrameOverlay() : ModulePass(&ID) {
+    PIC16Overlay() : ModulePass(&ID) {
       OverlayStr = "Overlay=";
-      InterruptDepth = PIC16Overlay::StartInterruptColor;
-      IndirectCallColor = PIC16Overlay::StartIndirectCallColor;
+      InterruptDepth = PIC16OVERLAY::StartInterruptColor;
+      IndirectCallColor = PIC16OVERLAY::StartIndirectCallColor;
     }
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const; 
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 3a15f7e..66dfd4b 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -257,7 +257,7 @@ void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
     case PPC::STWX:   case PPC::STWX8:
     case PPC::STWUX:
     case PPC::STW:    case PPC::STW8:
-    case PPC::STWU:   case PPC::STWU8:
+    case PPC::STWU:
     case PPC::STVEWX:
     case PPC::STFIWX:
     case PPC::STWBRX:
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 004997f..9d79c0d 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -156,10 +156,6 @@ namespace {
     SDValue BuildSDIVSequence(SDNode *N);
     SDValue BuildUDIVSequence(SDNode *N);
     
-    /// InstructionSelect - This callback is invoked by
-    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-    virtual void InstructionSelect();
-    
     void InsertVRSaveCode(MachineFunction &MF);
 
     virtual const char *getPassName() const {
@@ -184,14 +180,6 @@ private:
   };
 }
 
-/// InstructionSelect - This callback is invoked by
-/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-void PPCDAGToDAGISel::InstructionSelect() {
-  // Select target instructions for the DAG.
-  SelectRoot(*CurDAG);
-  CurDAG->RemoveDeadNodes();
-}
-
 /// InsertVRSaveCode - Once the entire function has been instruction selected,
 /// all virtual registers are created and all machine instructions are built,
 /// check to see if we need to save/restore VRSAVE.  If so, do it.
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index e73af56..3d81afa 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1334,7 +1334,7 @@ SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
                 false, false, false, false, 0, CallingConv::C, false,
                 /*isReturnValueUsed=*/true,
                 DAG.getExternalSymbol("__trampoline_setup", PtrVT),
-                Args, DAG, dl, DAG.GetOrdering(Chain.getNode()));
+                Args, DAG, dl);
 
   SDValue Ops[] =
     { CallResult.first, CallResult.second };
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 219efb9..a0781b9 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -366,7 +366,7 @@ def ADDE8   : XOForm_1<31, 138, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
                        [(set G8RC:$rT, (adde G8RC:$rA, G8RC:$rB))]>;
 def ADDME8  : XOForm_3<31, 234, 0, (outs G8RC:$rT), (ins G8RC:$rA),
                        "addme $rT, $rA", IntGeneral,
-                       [(set G8RC:$rT, (adde G8RC:$rA, immAllOnes))]>;
+                       [(set G8RC:$rT, (adde G8RC:$rA, -1))]>;
 def ADDZE8  : XOForm_3<31, 202, 0, (outs G8RC:$rT), (ins G8RC:$rA),
                        "addze $rT, $rA", IntGeneral,
                        [(set G8RC:$rT, (adde G8RC:$rA, 0))]>;
@@ -375,7 +375,7 @@ def SUBFE8  : XOForm_1<31, 136, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
                        [(set G8RC:$rT, (sube G8RC:$rB, G8RC:$rA))]>;
 def SUBFME8 : XOForm_3<31, 232, 0, (outs G8RC:$rT), (ins G8RC:$rA),
                        "subfme $rT, $rA", IntGeneral,
-                       [(set G8RC:$rT, (sube immAllOnes, G8RC:$rA))]>;
+                       [(set G8RC:$rT, (sube -1, G8RC:$rA))]>;
 def SUBFZE8 : XOForm_3<31, 200, 0, (outs G8RC:$rT), (ins G8RC:$rA),
                        "subfze $rT, $rA", IntGeneral,
                        [(set G8RC:$rT, (sube 0, G8RC:$rA))]>;
@@ -635,13 +635,6 @@ def STHU8 : DForm_1<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                         (pre_truncsti16 G8RC:$rS, ptr_rc:$ptrreg, 
                                         iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
-def STWU8 : DForm_1<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
-                             symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stwu $rS, $ptroff($ptrreg)", LdStGeneral,
-                    [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, 
-                                                     iaddroff:$ptroff))]>,
-                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
-
 
 def STDU : DSForm_1<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                                 s16immX4:$ptroff, ptr_rc:$ptrreg),
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 3db623a..9895bea 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -19,6 +19,7 @@
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -73,8 +74,7 @@ bool PPCInstrInfo::isMoveInstr(const MachineInstr& MI,
       destReg = MI.getOperand(0).getReg();
       return true;
     }
-  } else if (oc == PPC::FMRS || oc == PPC::FMRD ||
-             oc == PPC::FMRSD) {      // fmr r1, r2
+  } else if (oc == PPC::FMR || oc == PPC::FMRSD) { // fmr r1, r2
     assert(MI.getNumOperands() >= 2 &&
            MI.getOperand(0).isReg() &&
            MI.getOperand(1).isReg() &&
@@ -344,10 +344,9 @@ bool PPCInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
     BuildMI(MBB, MI, DL, get(PPC::OR), DestReg).addReg(SrcReg).addReg(SrcReg);
   } else if (DestRC == PPC::G8RCRegisterClass) {
     BuildMI(MBB, MI, DL, get(PPC::OR8), DestReg).addReg(SrcReg).addReg(SrcReg);
-  } else if (DestRC == PPC::F4RCRegisterClass) {
-    BuildMI(MBB, MI, DL, get(PPC::FMRS), DestReg).addReg(SrcReg);
-  } else if (DestRC == PPC::F8RCRegisterClass) {
-    BuildMI(MBB, MI, DL, get(PPC::FMRD), DestReg).addReg(SrcReg);
+  } else if (DestRC == PPC::F4RCRegisterClass ||
+             DestRC == PPC::F8RCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::FMR), DestReg).addReg(SrcReg);
   } else if (DestRC == PPC::CRRCRegisterClass) {
     BuildMI(MBB, MI, DL, get(PPC::MCRF), DestReg).addReg(SrcReg);
   } else if (DestRC == PPC::VRRCRegisterClass) {
@@ -688,33 +687,21 @@ MachineInstr *PPCInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                         getUndefRegState(isUndef)),
                                 FrameIndex);
     }
-  } else if (Opc == PPC::FMRD) {
-    if (OpNum == 0) {  // move -> store
-      unsigned InReg = MI->getOperand(1).getReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::STFD))
-                                .addReg(InReg,
-                                        getKillRegState(isKill) |
-                                        getUndefRegState(isUndef)),
-                                FrameIndex);
-    } else {           // move -> load
-      unsigned OutReg = MI->getOperand(0).getReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::LFD))
-                                .addReg(OutReg,
-                                        RegState::Define |
-                                        getDeadRegState(isDead) |
-                                        getUndefRegState(isUndef)),
-                                FrameIndex);
-    }
-  } else if (Opc == PPC::FMRS) {
+  } else if (Opc == PPC::FMR || Opc == PPC::FMRSD) {
+    // The register may be F4RC or F8RC, and that determines the memory op.
+    unsigned OrigReg = MI->getOperand(OpNum).getReg();
+    // We cannot tell the register class from a physreg alone.
+    if (TargetRegisterInfo::isPhysicalRegister(OrigReg))
+      return NULL;
+    const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(OrigReg);
+    const bool is64 = RC == PPC::F8RCRegisterClass;
+
     if (OpNum == 0) {  // move -> store
       unsigned InReg = MI->getOperand(1).getReg();
       bool isKill = MI->getOperand(1).isKill();
       bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::STFS))
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(),
+                                        get(is64 ? PPC::STFD : PPC::STFS))
                                 .addReg(InReg,
                                         getKillRegState(isKill) |
                                         getUndefRegState(isUndef)),
@@ -723,7 +710,8 @@ MachineInstr *PPCInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       unsigned OutReg = MI->getOperand(0).getReg();
       bool isDead = MI->getOperand(0).isDead();
       bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::LFS))
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(),
+                                        get(is64 ? PPC::LFD : PPC::LFS))
                                 .addReg(OutReg,
                                         RegState::Define |
                                         getDeadRegState(isDead) |
@@ -749,7 +737,7 @@ bool PPCInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
   else if ((Opc == PPC::OR8 &&
               MI->getOperand(1).getReg() == MI->getOperand(2).getReg()))
     return true;
-  else if (Opc == PPC::FMRD || Opc == PPC::FMRS)
+  else if (Opc == PPC::FMR || Opc == PPC::FMRSD)
     return true;
 
   return false;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 842f8ee..845cd8f 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1019,20 +1019,16 @@ let Uses = [RM] in {
   }
 }
 
-/// FMR is split into 3 versions, one for 4/8 byte FP, and one for extending.
+/// FMR is split into 2 versions, one for 4/8 byte FP, and one for extending.
 ///
 /// Note that these are defined as pseudo-ops on the PPC970 because they are
 /// often coalesced away and we don't want the dispatch group builder to think
 /// that they will fill slots (which could cause the load of a LSU reject to
 /// sneak into a d-group with a store).
-def FMRS   : XForm_26<63, 72, (outs F4RC:$frD), (ins F4RC:$frB),
-                      "fmr $frD, $frB", FPGeneral,
-                      []>,  // (set F4RC:$frD, F4RC:$frB)
-                      PPC970_Unit_Pseudo;
-def FMRD   : XForm_26<63, 72, (outs F8RC:$frD), (ins F8RC:$frB),
-                      "fmr $frD, $frB", FPGeneral,
-                      []>,  // (set F8RC:$frD, F8RC:$frB)
-                      PPC970_Unit_Pseudo;
+def FMR   : XForm_26<63, 72, (outs F4RC:$frD), (ins F4RC:$frB),
+                     "fmr $frD, $frB", FPGeneral,
+                     []>,  // (set F4RC:$frD, F4RC:$frB)
+                     PPC970_Unit_Pseudo;
 def FMRSD  : XForm_26<63, 72, (outs F8RC:$frD), (ins F4RC:$frB),
                       "fmr $frD, $frB", FPGeneral,
                       [(set F8RC:$frD, (fextend F4RC:$frB))]>,
@@ -1215,7 +1211,7 @@ def ADDE  : XOForm_1<31, 138, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
                       [(set GPRC:$rT, (adde GPRC:$rA, GPRC:$rB))]>;
 def ADDME  : XOForm_3<31, 234, 0, (outs GPRC:$rT), (ins GPRC:$rA),
                       "addme $rT, $rA", IntGeneral,
-                      [(set GPRC:$rT, (adde GPRC:$rA, immAllOnes))]>;
+                      [(set GPRC:$rT, (adde GPRC:$rA, -1))]>;
 def ADDZE  : XOForm_3<31, 202, 0, (outs GPRC:$rT), (ins GPRC:$rA),
                       "addze $rT, $rA", IntGeneral,
                       [(set GPRC:$rT, (adde GPRC:$rA, 0))]>;
@@ -1224,7 +1220,7 @@ def SUBFE : XOForm_1<31, 136, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
                       [(set GPRC:$rT, (sube GPRC:$rB, GPRC:$rA))]>;
 def SUBFME : XOForm_3<31, 232, 0, (outs GPRC:$rT), (ins GPRC:$rA),
                       "subfme $rT, $rA", IntGeneral,
-                      [(set GPRC:$rT, (sube immAllOnes, GPRC:$rA))]>;
+                      [(set GPRC:$rT, (sube -1, GPRC:$rA))]>;
 def SUBFZE : XOForm_3<31, 200, 0, (outs GPRC:$rT), (ins GPRC:$rA),
                       "subfze $rT, $rA", IntGeneral,
                       [(set GPRC:$rT, (sube 0, GPRC:$rA))]>;
@@ -1480,11 +1476,11 @@ def : Pat<(extloadf32 xaddr:$src),
           (FMRSD (LFSX xaddr:$src))>;
 
 // Memory barriers
-def : Pat<(membarrier (i32 imm:$ll),
-                      (i32 imm:$ls),
-                      (i32 imm:$sl),
-                      (i32 imm:$ss),
-                      (i32 imm:$device)),
+def : Pat<(membarrier (i32 imm /*ll*/),
+                      (i32 imm /*ls*/),
+                      (i32 imm /*sl*/),
+                      (i32 imm /*ss*/),
+                      (i32 imm /*device*/)),
            (SYNC)>;
 
 include "PPCInstrAltivec.td"
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index 049e893..1cb7340 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -287,10 +287,8 @@ def GPRC : RegisterClass<"PPC", [i32], 32,
     GPRCClass::allocation_order_begin(const MachineFunction &MF) const {
       // 32-bit SVR4 ABI: r2 is reserved for the OS.
       // 64-bit SVR4 ABI: r2 is reserved for the TOC pointer.
-      if (!MF.getTarget().getSubtarget<PPCSubtarget>().isDarwin())
-        return begin()+1;
-
-      return begin();
+      // Darwin: R2 is reserved for CR save/restore sequence.
+      return begin()+1;
     }
     GPRCClass::iterator
     GPRCClass::allocation_order_end(const MachineFunction &MF) const {
@@ -325,10 +323,8 @@ def G8RC : RegisterClass<"PPC", [i64], 64,
     G8RCClass::iterator
     G8RCClass::allocation_order_begin(const MachineFunction &MF) const {
       // 64-bit SVR4 ABI: r2 is reserved for the TOC pointer.
-      if (!MF.getTarget().getSubtarget<PPCSubtarget>().isDarwin())
-        return begin()+1;
-
-      return begin();
+      // Darwin: r2 is reserved for CR save/restore sequence.
+      return begin()+1;
     }
     G8RCClass::iterator
     G8RCClass::allocation_order_end(const MachineFunction &MF) const {
diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt
index e49bda0..3465779 100644
--- a/lib/Target/PowerPC/README.txt
+++ b/lib/Target/PowerPC/README.txt
@@ -889,7 +889,7 @@ entry:
 ; recognize a more elaborate tree than a simple SETxx.
 
 define double @test_FNEG_sel(double %A, double %B, double %C) {
-        %D = sub double -0.000000e+00, %A               ; <double> [#uses=1]
+        %D = fsub double -0.000000e+00, %A               ; <double> [#uses=1]
         %Cond = fcmp ugt double %D, -0.000000e+00               ; <i1> [#uses=1]
         %E = select i1 %Cond, double %B, double %C              ; <double> [#uses=1]
         ret double %E
diff --git a/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp b/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp
index 9a2ce6b..d6b45be 100644
--- a/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp
@@ -56,6 +56,9 @@ namespace {
                              unsigned AsmVariant, const char *ExtraCode);
 
     bool printGetPCX(const MachineInstr *MI, unsigned OpNo);
+    
+    virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB)
+                       const;
   };
 } // end of anonymous namespace
 
@@ -140,18 +143,19 @@ bool SparcAsmPrinter::printGetPCX(const MachineInstr *MI, unsigned opNum) {
     break;
   }
 
+  unsigned mfNum = MI->getParent()->getParent()->getFunctionNumber();
   unsigned bbNum = MI->getParent()->getNumber();
 
-  O << '\n' << ".LLGETPCH" << bbNum << ":\n";
-  O << "\tcall\t.LLGETPC" << bbNum << '\n' ;
+  O << '\n' << ".LLGETPCH" << mfNum << '_' << bbNum << ":\n";
+  O << "\tcall\t.LLGETPC" << mfNum << '_' << bbNum << '\n' ;
 
   O << "\t  sethi\t"
-    << "%hi(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << bbNum << ")), "  
+    << "%hi(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << mfNum << '_' << bbNum << ")), "  
     << operand << '\n' ;
 
-  O << ".LLGETPC" << bbNum << ":\n" ;
+  O << ".LLGETPC" << mfNum << '_' << bbNum << ":\n" ;
   O << "\tor\t" << operand  
-    << ", %lo(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << bbNum << ")), "
+    << ", %lo(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << mfNum << '_' << bbNum << ")), "
     << operand << '\n';
   O << "\tadd\t" << operand << ", %o7, " << operand << '\n'; 
   
@@ -197,6 +201,38 @@ bool SparcAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
+/// isBlockOnlyReachableByFallthough - Return true if the basic block has
+/// exactly one predecessor and the control transfer mechanism between
+/// the predecessor and this block is a fall-through.
+/// Override AsmPrinter implementation to handle delay slots
+bool SparcAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) 
+    const {
+  // If this is a landing pad, it isn't a fall through.  If it has no preds,
+  // then nothing falls through to it.
+  if (MBB->isLandingPad() || MBB->pred_empty())
+    return false;
+  
+  // If there isn't exactly one predecessor, it can't be a fall through.
+  MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
+  ++PI2;
+  if (PI2 != MBB->pred_end())
+    return false;
+  
+  // The predecessor has to be immediately before this block.
+  const MachineBasicBlock *Pred = *PI;
+  
+  if (!Pred->isLayoutSuccessor(MBB))
+    return false;
+  
+  // Check if the last terminator is an unconditional branch
+  MachineBasicBlock::const_iterator I = Pred->end();
+  while( I != Pred->begin() && !(--I)->getDesc().isTerminator() )
+      ; /* Noop */
+  return I == Pred->end() || !I->getDesc().isBarrier();
+}
+
+
+
 // Force static initialization.
 extern "C" void LLVMInitializeSparcAsmPrinter() { 
   RegisterAsmPrinter<SparcAsmPrinter> X(TheSparcTarget);
diff --git a/lib/Target/Sparc/README.txt b/lib/Target/Sparc/README.txt
index cc24abf..b4991fe 100644
--- a/lib/Target/Sparc/README.txt
+++ b/lib/Target/Sparc/README.txt
@@ -56,3 +56,4 @@ int %t1(int %a, int %b) {
   leaf fns.
 * Fill delay slots
 
+* Implement JIT support
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index e1b3299..a7d1805 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -35,7 +35,6 @@ class SparcDAGToDAGISel : public SelectionDAGISel {
   /// make the right decision when generating code for different targets.
   const SparcSubtarget &Subtarget;
   SparcTargetMachine& TM;
-  MachineBasicBlock *CurBB;
 public:
   explicit SparcDAGToDAGISel(SparcTargetMachine &tm)
     : SelectionDAGISel(tm),
@@ -56,10 +55,6 @@ public:
                                             char ConstraintCode,
                                             std::vector<SDValue> &OutOps);
 
-  /// InstructionSelect - This callback is invoked by
-  /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-  virtual void InstructionSelect();
-
   virtual const char *getPassName() const {
     return "SPARC DAG->DAG Pattern Instruction Selection";
   }
@@ -72,17 +67,8 @@ private:
 };
 }  // end anonymous namespace
 
-/// InstructionSelect - This callback is invoked by
-/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-void SparcDAGToDAGISel::InstructionSelect() {
-  CurBB = BB;
-  // Select target instructions for the DAG.
-  SelectRoot(*CurDAG);
-  CurDAG->RemoveDeadNodes();
-}
-
 SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
-  MachineFunction *MF = CurBB->getParent();
+  MachineFunction *MF = BB->getParent();
   unsigned GlobalBaseReg = TM.getInstrInfo()->getGlobalBaseReg(MF);
   return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
 }
diff --git a/lib/Target/Sparc/SparcMachineFunctionInfo.h b/lib/Target/Sparc/SparcMachineFunctionInfo.h
index e457235..56d8708 100644
--- a/lib/Target/Sparc/SparcMachineFunctionInfo.h
+++ b/lib/Target/Sparc/SparcMachineFunctionInfo.h
@@ -22,7 +22,7 @@ namespace llvm {
     unsigned GlobalBaseReg;
   public:
     SparcMachineFunctionInfo() : GlobalBaseReg(0) {}
-    SparcMachineFunctionInfo(MachineFunction &MF) : GlobalBaseReg(0) {}
+    explicit SparcMachineFunctionInfo(MachineFunction &MF) : GlobalBaseReg(0) {}
 
     unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
     void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 7f0d9fb..8152e1d 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -100,8 +100,6 @@ namespace {
         Lowering(*TM.getTargetLowering()),
         Subtarget(*TM.getSubtargetImpl()) { }
 
-    virtual void InstructionSelect();
-
     virtual const char *getPassName() const {
       return "SystemZ DAG->DAG Pattern Instruction Selection";
     }
@@ -152,10 +150,6 @@ namespace {
     bool MatchAddressBase(SDValue N, SystemZRRIAddressMode &AM);
     bool MatchAddressRI(SDValue N, SystemZRRIAddressMode &AM,
                         bool is12Bit);
-
-  #ifndef NDEBUG
-    unsigned Indent;
-  #endif
   };
 }  // end anonymous namespace
 
@@ -599,35 +593,17 @@ bool SystemZDAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
   return false;
 }
 
-/// InstructionSelect - This callback is invoked by
-/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-void SystemZDAGToDAGISel::InstructionSelect() {
-  // Codegen the basic block.
-  DEBUG(errs() << "===== Instruction selection begins:\n");
-  DEBUG(Indent = 0);
-  SelectRoot(*CurDAG);
-  DEBUG(errs() << "===== Instruction selection ends:\n");
-
-  CurDAG->RemoveDeadNodes();
-}
-
 SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
   EVT NVT = Node->getValueType(0);
   DebugLoc dl = Node->getDebugLoc();
   unsigned Opcode = Node->getOpcode();
 
   // Dump information about the Node being selected
-  DEBUG(errs().indent(Indent) << "Selecting: ";
-        Node->dump(CurDAG);
-        errs() << "\n");
-  DEBUG(Indent += 2);
+  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
 
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(errs().indent(Indent-2) << "== ";
-          Node->dump(CurDAG);
-          errs() << "\n");
-    DEBUG(Indent -= 2);
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     return NULL; // Already selected.
   }
 
@@ -693,9 +669,7 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
                                                                      MVT::i32));
 
       ReplaceUses(SDValue(Node, 0), SDValue(Div, 0));
-      DEBUG(errs().indent(Indent-2) << "=> ";
-            Result->dump(CurDAG);
-            errs() << "\n");
+      DEBUG(errs() << "=> "; Result->dump(CurDAG); errs() << "\n");
     }
 
     // Copy the remainder (even subreg) result, if it is needed.
@@ -708,15 +682,9 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
                                                                      MVT::i32));
 
       ReplaceUses(SDValue(Node, 1), SDValue(Rem, 0));
-      DEBUG(errs().indent(Indent-2) << "=> ";
-            Result->dump(CurDAG);
-            errs() << "\n");
+      DEBUG(errs() << "=> "; Result->dump(CurDAG); errs() << "\n");
     }
 
-#ifndef NDEBUG
-    Indent -= 2;
-#endif
-
     return NULL;
   }
   case ISD::UDIVREM: {
@@ -782,9 +750,7 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
                                            CurDAG->getTargetConstant(SubRegIdx,
                                                                      MVT::i32));
       ReplaceUses(SDValue(Node, 0), SDValue(Div, 0));
-      DEBUG(errs().indent(Indent-2) << "=> ";
-            Result->dump(CurDAG);
-            errs() << "\n");
+      DEBUG(errs() << "=> "; Result->dump(CurDAG); errs() << "\n");
     }
 
     // Copy the remainder (even subreg) result, if it is needed.
@@ -796,15 +762,9 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
                                            CurDAG->getTargetConstant(SubRegIdx,
                                                                      MVT::i32));
       ReplaceUses(SDValue(Node, 1), SDValue(Rem, 0));
-      DEBUG(errs().indent(Indent-2) << "=> ";
-            Result->dump(CurDAG);
-            errs() << "\n");
+      DEBUG(errs() << "=> "; Result->dump(CurDAG); errs() << "\n");
     }
 
-#ifndef NDEBUG
-    Indent -= 2;
-#endif
-
     return NULL;
   }
   }
@@ -812,14 +772,12 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
   // Select the default instruction
   SDNode *ResNode = SelectCode(Node);
 
-  DEBUG(errs().indent(Indent-2) << "=> ";
+  DEBUG(errs() << "=> ";
         if (ResNode == NULL || ResNode == Node)
           Node->dump(CurDAG);
         else
           ResNode->dump(CurDAG);
         errs() << "\n";
         );
-  DEBUG(Indent -= 2);
-
   return ResNode;
 }
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 336e20e..f46840c 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -58,7 +58,7 @@ def FMOV64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src),
                       []>;
 }
 
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
 def FMOV32rm  : Pseudo<(outs FP32:$dst), (ins rriaddr12:$src),
                       "le\t{$dst, $src}",
                       [(set FP32:$dst, (load rriaddr12:$src))]>;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 1891bba..a44f6d9 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -257,7 +257,7 @@ def MOV64rihi32 : RILI<0xEC0, (outs GR64:$dst), (ins i64imm:$src),
                        [(set GR64:$dst, i64hi32:$src)]>;
 }
 
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
 def MOV32rm  : RXI<0x58,
                    (outs GR32:$dst), (ins rriaddr12:$src),
                    "l\t{$dst, $src}",
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index e47d419..fd6e330 100644
--- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -33,7 +33,8 @@ class SystemZMachineFunctionInfo : public MachineFunctionInfo {
 public:
   SystemZMachineFunctionInfo() : CalleeSavedFrameSize(0) {}
 
-  SystemZMachineFunctionInfo(MachineFunction &MF) : CalleeSavedFrameSize(0) {}
+  explicit SystemZMachineFunctionInfo(MachineFunction &MF)
+    : CalleeSavedFrameSize(0) {}
 
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
   void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
index 295b30f..9a16808 100644
--- a/lib/Target/TargetData.cpp
+++ b/lib/Target/TargetData.cpp
@@ -580,7 +580,7 @@ const IntegerType *TargetData::getIntPtrType(LLVMContext &C) const {
 uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices,
                                       unsigned NumIndices) const {
   const Type *Ty = ptrTy;
-  assert(isa<PointerType>(Ty) && "Illegal argument for getIndexedOffset()");
+  assert(Ty->isPointerTy() && "Illegal argument for getIndexedOffset()");
   uint64_t Result = 0;
 
   generic_gep_type_iterator<Value* const*>
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 0c105e9..82619c7 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -307,7 +307,7 @@ getSymbolForDwarfReference(const MCSymbol *Sym, MachineModuleInfo *MMI,
 
   switch (Encoding & 0xF0) {
   default:
-    llvm_report_error("Do not support this DWARF encoding yet!");
+    llvm_report_error("We do not support this DWARF encoding yet!");
     break;
   case dwarf::DW_EH_PE_absptr:
     // Do nothing special
diff --git a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
index 1a35a49..734a545 100644
--- a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
@@ -73,7 +73,7 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo) {
     O << '$' << Op.getImm();
     
     if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
-      *CommentStream << format("imm = 0x%X\n", Op.getImm());
+      *CommentStream << format("imm = 0x%llX\n", (long long)Op.getImm());
     
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 61f26a7..eed3b45 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -15,6 +15,7 @@ tablegen(X86GenCallingConv.inc -gen-callingconv)
 tablegen(X86GenSubtarget.inc -gen-subtarget)
 
 set(sources
+  X86AsmBackend.cpp
   X86CodeEmitter.cpp
   X86COFFMachineModuleInfo.cpp
   X86ELFWriterInfo.cpp
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 19eb05e..e5f84e8 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -67,8 +67,8 @@ no_exit.i7:		; preds = %no_exit.i7, %build_tree.exit
                                    [ %tmp.34.i18, %no_exit.i7 ]
 	%tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ],
                                     [ %tmp.28.i16, %no_exit.i7 ]
-	%tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
-	%tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
+	%tmp.28.i16 = fadd double %tmp.0.0.0.i10, 0.000000e+00
+	%tmp.34.i18 = fadd double %tmp.0.1.0.i9, 0.000000e+00
 	br i1 false, label %Compute_Tree.exit23, label %no_exit.i7
 
 Compute_Tree.exit23:		; preds = %no_exit.i7
@@ -97,7 +97,7 @@ pcmp/pand/pandn/por to do a selection instead of a conditional branch:
 
 double %X(double %Y, double %Z, double %A, double %B) {
         %C = setlt double %A, %B
-        %z = add double %Z, 0.0    ;; select operand is not a load
+        %z = fadd double %Z, 0.0    ;; select operand is not a load
         %D = select bool %C, double %Y, double %z
         ret double %D
 }
@@ -545,7 +545,7 @@ eliminates a constant pool load.  For example, consider:
 
 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
 entry:
- %tmp6 = sub float -0.000000e+00, %z.1		; <float> [#uses=1]
+ %tmp6 = fsub float -0.000000e+00, %z.1		; <float> [#uses=1]
  %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
  ret i64 %tmp20
 }
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 3c6138b..d4545a6 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -227,11 +227,6 @@ lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 
 //===---------------------------------------------------------------------===//
 
-Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
-FR64 to VR128.
-
-//===---------------------------------------------------------------------===//
-
 Adding to the list of cmp / test poor codegen issues:
 
 int test(__m128 *A, __m128 *B) {
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 1a1e447..ba0ee6c 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -19,13 +19,15 @@
 
 namespace llvm {
 
-class X86TargetMachine;
 class FunctionPass;
-class MachineCodeEmitter;
+class JITCodeEmitter;
+class MCAssembler;
 class MCCodeEmitter;
 class MCContext;
-class JITCodeEmitter;
+class MachineCodeEmitter;
 class Target;
+class TargetAsmBackend;
+class X86TargetMachine;
 class formatted_raw_ostream;
 
 /// createX86ISelDag - This pass converts a legalized DAG into a 
@@ -55,6 +57,9 @@ MCCodeEmitter *createX86_32MCCodeEmitter(const Target &, TargetMachine &TM,
 MCCodeEmitter *createX86_64MCCodeEmitter(const Target &, TargetMachine &TM,
                                          MCContext &Ctx);
 
+TargetAsmBackend *createX86_32AsmBackend(const Target &, MCAssembler &);
+TargetAsmBackend *createX86_64AsmBackend(const Target &, MCAssembler &);
+
 /// createX86EmitCodeToMemory - Returns a pass that converts a register
 /// allocated function into raw machine code in a dynamically
 /// allocated chunk of memory.
diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp
new file mode 100644
index 0000000..e6654ef
--- /dev/null
+++ b/lib/Target/X86/X86AsmBackend.cpp
@@ -0,0 +1,34 @@
+//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetAsmBackend.h"
+#include "X86.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Target/TargetAsmBackend.h"
+using namespace llvm;
+
+namespace {
+
+class X86AsmBackend : public TargetAsmBackend {
+public:
+  X86AsmBackend(const Target &T, MCAssembler &A)
+    : TargetAsmBackend(T) {}
+};
+
+}
+
+TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
+                                               MCAssembler &A) {
+  return new X86AsmBackend(T, A);
+}
+
+TargetAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
+                                               MCAssembler &A) {
+  return new X86AsmBackend(T, A);
+}
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 69a9d60..17366ee 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1161,6 +1161,8 @@ bool X86FastISel::X86VisitIntrinsicCall(IntrinsicInst &I) {
     if (!X86SelectAddress(DI->getAddress(), AM))
       return false;
     const TargetInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+    // FIXME may need to add RegState::Debug to any registers produced,
+    // although ESP/EBP should be the only ones at the moment.
     addFullAddress(BuildMI(MBB, DL, II), AM).addImm(0).
                                         addMetadata(DI->getVariable());
     return true;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 7b349f6..08030e0 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -12,15 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Force NDEBUG on in any optimized build on Darwin.
-//
-// FIXME: This is a huge hack, to work around ridiculously awful compile times
-// on this file with gcc-4.2 on Darwin, in Release mode.
-#if (!defined(__llvm__) && defined(__APPLE__) && \
-     defined(__OPTIMIZE__) && !defined(NDEBUG))
-#define NDEBUG
-#endif
-
 #define DEBUG_TYPE "x86-isel"
 #include "X86.h"
 #include "X86InstrBuilder.h"
@@ -177,15 +168,11 @@ namespace {
       return "X86 DAG->DAG Instruction Selection";
     }
 
-    /// InstructionSelect - This callback is invoked by
-    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-    virtual void InstructionSelect();
-
     virtual void EmitFunctionEntryCode(Function &Fn, MachineFunction &MF);
 
     virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const;
 
-    virtual bool IsLegalToFold(SDValue N, SDNode *U, SDNode *Root) const;
+    virtual void PreprocessISelDAG();
 
 // Include the pieces autogenerated from the target description.
 #include "X86GenDAGISel.inc"
@@ -209,18 +196,17 @@ namespace {
                        SDValue &Scale, SDValue &Index, SDValue &Disp);
     bool SelectTLSADDRAddr(SDNode *Op, SDValue N, SDValue &Base,
                        SDValue &Scale, SDValue &Index, SDValue &Disp);
-    bool SelectScalarSSELoad(SDNode *Op, SDValue Pred,
-                             SDValue N, SDValue &Base, SDValue &Scale,
+    bool SelectScalarSSELoad(SDNode *Root, SDValue N,
+                             SDValue &Base, SDValue &Scale,
                              SDValue &Index, SDValue &Disp,
                              SDValue &Segment,
-                             SDValue &InChain, SDValue &OutChain);
+                             SDValue &NodeWithChain);
+    
     bool TryFoldLoad(SDNode *P, SDValue N,
                      SDValue &Base, SDValue &Scale,
                      SDValue &Index, SDValue &Disp,
                      SDValue &Segment);
-    void PreprocessForRMW();
-    void PreprocessForFPConvert();
-
+    
     /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
     /// inline asm expressions.
     virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
@@ -296,10 +282,6 @@ namespace {
     const X86InstrInfo *getInstrInfo() {
       return getTargetMachine().getInstrInfo();
     }
-
-#ifndef NDEBUG
-    unsigned Indent;
-#endif
   };
 }
 
@@ -367,65 +349,6 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
   return true;
 }
 
-
-bool X86DAGToDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root) const {
-  if (OptLevel == CodeGenOpt::None) return false;
-
-  // Proceed to 'generic' cycle finder code
-  return SelectionDAGISel::IsLegalToFold(N, U, Root);
-}
-
-/// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
-/// and move load below the TokenFactor. Replace store's chain operand with
-/// load's chain result.
-static void MoveBelowTokenFactor(SelectionDAG *CurDAG, SDValue Load,
-                                 SDValue Store, SDValue TF) {
-  SmallVector<SDValue, 4> Ops;
-  for (unsigned i = 0, e = TF.getNode()->getNumOperands(); i != e; ++i)
-    if (Load.getNode() == TF.getOperand(i).getNode())
-      Ops.push_back(Load.getOperand(0));
-    else
-      Ops.push_back(TF.getOperand(i));
-  SDValue NewTF = CurDAG->UpdateNodeOperands(TF, &Ops[0], Ops.size());
-  SDValue NewLoad = CurDAG->UpdateNodeOperands(Load, NewTF,
-                                               Load.getOperand(1),
-                                               Load.getOperand(2));
-  CurDAG->UpdateNodeOperands(Store, NewLoad.getValue(1), Store.getOperand(1),
-                             Store.getOperand(2), Store.getOperand(3));
-}
-
-/// isRMWLoad - Return true if N is a load that's part of RMW sub-DAG.  The 
-/// chain produced by the load must only be used by the store's chain operand,
-/// otherwise this may produce a cycle in the DAG.
-/// 
-static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
-                      SDValue &Load) {
-  if (N.getOpcode() == ISD::BIT_CONVERT) {
-    if (!N.hasOneUse())
-      return false;
-    N = N.getOperand(0);
-  }
-
-  LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
-  if (!LD || LD->isVolatile())
-    return false;
-  if (LD->getAddressingMode() != ISD::UNINDEXED)
-    return false;
-
-  ISD::LoadExtType ExtType = LD->getExtensionType();
-  if (ExtType != ISD::NON_EXTLOAD && ExtType != ISD::EXTLOAD)
-    return false;
-
-  if (N.hasOneUse() &&
-      LD->hasNUsesOfValue(1, 1) &&
-      N.getOperand(1) == Address &&
-      LD->isOperandOf(Chain.getNode())) {
-    Load = N;
-    return true;
-  }
-  return false;
-}
-
 /// MoveBelowCallSeqStart - Replace CALLSEQ_START operand with load's chain
 /// operand and move load below the call's chain operand.
 static void MoveBelowCallSeqStart(SelectionDAG *CurDAG, SDValue Load,
@@ -489,51 +412,14 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain) {
   return false;
 }
 
-
-/// PreprocessForRMW - Preprocess the DAG to make instruction selection better.
-/// This is only run if not in -O0 mode.
-/// This allows the instruction selector to pick more read-modify-write
-/// instructions. This is a common case:
-///
-///     [Load chain]
-///         ^
-///         |
-///       [Load]
-///       ^    ^
-///       |    |
-///      /      \-
-///     /         |
-/// [TokenFactor] [Op]
-///     ^          ^
-///     |          |
-///      \        /
-///       \      /
-///       [Store]
-///
-/// The fact the store's chain operand != load's chain will prevent the
-/// (store (op (load))) instruction from being selected. We can transform it to:
-///
-///     [Load chain]
-///         ^
-///         |
-///    [TokenFactor]
-///         ^
-///         |
-///       [Load]
-///       ^    ^
-///       |    |
-///       |     \- 
-///       |       | 
-///       |     [Op]
-///       |       ^
-///       |       |
-///       \      /
-///        \    /
-///       [Store]
-void X86DAGToDAGISel::PreprocessForRMW() {
+void X86DAGToDAGISel::PreprocessISelDAG() {
+  OptForSize = MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize);
+  
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-         E = CurDAG->allnodes_end(); I != E; ++I) {
-    if (I->getOpcode() == X86ISD::CALL) {
+       E = CurDAG->allnodes_end(); I != E; ) {
+    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
+
+    if (OptLevel != CodeGenOpt::None && N->getOpcode() == X86ISD::CALL) {
       /// Also try moving call address load from outside callseq_start to just
       /// before the call to allow it to be folded.
       ///
@@ -553,85 +439,23 @@ void X86DAGToDAGISel::PreprocessForRMW() {
       ///      \        /
       ///       \      /
       ///       [CALL]
-      SDValue Chain = I->getOperand(0);
-      SDValue Load  = I->getOperand(1);
+      SDValue Chain = N->getOperand(0);
+      SDValue Load  = N->getOperand(1);
       if (!isCalleeLoad(Load, Chain))
         continue;
-      MoveBelowCallSeqStart(CurDAG, Load, SDValue(I, 0), Chain);
+      MoveBelowCallSeqStart(CurDAG, Load, SDValue(N, 0), Chain);
       ++NumLoadMoved;
       continue;
     }
-
-    if (!ISD::isNON_TRUNCStore(I))
-      continue;
-    SDValue Chain = I->getOperand(0);
-
-    if (Chain.getNode()->getOpcode() != ISD::TokenFactor)
-      continue;
-
-    SDValue N1 = I->getOperand(1);
-    SDValue N2 = I->getOperand(2);
-    if ((N1.getValueType().isFloatingPoint() &&
-         !N1.getValueType().isVector()) ||
-        !N1.hasOneUse())
-      continue;
-
-    bool RModW = false;
-    SDValue Load;
-    unsigned Opcode = N1.getNode()->getOpcode();
-    switch (Opcode) {
-    case ISD::ADD:
-    case ISD::MUL:
-    case ISD::AND:
-    case ISD::OR:
-    case ISD::XOR:
-    case ISD::ADDC:
-    case ISD::ADDE:
-    case ISD::VECTOR_SHUFFLE: {
-      SDValue N10 = N1.getOperand(0);
-      SDValue N11 = N1.getOperand(1);
-      RModW = isRMWLoad(N10, Chain, N2, Load);
-      if (!RModW)
-        RModW = isRMWLoad(N11, Chain, N2, Load);
-      break;
-    }
-    case ISD::SUB:
-    case ISD::SHL:
-    case ISD::SRA:
-    case ISD::SRL:
-    case ISD::ROTL:
-    case ISD::ROTR:
-    case ISD::SUBC:
-    case ISD::SUBE:
-    case X86ISD::SHLD:
-    case X86ISD::SHRD: {
-      SDValue N10 = N1.getOperand(0);
-      RModW = isRMWLoad(N10, Chain, N2, Load);
-      break;
-    }
-    }
-
-    if (RModW) {
-      MoveBelowTokenFactor(CurDAG, Load, SDValue(I, 0), Chain);
-      ++NumLoadMoved;
-      checkForCycles(I);
-    }
-  }
-}
-
-
-/// PreprocessForFPConvert - Walk over the dag lowering fpround and fpextend
-/// nodes that target the FP stack to be store and load to the stack.  This is a
-/// gross hack.  We would like to simply mark these as being illegal, but when
-/// we do that, legalize produces these when it expands calls, then expands
-/// these in the same legalize pass.  We would like dag combine to be able to
-/// hack on these between the call expansion and the node legalization.  As such
-/// this pass basically does "really late" legalization of these inline with the
-/// X86 isel pass.
-void X86DAGToDAGISel::PreprocessForFPConvert() {
-  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-       E = CurDAG->allnodes_end(); I != E; ) {
-    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
+    
+    // Lower fpround and fpextend nodes that target the FP stack to be store and
+    // load to the stack.  This is a gross hack.  We would like to simply mark
+    // these as being illegal, but when we do that, legalize produces these when
+    // it expands calls, then expands these in the same legalize pass.  We would
+    // like dag combine to be able to hack on these between the call expansion
+    // and the node legalization.  As such this pass basically does "really
+    // late" legalization of these inline with the X86 isel pass.
+    // FIXME: This should only happen when not compiled with -O0.
     if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
       continue;
     
@@ -687,30 +511,6 @@ void X86DAGToDAGISel::PreprocessForFPConvert() {
   }  
 }
 
-/// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel
-/// when it has created a SelectionDAG for us to codegen.
-void X86DAGToDAGISel::InstructionSelect() {
-  const Function *F = MF->getFunction();
-  OptForSize = F->hasFnAttr(Attribute::OptimizeForSize);
-
-  if (OptLevel != CodeGenOpt::None)
-    PreprocessForRMW();
-
-  // FIXME: This should only happen when not compiled with -O0.
-  PreprocessForFPConvert();
-
-  // Codegen the basic block.
-#ifndef NDEBUG
-  DEBUG(dbgs() << "===== Instruction selection begins:\n");
-  Indent = 0;
-#endif
-  SelectRoot(*CurDAG);
-#ifndef NDEBUG
-  DEBUG(dbgs() << "===== Instruction selection ends:\n");
-#endif
-
-  CurDAG->RemoveDeadNodes();
-}
 
 /// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
 /// the main function.
@@ -1317,22 +1117,24 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Op, SDValue N, SDValue &Base,
 /// SelectScalarSSELoad - Match a scalar SSE load.  In particular, we want to
 /// match a load whose top elements are either undef or zeros.  The load flavor
 /// is derived from the type of N, which is either v4f32 or v2f64.
-bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Op, SDValue Pred,
+///
+/// We also return:
+///   PatternChainNode: this is the matched node that has a chain input and
+///   output.
+bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
                                           SDValue N, SDValue &Base,
                                           SDValue &Scale, SDValue &Index,
                                           SDValue &Disp, SDValue &Segment,
-                                          SDValue &InChain,
-                                          SDValue &OutChain) {
+                                          SDValue &PatternNodeWithChain) {
   if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-    InChain = N.getOperand(0).getValue(1);
-    if (ISD::isNON_EXTLoad(InChain.getNode()) &&
-        InChain.getValue(0).hasOneUse() &&
-        IsProfitableToFold(N, Pred.getNode(), Op) &&
-        IsLegalToFold(N, Pred.getNode(), Op)) {
-      LoadSDNode *LD = cast<LoadSDNode>(InChain);
-      if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+    PatternNodeWithChain = N.getOperand(0);
+    if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
+        PatternNodeWithChain.hasOneUse() &&
+        IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+        IsLegalToFold(N.getOperand(0), N.getNode(), Root)) {
+      LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+      if (!SelectAddr(Root, LD->getBasePtr(), Base, Scale, Index, Disp,Segment))
         return false;
-      OutChain = LD->getChain();
       return true;
     }
   }
@@ -1344,13 +1146,14 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Op, SDValue Pred,
       N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && 
       N.getOperand(0).getNode()->hasOneUse() &&
       ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
-      N.getOperand(0).getOperand(0).hasOneUse()) {
+      N.getOperand(0).getOperand(0).hasOneUse() &&
+      IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+      IsLegalToFold(N.getOperand(0), N.getNode(), Root)) {
     // Okay, this is a zero extending load.  Fold it.
     LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
-    if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+    if (!SelectAddr(Root, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
       return false;
-    OutChain = LD->getChain();
-    InChain = SDValue(LD, 1);
+    PatternNodeWithChain = SDValue(LD, 0);
     return true;
   }
   return false;
@@ -1424,7 +1227,6 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDNode *Op, SDValue N,
 bool X86DAGToDAGISel::SelectTLSADDRAddr(SDNode *Op, SDValue N, SDValue &Base,
                                         SDValue &Scale, SDValue &Index,
                                         SDValue &Disp) {
-  assert(Op->getOpcode() == X86ISD::TLSADDR);
   assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
   
@@ -1451,11 +1253,12 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
                                   SDValue &Base, SDValue &Scale,
                                   SDValue &Index, SDValue &Disp,
                                   SDValue &Segment) {
-  if (ISD::isNON_EXTLoad(N.getNode()) &&
-      IsProfitableToFold(N, P, P) &&
-      IsLegalToFold(N, P, P))
-    return SelectAddr(P, N.getOperand(1), Base, Scale, Index, Disp, Segment);
-  return false;
+  if (!ISD::isNON_EXTLoad(N.getNode()) ||
+      !IsProfitableToFold(N, P, P) ||
+      !IsLegalToFold(N, P, P))
+    return false;
+  
+  return SelectAddr(P, N.getOperand(1), Base, Scale, Index, Disp, Segment);
 }
 
 /// getGlobalBaseReg - Return an SDNode that returns the value of
@@ -1558,7 +1361,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
       Opc = X86::LOCK_DEC16m;
     else if (isSub) {
       if (isCN) {
-        if (Predicate_i16immSExt8(Val.getNode()))
+        if (Predicate_immSext8(Val.getNode()))
           Opc = X86::LOCK_SUB16mi8;
         else
           Opc = X86::LOCK_SUB16mi;
@@ -1566,7 +1369,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
         Opc = X86::LOCK_SUB16mr;
     } else {
       if (isCN) {
-        if (Predicate_i16immSExt8(Val.getNode()))
+        if (Predicate_immSext8(Val.getNode()))
           Opc = X86::LOCK_ADD16mi8;
         else
           Opc = X86::LOCK_ADD16mi;
@@ -1581,7 +1384,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
       Opc = X86::LOCK_DEC32m;
     else if (isSub) {
       if (isCN) {
-        if (Predicate_i32immSExt8(Val.getNode()))
+        if (Predicate_immSext8(Val.getNode()))
           Opc = X86::LOCK_SUB32mi8;
         else
           Opc = X86::LOCK_SUB32mi;
@@ -1589,7 +1392,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
         Opc = X86::LOCK_SUB32mr;
     } else {
       if (isCN) {
-        if (Predicate_i32immSExt8(Val.getNode()))
+        if (Predicate_immSext8(Val.getNode()))
           Opc = X86::LOCK_ADD32mi8;
         else
           Opc = X86::LOCK_ADD32mi;
@@ -1605,7 +1408,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
     else if (isSub) {
       Opc = X86::LOCK_SUB64mr;
       if (isCN) {
-        if (Predicate_i64immSExt8(Val.getNode()))
+        if (Predicate_immSext8(Val.getNode()))
           Opc = X86::LOCK_SUB64mi8;
         else if (Predicate_i64immSExt32(Val.getNode()))
           Opc = X86::LOCK_SUB64mi32;
@@ -1613,7 +1416,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
     } else {
       Opc = X86::LOCK_ADD64mr;
       if (isCN) {
-        if (Predicate_i64immSExt8(Val.getNode()))
+        if (Predicate_immSext8(Val.getNode()))
           Opc = X86::LOCK_ADD64mi8;
         else if (Predicate_i64immSExt32(Val.getNode()))
           Opc = X86::LOCK_ADD64mi32;
@@ -1710,24 +1513,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
   DebugLoc dl = Node->getDebugLoc();
   
-#ifndef NDEBUG
-  DEBUG({
-      dbgs() << std::string(Indent, ' ') << "Selecting: ";
-      Node->dump(CurDAG);
-      dbgs() << '\n';
-    });
-  Indent += 2;
-#endif
+  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
 
   if (Node->isMachineOpcode()) {
-#ifndef NDEBUG
-    DEBUG({
-        dbgs() << std::string(Indent-2, ' ') << "== ";
-        Node->dump(CurDAG);
-        dbgs() << '\n';
-      });
-    Indent -= 2;
-#endif
+    DEBUG(dbgs() << "== ";  Node->dump(CurDAG); dbgs() << '\n');
     return NULL;   // Already selected.
   }
 
@@ -1823,13 +1612,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
                                                 LoReg, NVT, InFlag);
       InFlag = Result.getValue(2);
       ReplaceUses(SDValue(Node, 0), Result);
-#ifndef NDEBUG
-      DEBUG({
-          dbgs() << std::string(Indent-2, ' ') << "=> ";
-          Result.getNode()->dump(CurDAG);
-          dbgs() << '\n';
-        });
-#endif
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
     }
     // Copy the high half of the result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
@@ -1852,19 +1635,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         InFlag = Result.getValue(2);
       }
       ReplaceUses(SDValue(Node, 1), Result);
-#ifndef NDEBUG
-      DEBUG({
-          dbgs() << std::string(Indent-2, ' ') << "=> ";
-          Result.getNode()->dump(CurDAG);
-          dbgs() << '\n';
-        });
-#endif
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
     }
 
-#ifndef NDEBUG
-    Indent -= 2;
-#endif
-
     return NULL;
   }
 
@@ -1979,13 +1752,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
                                                 LoReg, NVT, InFlag);
       InFlag = Result.getValue(2);
       ReplaceUses(SDValue(Node, 0), Result);
-#ifndef NDEBUG
-      DEBUG({
-          dbgs() << std::string(Indent-2, ' ') << "=> ";
-          Result.getNode()->dump(CurDAG);
-          dbgs() << '\n';
-        });
-#endif
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
     }
     // Copy the remainder (high) result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
@@ -2009,19 +1776,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         InFlag = Result.getValue(2);
       }
       ReplaceUses(SDValue(Node, 1), Result);
-#ifndef NDEBUG
-      DEBUG({
-          dbgs() << std::string(Indent-2, ' ') << "=> ";
-          Result.getNode()->dump(CurDAG);
-          dbgs() << '\n';
-        });
-#endif
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
     }
-
-#ifndef NDEBUG
-    Indent -= 2;
-#endif
-
     return NULL;
   }
 
@@ -2134,17 +1890,12 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
   SDNode *ResNode = SelectCode(Node);
 
-#ifndef NDEBUG
-  DEBUG({
-      dbgs() << std::string(Indent-2, ' ') << "=> ";
-      if (ResNode == NULL || ResNode == Node)
-        Node->dump(CurDAG);
-      else
-        ResNode->dump(CurDAG);
-      dbgs() << '\n';
-    });
-  Indent -= 2;
-#endif
+  DEBUG(dbgs() << "=> ";
+        if (ResNode == NULL || ResNode == Node)
+          Node->dump(CurDAG);
+        else
+          ResNode->dump(CurDAG);
+        dbgs() << '\n');
 
   return ResNode;
 }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9974d8c..e2b8193 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -73,7 +73,7 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
   case X86Subtarget::isDarwin:
     if (TM.getSubtarget<X86Subtarget>().is64Bit())
       return new X8664_MachoTargetObjectFile();
-    return new X8632_MachoTargetObjectFile();
+    return new TargetLoweringObjectFileMachO();
   case X86Subtarget::isELF:
    if (TM.getSubtarget<X86Subtarget>().is64Bit())
      return new X8664_ELFTargetObjectFile(TM);
@@ -990,6 +990,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::BUILD_VECTOR);
   setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::SRA);
   setTargetDAGCombine(ISD::SRL);
@@ -1743,7 +1744,7 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
   // Calculate the new stack slot for the return address.
   int SlotSize = Is64Bit ? 8 : 4;
   int NewReturnAddrFI =
-    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, true,false);
+    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false);
   EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
@@ -2376,7 +2377,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
     // Set up a frame object for the return address.
     uint64_t SlotSize = TD->getPointerSize();
     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
-                                                           true, false);
+                                                           false, false);
     FuncInfo->setRAIndex(ReturnAddrIndex);
   }
 
@@ -4816,8 +4817,16 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
 
   if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
       isa<ConstantSDNode>(N2)) {
-    unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB
-                                                : X86ISD::PINSRW;
+    unsigned Opc;
+    if (VT == MVT::v8i16)
+      Opc = X86ISD::PINSRW;
+    else if (VT == MVT::v4i16)
+      Opc = X86ISD::MMX_PINSRW;
+    else if (VT == MVT::v16i8)
+      Opc = X86ISD::PINSRB;
+    else
+      Opc = X86ISD::PINSRB;
+
     // Transform it so it match pinsr{b,w} which expects a GR32 as its second
     // argument.
     if (N1.getValueType() != MVT::i32)
@@ -4868,7 +4877,8 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
     if (N2.getValueType() != MVT::i32)
       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
-    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+    return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW,
+                       dl, VT, N0, N1, N2);
   }
   return SDValue();
 }
@@ -5244,7 +5254,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
 
   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                 DAG.getConstant(VTBits, MVT::i8));
-  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT,
+  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                              AndNode, DAG.getConstant(0, MVT::i8));
 
   SDValue Hi, Lo;
@@ -5873,26 +5883,31 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
 
 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
 /// if it's possible.
-static SDValue LowerToBT(SDValue Op0, ISD::CondCode CC,
+static SDValue LowerToBT(SDValue And, ISD::CondCode CC,
                          DebugLoc dl, SelectionDAG &DAG) {
+  SDValue Op0 = And.getOperand(0);
+  SDValue Op1 = And.getOperand(1);
+  if (Op0.getOpcode() == ISD::TRUNCATE)
+    Op0 = Op0.getOperand(0);
+  if (Op1.getOpcode() == ISD::TRUNCATE)
+    Op1 = Op1.getOperand(0);
+
   SDValue LHS, RHS;
-  if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
-    if (ConstantSDNode *Op010C =
-        dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
-      if (Op010C->getZExtValue() == 1) {
-        LHS = Op0.getOperand(0);
-        RHS = Op0.getOperand(1).getOperand(1);
+  if (Op1.getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *And10C = dyn_cast<ConstantSDNode>(Op1.getOperand(0)))
+      if (And10C->getZExtValue() == 1) {
+        LHS = Op0;
+        RHS = Op1.getOperand(1);
       }
-  } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
-    if (ConstantSDNode *Op000C =
-        dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
-      if (Op000C->getZExtValue() == 1) {
-        LHS = Op0.getOperand(1);
-        RHS = Op0.getOperand(0).getOperand(1);
+  } else if (Op0.getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
+      if (And00C->getZExtValue() == 1) {
+        LHS = Op1;
+        RHS = Op0.getOperand(1);
       }
-  } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
-    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
-    SDValue AndLHS = Op0.getOperand(0);
+  } else if (Op1.getOpcode() == ISD::Constant) {
+    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+    SDValue AndLHS = Op0;
     if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
       LHS = AndLHS.getOperand(0);
       RHS = AndLHS.getOperand(1);
@@ -5942,6 +5957,21 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
       return NewSetCC;
   }
 
+  // Look for "(setcc) == / != 1" to avoid unncessary setcc.
+  if (Op0.getOpcode() == X86ISD::SETCC &&
+      Op1.getOpcode() == ISD::Constant &&
+      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
+       cast<ConstantSDNode>(Op1)->isNullValue()) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
+    bool Invert = (CC == ISD::SETNE) ^
+      cast<ConstantSDNode>(Op1)->isNullValue();
+    if (Invert)
+      CCode = X86::GetOppositeBranchCondition(CCode);
+    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                       DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
+  }
+
   bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
   if (X86CC == X86::COND_INVALID)
@@ -6444,8 +6474,7 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
         LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
                     false, false, false, false,
                     0, CallingConv::C, false, /*isReturnValueUsed=*/false,
-                    DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl,
-                    DAG.GetOrdering(Chain.getNode()));
+                    DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
       return CallResult.second;
     }
 
@@ -7662,6 +7691,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
+  case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
@@ -7769,7 +7799,7 @@ bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   if (NumBits1 <= NumBits2)
     return false;
-  return Subtarget->is64Bit() || NumBits1 < 64;
+  return true;
 }
 
 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
@@ -7779,7 +7809,7 @@ bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   unsigned NumBits2 = VT2.getSizeInBits();
   if (NumBits1 <= NumBits2)
     return false;
-  return Subtarget->is64Bit() || NumBits1 < 64;
+  return true;
 }
 
 bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
@@ -8792,10 +8822,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   SDValue RHS = N->getOperand(2);
 
   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
-  // instructions have the peculiarity that if either operand is a NaN,
-  // they chose what we call the RHS operand (and as such are not symmetric).
-  // It happens that this matches the semantics of the common C idiom
-  // x<y?x:y and related forms, so we can recognize these cases.
+  // instructions match the semantics of the common C idiom x<y?x:y but not
+  // x<=y?x:y, because of how they handle negative zero (which can be
+  // ignored in unsafe-math mode).
   if (Subtarget->hasSSE2() &&
       (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
       Cond.getOpcode() == ISD::SETCC) {
@@ -8803,36 +8832,34 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
 
     unsigned Opcode = 0;
     // Check for x CC y ? x : y.
-    if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
+    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
       switch (CC) {
       default: break;
       case ISD::SETULT:
-        // This can be a min if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(RHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(LHS))
+        // Converting this to a min would handle NaNs incorrectly, and swapping
+        // the operands would cause it to handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!FiniteOnlyFPMath() &&
+            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
+          if (!UnsafeFPMath &&
+              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
             break;
+          std::swap(LHS, RHS);
         }
         Opcode = X86ISD::FMIN;
         break;
       case ISD::SETOLE:
-        // This can be a min if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(LHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(RHS))
-            break;
-        }
+        // Converting this to a min would handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+          break;
         Opcode = X86ISD::FMIN;
         break;
       case ISD::SETULE:
-        // This can be a min, but if either operand is a NaN we need it to
-        // preserve the original LHS.
+        // Converting this to a min would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
       case ISD::SETOLT:
       case ISD::SETLT:
@@ -8841,32 +8868,29 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         break;
 
       case ISD::SETOGE:
-        // This can be a max if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(LHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(RHS))
-            break;
-        }
+        // Converting this to a max would handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS))
+          break;
         Opcode = X86ISD::FMAX;
         break;
       case ISD::SETUGT:
-        // This can be a max if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(RHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(LHS))
+        // Converting this to a max would handle NaNs incorrectly, and swapping
+        // the operands would cause it to handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!FiniteOnlyFPMath() &&
+            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
+          if (!UnsafeFPMath &&
+              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
             break;
+          std::swap(LHS, RHS);
         }
         Opcode = X86ISD::FMAX;
         break;
       case ISD::SETUGE:
-        // This can be a max, but if either operand is a NaN we need it to
-        // preserve the original LHS.
+        // Converting this to a max would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
       case ISD::SETOGT:
       case ISD::SETGT:
@@ -8875,36 +8899,33 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         break;
       }
     // Check for x CC y ? y : x -- a min/max with reversed arms.
-    } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
+    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
       switch (CC) {
       default: break;
       case ISD::SETOGE:
-        // This can be a min if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(RHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(LHS))
+        // Converting this to a min would handle comparisons between positive
+        // and negative zero incorrectly, and swapping the operands would
+        // cause it to handle NaNs incorrectly.
+        if (!UnsafeFPMath &&
+            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
+          if (!FiniteOnlyFPMath() &&
+              (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
             break;
+          std::swap(LHS, RHS);
         }
         Opcode = X86ISD::FMIN;
         break;
       case ISD::SETUGT:
-        // This can be a min if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(LHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(RHS))
-            break;
-        }
+        // Converting this to a min would handle NaNs incorrectly.
+        if (!UnsafeFPMath &&
+            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+          break;
         Opcode = X86ISD::FMIN;
         break;
       case ISD::SETUGE:
-        // This can be a min, but if either operand is a NaN we need it to
-        // preserve the original LHS.
+        // Converting this to a min would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
       case ISD::SETOGT:
       case ISD::SETGT:
@@ -8913,32 +8934,28 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         break;
 
       case ISD::SETULT:
-        // This can be a max if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(LHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(RHS))
-            break;
-        }
+        // Converting this to a max would handle NaNs incorrectly.
+        if (!FiniteOnlyFPMath() &&
+            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+          break;
         Opcode = X86ISD::FMAX;
         break;
       case ISD::SETOLE:
-        // This can be a max if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(RHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(LHS))
+        // Converting this to a max would handle comparisons between positive
+        // and negative zero incorrectly, and swapping the operands would
+        // cause it to handle NaNs incorrectly.
+        if (!UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
+          if (!FiniteOnlyFPMath() &&
+              (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
             break;
+          std::swap(LHS, RHS);
         }
         Opcode = X86ISD::FMAX;
         break;
       case ISD::SETULE:
-        // This can be a max, but if either operand is a NaN we need it to
-        // preserve the original LHS.
+        // Converting this to a max would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
       case ISD::SETOLT:
       case ISD::SETLT:
@@ -9157,16 +9174,64 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// PerformANDCombine - Look for SSE and instructions of this form:
+/// (and x, (build_vector signbit,signbit,signbit,signbit)). If there
+/// exists a use of a build_vector that's the bitwise complement of the mask,
+/// then transform the node to
+/// (and (xor x, (build_vector -1,-1,-1,-1)), (build_vector ~sb,~sb,~sb,~sb)).
+static SDValue PerformANDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (N0.getOpcode() == ISD::XOR || !N1.hasOneUse())
+    return SDValue();
+
+  if (N1.getOpcode() == ISD::BUILD_VECTOR) {
+    unsigned NumElts = VT.getVectorNumElements();
+    EVT EltVT = VT.getVectorElementType();
+    SmallVector<SDValue, 8> Mask;
+    Mask.reserve(NumElts);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      SDValue Arg = N1.getOperand(i);
+      if (Arg.getOpcode() == ISD::UNDEF) {
+        Mask.push_back(Arg);
+        continue;
+      }
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Arg);
+      if (!C)
+        return SDValue();
+      if (!C->getAPIntValue().isSignBit() &&
+          !C->getAPIntValue().isMaxSignedValue())
+        return SDValue();
+      Mask.push_back(DAG.getConstant(~C->getAPIntValue(), EltVT));
+    }
+    N1 = DAG.getNode(ISD::BUILD_VECTOR, N1.getDebugLoc(), VT,
+                     &Mask[0], NumElts);
+    if (!N1.use_empty()) {
+      unsigned Bits = EltVT.getSizeInBits();
+      Mask.clear();
+      for (unsigned i = 0; i != NumElts; ++i)
+        Mask.push_back(DAG.getConstant(APInt::getAllOnesValue(Bits), EltVT));
+      SDValue NewMask = DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                                    VT, &Mask[0], NumElts);
+      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                         DAG.getNode(ISD::XOR, N->getDebugLoc(), VT,
+                                     N0, NewMask), N1);
+    }
+  }
+
+  return SDValue();
+}
 
 /// PerformMulCombine - Optimize a single multiply with constant into two
 /// in order to implement it with two cheaper instructions, e.g.
 /// LEA + SHL, LEA + LEA.
 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI) {
-  if (DAG.getMachineFunction().
-      getFunction()->hasFnAttr(Attribute::OptimizeForSize))
-    return SDValue();
-
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
@@ -9305,7 +9370,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
       }
     } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
-         unsigned SplatIdx = cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
+         unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
          if (C->getZExtValue() == SplatIdx)
            BaseShAmt = InVec.getOperand(1);
        }
@@ -9690,6 +9755,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
+  case ISD::AND:            return PerformANDCombine(N, DAG, DCI);
   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
   case ISD::SHL:
   case ISD::SRA:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index cf0eb40..ffaf1cf 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -180,7 +180,7 @@ namespace llvm {
 
       /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector,
       /// corresponds to X86::PINSRW.
-      PINSRW,
+      PINSRW, MMX_PINSRW,
 
       /// PSHUFB - Shuffle 16 8-bit values within a vector.
       PSHUFB,
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
index 4ea3739..8462255 100644
--- a/lib/Target/X86/X86Instr64bit.td
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -59,10 +59,11 @@ def tls64addr : ComplexPattern<i64, 4, "SelectTLSADDRAddr",
 // Pattern fragments.
 //
 
-def i64immSExt8  : PatLeaf<(i64 imm), [{
-  // i64immSExt8 predicate - True if the 64-bit immediate fits in a 8-bit
-  // sign extended field.
-  return (int64_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+def i64immSExt8  : PatLeaf<(i64 immSext8)>;
+
+def GetLo32XForm : SDNodeXForm<imm, [{
+  // Transformation function: get the low 32 bits.
+  return getI32Imm((unsigned)N->getZExtValue());
 }]>;
 
 def i64immSExt32  : PatLeaf<(i64 imm), [{
@@ -71,6 +72,7 @@ def i64immSExt32  : PatLeaf<(i64 imm), [{
   return (int64_t)N->getZExtValue() == (int32_t)N->getZExtValue();
 }]>;
 
+
 def i64immZExt32  : PatLeaf<(i64 imm), [{
   // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
   // unsignedsign extended field.
@@ -325,7 +327,7 @@ def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
 def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                      "mov{q}\t{$src, $dst|$dst, $src}", []>;
 
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}",
                  [(set GR64:$dst, (load addr:$src))]>;
@@ -556,7 +558,7 @@ def ADC64mi8 : RIi8<0x83, MRM2m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
                   addr:$dst)]>;
 def ADC64mi32 : RIi32<0x81, MRM2m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
                       "adc{q}\t{$src2, $dst|$dst, $src2}",
-                 [(store (adde (load addr:$dst), i64immSExt8:$src2), 
+                 [(store (adde (load addr:$dst), i64immSExt32:$src2), 
                   addr:$dst)]>;
 } // Uses = [EFLAGS]
 
@@ -1981,7 +1983,7 @@ def : Pat<(and GR64:$src, i64immZExt32:$imm),
             (i64 0),
             (AND32ri
               (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit),
-              imm:$imm),
+              (i32 (GetLo32XForm imm:$imm))),
             x86_subreg_32bit)>;
 
 // r & (2^32-1) ==> movz
@@ -2105,34 +2107,34 @@ def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
 def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
 
 // (shl x (and y, 63)) ==> (shl x, y)
-def : Pat<(shl GR64:$src1, (and CL:$amt, 63)),
+def : Pat<(shl GR64:$src1, (and CL, 63)),
           (SHL64rCL GR64:$src1)>;
-def : Pat<(store (shl (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+def : Pat<(store (shl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
           (SHL64mCL addr:$dst)>;
 
-def : Pat<(srl GR64:$src1, (and CL:$amt, 63)),
+def : Pat<(srl GR64:$src1, (and CL, 63)),
           (SHR64rCL GR64:$src1)>;
-def : Pat<(store (srl (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+def : Pat<(store (srl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
           (SHR64mCL addr:$dst)>;
 
-def : Pat<(sra GR64:$src1, (and CL:$amt, 63)),
+def : Pat<(sra GR64:$src1, (and CL, 63)),
           (SAR64rCL GR64:$src1)>;
-def : Pat<(store (sra (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+def : Pat<(store (sra (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
           (SAR64mCL addr:$dst)>;
 
 // Double shift patterns
-def : Pat<(shrd GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm:$amt2)),
+def : Pat<(shrd GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm)),
           (SHRD64rri8 GR64:$src1, GR64:$src2, (i8 imm:$amt1))>;
 
 def : Pat<(store (shrd (loadi64 addr:$dst), (i8 imm:$amt1),
-                       GR64:$src2, (i8 imm:$amt2)), addr:$dst),
+                       GR64:$src2, (i8 imm)), addr:$dst),
           (SHRD64mri8 addr:$dst, GR64:$src2, (i8 imm:$amt1))>;
 
-def : Pat<(shld GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm:$amt2)),
+def : Pat<(shld GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm)),
           (SHLD64rri8 GR64:$src1, GR64:$src2, (i8 imm:$amt1))>;
 
 def : Pat<(store (shld (loadi64 addr:$dst), (i8 imm:$amt1),
-                       GR64:$src2, (i8 imm:$amt2)), addr:$dst),
+                       GR64:$src2, (i8 imm)), addr:$dst),
           (SHLD64mri8 addr:$dst, GR64:$src2, (i8 imm:$amt1))>;
 
 // (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index e22a903..ae24bfb 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -397,7 +397,7 @@ def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
 let canFoldAsLoad = 1 in {
 def LD_Fp32m   : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
                   [(set RFP32:$dst, (loadf32 addr:$src))]>;
-let isReMaterializable = 1, mayHaveSideEffects = 1 in
+let isReMaterializable = 1 in
   def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
                   [(set RFP64:$dst, (loadf64 addr:$src))]>;
 def LD_Fp80m   : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index a0d0312..39bda04 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -276,11 +276,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::MOVDQArr,    X86::MOVDQAmr, 0, 16 },
     { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0, 0 },
     { X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0, 0 },
-    { X86::MOVPS2SSrr,  X86::MOVPS2SSmr, 0, 0 },
-    { X86::MOVSDrr,     X86::MOVSDmr, 0, 0 },
     { X86::MOVSDto64rr, X86::MOVSDto64mr, 0, 0 },
     { X86::MOVSS2DIrr,  X86::MOVSS2DImr, 0, 0 },
-    { X86::MOVSSrr,     X86::MOVSSmr, 0, 0 },
     { X86::MOVUPDrr,    X86::MOVUPDmr, 0, 0 },
     { X86::MOVUPSrr,    X86::MOVUPSmr, 0, 0 },
     { X86::MUL16r,      X86::MUL16m, 1, 0 },
@@ -389,12 +386,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm, 0 },
     { X86::MOVDI2SSrr,      X86::MOVDI2SSrm, 0 },
     { X86::MOVDQArr,        X86::MOVDQArm, 16 },
-    { X86::MOVSD2PDrr,      X86::MOVSD2PDrm, 0 },
-    { X86::MOVSDrr,         X86::MOVSDrm, 0 },
     { X86::MOVSHDUPrr,      X86::MOVSHDUPrm, 16 },
     { X86::MOVSLDUPrr,      X86::MOVSLDUPrm, 16 },
-    { X86::MOVSS2PSrr,      X86::MOVSS2PSrm, 0 },
-    { X86::MOVSSrr,         X86::MOVSSrm, 0 },
     { X86::MOVSX16rr8,      X86::MOVSX16rm8, 0 },
     { X86::MOVSX32rr16,     X86::MOVSX32rm16, 0 },
     { X86::MOVSX32rr8,      X86::MOVSX32rm8, 0 },
@@ -682,23 +675,20 @@ bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
   case X86::MOV16rr:
   case X86::MOV32rr: 
   case X86::MOV64rr:
-  case X86::MOVSSrr:
-  case X86::MOVSDrr:
 
   // FP Stack register class copies
   case X86::MOV_Fp3232: case X86::MOV_Fp6464: case X86::MOV_Fp8080:
   case X86::MOV_Fp3264: case X86::MOV_Fp3280:
   case X86::MOV_Fp6432: case X86::MOV_Fp8032:
-      
+
+  // Note that MOVSSrr and MOVSDrr are not considered copies. FR32 and FR64
+  // copies are done with FsMOVAPSrr and FsMOVAPDrr.
+
   case X86::FsMOVAPSrr:
   case X86::FsMOVAPDrr:
   case X86::MOVAPSrr:
   case X86::MOVAPDrr:
   case X86::MOVDQArr:
-  case X86::MOVSS2PSrr:
-  case X86::MOVSD2PDrr:
-  case X86::MOVPS2SSrr:
-  case X86::MOVPD2SDrr:
   case X86::MMX_MOVQ64rr:
     assert(MI.getNumOperands() >= 2 &&
            MI.getOperand(0).isReg() &&
@@ -1083,7 +1073,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
       case X86::MOV8r0:  Opc = X86::MOV8ri;  break;
       case X86::MOV16r0: Opc = X86::MOV16ri; break;
       case X86::MOV32r0: Opc = X86::MOV32ri; break;
-      case X86::MOV64r0: Opc = X86::MOV64ri; break;
+      case X86::MOV64r0: Opc = X86::MOV64ri64i32; break;
       }
       Clone = false;
     }
@@ -1860,7 +1850,7 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
     CommonRC = SrcRC;
   else if (!DestRC->hasSubClass(SrcRC)) {
     // Neither of GR64_NOREX or GR64_NOSP is a superclass of the other,
-    // but we want to copy then as GR64. Similarly, for GR32_NOREX and
+    // but we want to copy them as GR64. Similarly, for GR32_NOREX and
     // GR32_NOSP, copy as GR32.
     if (SrcRC->hasSuperClass(&X86::GR64RegClass) &&
         DestRC->hasSuperClass(&X86::GR64RegClass))
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 25cd297..cfe71a5 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -343,18 +343,37 @@ def X86_COND_O   : PatLeaf<(i8 13)>;
 def X86_COND_P   : PatLeaf<(i8 14)>; // alt. COND_PE
 def X86_COND_S   : PatLeaf<(i8 15)>;
 
-def i16immSExt8  : PatLeaf<(i16 imm), [{
-  // i16immSExt8 predicate - True if the 16-bit immediate fits in a 8-bit
-  // sign extended field.
-  return (int16_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+def immSext8 : PatLeaf<(imm), [{
+  return N->getSExtValue() == (int8_t)N->getSExtValue();
 }]>;
 
-def i32immSExt8  : PatLeaf<(i32 imm), [{
-  // i32immSExt8 predicate - True if the 32-bit immediate fits in a 8-bit
-  // sign extended field.
-  return (int32_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+def i16immSExt8  : PatLeaf<(i16 immSext8)>;
+def i32immSExt8  : PatLeaf<(i32 immSext8)>;
+
+/// Load patterns: these constraint the match to the right address space.
+def dsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  return true;
+}]>;
+
+def gsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      return PT->getAddressSpace() == 256;
+  return false;
+}]>;
+
+def fsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      return PT->getAddressSpace() == 257;
+  return false;
 }]>;
 
+
 // Helper fragments for loads.
 // It's always safe to treat a anyext i16 load as a i32 load if the i16 is
 // known to be 32-bit aligned or better. Ditto for i8 to i16.
@@ -372,8 +391,7 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
   return false;
 }]>;
 
-def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),
-[{
+def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{
   LoadSDNode *LD = cast<LoadSDNode>(N);
   if (const Value *Src = LD->getSrcValue())
     if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
@@ -399,72 +417,11 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
   return false;
 }]>;
 
-def nvloadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  if (const Value *Src = LD->getSrcValue())
-    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
-      if (PT->getAddressSpace() > 255)
-        return false;
-  if (LD->isVolatile())
-    return false;
-  ISD::LoadExtType ExtType = LD->getExtensionType();
-  if (ExtType == ISD::NON_EXTLOAD)
-    return true;
-  if (ExtType == ISD::EXTLOAD)
-    return LD->getAlignment() >= 4;
-  return false;
-}]>;
-
-def gsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
-    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
-      return PT->getAddressSpace() == 256;
-  return false;
-}]>;
-
-def fsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
-    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
-      return PT->getAddressSpace() == 257;
-  return false;
-}]>;
-
-def loadi8  : PatFrag<(ops node:$ptr), (i8  (load node:$ptr)), [{
-  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
-    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
-      if (PT->getAddressSpace() > 255)
-        return false;
-  return true;
-}]>;
-def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr)), [{
-  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
-    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
-      if (PT->getAddressSpace() > 255)
-        return false;
-  return true;
-}]>;
-
-def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr)), [{
-  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
-    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
-      if (PT->getAddressSpace() > 255)
-        return false;
-  return true;
-}]>;
-def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr)), [{
-  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
-    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
-      if (PT->getAddressSpace() > 255)
-        return false;
-  return true;
-}]>;
-def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr)), [{
-  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
-    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
-      if (PT->getAddressSpace() > 255)
-        return false;
-  return true;
-}]>;
+def loadi8  : PatFrag<(ops node:$ptr), (i8  (dsload node:$ptr))>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (dsload node:$ptr))>;
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (dsload node:$ptr))>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (dsload node:$ptr))>;
+def loadf80 : PatFrag<(ops node:$ptr), (f80 (dsload node:$ptr))>;
 
 def sextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
 def sextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
@@ -1037,7 +994,7 @@ def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
 def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                     "mov{l}\t{$src, $dst|$dst, $src}", []>;
 
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
 def MOV8rm  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
                 "mov{b}\t{$src, $dst|$dst, $src}",
                 [(set GR8:$dst, (loadi8 addr:$src))]>;
@@ -1071,7 +1028,7 @@ def MOV8mr_NOREX : I<0x88, MRMDestMem,
                      (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
                      "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
 let mayLoad = 1,
-    canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+    canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
                      (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
                      "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
@@ -1156,7 +1113,7 @@ def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
 } // neverHasSideEffects
 
 // unsigned division/remainder
-let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+let Defs = [AX,EFLAGS], Uses = [AX] in
 def DIV8r  : I<0xF6, MRM6r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
                "div{b}\t$src", []>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
@@ -4442,12 +4399,6 @@ def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8  GR8 :$src)>;
 def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
 def : Pat<(i32 (anyext GR16:$src)), (MOVZX32rr16 GR16:$src)>;
 
-// (and (i32 load), 255) -> (zextload i8)
-def : Pat<(i32 (and (nvloadi32 addr:$src), (i32 255))),
-          (MOVZX32rm8 addr:$src)>;
-def : Pat<(i32 (and (nvloadi32 addr:$src), (i32 65535))),
-          (MOVZX32rm16 addr:$src)>;
-
 //===----------------------------------------------------------------------===//
 // Some peepholes
 //===----------------------------------------------------------------------===//
@@ -4543,43 +4494,43 @@ def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
 def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
 
 // (shl x (and y, 31)) ==> (shl x, y)
-def : Pat<(shl GR8:$src1, (and CL:$amt, 31)),
+def : Pat<(shl GR8:$src1, (and CL, 31)),
           (SHL8rCL GR8:$src1)>;
-def : Pat<(shl GR16:$src1, (and CL:$amt, 31)),
+def : Pat<(shl GR16:$src1, (and CL, 31)),
           (SHL16rCL GR16:$src1)>;
-def : Pat<(shl GR32:$src1, (and CL:$amt, 31)),
+def : Pat<(shl GR32:$src1, (and CL, 31)),
           (SHL32rCL GR32:$src1)>;
-def : Pat<(store (shl (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (shl (loadi8 addr:$dst), (and CL, 31)), addr:$dst),
           (SHL8mCL addr:$dst)>;
-def : Pat<(store (shl (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (shl (loadi16 addr:$dst), (and CL, 31)), addr:$dst),
           (SHL16mCL addr:$dst)>;
-def : Pat<(store (shl (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (shl (loadi32 addr:$dst), (and CL, 31)), addr:$dst),
           (SHL32mCL addr:$dst)>;
 
-def : Pat<(srl GR8:$src1, (and CL:$amt, 31)),
+def : Pat<(srl GR8:$src1, (and CL, 31)),
           (SHR8rCL GR8:$src1)>;
-def : Pat<(srl GR16:$src1, (and CL:$amt, 31)),
+def : Pat<(srl GR16:$src1, (and CL, 31)),
           (SHR16rCL GR16:$src1)>;
-def : Pat<(srl GR32:$src1, (and CL:$amt, 31)),
+def : Pat<(srl GR32:$src1, (and CL, 31)),
           (SHR32rCL GR32:$src1)>;
-def : Pat<(store (srl (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (srl (loadi8 addr:$dst), (and CL, 31)), addr:$dst),
           (SHR8mCL addr:$dst)>;
-def : Pat<(store (srl (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (srl (loadi16 addr:$dst), (and CL, 31)), addr:$dst),
           (SHR16mCL addr:$dst)>;
-def : Pat<(store (srl (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (srl (loadi32 addr:$dst), (and CL, 31)), addr:$dst),
           (SHR32mCL addr:$dst)>;
 
-def : Pat<(sra GR8:$src1, (and CL:$amt, 31)),
+def : Pat<(sra GR8:$src1, (and CL, 31)),
           (SAR8rCL GR8:$src1)>;
-def : Pat<(sra GR16:$src1, (and CL:$amt, 31)),
+def : Pat<(sra GR16:$src1, (and CL, 31)),
           (SAR16rCL GR16:$src1)>;
-def : Pat<(sra GR32:$src1, (and CL:$amt, 31)),
+def : Pat<(sra GR32:$src1, (and CL, 31)),
           (SAR32rCL GR32:$src1)>;
-def : Pat<(store (sra (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (sra (loadi8 addr:$dst), (and CL, 31)), addr:$dst),
           (SAR8mCL addr:$dst)>;
-def : Pat<(store (sra (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (sra (loadi16 addr:$dst), (and CL, 31)), addr:$dst),
           (SAR16mCL addr:$dst)>;
-def : Pat<(store (sra (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (sra (loadi32 addr:$dst), (and CL, 31)), addr:$dst),
           (SAR32mCL addr:$dst)>;
 
 // (or (x >> c) | (y << (32 - c))) ==> (shrd32 x, y, c)
@@ -4600,11 +4551,11 @@ def : Pat<(store (or (srl (loadi32 addr:$dst), (i8 (trunc ECX:$amt))),
                  addr:$dst),
           (SHRD32mrCL addr:$dst, GR32:$src2)>;
 
-def : Pat<(shrd GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm:$amt2)),
+def : Pat<(shrd GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm/*:$amt2*/)),
           (SHRD32rri8 GR32:$src1, GR32:$src2, (i8 imm:$amt1))>;
 
 def : Pat<(store (shrd (loadi32 addr:$dst), (i8 imm:$amt1),
-                       GR32:$src2, (i8 imm:$amt2)), addr:$dst),
+                       GR32:$src2, (i8 imm/*:$amt2*/)), addr:$dst),
           (SHRD32mri8 addr:$dst, GR32:$src2, (i8 imm:$amt1))>;
 
 // (or (x << c) | (y >> (32 - c))) ==> (shld32 x, y, c)
@@ -4625,11 +4576,11 @@ def : Pat<(store (or (shl (loadi32 addr:$dst), (i8 (trunc ECX:$amt))),
                  addr:$dst),
           (SHLD32mrCL addr:$dst, GR32:$src2)>;
 
-def : Pat<(shld GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm:$amt2)),
+def : Pat<(shld GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm/*:$amt2*/)),
           (SHLD32rri8 GR32:$src1, GR32:$src2, (i8 imm:$amt1))>;
 
 def : Pat<(store (shld (loadi32 addr:$dst), (i8 imm:$amt1),
-                       GR32:$src2, (i8 imm:$amt2)), addr:$dst),
+                       GR32:$src2, (i8 imm/*:$amt2*/)), addr:$dst),
           (SHLD32mri8 addr:$dst, GR32:$src2, (i8 imm:$amt1))>;
 
 // (or (x >> c) | (y << (16 - c))) ==> (shrd16 x, y, c)
@@ -4650,11 +4601,11 @@ def : Pat<(store (or (srl (loadi16 addr:$dst), (i8 (trunc CX:$amt))),
                  addr:$dst),
           (SHRD16mrCL addr:$dst, GR16:$src2)>;
 
-def : Pat<(shrd GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm:$amt2)),
+def : Pat<(shrd GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm/*:$amt2*/)),
           (SHRD16rri8 GR16:$src1, GR16:$src2, (i8 imm:$amt1))>;
 
 def : Pat<(store (shrd (loadi16 addr:$dst), (i8 imm:$amt1),
-                       GR16:$src2, (i8 imm:$amt2)), addr:$dst),
+                       GR16:$src2, (i8 imm/*:$amt2*/)), addr:$dst),
           (SHRD16mri8 addr:$dst, GR16:$src2, (i8 imm:$amt1))>;
 
 // (or (x << c) | (y >> (16 - c))) ==> (shld16 x, y, c)
@@ -4675,11 +4626,11 @@ def : Pat<(store (or (shl (loadi16 addr:$dst), (i8 (trunc CX:$amt))),
                  addr:$dst),
           (SHLD16mrCL addr:$dst, GR16:$src2)>;
 
-def : Pat<(shld GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm:$amt2)),
+def : Pat<(shld GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm/*:$amt2*/)),
           (SHLD16rri8 GR16:$src1, GR16:$src2, (i8 imm:$amt1))>;
 
 def : Pat<(store (shld (loadi16 addr:$dst), (i8 imm:$amt1),
-                       GR16:$src2, (i8 imm:$amt2)), addr:$dst),
+                       GR16:$src2, (i8 imm/*:$amt2*/)), addr:$dst),
           (SHLD16mri8 addr:$dst, GR16:$src2, (i8 imm:$amt1))>;
 
 // (anyext (setcc_carry)) -> (setcc_carry)
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 89f020c..c8e0723 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -141,7 +141,7 @@ def MMX_MOVD64rrv164 : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
 let neverHasSideEffects = 1 in
 def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst, (load_mmx addr:$src))]>;
@@ -426,13 +426,15 @@ def MMX_CVTTPS2PIrm : MMXI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
 
 
 // Extract / Insert
-def MMX_X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>;
-def MMX_X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>;
+def MMX_X86pinsrw : SDNode<"X86ISD::MMX_PINSRW",
+                    SDTypeProfile<1, 3, [SDTCisVT<0, v4i16>, SDTCisSameAs<0,1>,
+                                         SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+
 
 def MMX_PEXTRWri  : MMXIi8<0xC5, MRMSrcReg,
                            (outs GR32:$dst), (ins VR64:$src1, i16i8imm:$src2),
                            "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           [(set GR32:$dst, (MMX_X86pextrw (v4i16 VR64:$src1),
+                           [(set GR32:$dst, (X86pextrw (v4i16 VR64:$src1),
                                              (iPTR imm:$src2)))]>;
 let Constraints = "$src1 = $dst" in {
   def MMX_PINSRWrri : MMXIi8<0xC4, MRMSrcReg,
@@ -597,13 +599,6 @@ let AddedComplexity = 10 in {
             (MMX_PUNPCKHDQrr VR64:$src, VR64:$src)>;
 }
 
-// Patterns to perform vector shuffling with a zeroed out vector.
-let AddedComplexity = 20 in {
-  def : Pat<(bc_v2i32 (mmx_unpckl immAllZerosV,
-                       (v2i32 (scalar_to_vector (load_mmx addr:$src))))),
-            (MMX_PUNPCKLDQrm VR64:$src, VR64:$src)>;
-}
-
 // Some special case PANDN patterns.
 // FIXME: Get rid of these.
 def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 9b2140f..2743dba 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -160,6 +160,32 @@ def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>;
 def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop64 node:$ptr))>;
 def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>;
 
+// MOVNT Support
+// Like 'store', but requires the non-temporal bit to be set
+def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+                           (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isNonTemporal();
+  return false;
+}]>;
+
+def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+			           (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isNonTemporal() && !ST->isTruncatingStore() &&
+           ST->getAddressingMode() == ISD::UNINDEXED &&
+           ST->getAlignment() >= 16;
+  return false;
+}]>;
+
+def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+			           (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isNonTemporal() &&
+           ST->getAlignment() < 16;
+  return false;
+}]>;
+
 def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
 def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
 def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
@@ -344,18 +370,56 @@ let Uses = [EFLAGS], usesCustomInserter = 1 in {
 // SSE1 Instructions
 //===----------------------------------------------------------------------===//
 
-// Move Instructions
-let neverHasSideEffects = 1 in
-def MOVSSrr : SSI<0x10, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
-                  "movss\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+// Move Instructions. Register-to-register movss is not used for FR32
+// register copies because it's a partial register update; FsMOVAPSrr is
+// used instead. Register-to-register movss is not modeled as an INSERT_SUBREG
+// because INSERT_SUBREG requires that the insert be implementable in terms of
+// a copy, and just mentioned, we don't use movss for copies.
+let Constraints = "$src1 = $dst" in
+def MOVSSrr : SSI<0x10, MRMSrcReg,
+                  (outs VR128:$dst), (ins VR128:$src1, FR32:$src2),
+                  "movss\t{$src2, $dst|$dst, $src2}",
+                  [(set VR128:$dst,
+                        (movl VR128:$src1, (scalar_to_vector FR32:$src2)))]>;
+
+// Extract the low 32-bit value from one vector and insert it into another.
+let AddedComplexity = 15 in
+def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
+          (MOVSSrr VR128:$src1,
+                   (EXTRACT_SUBREG (v4f32 VR128:$src2), x86_subreg_ss))>;
+
+// Implicitly promote a 32-bit scalar to a vector.
+def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, x86_subreg_ss)>;
+
+// Loading from memory automatically zeroing upper bits.
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOVSSrm : SSI<0x10, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
                   "movss\t{$src, $dst|$dst, $src}",
                   [(set FR32:$dst, (loadf32 addr:$src))]>;
+
+// MOVSSrm zeros the high parts of the register; represent this
+// with SUBREG_TO_REG.
+let AddedComplexity = 20 in {
+def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+          (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), x86_subreg_ss)>;
+def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+          (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), x86_subreg_ss)>;
+def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+          (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), x86_subreg_ss)>;
+}
+
+// Store scalar value to memory.
 def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
                   "movss\t{$src, $dst|$dst, $src}",
                   [(store FR32:$src, addr:$dst)]>;
 
+// Extract and store.
+def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+                 addr:$dst),
+          (MOVSSmr addr:$dst,
+                   (EXTRACT_SUBREG (v4f32 VR128:$src), x86_subreg_ss))>;
+
 // Conversion instructions
 def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
                       "cvttss2si\t{$src, $dst|$dst, $src}",
@@ -518,7 +582,7 @@ def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
 
 // Alias instruction to load FR32 from f128mem using movaps. Upper bits are
 // disregarded.
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
                      "movaps\t{$src, $dst|$dst, $src}",
                      [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
@@ -715,7 +779,7 @@ defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin,
 let neverHasSideEffects = 1 in
 def MOVAPSrr : PSI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOVAPSrm : PSI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (alignedloadv4f32 addr:$src))]>;
@@ -727,7 +791,7 @@ def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
 let neverHasSideEffects = 1 in
 def MOVUPSrr : PSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movups\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOVUPSrm : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                    "movups\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (loadv4f32 addr:$src))]>;
@@ -736,7 +800,7 @@ def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    [(store (v4f32 VR128:$src), addr:$dst)]>;
 
 // Intrinsic forms of MOVUPS load and store
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "movups\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
@@ -796,9 +860,9 @@ def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
 
 let AddedComplexity = 20 in {
 def : Pat<(v4f32 (movddup VR128:$src, (undef))),
-          (MOVLHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+          (MOVLHPSrr VR128:$src, VR128:$src)>;
 def : Pat<(v2i64 (movddup VR128:$src, (undef))),
-          (MOVLHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+          (MOVLHPSrr VR128:$src, VR128:$src)>;
 }
 
 
@@ -1013,10 +1077,33 @@ def PREFETCHNTA  : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
     "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>;
 
 // Non-temporal stores
-def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                     "movntps\t{$src, $dst|$dst, $src}",
                     [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
 
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntps\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
+
+def MOVNTDQ_64mr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>;
+
+def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
+          (MOVNTDQ_64mr VR128:$src, addr:$dst)>;
+
+def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                 "movnti\t{$src, $dst|$dst, $src}",
+                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
+               TB, Requires<[HasSSE2]>;
+
+def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                     "movnti\t{$src, $dst|$dst, $src}",
+                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
+                  TB, Requires<[HasSSE2]>;
+}
+
 // Load, store, and memory fence
 def SFENCE : PSI<0xAE, MRM7r, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>;
 
@@ -1035,84 +1122,73 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
                  [(set VR128:$dst, (v4i32 immAllZerosV))]>;
 
-let Predicates = [HasSSE1] in {
-  def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
-  def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
-  def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
-  def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
-  def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
-}
-
-// FR32 to 128-bit vector conversion.
-let isAsCheapAsAMove = 1 in
-def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR32:$src),
-                      "movss\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst,
-                        (v4f32 (scalar_to_vector FR32:$src)))]>;
-def MOVSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
-                     "movss\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst,
-                       (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>;
-
-// FIXME: may not be able to eliminate this movss with coalescing the src and
-// dest register classes are different. We really want to write this pattern
-// like this:
-// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
-//           (f32 FR32:$src)>;
-let isAsCheapAsAMove = 1 in
-def MOVPS2SSrr : SSI<0x10, MRMSrcReg, (outs FR32:$dst), (ins VR128:$src),
-                     "movss\t{$src, $dst|$dst, $src}",
-                     [(set FR32:$dst, (vector_extract (v4f32 VR128:$src),
-                                       (iPTR 0)))]>;
-def MOVPS2SSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
-                     "movss\t{$src, $dst|$dst, $src}",
-                     [(store (f32 (vector_extract (v4f32 VR128:$src),
-                                   (iPTR 0))), addr:$dst)]>;
-
-
-// Move to lower bits of a VR128, leaving upper bits alone.
-// Three operand (but two address) aliases.
-let Constraints = "$src1 = $dst" in {
-let neverHasSideEffects = 1 in
-  def MOVLSS2PSrr : SSI<0x10, MRMSrcReg,
-                        (outs VR128:$dst), (ins VR128:$src1, FR32:$src2),
-                        "movss\t{$src2, $dst|$dst, $src2}", []>;
-
-  let AddedComplexity = 15 in
-    def MOVLPSrr : SSI<0x10, MRMSrcReg,
-                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                       "movss\t{$src2, $dst|$dst, $src2}",
-                       [(set VR128:$dst,
-                         (v4f32 (movl VR128:$src1, VR128:$src2)))]>;
-}
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
 
-// Move to lower bits of a VR128 and zeroing upper bits.
-// Loading from memory automatically zeroing upper bits.
-let AddedComplexity = 20 in
-def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
-                      "movss\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (v4f32 (X86vzmovl (v4f32 (scalar_to_vector
-                                                    (loadf32 addr:$src))))))]>;
-
-def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
-          (MOVZSS2PSrm addr:$src)>;
+def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+          (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), x86_subreg_ss))>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 Instructions
 //===---------------------------------------------------------------------===//
 
-// Move Instructions
-let neverHasSideEffects = 1 in
-def MOVSDrr : SDI<0x10, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
-                  "movsd\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+// Move Instructions. Register-to-register movsd is not used for FR64
+// register copies because it's a partial register update; FsMOVAPDrr is
+// used instead. Register-to-register movsd is not modeled as an INSERT_SUBREG
+// because INSERT_SUBREG requires that the insert be implementable in terms of
+// a copy, and just mentioned, we don't use movsd for copies.
+let Constraints = "$src1 = $dst" in
+def MOVSDrr : SDI<0x10, MRMSrcReg,
+                  (outs VR128:$dst), (ins VR128:$src1, FR64:$src2),
+                  "movsd\t{$src2, $dst|$dst, $src2}",
+                  [(set VR128:$dst,
+                        (movl VR128:$src1, (scalar_to_vector FR64:$src2)))]>;
+
+// Extract the low 64-bit value from one vector and insert it into another.
+let AddedComplexity = 15 in
+def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
+          (MOVSDrr VR128:$src1,
+                   (EXTRACT_SUBREG (v2f64 VR128:$src2), x86_subreg_sd))>;
+
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, x86_subreg_sd)>;
+
+// Loading from memory automatically zeroing upper bits.
+let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 20 in
 def MOVSDrm : SDI<0x10, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
                   "movsd\t{$src, $dst|$dst, $src}",
                   [(set FR64:$dst, (loadf64 addr:$src))]>;
+
+// MOVSDrm zeros the high parts of the register; represent this
+// with SUBREG_TO_REG.
+let AddedComplexity = 20 in {
+def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), x86_subreg_sd)>;
+def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), x86_subreg_sd)>;
+def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), x86_subreg_sd)>;
+def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), x86_subreg_sd)>;
+def : Pat<(v2f64 (X86vzload addr:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), x86_subreg_sd)>;
+}
+
+// Store scalar value to memory.
 def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
                   "movsd\t{$src, $dst|$dst, $src}",
                   [(store FR64:$src, addr:$dst)]>;
 
+// Extract and store.
+def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+                 addr:$dst),
+          (MOVSDmr addr:$dst,
+                   (EXTRACT_SUBREG (v2f64 VR128:$src), x86_subreg_sd))>;
+
 // Conversion instructions
 def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src),
                       "cvttsd2si\t{$src, $dst|$dst, $src}",
@@ -1166,7 +1242,8 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                  Requires<[HasSSE2, OptForSize]>;
 
 def : Pat<(extloadf32 addr:$src),
-          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>;
+          (CVTSS2SDrr (MOVSSrm addr:$src))>,
+      Requires<[HasSSE2, OptForSpeed]>;
 
 // Match intrinsics which expect XMM operand(s).
 def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
@@ -1285,7 +1362,7 @@ def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
 
 // Alias instruction to load FR64 from f128mem using movapd. Upper bits are
 // disregarded.
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
                      "movapd\t{$src, $dst|$dst, $src}",
                      [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
@@ -1483,7 +1560,7 @@ defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin,
 let neverHasSideEffects = 1 in
 def MOVAPDrr : PDI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movapd\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOVAPDrm : PDI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (alignedloadv2f64 addr:$src))]>;
@@ -2298,17 +2375,30 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
                      [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
 
 // Non-temporal stores
-def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
-                    "movntpd\t{$src, $dst|$dst, $src}",
-                    [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
-def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                    "movntdq\t{$src, $dst|$dst, $src}",
-                    [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
-def MOVNTImr  :   I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                        "movntpd\t{$src, $dst|$dst, $src}",
+                        [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
+def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                        "movntdq\t{$src, $dst|$dst, $src}",
+                        [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
+def MOVNTImr_Int  :   I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                     "movnti\t{$src, $dst|$dst, $src}",
                     [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
                   TB, Requires<[HasSSE2]>;
 
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntpd\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
+
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
+
+def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+          (MOVNTDQmr VR128:$src, addr:$dst)>;
+}
+
 // Flush cache
 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
                "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
@@ -2321,11 +2411,11 @@ def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
                "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
 
 //TODO: custom lower this so as to never even generate the noop
-def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss),
+def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
            (i8 0)), (NOOP)>;
 def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
 def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
-def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss),
+def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
            (i8 1)), (MFENCE)>;
 
 // Alias instructions that map zero vector to pxor / xorp* for sse.
@@ -2337,17 +2427,6 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
   def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
                          [(set VR128:$dst, (v4i32 immAllOnesV))]>;
 
-// FR64 to 128-bit vector conversion.
-let isAsCheapAsAMove = 1 in
-def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR64:$src),
-                      "movsd\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst,
-                        (v2f64 (scalar_to_vector FR64:$src)))]>;
-def MOVSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                     "movsd\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst,
-                       (v2f64 (scalar_to_vector (loadf64 addr:$src))))]>;
-
 def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -2376,20 +2455,9 @@ def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       [(store (i64 (vector_extract (v2i64 VR128:$src),
                                     (iPTR 0))), addr:$dst)]>;
 
-// FIXME: may not be able to eliminate this movss with coalescing the src and
-// dest register classes are different. We really want to write this pattern
-// like this:
-// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
-//           (f32 FR32:$src)>;
-let isAsCheapAsAMove = 1 in
-def MOVPD2SDrr : SDI<0x10, MRMSrcReg, (outs FR64:$dst), (ins VR128:$src),
-                     "movsd\t{$src, $dst|$dst, $src}",
-                     [(set FR64:$dst, (vector_extract (v2f64 VR128:$src),
-                                       (iPTR 0)))]>;
-def MOVPD2SDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
-                     "movsd\t{$src, $dst|$dst, $src}",
-                     [(store (f64 (vector_extract (v2f64 VR128:$src),
-                                   (iPTR 0))), addr:$dst)]>;
+def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+          (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), x86_subreg_sd))>;
+
 def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
@@ -2406,44 +2474,11 @@ def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
 
-
-// Move to lower bits of a VR128, leaving upper bits alone.
-// Three operand (but two address) aliases.
-let Constraints = "$src1 = $dst" in {
-  let neverHasSideEffects = 1 in
-  def MOVLSD2PDrr : SDI<0x10, MRMSrcReg,
-                        (outs VR128:$dst), (ins VR128:$src1, FR64:$src2),
-                        "movsd\t{$src2, $dst|$dst, $src2}", []>;
-
-  let AddedComplexity = 15 in
-    def MOVLPDrr : SDI<0x10, MRMSrcReg,
-                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                       "movsd\t{$src2, $dst|$dst, $src2}",
-                       [(set VR128:$dst,
-                         (v2f64 (movl VR128:$src1, VR128:$src2)))]>;
-}
-
 // Store / copy lower 64-bits of a XMM register.
 def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
 
-// Move to lower bits of a VR128 and zeroing upper bits.
-// Loading from memory automatically zeroing upper bits.
-let AddedComplexity = 20 in {
-def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                      "movsd\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst,
-                        (v2f64 (X86vzmovl (v2f64 (scalar_to_vector
-                                                 (loadf64 addr:$src))))))]>;
-
-def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
-            (MOVZSD2PDrm addr:$src)>;
-def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
-            (MOVZSD2PDrm addr:$src)>;
-def : Pat<(v2f64 (X86vzload addr:$src)), (MOVZSD2PDrm addr:$src)>;
-}
-
 // movd / movq to XMM register zero-extends
 let AddedComplexity = 15 in {
 def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
@@ -2989,13 +3024,15 @@ let Predicates = [HasSSE2] in {
 let AddedComplexity = 15 in {
 // Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
-          (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
+          (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
-          (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE1]>;
+          (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-          (MOVLPSrr (V_SET0), VR128:$src)>, Requires<[HasSSE1]>;
+          (MOVSSrr (v4f32 (V_SET0)),
+                   (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), x86_subreg_ss)))>;
 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-          (MOVLPSrr (V_SET0), VR128:$src)>, Requires<[HasSSE1]>;
+          (MOVSSrr (v4i32 (V_SET0)),
+                   (EXTRACT_SUBREG (v4i32 VR128:$src), x86_subreg_ss))>;
 }
 
 // Splat v2f64 / v2i64
@@ -3013,8 +3050,7 @@ def : Pat<(unpckh (v2i64 VR128:$src), (undef)),
 // Special unary SHUFPSrri case.
 def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
           (SHUFPSrri VR128:$src1, VR128:$src1,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
-      Requires<[HasSSE1]>;
+                     (SHUFFLE_get_shuf_imm VR128:$src3))>;
 let AddedComplexity = 5 in
 def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
           (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
@@ -3060,13 +3096,13 @@ def : Pat<(v4f32 (unpckl_undef:$src2 VR128:$src, (undef))),
 }
 let AddedComplexity = 10 in {
 def : Pat<(v4f32 (unpckl_undef VR128:$src, (undef))),
-          (UNPCKLPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+          (UNPCKLPSrr VR128:$src, VR128:$src)>;
 def : Pat<(v16i8 (unpckl_undef VR128:$src, (undef))),
-          (PUNPCKLBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+          (PUNPCKLBWrr VR128:$src, VR128:$src)>;
 def : Pat<(v8i16 (unpckl_undef VR128:$src, (undef))),
-          (PUNPCKLWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+          (PUNPCKLWDrr VR128:$src, VR128:$src)>;
 def : Pat<(v4i32 (unpckl_undef VR128:$src, (undef))),
-          (PUNPCKLDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+          (PUNPCKLDQrr VR128:$src, VR128:$src)>;
 }
 
 // vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
@@ -3080,13 +3116,13 @@ def : Pat<(v4f32 (unpckh_undef:$src2 VR128:$src, (undef))),
 }
 let AddedComplexity = 10 in {
 def : Pat<(v4f32 (unpckh_undef VR128:$src, (undef))),
-          (UNPCKHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+          (UNPCKHPSrr VR128:$src, VR128:$src)>;
 def : Pat<(v16i8 (unpckh_undef VR128:$src, (undef))),
-          (PUNPCKHBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+          (PUNPCKHBWrr VR128:$src, VR128:$src)>;
 def : Pat<(v8i16 (unpckh_undef VR128:$src, (undef))),
-          (PUNPCKHWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+          (PUNPCKHWDrr VR128:$src, VR128:$src)>;
 def : Pat<(v4i32 (unpckh_undef VR128:$src, (undef))),
-          (PUNPCKHDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+          (PUNPCKHDQrr VR128:$src, VR128:$src)>;
 }
 
 let AddedComplexity = 20 in {
@@ -3108,45 +3144,49 @@ def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
 let AddedComplexity = 20 in {
 // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
 def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+          (MOVLPSrm VR128:$src1, addr:$src2)>;
 def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
-          (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+          (MOVLPDrm VR128:$src1, addr:$src2)>;
 def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+          (MOVLPSrm VR128:$src1, addr:$src2)>;
 def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
-          (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+          (MOVLPDrm VR128:$src1, addr:$src2)>;
 }
 
 // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
 def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
-          (MOVLPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+          (MOVLPSmr addr:$src1, VR128:$src2)>;
 def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
-          (MOVLPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+          (MOVLPDmr addr:$src1, VR128:$src2)>;
 def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
                  addr:$src1),
-          (MOVLPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+          (MOVLPSmr addr:$src1, VR128:$src2)>;
 def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
-          (MOVLPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+          (MOVLPDmr addr:$src1, VR128:$src2)>;
 
 let AddedComplexity = 15 in {
 // Setting the lowest element in the vector.
 def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
-          (MOVLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+          (MOVSSrr (v4i32 VR128:$src1),
+                   (EXTRACT_SUBREG (v4i32 VR128:$src2), x86_subreg_ss))>;
 def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
-          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+          (MOVSDrr (v2i64 VR128:$src1),
+                   (EXTRACT_SUBREG (v2i64 VR128:$src2), x86_subreg_sd))>;
 
-// vector_shuffle v1, v2 <4, 5, 2, 3> using MOVLPDrr (movsd)
+// vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
 def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
-          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, x86_subreg_sd))>,
+      Requires<[HasSSE2]>;
 def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
-          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, x86_subreg_sd))>,
+      Requires<[HasSSE2]>;
 }
 
 // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
 // fall back to this for SSE1)
 def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
           (SHUFPSrri VR128:$src2, VR128:$src1,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>, Requires<[HasSSE1]>;
+                     (SHUFFLE_get_shuf_imm VR128:$src3))>;
 
 // Set lowest element and zero upper elements.
 let AddedComplexity = 15 in
@@ -3188,30 +3228,30 @@ def : Pat<(v2i32 (fp_to_sint (v2f64 VR128:$src))),
 
 // Use movaps / movups for SSE integer load / store (one byte shorter).
 def : Pat<(alignedloadv4i32 addr:$src),
-          (MOVAPSrm addr:$src)>, Requires<[HasSSE1]>;
+          (MOVAPSrm addr:$src)>;
 def : Pat<(loadv4i32 addr:$src),
-          (MOVUPSrm addr:$src)>, Requires<[HasSSE1]>;
+          (MOVUPSrm addr:$src)>;
 def : Pat<(alignedloadv2i64 addr:$src),
-          (MOVAPSrm addr:$src)>, Requires<[HasSSE2]>;
+          (MOVAPSrm addr:$src)>;
 def : Pat<(loadv2i64 addr:$src),
-          (MOVUPSrm addr:$src)>, Requires<[HasSSE2]>;
+          (MOVUPSrm addr:$src)>;
 
 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
-          (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          (MOVAPSmr addr:$dst, VR128:$src)>;
 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
-          (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          (MOVAPSmr addr:$dst, VR128:$src)>;
 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
-          (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          (MOVAPSmr addr:$dst, VR128:$src)>;
 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
-          (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          (MOVAPSmr addr:$dst, VR128:$src)>;
 def : Pat<(store (v2i64 VR128:$src), addr:$dst),
-          (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          (MOVUPSmr addr:$dst, VR128:$src)>;
 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
-          (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          (MOVUPSmr addr:$dst, VR128:$src)>;
 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
-          (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          (MOVUPSmr addr:$dst, VR128:$src)>;
 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
-          (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          (MOVUPSmr addr:$dst, VR128:$src)>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 Instructions
@@ -3400,7 +3440,7 @@ let Constraints = "$src1 = $dst" in {
                    (ins VR128:$src1, i128mem:$src2),
                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                    [(set VR128:$dst,
-                     (OpNode VR128:$src1, (memop addr:$src2)))]>, OpSize;
+                     (OpVT (OpNode VR128:$src1, (memop addr:$src2))))]>, OpSize;
     def rm_int : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                        (ins VR128:$src1, i128mem:$src2),
                        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
diff --git a/lib/Target/X86/X86MCAsmInfo.cpp b/lib/Target/X86/X86MCAsmInfo.cpp
index 91c0fbb..250634f 100644
--- a/lib/Target/X86/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/X86MCAsmInfo.cpp
@@ -55,6 +55,11 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &Triple) {
   if (!is64Bit)
     Data64bitsDirective = 0;       // we can't emit a 64-bit unit
 
+  // Use ## as a comment string so that .s files generated by llvm can go
+  // through the GCC preprocessor without causing an error.  This is needed
+  // because "clang foo.s" runs the C preprocessor, which is usually reserved
+  // for .S files on other systems.  Perhaps this is because the file system
+  // wasn't always case preserving or something.
   CommentString = "##";
   PCSymbol = ".";
 
@@ -70,6 +75,8 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &Triple) {
   AsmTransCBE = x86_asm_table;
   AssemblerDialect = AsmWriterFlavor;
 
+  TextAlignFillValue = 0x90;
+
   PrivateGlobalPrefix = ".L";
   WeakRefDirective = "\t.weak\t";
   PCSymbol = ".";
@@ -94,4 +101,6 @@ MCSection *X86ELFMCAsmInfo::getNonexecutableStackSection(MCContext &Ctx) const {
 X86MCAsmInfoCOFF::X86MCAsmInfoCOFF(const Triple &Triple) {
   AsmTransCBE = x86_asm_table;
   AssemblerDialect = AsmWriterFlavor;
-}
-\ No newline at end of file
+
+  TextAlignFillValue = 0x90;
+}
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 8524236..946d6b2 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -191,6 +191,8 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
         return &X86::GR16_NOREXRegClass;
       else if (A == &X86::GR16_ABCDRegClass)
         return &X86::GR16_ABCDRegClass;
+    } else if (B == &X86::FR32RegClass) {
+      return A;
     }
     break;
   case 2:
@@ -207,6 +209,8 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
       else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass ||
                A == &X86::GR16_NOREXRegClass)
         return &X86::GR16_ABCDRegClass;
+    } else if (B == &X86::FR64RegClass) {
+      return A;
     }
     break;
   case 3:
@@ -234,6 +238,8 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
         return &X86::GR32_NOREXRegClass;
       else if (A == &X86::GR32_ABCDRegClass)
         return &X86::GR64_ABCDRegClass;
+    } else if (B == &X86::VR128RegClass) {
+      return A;
     }
     break;
   case 4:
@@ -446,8 +452,10 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
 
 bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *F = MF.getFunction();
   bool requiresRealignment =
-    RealignStack && (MFI->getMaxAlignment() > StackAlign);
+    RealignStack && ((MFI->getMaxAlignment() > StackAlign) ||
+                     F->hasFnAttr(Attribute::StackAlignment));
 
   // FIXME: Currently we don't support stack realignment for functions with
   //        variable-sized allocas.
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 8fb5e92..e4bdb4e 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -35,7 +35,8 @@ namespace X86 {
   /// these indices must be kept in sync with the class indices in the 
   /// X86RegisterInfo.td file.
   enum SubregIndex {
-    SUBREG_8BIT = 1, SUBREG_8BIT_HI = 2, SUBREG_16BIT = 3, SUBREG_32BIT = 4
+    SUBREG_8BIT = 1, SUBREG_8BIT_HI = 2, SUBREG_16BIT = 3, SUBREG_32BIT = 4,
+    SUBREG_SS = 1, SUBREG_SD = 2, SUBREG_XMM = 3
   };
 }
 
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 1559bf7..ed2ce6c 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -158,22 +158,22 @@ let Namespace = "X86" in {
   def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>;
 
   // YMM Registers, used by AVX instructions
-  def YMM0: Register<"ymm0">, DwarfRegNum<[17, 21, 21]>;
-  def YMM1: Register<"ymm1">, DwarfRegNum<[18, 22, 22]>;
-  def YMM2: Register<"ymm2">, DwarfRegNum<[19, 23, 23]>;
-  def YMM3: Register<"ymm3">, DwarfRegNum<[20, 24, 24]>;
-  def YMM4: Register<"ymm4">, DwarfRegNum<[21, 25, 25]>;
-  def YMM5: Register<"ymm5">, DwarfRegNum<[22, 26, 26]>;
-  def YMM6: Register<"ymm6">, DwarfRegNum<[23, 27, 27]>;
-  def YMM7: Register<"ymm7">, DwarfRegNum<[24, 28, 28]>;
-  def YMM8:  Register<"ymm8">,  DwarfRegNum<[25, -2, -2]>;
-  def YMM9:  Register<"ymm9">,  DwarfRegNum<[26, -2, -2]>;
-  def YMM10: Register<"ymm10">, DwarfRegNum<[27, -2, -2]>;
-  def YMM11: Register<"ymm11">, DwarfRegNum<[28, -2, -2]>;
-  def YMM12: Register<"ymm12">, DwarfRegNum<[29, -2, -2]>;
-  def YMM13: Register<"ymm13">, DwarfRegNum<[30, -2, -2]>;
-  def YMM14: Register<"ymm14">, DwarfRegNum<[31, -2, -2]>;
-  def YMM15: Register<"ymm15">, DwarfRegNum<[32, -2, -2]>;
+  def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegNum<[17, 21, 21]>;
+  def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegNum<[18, 22, 22]>;
+  def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegNum<[19, 23, 23]>;
+  def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegNum<[20, 24, 24]>;
+  def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegNum<[21, 25, 25]>;
+  def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegNum<[22, 26, 26]>;
+  def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegNum<[23, 27, 27]>;
+  def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegNum<[24, 28, 28]>;
+  def YMM8:  RegisterWithSubRegs<"ymm8", [XMM8]>,  DwarfRegNum<[25, -2, -2]>;
+  def YMM9:  RegisterWithSubRegs<"ymm9", [XMM9]>,  DwarfRegNum<[26, -2, -2]>;
+  def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegNum<[27, -2, -2]>;
+  def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegNum<[28, -2, -2]>;
+  def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegNum<[29, -2, -2]>;
+  def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegNum<[30, -2, -2]>;
+  def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegNum<[31, -2, -2]>;
+  def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegNum<[32, -2, -2]>;
 
   // Floating point stack registers
   def ST0 : Register<"st(0)">, DwarfRegNum<[33, 12, 11]>;
@@ -238,6 +238,10 @@ def x86_subreg_8bit_hi : PatLeaf<(i32 2)>;
 def x86_subreg_16bit   : PatLeaf<(i32 3)>;
 def x86_subreg_32bit   : PatLeaf<(i32 4)>;
 
+def x86_subreg_ss   : PatLeaf<(i32 1)>;
+def x86_subreg_sd   : PatLeaf<(i32 2)>;
+def x86_subreg_xmm  : PatLeaf<(i32 3)>;
+
 def : SubRegSet<1, [AX, CX, DX, BX, SP,  BP,  SI,  DI,  
                     R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W],
                    [AL, CL, DL, BL, SPL, BPL, SIL, DIL, 
@@ -277,11 +281,31 @@ def : SubRegSet<4, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
                    [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI, 
                     R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D]>;
 
-def : SubRegSet<1, [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,  
+def : SubRegSet<1, [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+                    YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15],
+                   [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                    XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]>;
+
+def : SubRegSet<2, [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+                    YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15],
+                   [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                    XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]>;
+
+def : SubRegSet<3, [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,  
                     YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15],
                    [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, 
                     XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]>;
 
+def : SubRegSet<1, [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                    XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15],
+                   [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                    XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]>;
+
+def : SubRegSet<2, [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                    XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15],
+                   [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                    XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]>;
+
 //===----------------------------------------------------------------------===//
 // Register Class Definitions... now that we have all of the pieces, define the
 // top-level register classes.  The order specified in the register list is
@@ -793,6 +817,7 @@ def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
                           [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
                            XMM8, XMM9, XMM10, XMM11,
                            XMM12, XMM13, XMM14, XMM15]> {
+  let SubRegClassList = [FR32, FR64];
   let MethodProtos = [{
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
@@ -811,7 +836,9 @@ def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
 def VR256 : RegisterClass<"X86", [ v8i32, v4i64, v8f32, v4f64],256,
                           [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
                            YMM8, YMM9, YMM10, YMM11,
-                           YMM12, YMM13, YMM14, YMM15]>;
+                           YMM12, YMM13, YMM14, YMM15]> {
+  let SubRegClassList = [FR32, FR64, VR128];
+}
 
 // Status flags registers.
 def CCR : RegisterClass<"X86", [i32], 32, [EFLAGS]> {
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 5e05c2f..594a470 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -20,9 +20,9 @@
 namespace llvm {
 class GlobalValue;
 class TargetMachine;
-  
+
 /// PICStyles - The X86 backend supports a number of different styles of PIC.
-/// 
+///
 namespace PICStyles {
 enum Style {
   StubPIC,          // Used on i386-darwin in -fPIC mode.
@@ -46,7 +46,7 @@ protected:
   /// PICStyle - Which PIC style to use
   ///
   PICStyles::Style PICStyle;
-  
+
   /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or
   /// none supported.
   X86SSEEnum X86SSELevel;
@@ -58,7 +58,7 @@ protected:
   /// HasCMov - True if this processor has conditional move instructions
   /// (generally pentium pro+).
   bool HasCMov;
-  
+
   /// HasX86_64 - True if the processor supports X86-64 instructions.
   ///
   bool HasX86_64;
@@ -78,8 +78,9 @@ protected:
   /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
-  /// HasVectorUAMem - True if SIMD operations can have unaligned memory operands.
-  ///                  This may require setting a feature bit in the processor.
+  /// HasVectorUAMem - True if SIMD operations can have unaligned memory
+  ///                  operands. This may require setting a feature bit in the
+  ///                  processor.
   bool HasVectorUAMem;
 
   /// DarwinVers - Nonzero if this is a darwin platform: the numeric
@@ -150,20 +151,20 @@ public:
 
   bool isTargetDarwin() const { return TargetType == isDarwin; }
   bool isTargetELF() const { return TargetType == isELF; }
-  
+
   bool isTargetWindows() const { return TargetType == isWindows; }
   bool isTargetMingw() const { return TargetType == isMingw; }
   bool isTargetCygwin() const { return TargetType == isCygwin; }
   bool isTargetCygMing() const {
     return TargetType == isMingw || TargetType == isCygwin;
   }
-  
+
   /// isTargetCOFF - Return true if this is any COFF/Windows target variant.
   bool isTargetCOFF() const {
     return TargetType == isMingw || TargetType == isCygwin ||
            TargetType == isWindows;
   }
-  
+
   bool isTargetWin64() const {
     return Is64Bit && (TargetType == isMingw || TargetType == isWindows);
   }
@@ -196,11 +197,11 @@ public:
   bool isPICStyleStubAny() const {
     return PICStyle == PICStyles::StubDynamicNoPIC ||
            PICStyle == PICStyles::StubPIC; }
-  
+
   /// getDarwinVers - Return the darwin version number, 8 = Tiger, 9 = Leopard,
   /// 10 = Snow Leopard, etc.
   unsigned getDarwinVers() const { return DarwinVers; }
-    
+
   /// ClassifyGlobalReference - Classify a global variable reference for the
   /// current subtarget according to how we should reference it in a non-pcrel
   /// context.
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 7802f98..56ddaf8 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -51,6 +51,12 @@ extern "C" void LLVMInitializeX86Target() {
                                       createX86_32MCCodeEmitter);
   TargetRegistry::RegisterCodeEmitter(TheX86_64Target,
                                       createX86_64MCCodeEmitter);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterAsmBackend(TheX86_32Target,
+                                     createX86_32AsmBackend);
+  TargetRegistry::RegisterAsmBackend(TheX86_64Target,
+                                     createX86_64AsmBackend);
 }
 
 
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index d1ee3fc..29a0be5 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -18,38 +18,6 @@
 using namespace llvm;
 using namespace dwarf;
 
-const MCExpr *X8632_MachoTargetObjectFile::
-getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
-                             MachineModuleInfo *MMI, unsigned Encoding) const {
-  // The mach-o version of this method defaults to returning a stub reference.
-
-  if (Encoding & DW_EH_PE_indirect) {
-    MachineModuleInfoMachO &MachOMMI =
-      MMI->getObjFileInfo<MachineModuleInfoMachO>();
-
-    SmallString<128> Name;
-    Mang->getNameWithPrefix(Name, GV, true);
-    Name += "$non_lazy_ptr";
-
-    // Add information about the stub reference to MachOMMI so that the stub
-    // gets emitted by the asmprinter.
-    MCSymbol *Sym = getContext().GetOrCreateSymbol(Name.str());
-    MCSymbol *&StubSym = MachOMMI.getGVStubEntry(Sym);
-    if (StubSym == 0) {
-      Name.clear();
-      Mang->getNameWithPrefix(Name, GV, false);
-      StubSym = getContext().GetOrCreateSymbol(Name.str());
-    }
-
-    return TargetLoweringObjectFile::
-      getSymbolForDwarfReference(Sym, MMI,
-                                 Encoding & ~dwarf::DW_EH_PE_indirect);
-  }
-
-  return TargetLoweringObjectFileMachO::
-    getSymbolForDwarfGlobalReference(GV, Mang, MMI, Encoding);
-}
-
 const MCExpr *X8664_MachoTargetObjectFile::
 getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
                            MachineModuleInfo *MMI, unsigned Encoding) const {
@@ -148,35 +116,3 @@ unsigned X8664_ELFTargetObjectFile::getTTypeEncoding() const {
 
   return DW_EH_PE_absptr;
 }
-
-unsigned X8632_MachoTargetObjectFile::getPersonalityEncoding() const {
-  return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8632_MachoTargetObjectFile::getLSDAEncoding() const {
-  return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8632_MachoTargetObjectFile::getFDEEncoding() const {
-  return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8632_MachoTargetObjectFile::getTTypeEncoding() const {
-  return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8664_MachoTargetObjectFile::getPersonalityEncoding() const {
-  return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8664_MachoTargetObjectFile::getLSDAEncoding() const {
-  return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8664_MachoTargetObjectFile::getFDEEncoding() const {
-  return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8664_MachoTargetObjectFile::getTTypeEncoding() const {
-  return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 0fff194..0444417 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -17,20 +17,6 @@
 namespace llvm {
   class X86TargetMachine;
 
-  /// X8632_MachoTargetObjectFile - This TLOF implementation is used for
-  /// Darwin/x86-32.
-  class X8632_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
-  public:
-
-    virtual const MCExpr *
-    getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
-                              MachineModuleInfo *MMI, unsigned Encoding) const;
-    virtual unsigned getPersonalityEncoding() const;
-    virtual unsigned getLSDAEncoding() const;
-    virtual unsigned getFDEEncoding() const;
-    virtual unsigned getTTypeEncoding() const;
-  };
-
   /// X8664_MachoTargetObjectFile - This TLOF implementation is used for
   /// Darwin/x86-64.
   class X8664_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
@@ -39,17 +25,13 @@ namespace llvm {
     virtual const MCExpr *
     getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
                               MachineModuleInfo *MMI, unsigned Encoding) const;
-    virtual unsigned getPersonalityEncoding() const;
-    virtual unsigned getLSDAEncoding() const;
-    virtual unsigned getFDEEncoding() const;
-    virtual unsigned getTTypeEncoding() const;
   };
 
   class X8632_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
     const X86TargetMachine &TM;
   public:
     X8632_ELFTargetObjectFile(const X86TargetMachine &tm)
-      :TM(tm) { };
+      :TM(tm) { }
     virtual unsigned getPersonalityEncoding() const;
     virtual unsigned getLSDAEncoding() const;
     virtual unsigned getFDEEncoding() const;
@@ -60,7 +42,7 @@ namespace llvm {
     const X86TargetMachine &TM;
   public:
     X8664_ELFTargetObjectFile(const X86TargetMachine &tm)
-      :TM(tm) { };
+      :TM(tm) { }
     virtual unsigned getPersonalityEncoding() const;
     virtual unsigned getLSDAEncoding() const;
     virtual unsigned getFDEEncoding() const;
diff --git a/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp b/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp
index d18f55d..82e23a1 100644
--- a/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetData.h"
@@ -62,6 +63,11 @@ namespace {
     }
 
     void printMemOperand(const MachineInstr *MI, int opNum);
+    void printInlineJT(const MachineInstr *MI, int opNum,
+                       const std::string &directive = ".jmptable");
+    void printInlineJT32(const MachineInstr *MI, int opNum) {
+      printInlineJT(MI, opNum, ".jmptable32");
+    }
     void printOperand(const MachineInstr *MI, int opNum);
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                         unsigned AsmVariant, const char *ExtraCode);
@@ -257,6 +263,23 @@ void XCoreAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum)
   printOperand(MI, opNum+1);
 }
 
+void XCoreAsmPrinter::
+printInlineJT(const MachineInstr *MI, int opNum, const std::string &directive)
+{
+  unsigned JTI = MI->getOperand(opNum).getIndex();
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+  O << "\t" << directive << " ";
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    if (i > 0)
+      O << ",";
+    O << *MBB->getSymbol(OutContext);
+  }
+}
+
 void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
   const MachineOperand &MO = MI->getOperand(opNum);
   switch (MO.getType()) {
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 383fd91..b1ab132 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -65,8 +65,6 @@ namespace {
     bool SelectADDRcpii(SDNode *Op, SDValue Addr, SDValue &Base,
                         SDValue &Offset);
     
-    virtual void InstructionSelect();
-
     virtual const char *getPassName() const {
       return "XCore DAG->DAG Pattern Instruction Selection";
     } 
@@ -147,15 +145,6 @@ bool XCoreDAGToDAGISel::SelectADDRcpii(SDNode *Op, SDValue Addr,
   return false;
 }
 
-/// InstructionSelect - This callback is invoked by
-/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-void XCoreDAGToDAGISel::InstructionSelect() {
-  // Select target instructions for the DAG.
-  SelectRoot(*CurDAG);
-  
-  CurDAG->RemoveDeadNodes();
-}
-
 SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
   EVT NVT = N->getValueType(0);
@@ -164,7 +153,11 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
       default: break;
       case ISD::Constant: {
         if (Predicate_immMskBitp(N)) {
-          SDValue MskSize = Transform_msksize_xform(N);
+          // Transformation function: get the size of a mask
+          int64_t MaskVal = cast<ConstantSDNode>(N)->getZExtValue();
+          assert(isMask_32(MaskVal));
+          // Look for the first non-zero bit
+          SDValue MskSize = getI32Imm(32 - CountLeadingZeros_32(MaskVal));
           return CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
                                         MVT::i32, MskSize);
         }
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 57fd43b..e6515d8 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -53,6 +54,8 @@ getTargetNodeName(unsigned Opcode) const
     case XCoreISD::RETSP             : return "XCoreISD::RETSP";
     case XCoreISD::LADD              : return "XCoreISD::LADD";
     case XCoreISD::LSUB              : return "XCoreISD::LSUB";
+    case XCoreISD::BR_JT             : return "XCoreISD::BR_JT";
+    case XCoreISD::BR_JT32           : return "XCoreISD::BR_JT32";
     default                           : return NULL;
   }
 }
@@ -106,9 +109,8 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
   
-  // Expand jump tables for now
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+  // Jump tables.
+  setOperationAction(ISD::BR_JT, MVT::Other, Custom);
 
   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
   setOperationAction(ISD::BlockAddress, MVT::i32 , Custom);
@@ -157,7 +159,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) {
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::BlockAddress:     return LowerBlockAddress(Op, DAG);
   case ISD::ConstantPool:     return LowerConstantPool(Op, DAG);
-  case ISD::JumpTable:        return LowerJumpTable(Op, DAG);
+  case ISD::BR_JT:            return LowerBR_JT(Op, DAG);
   case ISD::LOAD:             return LowerLOAD(Op, DAG);
   case ISD::STORE:            return LowerSTORE(Op, DAG);
   case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
@@ -315,14 +317,27 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG)
 }
 
 SDValue XCoreTargetLowering::
-LowerJumpTable(SDValue Op, SelectionDAG &DAG)
+LowerBR_JT(SDValue Op, SelectionDAG &DAG)
 {
-  // FIXME there isn't really debug info here
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
   DebugLoc dl = Op.getDebugLoc();
-  EVT PtrVT = Op.getValueType();
-  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
-  return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, JTI);
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  unsigned JTI = JT->getIndex();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
+
+  unsigned NumEntries = MJTI->getJumpTables()[JTI].MBBs.size();
+  if (NumEntries <= 32) {
+    return DAG.getNode(XCoreISD::BR_JT, dl, MVT::Other, Chain, TargetJT, Index);
+  }
+  assert((NumEntries >> 31) == 0);
+  SDValue ScaledIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index,
+                                    DAG.getConstant(1, MVT::i32));
+  return DAG.getNode(XCoreISD::BR_JT32, dl, MVT::Other, Chain, TargetJT,
+                     ScaledIndex);
 }
 
 static bool
@@ -458,7 +473,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG)
                     false, false, 0, CallingConv::C, false,
                     /*isReturnValueUsed=*/true,
                     DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
-                    Args, DAG, dl, DAG.GetOrdering(Chain.getNode()));
+                    Args, DAG, dl);
 
   SDValue Ops[] =
     { CallResult.first, CallResult.second };
@@ -521,7 +536,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG)
                     false, false, 0, CallingConv::C, false,
                     /*isReturnValueUsed=*/true,
                     DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
-                    Args, DAG, dl, DAG.GetOrdering(Chain.getNode()));
+                    Args, DAG, dl);
 
   return CallResult.second;
 }
@@ -1146,10 +1161,8 @@ static inline bool isImmUs4(int64_t val)
 bool
 XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM, 
                                               const Type *Ty) const {
-  // Be conservative with void
-  // FIXME: Can we be more aggressive?
   if (Ty->getTypeID() == Type::VoidTyID)
-    return false;
+    return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
 
   const TargetData *TD = TM.getTargetData();
   unsigned Size = TD->getTypeAllocSize(Ty);
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 5095f36..0c638af 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -52,7 +52,13 @@ namespace llvm {
       LADD,
 
       // Corresponds to LSUB instruction
-      LSUB
+      LSUB,
+
+      // Jumptable branch.
+      BR_JT,
+
+      // Jumptable branch using long branches for each entry.
+      BR_JT32
     };
   }
 
@@ -122,7 +128,7 @@ namespace llvm {
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG);
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG);
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG);
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG);
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index 5a54844..722e747 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -145,6 +145,11 @@ static inline bool IsCondBranch(unsigned BrOpc) {
   return IsBRF(BrOpc) || IsBRT(BrOpc);
 }
 
+static inline bool IsBR_JT(unsigned BrOpc) {
+  return BrOpc == XCore::BR_JT
+      || BrOpc == XCore::BR_JT32;
+}
+
 /// GetCondFromBranchOpc - Return the XCore CC that matches 
 /// the correspondent Branch instruction opcode.
 static XCore::CondCode GetCondFromBranchOpc(unsigned BrOpc) 
@@ -271,6 +276,14 @@ XCoreInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
     return false;
   }
 
+  // Likewise if it ends with a branch table followed by an unconditional branch.
+  if (IsBR_JT(SecondLastInst->getOpcode()) && IsBRU(LastInst->getOpcode())) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
   // Otherwise, can't handle this.
   return true;
 }
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 10dc18c..46805d5 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -34,6 +34,15 @@ def XCoreBranchLink     : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink,
 def XCoreRetsp       : SDNode<"XCoreISD::RETSP", SDTNone,
                          [SDNPHasChain, SDNPOptInFlag]>;
 
+def SDT_XCoreBR_JT    : SDTypeProfile<0, 2,
+                                      [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+def XCoreBR_JT : SDNode<"XCoreISD::BR_JT", SDT_XCoreBR_JT,
+                        [SDNPHasChain]>;
+
+def XCoreBR_JT32 : SDNode<"XCoreISD::BR_JT32", SDT_XCoreBR_JT,
+                        [SDNPHasChain]>;
+
 def SDT_XCoreAddress    : SDTypeProfile<1, 1,
                             [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
 
@@ -185,6 +194,15 @@ def MEMii : Operand<i32> {
   let MIOperandInfo = (ops i32imm, i32imm);
 }
 
+// Jump tables.
+def InlineJT : Operand<i32> {
+  let PrintMethod = "printInlineJT";
+}
+
+def InlineJT32 : Operand<i32> {
+  let PrintMethod = "printInlineJT32";
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction Class Templates
 //===----------------------------------------------------------------------===//
@@ -624,7 +642,7 @@ defm RETSP : FU6_LU6<"retsp", XCoreRetsp>;
 
 // TODO extdp, kentsp, krestsp, blat, setsr
 // clrsr, getsr, kalli
-let isBranch = 1, isTerminator = 1 in {
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 def BRBU_u6 : _FU6<
                  (outs),
                  (ins brtarget:$target),
@@ -756,24 +774,34 @@ def CLZ_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
 
 // One operand short
 // TODO edu, eeu, waitet, waitef, freer, tstart, msync, mjoin, syncr, clrtp
-// bru, setdp, setcp, setv, setev, kcall
+// setdp, setcp, setv, setev, kcall
 // dgetreg
-let isBranch=1, isIndirectBranch=1, isTerminator=1 in
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
 def BAU_1r : _F1R<(outs), (ins GRRegs:$addr),
                  "bau $addr",
                  [(brind GRRegs:$addr)]>;
 
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BR_JT : PseudoInstXCore<(outs), (ins InlineJT:$t, GRRegs:$i),
+                            "bru $i\n$t",
+                            [(XCoreBR_JT tjumptable:$t, GRRegs:$i)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BR_JT32 : PseudoInstXCore<(outs), (ins InlineJT32:$t, GRRegs:$i),
+                              "bru $i\n$t",
+                              [(XCoreBR_JT32 tjumptable:$t, GRRegs:$i)]>;
+
 let Defs=[SP], neverHasSideEffects=1 in
 def SETSP_1r : _F1R<(outs), (ins GRRegs:$src),
                  "set sp, $src",
                  []>;
 
-let isBarrier = 1, hasCtrlDep = 1 in 
+let hasCtrlDep = 1 in 
 def ECALLT_1r : _F1R<(outs), (ins GRRegs:$src),
                  "ecallt $src",
                  []>;
 
-let isBarrier = 1, hasCtrlDep = 1 in 
+let hasCtrlDep = 1 in 
 def ECALLF_1r : _F1R<(outs), (ins GRRegs:$src),
                  "ecallf $src",
                  []>;