Vendor import of llvm trunk r126547:

http://llvm.org/svn/llvm-project/llvm/trunk@126547
author: dim <dim@FreeBSD.org> 2011-02-26 22:03:50 +0000
committer: dim <dim@FreeBSD.org> 2011-02-26 22:03:50 +0000
commit: c80ac9d286b8fcc6d1ee5d76048134cf80aa9edc (patch)
tree: ddf53b8bd9235bcb0b8aae16c5e22310dcdad665 /lib/Target
parent: cbb70ce070d220642b038ea101d9c0f9fbf860d6 (diff)
download: FreeBSD-src-c80ac9d286b8fcc6d1ee5d76048134cf80aa9edc.zip
FreeBSD-src-c80ac9d286b8fcc6d1ee5d76048134cf80aa9edc.tar.gz
50 files changed, 732 insertions, 405 deletions
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 1fb8872..7e2183d 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -155,10 +155,11 @@ namespace ARMII {
     //===------------------------------------------------------------------===//
     // Code domain.
     DomainShift   = 18,
-    DomainMask    = 3 << DomainShift,
+    DomainMask    = 7 << DomainShift,
     DomainGeneral = 0 << DomainShift,
     DomainVFP     = 1 << DomainShift,
     DomainNEON    = 2 << DomainShift,
+    DomainNEONA8  = 4 << DomainShift,
 
     //===------------------------------------------------------------------===//
     // Field shifts - such shifts are used to set field while generating
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 9f29530..26f48b3 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -172,6 +172,7 @@ class ARMFastISel : public FastISel {
     unsigned ARMMaterializeGV(const GlobalValue *GV, EVT VT);
     unsigned ARMMoveToFPReg(EVT VT, unsigned SrcReg);
     unsigned ARMMoveToIntReg(EVT VT, unsigned SrcReg);
+    unsigned ARMSelectCallOp(const GlobalValue *GV);
 
     // Call handling routines.
   private:
@@ -1633,6 +1634,25 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   return true;
 }
 
+unsigned ARMFastISel::ARMSelectCallOp(const GlobalValue *GV) {
+
+  // Depend our opcode for thumb on whether or not we're targeting an
+  // externally callable function. For libcalls we'll just pass a NULL GV
+  // in here.
+  bool isExternal = false;
+  if (!GV || GV->hasExternalLinkage()) isExternal = true;
+  
+  // Darwin needs the r9 versions of the opcodes.
+  bool isDarwin = Subtarget->isTargetDarwin();
+  if (isThumb && isExternal) {
+    return isDarwin ? ARM::tBLXi_r9 : ARM::tBLXi;
+  } else if (isThumb) {
+    return isDarwin ? ARM::tBLr9 : ARM::tBL;
+  } else  {
+    return isDarwin ? ARM::BLr9 : ARM::BL;
+  }
+}
+
 // A quick function that will emit a call for a named libcall in F with the
 // vector of passed arguments for the Instruction in I. We can assume that we
 // can emit a call for any libcall we can produce. This is an abridged version
@@ -1694,20 +1714,17 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops.
   // TODO: Turn this into the table of arm call ops.
   MachineInstrBuilder MIB;
-  unsigned CallOpc;
-  if(isThumb) {
-    CallOpc = Subtarget->isTargetDarwin() ? ARM::tBLXi_r9 : ARM::tBLXi;
+  unsigned CallOpc = ARMSelectCallOp(NULL);
+  if(isThumb)
     // Explicitly adding the predicate here.
     MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                          TII.get(CallOpc)))
                          .addExternalSymbol(TLI.getLibcallName(Call));
-  } else {
-    CallOpc = Subtarget->isTargetDarwin() ? ARM::BLr9 : ARM::BL;
+  else
     // Explicitly adding the predicate here.
     MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                          TII.get(CallOpc))
           .addExternalSymbol(TLI.getLibcallName(Call)));
-  }
 
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
@@ -1813,21 +1830,18 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
   // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops.
   // TODO: Turn this into the table of arm call ops.
   MachineInstrBuilder MIB;
-  unsigned CallOpc;
+  unsigned CallOpc = ARMSelectCallOp(GV);
   // Explicitly adding the predicate here.
-  if(isThumb) {
-    CallOpc = Subtarget->isTargetDarwin() ? ARM::tBLXi_r9 : ARM::tBLXi;
+  if(isThumb)
     // Explicitly adding the predicate here.
     MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                          TII.get(CallOpc)))
           .addGlobalAddress(GV, 0, 0);
-  } else {
-    CallOpc = Subtarget->isTargetDarwin() ? ARM::BLr9 : ARM::BL;
+  else
     // Explicitly adding the predicate here.
     MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                          TII.get(CallOpc))
           .addGlobalAddress(GV, 0, 0));
-  }
   
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index f42c6db..68c33f0 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -215,7 +215,13 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
 
   // Move past area 3.
-  if (DPRCSSize > 0) MBBI++;
+  if (DPRCSSize > 0) {
+    MBBI++;
+    // Since vpush register list cannot have gaps, there may be multiple vpush
+    // instructions in the prologue.
+    while (MBBI->getOpcode() == ARM::VSTMDDB_UPD)
+      MBBI++;
+  }
 
   NumBytes = DPRCSOffset;
   if (NumBytes) {
@@ -370,7 +376,13 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
 
     // Increment past our save areas.
-    if (AFI->getDPRCalleeSavedAreaSize()) MBBI++;
+    if (AFI->getDPRCalleeSavedAreaSize()) {
+      MBBI++;
+      // Since vpop register list cannot have gaps, there may be multiple vpop
+      // instructions in the epilogue.
+      while (MBBI->getOpcode() == ARM::VLDMDIA_UPD)
+        MBBI++;
+    }
     if (AFI->getGPRCalleeSavedArea2Size()) MBBI++;
     if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
   }
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index 676b01e..e97ce50 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -21,17 +21,14 @@ static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
   // FIXME: Detect integer instructions properly.
   const TargetInstrDesc &TID = MI->getDesc();
   unsigned Domain = TID.TSFlags & ARMII::DomainMask;
-  if (Domain == ARMII::DomainVFP) {
-    unsigned Opcode = MI->getOpcode();
-    if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD ||
-        Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
-      return false;
-  } else if (Domain == ARMII::DomainNEON) {
-    if (MI->getDesc().mayStore() || MI->getDesc().mayLoad())
-      return false;
-  } else
+  if (TID.mayStore())
     return false;
-  return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI);
+  unsigned Opcode = TID.getOpcode();
+  if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+    return false;
+  if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
+    return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI);
+  return false;
 }
 
 ScheduleHazardRecognizer::HazardType
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index a506cff..f0d5a7d 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -126,6 +126,7 @@ public:
   bool SelectAddrMode5(SDValue N, SDValue &Base,
                        SDValue &Offset);
   bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align);
+  bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset);
 
   bool SelectAddrModePC(SDValue N, SDValue &Offset, SDValue &Label);
 
@@ -886,6 +887,20 @@ bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
   return true;
 }
 
+bool ARMDAGToDAGISel::SelectAddrMode6Offset(SDNode *Op, SDValue N,
+                                            SDValue &Offset) {
+  LSBaseSDNode *LdSt = cast<LSBaseSDNode>(Op);
+  ISD::MemIndexedMode AM = LdSt->getAddressingMode();
+  if (AM != ISD::POST_INC)
+    return false;
+  Offset = N;
+  if (ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N)) {
+    if (NC->getZExtValue() * 8 == LdSt->getMemoryVT().getSizeInBits())
+      Offset = CurDAG->getRegister(0, MVT::i32);
+  }
+  return true;
+}
+
 bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N,
                                        SDValue &Offset, SDValue &Label) {
   if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 1835ec0..ab9f9e1 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -2236,7 +2236,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
     RC = ARM::GPRRegisterClass;
 
   // Transform the arguments stored in physical registers into virtual ones.
-  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC, dl);
+  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
 
   SDValue ArgValue2;
@@ -2250,7 +2250,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                             MachinePointerInfo::getFixedStack(FI),
                             false, false, 0);
   } else {
-    Reg = MF.addLiveIn(NextVA.getLocReg(), RC, dl);
+    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
   }
 
@@ -2331,7 +2331,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
         // Transform the arguments in physical registers into virtual ones.
-        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC, dl);
+        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
       }
 
@@ -2408,7 +2408,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
         else
           RC = ARM::GPRRegisterClass;
 
-        unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC, dl);
+        unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC);
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
         SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
@@ -2838,8 +2838,51 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
   EVT SrcVT = Tmp1.getValueType();
-  bool F2IisFast = Subtarget->isCortexA9() ||
-    Tmp0.getOpcode() == ISD::BITCAST || Tmp0.getOpcode() == ARMISD::VMOVDRR;
+  bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
+    Tmp0.getOpcode() == ARMISD::VMOVDRR;
+  bool UseNEON = !InGPR && Subtarget->hasNEON();
+
+  if (UseNEON) {
+    // Use VBSL to copy the sign bit.
+    unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
+    SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
+                               DAG.getTargetConstant(EncodedVal, MVT::i32));
+    EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
+    if (VT == MVT::f64)
+      Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+                         DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
+                         DAG.getConstant(32, MVT::i32));
+    else /*if (VT == MVT::f32)*/
+      Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
+    if (SrcVT == MVT::f32) {
+      Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
+      if (VT == MVT::f64)
+        Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+                           DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
+                           DAG.getConstant(32, MVT::i32));
+    }
+    Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
+
+    SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
+                                            MVT::i32);
+    AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
+    SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
+                                  DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
+                                              
+    SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
+                              DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
+                              DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
+    if (SrcVT == MVT::f32) {
+      Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
+      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+                        DAG.getConstant(0, MVT::i32));
+    } else {
+      Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
+    }
+
+    return Res;
+  }
 
   // Bitcast operand 1 to i32.
   if (SrcVT == MVT::f64)
@@ -2847,37 +2890,24 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
                        &Tmp1, 1).getValue(1);
   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
 
-  // If float to int conversion isn't going to be super expensive, then simply
-  // or in the signbit.
-  if (F2IisFast) {
-    SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32);
-    SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32);
-    Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
-    if (VT == MVT::f32) {
-      Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
-                         DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
-      return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
-                         DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
-    }
-
-    // f64: Or the high part with signbit and then combine two parts.
-    Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
-                       &Tmp0, 1);
-    SDValue Lo = Tmp0.getValue(0);
-    SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
-    Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
-    return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+  // Or in the signbit with integer operations.
+  SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32);
+  SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32);
+  Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
+  if (VT == MVT::f32) {
+    Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
+                       DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
+                       DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
   }
 
-  // Remove the signbit of operand 0.
-  Tmp0 = DAG.getNode(ISD::FABS, dl, VT, Tmp0);
-
-  // If operand 1 signbit is one, then negate operand 0.
-  SDValue ARMcc;
-  SDValue Cmp = getARMCmp(Tmp1, DAG.getConstant(0, MVT::i32),
-                          ISD::SETLT, ARMcc, DAG, dl);
-  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
-  return DAG.getNode(ARMISD::CNEG, dl, VT, Tmp0, Tmp0, ARMcc, CCR, Cmp);
+  // f64: Or the high part with signbit and then combine two parts.
+  Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                     &Tmp0, 1);
+  SDValue Lo = Tmp0.getValue(0);
+  SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
+  Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
+  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
 }
 
 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
@@ -2897,7 +2927,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32), dl);
+  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
 }
 
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 765cba4..359ac45 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -127,13 +127,14 @@ def IndexModePost : IndexMode<2>;
 def IndexModeUpd  : IndexMode<3>;
 
 // Instruction execution domain.
-class Domain<bits<2> val> {
-  bits<2> Value = val;
+class Domain<bits<3> val> {
+  bits<3> Value = val;
 }
 def GenericDomain : Domain<0>;
 def VFPDomain     : Domain<1>; // Instructions in VFP domain only
 def NeonDomain    : Domain<2>; // Instructions in Neon domain only
 def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains
+def VFPNeonA8Domain : Domain<5>; // Instructions in VFP & Neon under A8
 
 //===----------------------------------------------------------------------===//
 // ARM special operands.
@@ -249,7 +250,7 @@ class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im,
   let TSFlags{15-10} = Form;
   let TSFlags{16}    = isUnaryDataProc;
   let TSFlags{17}    = canXformTo16Bit;
-  let TSFlags{19-18} = D.Value;
+  let TSFlags{20-18} = D.Value;
 
   let Constraints = cstr;
   let Itinerary = itin;
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index c827ce3d..6e3fe2e 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -561,7 +561,9 @@ def addrmode6 : Operand<i32>,
   let EncoderMethod = "getAddrMode6AddressOpValue";
 }
 
-def am6offset : Operand<i32> {
+def am6offset : Operand<i32>,
+                ComplexPattern<i32, 1, "SelectAddrMode6Offset",
+                               [], [SDNPWantRoot]> {
   let PrintMethod = "printAddrMode6OffsetOperand";
   let MIOperandInfo = (ops GPR);
   let EncoderMethod = "getAddrMode6OffsetOpValue";
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 1e2e550..dc3d63e 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -1402,31 +1402,42 @@ def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr),
 def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr),
           (VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
 
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
-
 // ...with address register writeback:
-class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+               PatFrag StoreOp, SDNode ExtractOp>
   : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
           (ins addrmode6:$Rn, am6offset:$Rm,
            DPR:$Vd, nohash_imm:$lane), IIC_VST1lnu, "vst1", Dt,
           "\\{$Vd[$lane]\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []>;
+          "$Rn.addr = $wb",
+          [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane),
+                                  addrmode6:$Rn, am6offset:$Rm))]>;
+class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
+  : VSTQLNWBPseudo<IIC_VST1lnu> {
+  let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
+                                        addrmode6:$addr, am6offset:$offset))];
+}
 
-def VST1LNd8_UPD  : VST1LNWB<0b0000, {?,?,?,0}, "8"> {
+def VST1LNd8_UPD  : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8,
+                             NEONvgetlaneu> {
   let Inst{7-5} = lane{2-0};
 }
-def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16"> {
+def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16,
+                             NEONvgetlaneu> {
   let Inst{7-6} = lane{1-0};
   let Inst{4}   = Rn{5};
 }
-def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32"> {
+def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store,
+                             extractelt> {
   let Inst{7}   = lane{0};
   let Inst{5-4} = Rn{5-4};
 }
 
-def VST1LNq8Pseudo_UPD  : VSTQLNWBPseudo<IIC_VST1lnu>;
-def VST1LNq16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST1lnu>;
-def VST1LNq32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST1lnu>;
+def VST1LNq8Pseudo_UPD  : VST1QLNWBPseudo<v16i8, post_truncsti8, NEONvgetlaneu>;
+def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo<v8i16, post_truncsti16,NEONvgetlaneu>;
+def VST1LNq32Pseudo_UPD : VST1QLNWBPseudo<v4i32, post_store, extractelt>;
+
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
 
 //   VST2LN   : Vector Store (single 2-element structure from one lane)
 class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 920c5c9..2990283 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -197,9 +197,9 @@ def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
                    [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VSUBD  : ADbI<0b11100, 0b11, 1, 0,
@@ -211,9 +211,9 @@ def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
                    [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VDIVD  : ADbI<0b11101, 0b00, 0, 0,
@@ -235,9 +235,9 @@ def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
                    [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VNMULD : ADbI<0b11100, 0b10, 1, 0,
@@ -249,9 +249,9 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm",
                   [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 // Match reassociated forms only if not sign dependent rounding.
@@ -271,9 +271,9 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
                   [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 // FIXME: Verify encoding after integrated assembler is working.
@@ -286,9 +286,9 @@ def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
                   [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 } // Defs = [FPSCR]
 
@@ -305,9 +305,9 @@ def VABSS  : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
                    (outs SPR:$Sd), (ins SPR:$Sm),
                    IIC_fpUNA32, "vabs", ".f32\t$Sd, $Sm",
                    [(set SPR:$Sd, (fabs SPR:$Sm))]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 let Defs = [FPSCR] in {
@@ -326,9 +326,9 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 // FIXME: Verify encoding after integrated assembler is working.
@@ -347,9 +347,9 @@ def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 } // Defs = [FPSCR]
 
@@ -423,9 +423,9 @@ def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
                    (outs SPR:$Sd), (ins SPR:$Sm),
                    IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm",
                    [(set SPR:$Sd, (fneg SPR:$Sm))]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
@@ -598,9 +598,9 @@ def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
                                 [(set SPR:$Sd, (arm_sitof SPR:$Sm))]> {
   let Inst{7} = 1; // s32
 
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
@@ -616,9 +616,9 @@ def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
                                 [(set SPR:$Sd, (arm_uitof SPR:$Sm))]> {
   let Inst{7} = 0; // u32
 
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 // FP -> Int:
@@ -671,9 +671,9 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
                                  [(set SPR:$Sd, (arm_ftosi SPR:$Sm))]> {
   let Inst{7} = 1; // Z bit
 
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
@@ -689,9 +689,9 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
                                  [(set SPR:$Sd, (arm_ftoui SPR:$Sm))]> {
   let Inst{7} = 1; // Z bit
 
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
@@ -743,36 +743,36 @@ def VTOSHS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VTOUHS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VTOSLS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VTOULS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VTOSHD : AVConv1XI<0b11101, 0b11, 0b1110, 0b1011, 0,
@@ -801,36 +801,36 @@ def VSHTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VUHTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VSLTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VULTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def VSHTOD : AVConv1XI<0b11101, 0b11, 0b1010, 0b1011, 0,
@@ -874,9 +874,9 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
               Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
@@ -901,9 +901,9 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
               Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
@@ -928,9 +928,9 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
                                            SPR:$Sdin))]>,
                 RegConstraint<"$Sdin = $Sd">,
                 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
@@ -954,9 +954,9 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
              [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
                          RegConstraint<"$Sdin = $Sd">,
                   Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
@@ -995,9 +995,9 @@ def VNEGScc  : ASuI<0b11101, 0b11, 0b0001, 0b01, 0,
                     IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm",
                     [/*(set SPR:$Sd, (ARMcneg SPR:$Sn, SPR:$Sm, imm:$cc))*/]>,
                  RegConstraint<"$Sn = $Sd"> {
-  // Some single precision VFP instructions may be executed on both NEON and VFP
-  // pipelines.
-  let D = VFPNeonDomain;
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
 }
 } // neverHasSideEffects
 
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 0bd740c..1465984 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -171,7 +171,9 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
 
   // Materializable GVs (in JIT lazy compilation mode) do not require an extra
   // load from stub.
-  bool isDecl = GV->isDeclaration() && !GV->isMaterializable();
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
 
   if (!isTargetDarwin()) {
     // Extra load is needed for all externally visible.
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index f9e86eb..9a27e2f 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -132,22 +132,16 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
 }
 
 bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
-  const TargetInstrDesc &TID = MI->getDesc();
   // FIXME: Detect integer instructions properly.
+  const TargetInstrDesc &TID = MI->getDesc();
   unsigned Domain = TID.TSFlags & ARMII::DomainMask;
-  if (Domain == ARMII::DomainVFP) {
-    unsigned Opcode = TID.getOpcode();
-    if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD ||
-        Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
-      return false;
-  } else if (Domain == ARMII::DomainNEON) {
-    if (TID.mayStore() || TID.mayLoad())
-      return false;
-  } else {
+  if (TID.mayStore())
     return false;
-  }
-
-  return MI->readsRegister(Reg, TRI);
+  unsigned Opcode = TID.getOpcode();
+  if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+    return false;
+  if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
+    return MI->readsRegister(Reg, TRI);
   return false;
 }
 
diff --git a/lib/Target/ARM/NEONMoveFix.cpp b/lib/Target/ARM/NEONMoveFix.cpp
index 97e54bf..965665c 100644
--- a/lib/Target/ARM/NEONMoveFix.cpp
+++ b/lib/Target/ARM/NEONMoveFix.cpp
@@ -35,6 +35,7 @@ namespace {
   private:
     const TargetRegisterInfo *TRI;
     const ARMBaseInstrInfo *TII;
+    bool isA8;
 
     typedef DenseMap<unsigned, const MachineInstr*> RegMap;
 
@@ -43,6 +44,11 @@ namespace {
   char NEONMoveFixPass::ID = 0;
 }
 
+static bool inNEONDomain(unsigned Domain, bool isA8) {
+  return (Domain & ARMII::DomainNEON) ||
+    (isA8 && (Domain & ARMII::DomainNEONA8));
+}
+
 bool NEONMoveFixPass::InsertMoves(MachineBasicBlock &MBB) {
   RegMap Defs;
   bool Modified = false;
@@ -70,7 +76,7 @@ bool NEONMoveFixPass::InsertMoves(MachineBasicBlock &MBB) {
           Domain = ARMII::DomainNEON;
       }
 
-      if (Domain & ARMII::DomainNEON) {
+      if (inNEONDomain(Domain, isA8)) {
         // Convert VMOVD to VMOVDneon
         unsigned DestReg = MI->getOperand(0).getReg();
 
@@ -123,6 +129,7 @@ bool NEONMoveFixPass::runOnMachineFunction(MachineFunction &Fn) {
 
   TRI = TM.getRegisterInfo();
   TII = static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+  isA8 = TM.getSubtarget<ARMSubtarget>().isCortexA8();
 
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 2f67257..9b1073b 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -95,6 +95,12 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
 bool
 Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI) const {
+  while (MBBI->isDebugValue()) {
+    ++MBBI;
+    if (MBBI == MBB.end())
+      return false;
+  }
+
   unsigned PredReg = 0;
   return llvm::getITInstrPredicate(MBBI, PredReg) == ARMCC::AL;
 }
diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp
index 9137d65..c4f43ab 100644
--- a/lib/Target/Alpha/AlphaISelLowering.cpp
+++ b/lib/Target/Alpha/AlphaISelLowering.cpp
@@ -48,7 +48,6 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM)
   : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
   // Set up the TargetLowering object.
   //I am having problems with shr n i8 1
-  setShiftAmountType(MVT::i64);
   setBooleanContents(ZeroOrOneBooleanContent);
 
   addRegisterClass(MVT::i64, Alpha::GPRCRegisterClass);
diff --git a/lib/Target/Alpha/AlphaISelLowering.h b/lib/Target/Alpha/AlphaISelLowering.h
index b429e9f..cb98f92 100644
--- a/lib/Target/Alpha/AlphaISelLowering.h
+++ b/lib/Target/Alpha/AlphaISelLowering.h
@@ -31,25 +31,25 @@ namespace llvm {
 
       /// GPRelHi/GPRelLo - These represent the high and low 16-bit
       /// parts of a global address respectively.
-      GPRelHi, GPRelLo, 
+      GPRelHi, GPRelLo,
 
       /// RetLit - Literal Relocation of a Global
       RelLit,
 
       /// GlobalRetAddr - used to restore the return address
       GlobalRetAddr,
-      
+
       /// CALL - Normal call.
       CALL,
 
       /// DIVCALL - used for special library calls for div and rem
       DivCall,
-      
+
       /// return flag operand
       RET_FLAG,
 
       /// CHAIN = COND_BRANCH CHAIN, OPC, (G|F)PRC, DESTBB [, INFLAG] - This
-      /// corresponds to the COND_BRANCH pseudo instruction.  
+      /// corresponds to the COND_BRANCH pseudo instruction.
       /// *PRC is the input register to compare to zero,
       /// OPC is the branch opcode to use (e.g. Alpha::BEQ),
       /// DESTBB is the destination block to branch to, and INFLAG is
@@ -62,7 +62,9 @@ namespace llvm {
   class AlphaTargetLowering : public TargetLowering {
   public:
     explicit AlphaTargetLowering(TargetMachine &TM);
-    
+
+    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i64; }
+
     /// getSetCCResultType - Get the SETCC result ValueType
     virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
 
@@ -92,7 +94,7 @@ namespace llvm {
     ConstraintWeight getSingleConstraintMatchWeight(
       AsmOperandInfo &info, const char *constraint) const;
 
-    std::vector<unsigned> 
+    std::vector<unsigned>
       getRegClassForInlineAsmConstraint(const std::string &Constraint,
                                         EVT VT) const;
 
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.cpp b/lib/Target/Blackfin/BlackfinISelLowering.cpp
index dd27d0a..7c80eec 100644
--- a/lib/Target/Blackfin/BlackfinISelLowering.cpp
+++ b/lib/Target/Blackfin/BlackfinISelLowering.cpp
@@ -41,7 +41,6 @@ using namespace llvm;
 
 BlackfinTargetLowering::BlackfinTargetLowering(TargetMachine &TM)
   : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
-  setShiftAmountType(MVT::i16);
   setBooleanContents(ZeroOrOneBooleanContent);
   setStackPointerRegisterToSaveRestore(BF::SP);
   setIntDivIsCheap(false);
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.h b/lib/Target/Blackfin/BlackfinISelLowering.h
index 15a745f..102c830 100644
--- a/lib/Target/Blackfin/BlackfinISelLowering.h
+++ b/lib/Target/Blackfin/BlackfinISelLowering.h
@@ -32,6 +32,7 @@ namespace llvm {
   class BlackfinTargetLowering : public TargetLowering {
   public:
     BlackfinTargetLowering(TargetMachine &TM);
+    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i16; }
     virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
     virtual void ReplaceNodeResults(SDNode *N,
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index e6511d0..743a4d7 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -435,7 +435,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
 
   setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
 
-  setShiftAmountType(MVT::i32);
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 
   setStackPointerRegisterToSaveRestore(SPU::R1);
@@ -1219,7 +1218,7 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
       FuncInfo->setVarArgsFrameIndex(
         MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
       SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
-      unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass, dl);
+      unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass);
       SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
       SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
                                    false, false, 0);
@@ -2190,7 +2189,7 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
 {
   SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
   DebugLoc dl = Op.getDebugLoc();
-  EVT ShiftVT = TLI.getShiftAmountTy();
+  EVT ShiftVT = TLI.getShiftAmountTy(N0.getValueType());
 
   assert(Op.getValueType() == MVT::i8);
   switch (Opc) {
@@ -3112,7 +3111,7 @@ SPUTargetLowering::getSingleConstraintMatchWeight(
   switch (*constraint) {
   default:
     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
-    break;
+    break;
     //FIXME: Seems like the supported constraint letters were just copied
     // from PPC, as the following doesn't correspond to the GCC docs.
     // I'm leaving it so until someone adds the corresponding lowering support.
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index 95d44af..dd48d7b 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -109,6 +109,8 @@ namespace llvm {
     /// getSetCCResultType - Return the ValueType for ISD::SETCC
     virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
 
+    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+
     //! Custom lowering hooks
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
@@ -179,9 +181,9 @@ namespace llvm {
 
     virtual bool isLegalICmpImmediate(int64_t Imm) const;
 
-    virtual bool isLegalAddressingMode(const AddrMode &AM, 
+    virtual bool isLegalAddressingMode(const AddrMode &AM,
                                        const Type *Ty) const;
- 
+
     /// After allocating this many registers, the allocator should feel
     /// register pressure. The value is a somewhat random guess, based on the
     /// number of non callee saved registers in the C calling convention.
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index 2f40bfc..f39826b 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -907,7 +907,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
 
       // Transform the arguments stored on
       // physical registers into virtual ones
-      unsigned Reg = MF.addLiveIn(ArgRegEnd, RC, dl);
+      unsigned Reg = MF.addLiveIn(ArgRegEnd, RC);
       SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
 
       // If this is an 8 or 16-bit value, it has been passed promoted
@@ -973,7 +973,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
 
     for (; Start <= End; ++Start, ++StackLoc) {
       unsigned Reg = MBlazeRegisterInfo::getRegisterFromNumbering(Start);
-      unsigned LiveReg = MF.addLiveIn(Reg, RC, dl);
+      unsigned LiveReg = MF.addLiveIn(Reg, RC);
       SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, LiveReg, MVT::i32);
 
       int FI = MFI->CreateFixedObject(4, 0, true);
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 30ef4f5..a95d59c 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -77,10 +77,6 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
   // Division is expensive
   setIntDivIsCheap(false);
 
-  // Even if we have only 1 bit shift here, we can perform
-  // shifts of the whole bitwidth 1 bit per step.
-  setShiftAmountType(MVT::i8);
-
   setStackPointerRegisterToSaveRestore(MSP430::SPW);
   setBooleanContents(ZeroOrOneBooleanContent);
   setSchedulingPreference(Sched::Latency);
@@ -330,7 +326,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
       // Arguments passed in registers
       EVT RegVT = VA.getLocVT();
       switch (RegVT.getSimpleVT().SimpleTy) {
-      default: 
+      default:
         {
 #ifndef NDEBUG
           errs() << "LowerFormalArguments Unhandled argument type: "
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 673c543..19c9eac 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -73,6 +73,8 @@ namespace llvm {
   public:
     explicit MSP430TargetLowering(MSP430TargetMachine &TM);
 
+    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
+
     /// LowerOperation - Provide custom lowering hooks for some operations.
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 8f623b8..70d00e4 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -362,7 +362,6 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
   }
 
-  setShiftAmountType(MVT::i32);
   setBooleanContents(ZeroOrOneBooleanContent);
 
   if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
@@ -1597,7 +1596,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
       }
 
       // Transform the arguments stored in physical registers into virtual ones.
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC, dl);
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
       SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, ValVT);
 
       InVals.push_back(ArgValue);
@@ -1689,7 +1688,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
       // Get an existing live-in vreg, or add a new one.
       unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
       if (!VReg)
-        VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass, dl);
+        VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
 
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
       SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
@@ -1708,7 +1707,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
       // Get an existing live-in vreg, or add a new one.
       unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
       if (!VReg)
-        VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass, dl);
+        VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
 
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
       SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
@@ -1872,7 +1871,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
       InVals.push_back(FIN);
       if (ObjSize==1 || ObjSize==2) {
         if (GPR_idx != Num_GPR_Regs) {
-          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass, dl);
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
                                             MachinePointerInfo(),
@@ -1891,7 +1890,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
         // to memory.  ArgVal will be address of the beginning of
         // the object.
         if (GPR_idx != Num_GPR_Regs) {
-          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass, dl);
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
           int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
           SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
@@ -1914,7 +1913,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
     case MVT::i32:
       if (!isPPC64) {
         if (GPR_idx != Num_GPR_Regs) {
-          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass, dl);
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
           ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
           ++GPR_idx;
         } else {
@@ -1928,7 +1927,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
       // FALLTHROUGH
     case MVT::i64:  // PPC64
       if (GPR_idx != Num_GPR_Regs) {
-        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass, dl);
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
         if (ObjectVT == MVT::i32) {
@@ -1966,9 +1965,9 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
         unsigned VReg;
 
         if (ObjectVT == MVT::f32)
-          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass, dl);
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
         else
-          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass, dl);
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
 
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         ++FPR_idx;
@@ -1986,7 +1985,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
       // Note that vector arguments in registers don't reserve stack space,
       // except in varargs functions.
       if (VR_idx != Num_VR_Regs) {
-        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass, dl);
+        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         if (isVarArg) {
           while ((ArgOffset % 16) != 0) {
@@ -2064,9 +2063,9 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
       unsigned VReg;
 
       if (isPPC64)
-        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass, dl);
+        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
       else
-        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass, dl);
+        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
 
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
       SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 80cab75..33daae9 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -29,36 +29,36 @@ namespace llvm {
       /// FSEL - Traditional three-operand fsel node.
       ///
       FSEL,
-      
+
       /// FCFID - The FCFID instruction, taking an f64 operand and producing
       /// and f64 value containing the FP representation of the integer that
       /// was temporarily in the f64 operand.
       FCFID,
-      
-      /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64 
+
+      /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64
       /// operand, producing an f64 value containing the integer representation
       /// of that FP value.
       FCTIDZ, FCTIWZ,
-      
+
       /// STFIWX - The STFIWX instruction.  The first operand is an input token
       /// chain, then an f64 value to store, then an address to store it to.
       STFIWX,
-      
+
       // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
       // three v4f32 operands and producing a v4f32 result.
       VMADDFP, VNMSUBFP,
-      
+
       /// VPERM - The PPC VPERM Instruction.
       ///
       VPERM,
-      
+
       /// Hi/Lo - These represent the high and low 16-bit parts of a global
       /// address respectively.  These nodes have two operands, the first of
       /// which must be a TargetGlobalAddress, and the second of which must be a
       /// Constant.  Selected naively, these turn into 'lis G+C' and 'li G+C',
       /// though these are usually folded into other nodes.
       Hi, Lo,
-      
+
       TOC_ENTRY,
 
       /// The following three target-specific nodes are used for calls through
@@ -80,37 +80,37 @@ namespace llvm {
       /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
       /// compute an allocation on the stack.
       DYNALLOC,
-      
+
       /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
       /// at function entry, used for PIC code.
       GlobalBaseReg,
-      
+
       /// These nodes represent the 32-bit PPC shifts that operate on 6-bit
       /// shift amounts.  These nodes are generated by the multi-precision shift
       /// code.
       SRL, SRA, SHL,
-      
+
       /// EXTSW_32 - This is the EXTSW instruction for use with "32-bit"
       /// registers.
       EXTSW_32,
 
       /// CALL - A direct function call.
       CALL_Darwin, CALL_SVR4,
-      
+
       /// NOP - Special NOP which follows 64-bit SVR4 calls.
       NOP,
 
       /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
       /// MTCTR instruction.
       MTCTR,
-      
+
       /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
       /// BCTRL instruction.
       BCTRL_Darwin, BCTRL_SVR4,
-      
+
       /// Return with a flag operand, matched by 'blr'
       RET_FLAG,
-      
+
       /// R32 = MFCR(CRREG, INFLAG) - Represents the MFCRpseud/MFOCRF
       /// instructions.  This copies the bits corresponding to the specified
       /// CRREG into the resultant GPR.  Bits corresponding to other CR regs
@@ -122,20 +122,20 @@ namespace llvm {
       /// encoding for the OPC field to identify the compare.  For example, 838
       /// is VCMPGTSH.
       VCMP,
-      
+
       /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the
-      /// altivec VCMP*o instructions.  For lack of better number, we use the 
+      /// altivec VCMP*o instructions.  For lack of better number, we use the
       /// opcode number encoding for the OPC field to identify the compare.  For
       /// example, 838 is VCMPGTSH.
       VCMPo,
-      
+
       /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
       /// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
       /// condition register to branch on, OPC is the branch opcode to use (e.g.
       /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
       /// an optional input flag argument.
       COND_BRANCH,
-      
+
       // The following 5 instructions are used only as part of the
       // long double-to-int conversion sequence.
 
@@ -150,7 +150,7 @@ namespace llvm {
       MTFSB1,
 
       /// F8RC, OUTFLAG = FADDRTZ F8RC, F8RC, INFLAG - This is an FADD done with
-      /// rounding towards zero.  It has flags added so it won't move past the 
+      /// rounding towards zero.  It has flags added so it won't move past the
       /// FPSCR-setting instructions.
       FADDRTZ,
 
@@ -174,14 +174,14 @@ namespace llvm {
 
       /// STD_32 - This is the STD instruction for use with "32-bit" registers.
       STD_32 = ISD::FIRST_TARGET_MEMORY_OPCODE,
-      
-      /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a 
+
+      /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
       /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
       /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
       /// i32.
-      STBRX, 
-      
-      /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a 
+      STBRX,
+
+      /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a
       /// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
       /// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
       /// or i32.
@@ -194,7 +194,7 @@ namespace llvm {
     /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
     /// VPKUHUM instruction.
     bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary);
-    
+
     /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
     /// VPKUWUM instruction.
     bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary);
@@ -208,16 +208,16 @@ namespace llvm {
     /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
     bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
                             bool isUnary);
-    
+
     /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
     /// amount, otherwise return -1.
     int isVSLDOIShuffleMask(SDNode *N, bool isUnary);
-    
+
     /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
     /// specifies a splat of a single element that is suitable for input to
     /// VSPLTB/VSPLTH/VSPLTW.
     bool isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize);
-    
+
     /// isAllNegativeZeroVector - Returns true if all elements of build_vector
     /// are -0.0.
     bool isAllNegativeZeroVector(SDNode *N);
@@ -225,24 +225,26 @@ namespace llvm {
     /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
     /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
     unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize);
-    
+
     /// get_VSPLTI_elt - If this is a build_vector of constants which can be
     /// formed by using a vspltis[bhw] instruction of the specified element
     /// size, return the constant being splatted.  The ByteSize field indicates
     /// the number of bytes of each element [124] -> [bhw].
     SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
   }
-  
+
   class PPCTargetLowering : public TargetLowering {
     const PPCSubtarget &PPCSubTarget;
 
   public:
     explicit PPCTargetLowering(PPCTargetMachine &TM);
-    
+
     /// getTargetNodeName() - This method returns the name of a target specific
     /// DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
+    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+
     /// getSetCCResultType - Return the ISD::SETCC ValueType
     virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
 
@@ -253,19 +255,19 @@ namespace llvm {
                                            SDValue &Offset,
                                            ISD::MemIndexedMode &AM,
                                            SelectionDAG &DAG) const;
-    
+
     /// SelectAddressRegReg - Given the specified addressed, check to see if it
     /// can be represented as an indexed [r+r] operation.  Returns false if it
     /// can be more efficiently represented with [r+imm].
     bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index,
                              SelectionDAG &DAG) const;
-    
+
     /// SelectAddressRegImm - Returns true if the address N can be represented
     /// by a base register plus a signed 16-bit displacement [r+imm], and if it
     /// is not better represented as reg+reg.
     bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
                              SelectionDAG &DAG) const;
-    
+
     /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
     bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
@@ -277,7 +279,7 @@ namespace llvm {
     bool SelectAddressRegImmShift(SDValue N, SDValue &Disp, SDValue &Base,
                                   SelectionDAG &DAG) const;
 
-    
+
     /// LowerOperation - Provide custom lowering hooks for some operations.
     ///
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
@@ -289,10 +291,10 @@ namespace llvm {
                                     SelectionDAG &DAG) const;
 
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-    
+
     virtual void computeMaskedBitsForTargetNode(const SDValue Op,
                                                 const APInt &Mask,
-                                                APInt &KnownZero, 
+                                                APInt &KnownZero,
                                                 APInt &KnownOne,
                                                 const SelectionDAG &DAG,
                                                 unsigned Depth = 0) const;
@@ -300,13 +302,13 @@ namespace llvm {
     virtual MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const;
-    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, 
+    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI,
                                         MachineBasicBlock *MBB, bool is64Bit,
                                         unsigned BinOpcode) const;
-    MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr *MI, 
-                                                MachineBasicBlock *MBB, 
+    MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr *MI,
+                                                MachineBasicBlock *MBB,
                                             bool is8bit, unsigned Opcode) const;
-    
+
     ConstraintType getConstraintType(const std::string &Constraint) const;
 
     /// Examine constraint string and operand type and determine a weight value.
@@ -314,7 +316,7 @@ namespace llvm {
     ConstraintWeight getSingleConstraintMatchWeight(
       AsmOperandInfo &info, const char *constraint) const;
 
-    std::pair<unsigned, const TargetRegisterClass*> 
+    std::pair<unsigned, const TargetRegisterClass*>
       getRegForInlineAsmConstraint(const std::string &Constraint,
                                    EVT VT) const;
 
@@ -329,11 +331,11 @@ namespace llvm {
                                               char ConstraintLetter,
                                               std::vector<SDValue> &Ops,
                                               SelectionDAG &DAG) const;
-    
+
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
     virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
-    
+
     /// isLegalAddressImmediate - Return true if the integer value can be used
     /// as the offset of the target addressing mode for load / store of the
     /// given type.
@@ -344,7 +346,7 @@ namespace llvm {
     virtual bool isLegalAddressImmediate(GlobalValue *GV) const;
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
-    
+
     /// getOptimalMemOpType - Returns the target specific optimal type for load
     /// and store operations as a result of memset, memcpy, and memmove
     /// lowering. If DstAlign is zero that means it's safe to destination
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 4e14fbb..f85914b 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -254,6 +254,20 @@ unsigned long reverse(unsigned v) {
 
 //===---------------------------------------------------------------------===//
 
+[LOOP DELETION]
+
+We don't delete this output free loop, because trip count analysis doesn't
+realize that it is finite (if it were infinite, it would be undefined).  Not
+having this blocks Loop Idiom from matching strlen and friends.  
+
+void foo(char *C) {
+  int x = 0;
+  while (*C)
+    ++x,++C;
+}
+
+//===---------------------------------------------------------------------===//
+
 [LOOP RECOGNITION]
 
 These idioms should be recognized as popcount (see PR1488):
@@ -287,6 +301,16 @@ unsigned int popcount(unsigned int input) {
   return count;
 }
 
+This should be recognized as CLZ:  rdar://8459039
+
+unsigned clz_a(unsigned a) {
+  int i;
+  for (i=0;i<32;i++)
+    if (a & (1<<(31-i)))
+      return i;
+  return 32;
+}
+
 This sort of thing should be added to the loop idiom pass.
 
 //===---------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index ee29275..4b12852 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -79,6 +79,7 @@ namespace {
     MachineBasicBlock::iterator
     findDelayInstr(MachineBasicBlock &MBB, MachineBasicBlock::iterator slot);
 
+    bool needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize);
 
   };
   char Filler::ID = 0;
@@ -91,6 +92,7 @@ FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
   return new Filler(tm);
 }
 
+
 /// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
 /// We assume there is only one delay slot per delayed instruction.
 ///
@@ -112,6 +114,13 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
         BuildMI(MBB, ++J, I->getDebugLoc(), TII->get(SP::NOP));
       else
         MBB.splice(++J, &MBB, D);
+      unsigned structSize = 0;
+      if (needsUnimp(I, structSize)) {
+        MachineBasicBlock::iterator J = I;
+        ++J; //skip the delay filler.
+        BuildMI(MBB, ++J, I->getDebugLoc(),
+                TII->get(SP::UNIMP)).addImm(structSize);
+      }
     }
   return Changed;
 }
@@ -287,6 +296,28 @@ bool Filler::isDelayFiller(MachineBasicBlock &MBB,
 {
   if (candidate == MBB.begin())
     return false;
+  if (candidate->getOpcode() == SP::UNIMP)
+    return true;
   const TargetInstrDesc &prevdesc = (--candidate)->getDesc();
   return prevdesc.hasDelaySlot();
 }
+
+bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
+{
+  if (!I->getDesc().isCall())
+    return false;
+
+  unsigned structSizeOpNum = 0;
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Unknown call opcode.");
+  case SP::CALL: structSizeOpNum = 1; break;
+  case SP::JMPLrr:
+  case SP::JMPLri: structSizeOpNum = 2; break;
+  }
+
+  const MachineOperand &MO = I->getOperand(structSizeOpNum);
+  if (!MO.isImm())
+    return false;
+  StructSize = MO.getImm();
+  return true;
+}
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 196b87d..70574c3 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -16,7 +16,9 @@
 #include "SparcISelLowering.h"
 #include "SparcTargetMachine.h"
 #include "SparcMachineFunctionInfo.h"
+#include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
+#include "llvm/Module.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -116,6 +118,8 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
     // Guarantee that all emitted copies are stuck together with flags.
     Flag = Chain.getValue(1);
   }
+
+  unsigned RetAddrOffset = 8; //Call Inst + Delay Slot
   // If the function returns a struct, copy the SRetReturnReg to I0
   if (MF.getFunction()->hasStructRetAttr()) {
     SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
@@ -127,11 +131,16 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
     Flag = Chain.getValue(1);
     if (MF.getRegInfo().liveout_empty())
       MF.getRegInfo().addLiveOut(SP::I0);
+    RetAddrOffset = 12; // CallInst + Delay Slot + Unimp
   }
 
+  SDValue RetAddrOffsetNode = DAG.getConstant(RetAddrOffset, MVT::i32);
+
   if (Flag.getNode())
-    return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
-  return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain);
+    return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain,
+                       RetAddrOffsetNode, Flag);
+  return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain, 
+                     RetAddrOffsetNode);
 }
 
 /// LowerFormalArguments - V8 uses a very simple ABI, where all values are
@@ -194,7 +203,7 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
                               false, false, 0);
         } else {
           unsigned loReg = MF.addLiveIn(NextVA.getLocReg(),
-                                        &SP::IntRegsRegClass, dl);
+                                        &SP::IntRegsRegClass);
           LoVal = DAG.getCopyFromReg(Chain, dl, loReg, MVT::i32);
         }
         SDValue WholeValue =
@@ -393,6 +402,7 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   SmallVector<SDValue, 8> MemOpChains;
 
   const unsigned StackOffset = 92;
+  bool hasStructRetAttr = false;
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, realArgIdx = 0, byvalArgIdx = 0, e = ArgLocs.size();
        i != e;
@@ -433,6 +443,7 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
                                          MachinePointerInfo(),
                                          false, false, 0));
+      hasStructRetAttr = true;
       continue;
     }
 
@@ -546,6 +557,8 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     InFlag = Chain.getValue(1);
   }
 
+  unsigned SRetArgSize = (hasStructRetAttr)? getSRetArgSize(DAG, Callee):0;
+
   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
@@ -559,6 +572,8 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   SmallVector<SDValue, 8> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Callee);
+  if (hasStructRetAttr)
+    Ops.push_back(DAG.getTargetConstant(SRetArgSize, MVT::i32));
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     unsigned Reg = RegsToPass[i].first;
     if (Reg >= SP::I0 && Reg <= SP::I7)
@@ -600,7 +615,29 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   return Chain;
 }
 
+unsigned
+SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
+{
+  const Function *CalleeFn = 0;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    CalleeFn = dyn_cast<Function>(G->getGlobal());
+  } else if (ExternalSymbolSDNode *E =
+             dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const Function *Fn = DAG.getMachineFunction().getFunction();
+    const Module *M = Fn->getParent();
+    CalleeFn = M->getFunction(E->getSymbol());
+  }
+
+  if (!CalleeFn)
+    return 0;
 
+  assert(CalleeFn->hasStructRetAttr() &&
+         "Callee does not have the StructRet attribute.");
+
+  const PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
+  const Type *ElementTy = Ty->getElementType();
+  return getTargetData()->getTypeAllocSize(ElementTy);
+}
 
 //===----------------------------------------------------------------------===//
 // TargetLowering Implementation
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 849e401..7d02df8 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -101,6 +101,8 @@ namespace llvm {
 
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+
+    unsigned getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const;
   };
 } // end namespace llvm
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 1072323..cf5c48f 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -124,7 +124,8 @@ def call          : SDNode<"SPISD::CALL", SDT_SPCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                             SDNPVariadic]>;
 
-def retflag       : SDNode<"SPISD::RET_FLAG", SDTNone,
+def SDT_SPRet     : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def retflag       : SDNode<"SPISD::RET_FLAG", SDT_SPRet,
                            [SDNPHasChain, SDNPOptInGlue]>;
 
 def flushw        : SDNode<"SPISD::FLUSHW", SDTNone,
@@ -132,7 +133,7 @@ def flushw        : SDNode<"SPISD::FLUSHW", SDTNone,
 
 def getPCX        : Operand<i32> {
   let PrintMethod = "printGetPCX";
-}  
+}
 
 //===----------------------------------------------------------------------===//
 // SPARC Flag Conditions
@@ -232,6 +233,9 @@ let hasSideEffects = 1, mayStore = 1 in {
                    [(flushw)]>;
 }
 
+def UNIMP : F2_1<0b000, (outs), (ins i32imm:$val),
+                "unimp $val", []>;
+
 // FpMOVD/FpNEGD/FpABSD - These are lowered to single-precision ops by the 
 // fpmover pass.
 let Predicates = [HasNoV9] in {  // Only emit these in V8 mode.
@@ -292,11 +296,13 @@ let usesCustomInserter = 1, Uses = [FCC] in {
 // Section A.3 - Synthetic Instructions, p. 85
 // special cases of JMPL:
 let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1 in {
-  let rd = O7.Num, rs1 = G0.Num, simm13 = 8 in
-    def RETL: F3_2<2, 0b111000, (outs), (ins), "retl", [(retflag)]>;
+  let rd = O7.Num, rs1 = G0.Num in
+    def RETL: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
+                   "jmp %o7+$val", [(retflag simm13:$val)]>;
 
-  let rd = I7.Num, rs1 = G0.Num, simm13 = 8 in
-    def RET: F3_2<2, 0b111000, (outs), (ins), "ret", []>;
+  let rd = I7.Num, rs1 = G0.Num in
+    def RET: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
+                  "jmp %i7+$val", []>;
 }
 
 // Section B.1 - Load Integer Instructions, p. 90
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index d694f2e..90939c3 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -59,9 +59,6 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm) :
   // Compute derived properties from the register classes
   computeRegisterProperties();
 
-  // Set shifts properties
-  setShiftAmountType(MVT::i64);
-
   // Provide all sorts of operation actions
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 51d2df3..3019242 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -57,6 +57,8 @@ namespace llvm {
   public:
     explicit SystemZTargetLowering(SystemZTargetMachine &TM);
 
+    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i64; }
+
     /// LowerOperation - Provide custom lowering hooks for some operations.
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 1cac07a..8fe549b 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -775,6 +775,19 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
       delete &Op;
     }
   }
+  // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al".
+  if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") &&
+      Operands.size() == 3) {
+    X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+    if (Op.isMem() && Op.Mem.SegReg == 0 &&
+        isa<MCConstantExpr>(Op.Mem.Disp) &&
+        cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+        Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+      SMLoc Loc = Op.getEndLoc();
+      Operands.begin()[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+      delete &Op;
+    }
+  }
   
   // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>.  Canonicalize to
   // "shift <op>".
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 691e2d7..f777756 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -168,16 +168,16 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
     switch (insn.displacementSize) {
     default:
       break;
-    case 8:
+    case 1:
       type = TYPE_MOFFS8;
       break;
-    case 16:
+    case 2:
       type = TYPE_MOFFS16;
       break;
-    case 32:
+    case 4:
       type = TYPE_MOFFS32;
       break;
-    case 64:
+    case 8:
       type = TYPE_MOFFS64;
       break;
     }
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 4f4fbcd..d0dc8b5 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -399,7 +399,7 @@ struct InternalInstruction {
   /* The segment override type */
   SegmentOverride segmentOverride;
   
-  /* Sizes of various critical pieces of data */
+  /* Sizes of various critical pieces of data, in bytes */
   uint8_t registerSize;
   uint8_t addressSize;
   uint8_t displacementSize;
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index c10e170..abd1515 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -1879,39 +1879,71 @@ _add32carry:
 
 //===---------------------------------------------------------------------===//
 
-This:
-char t(char c) {
-  return c/3;
+The hot loop of 256.bzip2 contains code that looks a bit like this:
+
+int foo(char *P, char *Q, int x, int y) {
+  if (P[0] != Q[0])
+     return P[0] < Q[0];
+  if (P[1] != Q[1])
+     return P[1] < Q[1];
+  if (P[2] != Q[2])
+     return P[2] < Q[2];
+   return P[3] < Q[3];
 }
 
-Compiles to: $clang t.c -S -o - -O3 -mkernel -fomit-frame-pointer
+In the real code, we get a lot more wrong than this.  However, even in this
+code we generate:
 
-_t:                                     ## @t
-	movslq	%edi, %rax
-	imulq	$-1431655765, %rax, %rcx ## imm = 0xFFFFFFFFAAAAAAAB
-	shrq	$32, %rcx
-	addl	%ecx, %eax
-	movl	%eax, %ecx
-	shrl	$31, %ecx
-	shrl	%eax
-	addl	%ecx, %eax
-	movsbl	%al, %eax
+_foo:                                   ## @foo
+## BB#0:                                ## %entry
+	movb	(%rsi), %al
+	movb	(%rdi), %cl
+	cmpb	%al, %cl
+	je	LBB0_2
+LBB0_1:                                 ## %if.then
+	cmpb	%al, %cl
+	jmp	LBB0_5
+LBB0_2:                                 ## %if.end
+	movb	1(%rsi), %al
+	movb	1(%rdi), %cl
+	cmpb	%al, %cl
+	jne	LBB0_1
+## BB#3:                                ## %if.end38
+	movb	2(%rsi), %al
+	movb	2(%rdi), %cl
+	cmpb	%al, %cl
+	jne	LBB0_1
+## BB#4:                                ## %if.end60
+	movb	3(%rdi), %al
+	cmpb	3(%rsi), %al
+LBB0_5:                                 ## %if.end60
+	setl	%al
+	movzbl	%al, %eax
 	ret
 
-GCC gets:
+Note that we generate jumps to LBB0_1 which does a redundant compare.  The
+redundant compare also forces the register values to be live, which prevents
+folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
 
-_t:
-	movl	$86, %eax
-	imulb	%dil
-	shrw	$8, %ax
-	sarb	$7, %dil
-	subb	%dil, %al
-	movsbl	%al,%eax
+_foo:
+	movzbl	(%rsi), %eax
+	cmpb	%al, (%rdi)
+	jne	L10
+L12:
+	movzbl	1(%rsi), %eax
+	cmpb	%al, 1(%rdi)
+	jne	L10
+	movzbl	2(%rsi), %eax
+	cmpb	%al, 2(%rdi)
+	jne	L10
+	movzbl	3(%rdi), %eax
+	cmpb	3(%rsi), %al
+L10:
+	setl	%al
+	movzbl	%al, %eax
 	ret
 
-which is nicer.  This also happens for int, not just char.
+which is "perfect".
 
 //===---------------------------------------------------------------------===//
 
-
-
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 9d42ac2..6fa9284 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -597,9 +597,13 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
         (AM.Base.Reg != 0 || AM.IndexReg != 0))
       return false;
 
-    // Can't handle TLS or DLLImport.
+    // Can't handle DLLImport.
+    if (GV->hasDLLImportLinkage())
+      return false;
+
+    // Can't handle TLS.
     if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
-      if (GVar->isThreadLocal() || GVar->hasDLLImportLinkage())
+      if (GVar->isThreadLocal())
         return false;
 
     // Okay, we've committed to selecting this global. Set up the basic address.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 27024b4..2f49dbc 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -45,7 +45,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/VectorExtras.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -56,10 +55,6 @@ using namespace dwarf;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 
-static cl::opt<bool>
-Disable256Bit("disable-256bit", cl::Hidden,
-              cl::desc("Disable use of 256-bit vectors"));
-
 // Forward declarations.
 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                        SDValue V2);
@@ -225,7 +220,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 
   // X86 is weird, it always uses i8 for shift amounts and setcc results.
-  setShiftAmountType(MVT::i8);
   setBooleanContents(ZeroOrOneBooleanContent);
   setSchedulingPreference(Sched::RegPressure);
   setStackPointerRegisterToSaveRestore(X86StackPtr);
@@ -1713,7 +1707,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
       else
         llvm_unreachable("Unknown argument type!");
 
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC, dl);
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
 
       // If this is an 8 or 16-bit value, it is really passed promoted to 32
@@ -1845,7 +1839,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
                                   DAG.getIntPtrConstant(Offset));
         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
-                                     X86::GR64RegisterClass, dl);
+                                     X86::GR64RegisterClass);
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
         SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
@@ -1861,7 +1855,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         SmallVector<SDValue, 11> SaveXMMOps;
         SaveXMMOps.push_back(Chain);
 
-        unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass, dl);
+        unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
         SaveXMMOps.push_back(ALVal);
 
@@ -1872,7 +1866,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
           unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
-                                       X86::VR128RegisterClass, dl);
+                                       X86::VR128RegisterClass);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
           SaveXMMOps.push_back(Val);
         }
@@ -2693,6 +2687,10 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::MOVSD:
   case X86ISD::UNPCKLPS:
   case X86ISD::UNPCKLPD:
+  case X86ISD::VUNPCKLPS:
+  case X86ISD::VUNPCKLPD:
+  case X86ISD::VUNPCKLPSY:
+  case X86ISD::VUNPCKLPDY:
   case X86ISD::PUNPCKLWD:
   case X86ISD::PUNPCKLBW:
   case X86ISD::PUNPCKLDQ:
@@ -2760,6 +2758,10 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::MOVSD:
   case X86ISD::UNPCKLPS:
   case X86ISD::UNPCKLPD:
+  case X86ISD::VUNPCKLPS:
+  case X86ISD::VUNPCKLPD:
+  case X86ISD::VUNPCKLPSY:
+  case X86ISD::VUNPCKLPDY:
   case X86ISD::PUNPCKLWD:
   case X86ISD::PUNPCKLBW:
   case X86ISD::PUNPCKLDQ:
@@ -4178,7 +4180,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
   return DAG.getNode(ISD::BITCAST, dl, VT,
                      DAG.getNode(Opc, dl, ShVT, SrcOp,
-                             DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
+                             DAG.getConstant(NumBits,
+                                  TLI.getShiftAmountTy(SrcOp.getValueType()))));
 }
 
 SDValue
@@ -4327,16 +4330,15 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   // For AVX-length vectors, build the individual 128-bit pieces and
   // use shuffles to put them in place.
-  if (VT.getSizeInBits() > 256 && 
-      Subtarget->hasAVX() && 
-      !Disable256Bit &&
+  if (VT.getSizeInBits() > 256 &&
+      Subtarget->hasAVX() &&
       !ISD::isBuildVectorAllZeros(Op.getNode())) {
     SmallVector<SDValue, 8> V;
     V.resize(NumElems);
     for (unsigned i = 0; i < NumElems; ++i) {
       V[i] = Op.getOperand(i);
     }
- 
+
     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
 
     // Build the lower subvector.
@@ -5044,7 +5046,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
                            DAG.getIntPtrConstant(Elt1 / 2));
       if ((Elt1 & 1) == 0)
         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
-                             DAG.getConstant(8, TLI.getShiftAmountTy()));
+                             DAG.getConstant(8,
+                                  TLI.getShiftAmountTy(InsElt.getValueType())));
       else if (Elt0 >= 0)
         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
                              DAG.getConstant(0xFF00, MVT::i16));
@@ -5058,7 +5061,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
       if ((Elt0 & 1) != 0)
         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
-                              DAG.getConstant(8, TLI.getShiftAmountTy()));
+                              DAG.getConstant(8,
+                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
       else if (Elt1 >= 0)
         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
                              DAG.getConstant(0x00FF, MVT::i16));
@@ -5475,7 +5479,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
   // Both of them can't be memory operations though.
   if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2))
     CanFoldLoad = false;
-  
+
   if (CanFoldLoad) {
     if (HasSSE2 && NumElems == 2)
       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
@@ -6088,7 +6092,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
     SDValue ScaledN2 = N2;
     if (Upper)
       ScaledN2 = DAG.getNode(ISD::SUB, dl, N2.getValueType(), N2,
-                             DAG.getConstant(NumElems / 
+                             DAG.getConstant(NumElems /
                                              (VT.getSizeInBits() / 128),
                                              N2.getValueType()));
     Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubN0.getValueType(), SubN0,
@@ -9327,6 +9331,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
   case X86ISD::UNPCKLPS:           return "X86ISD::UNPCKLPS";
   case X86ISD::UNPCKLPD:           return "X86ISD::UNPCKLPD";
+  case X86ISD::VUNPCKLPS:          return "X86ISD::VUNPCKLPS";
+  case X86ISD::VUNPCKLPD:          return "X86ISD::VUNPCKLPD";
+  case X86ISD::VUNPCKLPSY:         return "X86ISD::VUNPCKLPSY";
+  case X86ISD::VUNPCKLPDY:         return "X86ISD::VUNPCKLPDY";
   case X86ISD::UNPCKHPS:           return "X86ISD::UNPCKHPS";
   case X86ISD::UNPCKHPD:           return "X86ISD::UNPCKHPD";
   case X86ISD::PUNPCKLBW:          return "X86ISD::PUNPCKLBW";
@@ -11984,6 +11992,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PUNPCKLQDQ:
   case X86ISD::UNPCKLPS:
   case X86ISD::UNPCKLPD:
+  case X86ISD::VUNPCKLPS:
+  case X86ISD::VUNPCKLPD:
+  case X86ISD::VUNPCKLPSY:
+  case X86ISD::VUNPCKLPDY:
   case X86ISD::MOVHLPS:
   case X86ISD::MOVLHPS:
   case X86ISD::PSHUFD:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 419da37..6ec4a7d 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -159,16 +159,16 @@ namespace llvm {
 
       /// PSHUFB - Shuffle 16 8-bit values within a vector.
       PSHUFB,
-      
+
       /// PANDN - and with not'd value.
       PANDN,
-      
+
       /// PSIGNB/W/D - Copy integer sign.
-      PSIGNB, PSIGNW, PSIGND, 
-      
+      PSIGNB, PSIGNW, PSIGND,
+
       /// PBLENDVB - Variable blend
       PBLENDVB,
-      
+
       /// FMAX, FMIN - Floating point max and min.
       ///
       FMAX, FMIN,
@@ -212,7 +212,7 @@ namespace llvm {
       // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results.
       ADD, SUB, ADC, SBB, SMUL,
       INC, DEC, OR, XOR, AND,
-      
+
       UMUL, // LOW, HI, FLAGS = umul LHS, RHS
 
       // MUL_IMM - X86 specific multiply by immediate.
@@ -248,6 +248,10 @@ namespace llvm {
       MOVSS,
       UNPCKLPS,
       UNPCKLPD,
+      VUNPCKLPS,
+      VUNPCKLPD,
+      VUNPCKLPSY,
+      VUNPCKLPDY,
       UNPCKHPS,
       UNPCKHPD,
       PUNPCKLBW,
@@ -463,6 +467,8 @@ namespace llvm {
 
     virtual unsigned getJumpTableEncoding() const;
 
+    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
+
     virtual const MCExpr *
     LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                               const MachineBasicBlock *MBB, unsigned uid,
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 344c14c..0660072 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -41,6 +41,8 @@ def MRM_F8 : Format<41>;
 def MRM_F9 : Format<42>;
 def RawFrmImm8 : Format<43>;
 def RawFrmImm16 : Format<44>;
+def MRM_D0 : Format<45>;
+def MRM_D1 : Format<46>;
 
 // ImmType - This specifies the immediate type used by an instruction. This is
 // part of the ad-hoc solution used to emit machine instruction encodings by our
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index ceb1b65..76a9b12 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -369,8 +369,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::IMUL32rri8,      X86::IMUL32rmi8, 0 },
     { X86::IMUL64rri32,     X86::IMUL64rmi32, 0 },
     { X86::IMUL64rri8,      X86::IMUL64rmi8, 0 },
-    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm, 0 },
-    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm, 0 },
     { X86::Int_COMISDrr,    X86::Int_COMISDrm, 0 },
     { X86::Int_COMISSrr,    X86::Int_COMISSrm, 0 },
     { X86::Int_CVTDQ2PDrr,  X86::Int_CVTDQ2PDrm, 16 },
@@ -568,6 +566,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::IMUL16rr,        X86::IMUL16rm, 0 },
     { X86::IMUL32rr,        X86::IMUL32rm, 0 },
     { X86::IMUL64rr,        X86::IMUL64rm, 0 },
+    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm, 0 },
+    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm, 0 },
     { X86::MAXPDrr,         X86::MAXPDrm, 16 },
     { X86::MAXPDrr_Int,     X86::MAXPDrm_Int, 16 },
     { X86::MAXPSrr,         X86::MAXPSrm, 16 },
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 1d44207..fcb5a25 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -311,6 +311,8 @@ namespace X86II {
     MRM_F0 = 40,
     MRM_F8 = 41,
     MRM_F9 = 42,
+    MRM_D0 = 45,
+    MRM_D1 = 46,
 
     /// RawFrmImm8 - This is used for the ENTER instruction, which has two
     /// immediates, the first of which is a 16-bit immediate (specified by
@@ -577,6 +579,8 @@ namespace X86II {
     case X86II::MRM_F0:
     case X86II::MRM_F8:
     case X86II::MRM_F9:
+    case X86II::MRM_D0:
+    case X86II::MRM_D1:
       return -1;
     }
   }
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 87dc4be..f832a7c 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -1296,6 +1296,9 @@ def : MnemonicAlias<"lret", "lretl">;
 def : MnemonicAlias<"leavel", "leave">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"leaveq", "leave">, Requires<[In64BitMode]>;
 
+def : MnemonicAlias<"loopz", "loope">;
+def : MnemonicAlias<"loopnz", "loopne">;
+
 def : MnemonicAlias<"pop", "popl">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"pop", "popq">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"popf", "popfl">, Requires<[In32BitMode]>;
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 1a58ba0..6a24d14 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -388,3 +388,8 @@ def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB;
 def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
 def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", []>, TB;
 
+let Defs = [RDX, RAX], Uses = [RCX] in
+  def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
+
+let Uses = [RDX, RAX, RCX] in
+  def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB;
diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/X86MCCodeEmitter.cpp
index e6dc74e..0e3b571 100644
--- a/lib/Target/X86/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/X86MCCodeEmitter.cpp
@@ -979,6 +979,14 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     EmitByte(BaseOpcode, CurByte, OS);
     EmitByte(0xF9, CurByte, OS);
     break;
+  case X86II::MRM_D0:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xD0, CurByte, OS);
+    break;
+  case X86II::MRM_D1:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xD1, CurByte, OS);
+    break;
   }
 
   // If there is a remaining operand, it must be a trailing immediate.  Emit it
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index de76856..1ee7312 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -342,9 +342,10 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS,
   assert((!Is64Bit || HasX86_64) &&
          "64-bit code requested on a subtarget that doesn't support it!");
 
-  // Stack alignment is 16 bytes on Darwin and Linux (both 32 and 64 bit) and 
-  // for all 64-bit targets.
-  if (isTargetDarwin() || isTargetLinux() || Is64Bit)
+  // Stack alignment is 16 bytes on Darwin, FreeBSD, Linux and Solaris (both
+  // 32 and 64 bit) and for all 64-bit targets.
+  if (isTargetDarwin() || isTargetFreeBSD() || isTargetLinux() ||
+      isTargetSolaris() || Is64Bit)
     stackAlignment = 16;
 
   if (StackAlignment)
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 8a119b4..0a62a02 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -166,6 +166,8 @@ public:
   bool hasVectorUAMem() const { return HasVectorUAMem; }
 
   bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; }
+  bool isTargetFreeBSD() const { return TargetTriple.getOS() == Triple::FreeBSD; }
+  bool isTargetSolaris() const { return TargetTriple.getOS() == Triple::Solaris; }
 
   // ELF is a reasonably sane default and the only other X86 targets we
   // support are Darwin and Windows. Just use "not those".
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 828d6f9..4817787 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -42,9 +42,9 @@
 using namespace llvm;
 
 const char *XCoreTargetLowering::
-getTargetNodeName(unsigned Opcode) const 
+getTargetNodeName(unsigned Opcode) const
 {
-  switch (Opcode) 
+  switch (Opcode)
   {
     case XCoreISD::BL                : return "XCoreISD::BL";
     case XCoreISD::PCRelativeWrapper : return "XCoreISD::PCRelativeWrapper";
@@ -77,7 +77,6 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   // Division is expensive
   setIntDivIsCheap(false);
 
-  setShiftAmountType(MVT::i32);
   setStackPointerRegisterToSaveRestore(XCore::SP);
 
   setSchedulingPreference(Sched::RegPressure);
@@ -95,7 +94,7 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
 
   // Stop the combiner recombining select and set_cc
   setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
-  
+
   // 64bit
   setOperationAction(ISD::ADD, MVT::i64, Custom);
   setOperationAction(ISD::SUB, MVT::i64, Custom);
@@ -106,14 +105,14 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
   setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
   setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
-  
+
   // Bit Manipulation
   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
   setOperationAction(ISD::ROTL , MVT::i32, Expand);
   setOperationAction(ISD::ROTR , MVT::i32, Expand);
-  
+
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
-  
+
   // Jump tables.
   setOperationAction(ISD::BR_JT, MVT::Other, Custom);
 
@@ -122,7 +121,7 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
 
   // Thread Local Storage
   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
-  
+
   // Conversion of i64 -> double produces constantpool nodes
   setOperationAction(ISD::ConstantPool, MVT::i32,   Custom);
 
@@ -143,7 +142,7 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
   setOperationAction(ISD::VAARG, MVT::Other, Custom);
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
-  
+
   // Dynamic stack
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
@@ -163,7 +162,7 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
 
 SDValue XCoreTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) 
+  switch (Op.getOpcode())
   {
   case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
@@ -414,7 +413,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = LD->getChain();
   SDValue BasePtr = LD->getBasePtr();
   DebugLoc DL = Op.getDebugLoc();
-  
+
   SDValue Base;
   int64_t Offset;
   if (!LD->isVolatile() &&
@@ -437,10 +436,10 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     SDValue HighOffset = DAG.getConstant((Offset & ~0x3) + 4, MVT::i32);
     SDValue LowShift = DAG.getConstant((Offset & 0x3) * 8, MVT::i32);
     SDValue HighShift = DAG.getConstant(32 - (Offset & 0x3) * 8, MVT::i32);
-    
+
     SDValue LowAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base, LowOffset);
     SDValue HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base, HighOffset);
-    
+
     SDValue Low = DAG.getLoad(getPointerTy(), DL, Chain,
                               LowAddr, MachinePointerInfo(), false, false, 0);
     SDValue High = DAG.getLoad(getPointerTy(), DL, Chain,
@@ -453,7 +452,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     SDValue Ops[] = { Result, Chain };
     return DAG.getMergeValues(Ops, 2, DL);
   }
-  
+
   if (LD->getAlignment() == 2) {
     SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain,
                                  BasePtr, LD->getPointerInfo(), MVT::i16,
@@ -473,16 +472,16 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     SDValue Ops[] = { Result, Chain };
     return DAG.getMergeValues(Ops, 2, DL);
   }
-  
+
   // Lower to a call to __misaligned_load(BasePtr).
   const Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  
+
   Entry.Ty = IntPtrTy;
   Entry.Node = BasePtr;
   Args.push_back(Entry);
-  
+
   std::pair<SDValue, SDValue> CallResult =
         LowerCallTo(Chain, IntPtrTy, false, false,
                     false, false, 0, CallingConv::C, false,
@@ -515,7 +514,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   SDValue BasePtr = ST->getBasePtr();
   SDValue Value = ST->getValue();
   DebugLoc dl = Op.getDebugLoc();
-  
+
   if (ST->getAlignment() == 2) {
     SDValue Low = Value;
     SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value,
@@ -532,19 +531,19 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
                                           ST->isNonTemporal(), 2);
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StoreLow, StoreHigh);
   }
-  
+
   // Lower to a call to __misaligned_store(BasePtr, Value).
   const Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  
+
   Entry.Ty = IntPtrTy;
   Entry.Node = BasePtr;
   Args.push_back(Entry);
-  
+
   Entry.Node = Value;
   Args.push_back(Entry);
-  
+
   std::pair<SDValue, SDValue> CallResult =
         LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), false, false,
                     false, false, 0, CallingConv::C, false,
@@ -722,7 +721,7 @@ ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
   }
 
   DebugLoc dl = N->getDebugLoc();
-  
+
   // Extract components
   SDValue LHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
                             N->getOperand(0),  DAG.getConstant(0, MVT::i32));
@@ -732,7 +731,7 @@ ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
                              N->getOperand(1), DAG.getConstant(0, MVT::i32));
   SDValue RHSH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
                              N->getOperand(1), DAG.getConstant(1, MVT::i32));
-  
+
   // Expand
   unsigned Opcode = (N->getOpcode() == ISD::ADD) ? XCoreISD::LADD :
                                                    XCoreISD::LSUB;
@@ -740,7 +739,7 @@ ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
   SDValue Carry = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
                                   LHSL, RHSL, Zero);
   SDValue Lo(Carry.getNode(), 1);
-  
+
   SDValue Ignored = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
                                   LHSH, RHSH, Carry);
   SDValue Hi(Ignored.getNode(), 1);
@@ -761,8 +760,8 @@ LowerVAARG(SDValue Op, SelectionDAG &DAG) const
                                Node->getOperand(1), MachinePointerInfo(V),
                                false, false, 0);
   // Increment the pointer, VAList, to the next vararg
-  SDValue Tmp3 = DAG.getNode(ISD::ADD, dl, getPointerTy(), VAList, 
-                     DAG.getConstant(VT.getSizeInBits(), 
+  SDValue Tmp3 = DAG.getNode(ISD::ADD, dl, getPointerTy(), VAList,
+                     DAG.getConstant(VT.getSizeInBits(),
                                      getPointerTy()));
   // Store the incremented VAList to the legalized pointer
   Tmp3 = DAG.getStore(VAList.getValue(1), dl, Tmp3, Node->getOperand(1),
@@ -781,20 +780,20 @@ LowerVASTART(SDValue Op, SelectionDAG &DAG) const
   MachineFunction &MF = DAG.getMachineFunction();
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   SDValue Addr = DAG.getFrameIndex(XFI->getVarArgsFrameIndex(), MVT::i32);
-  return DAG.getStore(Op.getOperand(0), dl, Addr, Op.getOperand(1), 
+  return DAG.getStore(Op.getOperand(0), dl, Addr, Op.getOperand(1),
                       MachinePointerInfo(), false, false, 0);
 }
 
 SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
                                             SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
-  // Depths > 0 not supported yet! 
+  // Depths > 0 not supported yet!
   if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
     return SDValue();
-  
+
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetRegisterInfo *RegInfo = getTargetMachine().getRegisterInfo();
-  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, 
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                             RegInfo->getFrameRegister(MF), MVT::i32);
 }
 
@@ -919,7 +918,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
-  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, 
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes,
                                  getPointerTy(), true));
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
@@ -944,8 +943,8 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
         break;
     }
-    
-    // Arguments that can be passed on register must be kept at 
+
+    // Arguments that can be passed on register must be kept at
     // RegsToPass vector
     if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
@@ -954,7 +953,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
       int Offset = VA.getLocMemOffset();
 
-      MemOpChains.push_back(DAG.getNode(XCoreISD::STWSP, dl, MVT::Other, 
+      MemOpChains.push_back(DAG.getNode(XCoreISD::STWSP, dl, MVT::Other,
                                         Chain, Arg,
                                         DAG.getConstant(Offset/4, MVT::i32)));
     }
@@ -963,16 +962,16 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // Transform all store nodes into one single node because
   // all store nodes are independent of each other.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Build a sequence of copy-to-reg nodes chained together with token 
+  // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
   // The InFlag in necessary since all emited instructions must be
   // stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                              RegsToPass[i].second, InFlag);
     InFlag = Chain.getValue(1);
   }
@@ -986,7 +985,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
     Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
 
   // XCoreBranchLink = #chain, #target_address, #opt_in_flags...
-  //             = Chain, Callee, Reg#1, Reg#2, ...  
+  //             = Chain, Callee, Reg#1, Reg#2, ...
   //
   // Returns a chain & a flag for retval copy to use.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -994,7 +993,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   Ops.push_back(Chain);
   Ops.push_back(Callee);
 
-  // Add argument registers to the end of the list so that they are 
+  // Add argument registers to the end of the list so that they are
   // known live into the call.
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
@@ -1098,11 +1097,11 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
   unsigned StackSlotSize = XCoreFrameLowering::stackSlotSize();
 
   unsigned LRSaveSize = StackSlotSize;
-  
+
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
 
     CCValAssign &VA = ArgLocs[i];
-    
+
     if (VA.isRegLoc()) {
       // Arguments passed in registers
       EVT RegVT = VA.getLocVT();
@@ -1139,12 +1138,12 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
       // Create the SelectionDAG nodes corresponding to a load
       //from this parameter
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
-      InVals.push_back(DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, 
+      InVals.push_back(DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
                                    MachinePointerInfo::getFixedStack(FI),
                                    false, false, 0));
     }
   }
-  
+
   if (isVarArg) {
     /* Argument registers */
     static const unsigned ArgRegs[] = {
@@ -1186,7 +1185,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
                                true));
     }
   }
-  
+
   return Chain;
 }
 
@@ -1222,7 +1221,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
 
-  // If this is the first return lowered for this function, add 
+  // If this is the first return lowered for this function, add
   // the regs to the liveout set for the function.
   if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
     for (unsigned i = 0; i != RVLocs.size(); ++i)
@@ -1237,7 +1236,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
-    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
                              OutVals[i], Flag);
 
     // guarantee that all emitted copies are
@@ -1265,7 +1264,7 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   DebugLoc dl = MI->getDebugLoc();
   assert((MI->getOpcode() == XCore::SELECT_CC) &&
          "Unexpected instr type to insert");
-  
+
   // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
   // control-flow pattern.  The incoming instruction knows the destination vreg
   // to set, the condition code register to branch on, the true/false values to
@@ -1273,7 +1272,7 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction::iterator It = BB;
   ++It;
-  
+
   //  thisMBB:
   //  ...
   //   TrueVal = ...
@@ -1296,7 +1295,7 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   // Next, add the true and fallthrough blocks as its successors.
   BB->addSuccessor(copy0MBB);
   BB->addSuccessor(sinkMBB);
-  
+
   BuildMI(BB, dl, TII.get(XCore::BRFT_lru6))
     .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
 
@@ -1304,10 +1303,10 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   //   %FalseValue = ...
   //   # fallthrough to sinkMBB
   BB = copy0MBB;
-  
+
   // Update machine-CFG edges
   BB->addSuccessor(sinkMBB);
-  
+
   //  sinkMBB:
   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   //  ...
@@ -1316,7 +1315,7 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
           TII.get(XCore::PHI), MI->getOperand(0).getReg())
     .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
-  
+
   MI->eraseFromParent();   // The pseudo instruction is gone now.
   return BB;
 }
@@ -1354,7 +1353,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
 
     // fold (ladd x, 0, y) -> 0, add x, y iff carry is unused and y has only the
     // low bit set
-    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 0)) { 
+    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 0)) {
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
@@ -1377,7 +1376,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
     EVT VT = N0.getValueType();
 
     // fold (lsub 0, 0, x) -> x, -x iff x has only the low bit set
-    if (N0C && N0C->isNullValue() && N1C && N1C->isNullValue()) {   
+    if (N0C && N0C->isNullValue() && N1C && N1C->isNullValue()) {
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
@@ -1393,7 +1392,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
 
     // fold (lsub x, 0, y) -> 0, sub x, y iff borrow is unused and y has only the
     // low bit set
-    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 0)) { 
+    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 0)) {
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
@@ -1557,7 +1556,7 @@ static inline bool isImmUs4(int64_t val)
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool
-XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM, 
+XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
                                               const Type *Ty) const {
   if (Ty->getTypeID() == Type::VoidTyID)
     return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
@@ -1568,7 +1567,7 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
     return Size >= 4 && !AM.HasBaseReg && AM.Scale == 0 &&
                  AM.BaseOffs%4 == 0;
   }
-  
+
   switch (Size) {
   case 1:
     // reg + imm
@@ -1593,7 +1592,7 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
     // reg + reg<<2
     return AM.Scale == 4 && AM.BaseOffs == 0;
   }
-  
+
   return false;
 }
 
@@ -1603,7 +1602,7 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
 
 std::vector<unsigned> XCoreTargetLowering::
 getRegClassForInlineAsmConstraint(const std::string &Constraint,
-                                  EVT VT) const 
+                                  EVT VT) const
 {
   if (Constraint.size() != 1)
     return std::vector<unsigned>();
@@ -1611,9 +1610,9 @@ getRegClassForInlineAsmConstraint(const std::string &Constraint,
   switch (Constraint[0]) {
     default : break;
     case 'r':
-      return make_vector<unsigned>(XCore::R0, XCore::R1,  XCore::R2, 
-                                   XCore::R3, XCore::R4,  XCore::R5, 
-                                   XCore::R6, XCore::R7,  XCore::R8, 
+      return make_vector<unsigned>(XCore::R0, XCore::R1,  XCore::R2,
+                                   XCore::R3, XCore::R4,  XCore::R5,
+                                   XCore::R6, XCore::R7,  XCore::R8,
                                    XCore::R9, XCore::R10, XCore::R11, 0);
       break;
   }
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 7e5dd2e..bb3f2cc 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -20,11 +20,11 @@
 #include "XCore.h"
 
 namespace llvm {
-  
+
   // Forward delcarations
   class XCoreSubtarget;
   class XCoreTargetMachine;
-  
+
   namespace XCoreISD {
     enum NodeType {
       // Start the numbering where the builtin ops and target ops leave off.
@@ -38,16 +38,16 @@ namespace llvm {
 
       // dp relative address
       DPRelativeWrapper,
-      
+
       // cp relative address
       CPRelativeWrapper,
-      
+
       // Store word to stack
       STWSP,
 
       // Corresponds to retsp instruction
       RETSP,
-      
+
       // Corresponds to LADD instruction
       LADD,
 
@@ -74,13 +74,14 @@ namespace llvm {
   //===--------------------------------------------------------------------===//
   // TargetLowering Implementation
   //===--------------------------------------------------------------------===//
-  class XCoreTargetLowering : public TargetLowering 
+  class XCoreTargetLowering : public TargetLowering
   {
   public:
 
     explicit XCoreTargetLowering(XCoreTargetMachine &TM);
 
     virtual unsigned getJumpTableEncoding() const;
+    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
@@ -91,10 +92,10 @@ namespace llvm {
     virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
                                     SelectionDAG &DAG) const;
 
-    /// getTargetNodeName - This method returns the name of a target specific 
+    /// getTargetNodeName - This method returns the name of a target specific
     //  DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
-  
+
     virtual MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const;
@@ -108,7 +109,7 @@ namespace llvm {
   private:
     const XCoreTargetMachine &TM;
     const XCoreSubtarget &Subtarget;
-  
+
     // Lower Operand helpers
     SDValue LowerCCCArguments(SDValue Chain,
                               CallingConv::ID CallConv,
@@ -148,12 +149,12 @@ namespace llvm {
     SDValue LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
-  
+
     // Inline asm support
     std::vector<unsigned>
     getRegClassForInlineAsmConstraint(const std::string &Constraint,
               EVT VT) const;
-  
+
     // Expand specifics
     SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const;
     SDValue ExpandADDSUB(SDNode *Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 38cc734..ecdd4cb 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -727,7 +727,7 @@ def NEG : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
                  "neg $dst, $b",
                  [(set GRRegs:$dst, (ineg GRRegs:$b))]>;
 
-// TODO setd, eet, eef, getts, setpt, outshr, inshr, testwct, tinitpc, tinitdp,
+// TODO setd, eet, eef, testwct, tinitpc, tinitdp,
 // tinitsp, tinitcp, tsetmr, sext (reg), zext (reg)
 let Constraints = "$src1 = $dst" in {
 let neverHasSideEffects = 1 in
@@ -758,6 +758,14 @@ def GETR_rus : _FRUS<(outs GRRegs:$dst), (ins i32imm:$type),
                  "getr $dst, $type",
                  [(set GRRegs:$dst, (int_xcore_getr immUs:$type))]>;
 
+def GETTS_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r),
+                 "getts $dst, res[$r]",
+                 [(set GRRegs:$dst, (int_xcore_getts GRRegs:$r))]>;
+
+def SETPT_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
+                 "setpt res[$r], $val",
+                 [(int_xcore_setpt GRRegs:$r, GRRegs:$val)]>;
+
 def OUTCT_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
                  "outct res[$r], $val",
                  [(int_xcore_outct GRRegs:$r, GRRegs:$val)]>;
@@ -774,6 +782,11 @@ def OUT_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
                  "out res[$r], $val",
                  [(int_xcore_out GRRegs:$r, GRRegs:$val)]>;
 
+let Constraints = "$src = $dst" in
+def OUTSHR_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r, GRRegs:$src),
+                 "outshr res[$r], $src",
+                 [(set GRRegs:$dst, (int_xcore_outshr GRRegs:$r, GRRegs:$src))]>;
+
 def INCT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r),
                  "inct $dst, res[$r]",
                  [(set GRRegs:$dst, (int_xcore_inct GRRegs:$r))]>;
@@ -786,6 +799,11 @@ def IN_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r),
                  "in $dst, res[$r]",
                  [(set GRRegs:$dst, (int_xcore_in GRRegs:$r))]>;
 
+let Constraints = "$src = $dst" in
+def INSHR_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r, GRRegs:$src),
+                 "inshr $dst, res[$r]",
+                 [(set GRRegs:$dst, (int_xcore_inshr GRRegs:$r, GRRegs:$src))]>;
+
 def CHKCT_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
                  "chkct res[$r], $val",
                  [(int_xcore_chkct GRRegs:$r, GRRegs:$val)]>;
@@ -799,7 +817,7 @@ def SETD_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
                  [(int_xcore_setd GRRegs:$r, GRRegs:$val)]>;
 
 // Two operand long
-// TODO settw, setclk, setrdy, setpsc, endin, peek,
+// TODO setclk, setrdy, setpsc, endin, peek,
 // getd, testlcl, tinitlr, getps, setps
 def BITREV_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
                  "bitrev $dst, $src",
@@ -813,13 +831,17 @@ def CLZ_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
                  "clz $dst, $src",
                  [(set GRRegs:$dst, (ctlz GRRegs:$src))]>;
 
-def SETC_l2r : _FRU6<(outs), (ins GRRegs:$r, GRRegs:$val),
+def SETC_l2r : _FL2R<(outs), (ins GRRegs:$r, GRRegs:$val),
                   "setc res[$r], $val",
                   [(int_xcore_setc GRRegs:$r, GRRegs:$val)]>;
 
+def SETTW_l2r : _FL2R<(outs), (ins GRRegs:$r, GRRegs:$val),
+                  "settw res[$r], $val",
+                  [(int_xcore_settw GRRegs:$r, GRRegs:$val)]>;
+
 // One operand short
-// TODO edu, eeu, waitet, waitef, tstart, msync, mjoin, syncr, clrtp
-// setdp, setcp, setv, setev, kcall
+// TODO edu, eeu, waitet, waitef, tstart, msync, mjoin, clrtp
+// setdp, setcp, setev, kcall
 // dgetreg
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
 def BAU_1r : _F1R<(outs), (ins GRRegs:$addr),
@@ -859,20 +881,41 @@ def BLA_1r : _F1R<(outs), (ins GRRegs:$addr, variable_ops),
                  [(XCoreBranchLink GRRegs:$addr)]>;
 }
 
+def SYNCR_1r : _F1R<(outs), (ins GRRegs:$r),
+                 "syncr res[$r]",
+                 [(int_xcore_syncr GRRegs:$r)]>;
+
 def FREER_1r : _F1R<(outs), (ins GRRegs:$r),
                "freer res[$r]",
                [(int_xcore_freer GRRegs:$r)]>;
 
+let Uses=[R11] in
+def SETV_1r : _F1R<(outs), (ins GRRegs:$r),
+               "setv res[$r], r11",
+               [(int_xcore_setv GRRegs:$r, R11)]>;
+
+def EEU_1r : _F1R<(outs), (ins GRRegs:$r),
+               "eeu res[$r]",
+               [(int_xcore_eeu GRRegs:$r)]>;
+
 // Zero operand short
-// TODO waiteu, clre, ssync, freet, ldspc, stspc, ldssr, stssr, ldsed, stsed,
+// TODO ssync, freet, ldspc, stspc, ldssr, stssr, ldsed, stsed,
 // stet, geted, getet, getkep, getksp, setkep, getid, kret, dcall, dret,
 // dentsp, drestsp
 
+def CLRE_0R : _F0R<(outs), (ins), "clre", [(int_xcore_clre)]>;
+
 let Defs = [R11] in
 def GETID_0R : _F0R<(outs), (ins),
                  "get r11, id",
                  [(set R11, (int_xcore_getid))]>;
 
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
+    hasSideEffects = 1 in
+def WAITEU_0R : _F0R<(outs), (ins),
+                 "waiteu",
+                 [(brind (int_xcore_waitevent))]>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
author	dim <dim@FreeBSD.org>	2011-02-26 22:03:50 +0000
committer	dim <dim@FreeBSD.org>	2011-02-26 22:03:50 +0000
commit	c80ac9d286b8fcc6d1ee5d76048134cf80aa9edc (patch)
tree	ddf53b8bd9235bcb0b8aae16c5e22310dcdad665 /lib/Target
parent	cbb70ce070d220642b038ea101d9c0f9fbf860d6 (diff)
download	FreeBSD-src-c80ac9d286b8fcc6d1ee5d76048134cf80aa9edc.zip FreeBSD-src-c80ac9d286b8fcc6d1ee5d76048134cf80aa9edc.tar.gz