443 files changed, 23478 insertions, 20095 deletions
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 371dcaf..503fbbd 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -233,10 +233,12 @@ bool llvm::isNoAliasCall(const Value *V) {
 ///    NoAlias returns
 ///
 bool llvm::isIdentifiedObject(const Value *V) {
-  if (isa<AllocaInst>(V) || isNoAliasCall(V))
+  if (isa<AllocaInst>(V))
     return true;
   if (isa<GlobalValue>(V) && !isa<GlobalAlias>(V))
     return true;
+  if (isNoAliasCall(V))
+    return true;
   if (const Argument *A = dyn_cast<Argument>(V))
     return A->hasNoAliasAttr() || A->hasByValAttr();
   return false;
diff --git a/lib/Analysis/AliasAnalysisEvaluator.cpp b/lib/Analysis/AliasAnalysisEvaluator.cpp
index bfa3ff1..37ee9fc 100644
--- a/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/Target/TargetData.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/InstIterator.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/lib/Analysis/AliasDebugger.cpp b/lib/Analysis/AliasDebugger.cpp
index 88c2875..bc2d9c55 100644
--- a/lib/Analysis/AliasDebugger.cpp
+++ b/lib/Analysis/AliasDebugger.cpp
@@ -45,8 +45,12 @@ namespace {
       InitializeAliasAnalysis(this);                 // set up super class
 
       for(Module::global_iterator I = M.global_begin(),
-            E = M.global_end(); I != E; ++I)
+            E = M.global_end(); I != E; ++I) {
         Vals.insert(&*I);
+        for (User::const_op_iterator OI = I->op_begin(),
+             OE = I->op_end(); OI != OE; ++OI)
+          Vals.insert(*OI);
+      }
 
       for(Module::iterator I = M.begin(),
             E = M.end(); I != E; ++I){
@@ -58,8 +62,12 @@ namespace {
           for (Function::const_iterator FI = I->begin(), FE = I->end();
                FI != FE; ++FI) 
             for (BasicBlock::const_iterator BI = FI->begin(), BE = FI->end();
-                 BI != BE; ++BI)
+                 BI != BE; ++BI) {
               Vals.insert(&*BI);
+              for (User::const_op_iterator OI = BI->op_begin(),
+                   OE = BI->op_end(); OI != OE; ++OI)
+                Vals.insert(*OI);
+            }
         }
         
       }
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index cfe7a1c..4f53a6d 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -78,6 +78,20 @@ static bool isNonEscapingLocalObject(const Value *V) {
   return false;
 }
 
+/// isEscapeSource - Return true if the pointer is one which would have
+/// been considered an escape by isNonEscapingLocalObject.
+static bool isEscapeSource(const Value *V) {
+  if (isa<CallInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V))
+    return true;
+
+  // The load case works because isNonEscapingLocalObject considers all
+  // stores to be escapes (it passes true for the StoreCaptures argument
+  // to PointerMayBeCaptured).
+  if (isa<LoadInst>(V))
+    return true;
+
+  return false;
+}
 
 /// isObjectSmallerThan - Return true if we can prove that the object specified
 /// by V is smaller than Size.
@@ -94,7 +108,7 @@ static bool isObjectSmallerThan(const Value *V, unsigned Size,
   } else if (const CallInst* CI = extractMallocCall(V)) {
     if (!isArrayMalloc(V, &TD))
       // The size is the argument to the malloc call.
-      if (const ConstantInt* C = dyn_cast<ConstantInt>(CI->getOperand(1)))
+      if (const ConstantInt* C = dyn_cast<ConstantInt>(CI->getArgOperand(0)))
         return (C->getZExtValue() < Size);
     return false;
   } else if (const Argument *A = dyn_cast<Argument>(V)) {
@@ -177,9 +191,29 @@ static RegisterAnalysisGroup<AliasAnalysis> V(U);
 ImmutablePass *llvm::createNoAAPass() { return new NoAA(); }
 
 //===----------------------------------------------------------------------===//
-// BasicAA Pass
+// BasicAliasAnalysis Pass
 //===----------------------------------------------------------------------===//
 
+#ifndef NDEBUG
+static const Function *getParent(const Value *V) {
+  if (const Instruction *inst = dyn_cast<Instruction>(V))
+    return inst->getParent()->getParent();
+
+  if (const Argument *arg = dyn_cast<Argument>(V))
+    return arg->getParent();
+
+  return NULL;
+}
+
+static bool notDifferentParent(const Value *O1, const Value *O2) {
+
+  const Function *F1 = getParent(O1);
+  const Function *F2 = getParent(O2);
+
+  return !F1 || !F2 || F1 == F2;
+}
+#endif
+
 namespace {
   /// BasicAliasAnalysis - This is the default alias analysis implementation.
   /// Because it doesn't chain to a previous alias analysis (like -no-aa), it
@@ -187,11 +221,14 @@ namespace {
   struct BasicAliasAnalysis : public NoAA {
     static char ID; // Class identification, replacement for typeinfo
     BasicAliasAnalysis() : NoAA(&ID) {}
+
     AliasResult alias(const Value *V1, unsigned V1Size,
                       const Value *V2, unsigned V2Size) {
-      assert(VisitedPHIs.empty() && "VisitedPHIs must be cleared after use!");
+      assert(Visited.empty() && "Visited must be cleared after use!");
+      assert(notDifferentParent(V1, V2) &&
+             "BasicAliasAnalysis doesn't support interprocedural queries.");
       AliasResult Alias = aliasCheck(V1, V1Size, V2, V2Size);
-      VisitedPHIs.clear();
+      Visited.clear();
       return Alias;
     }
 
@@ -213,8 +250,8 @@ namespace {
     }
     
   private:
-    // VisitedPHIs - Track PHI nodes visited by a aliasCheck() call.
-    SmallPtrSet<const Value*, 16> VisitedPHIs;
+    // Visited - Track instructions visited by a aliasPHI, aliasSelect(), and aliasGEP().
+    SmallPtrSet<const Value*, 16> Visited;
 
     // aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP
     // instruction against another.
@@ -268,6 +305,9 @@ bool BasicAliasAnalysis::pointsToConstantMemory(const Value *P) {
 /// simple "address taken" analysis on local objects.
 AliasAnalysis::ModRefResult
 BasicAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+  assert(notDifferentParent(CS.getInstruction(), P) &&
+         "AliasAnalysis query involving multiple functions!");
+
   const Value *Object = P->getUnderlyingObject();
   
   // If this is a tail call and P points to a stack location, we know that
@@ -318,10 +358,10 @@ BasicAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
   case Intrinsic::memcpy:
   case Intrinsic::memmove: {
     unsigned Len = ~0U;
-    if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getOperand(3)))
+    if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2)))
       Len = LenCI->getZExtValue();
-    Value *Dest = II->getOperand(1);
-    Value *Src = II->getOperand(2);
+    Value *Dest = II->getArgOperand(0);
+    Value *Src = II->getArgOperand(1);
     if (isNoAlias(Dest, Len, P, Size)) {
       if (isNoAlias(Src, Len, P, Size))
         return NoModRef;
@@ -332,9 +372,9 @@ BasicAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
   case Intrinsic::memset:
     // Since memset is 'accesses arguments' only, the AliasAnalysis base class
     // will handle it for the variable length case.
-    if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getOperand(3))) {
+    if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
       unsigned Len = LenCI->getZExtValue();
-      Value *Dest = II->getOperand(1);
+      Value *Dest = II->getArgOperand(0);
       if (isNoAlias(Dest, Len, P, Size))
         return NoModRef;
     }
@@ -352,7 +392,7 @@ BasicAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
   case Intrinsic::atomic_load_umax:
   case Intrinsic::atomic_load_umin:
     if (TD) {
-      Value *Op1 = II->getOperand(1);
+      Value *Op1 = II->getArgOperand(0);
       unsigned Op1Size = TD->getTypeStoreSize(Op1->getType());
       if (isNoAlias(Op1, Op1Size, P, Size))
         return NoModRef;
@@ -361,14 +401,14 @@ BasicAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end:
   case Intrinsic::invariant_start: {
-    unsigned PtrSize = cast<ConstantInt>(II->getOperand(1))->getZExtValue();
-    if (isNoAlias(II->getOperand(2), PtrSize, P, Size))
+    unsigned PtrSize = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+    if (isNoAlias(II->getArgOperand(1), PtrSize, P, Size))
       return NoModRef;
     break;
   }
   case Intrinsic::invariant_end: {
-    unsigned PtrSize = cast<ConstantInt>(II->getOperand(2))->getZExtValue();
-    if (isNoAlias(II->getOperand(3), PtrSize, P, Size))
+    unsigned PtrSize = cast<ConstantInt>(II->getArgOperand(1))->getZExtValue();
+    if (isNoAlias(II->getArgOperand(2), PtrSize, P, Size))
       return NoModRef;
     break;
   }
@@ -440,6 +480,13 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, unsigned V1Size,
                              const Value *V2, unsigned V2Size,
                              const Value *UnderlyingV1,
                              const Value *UnderlyingV2) {
+  // If this GEP has been visited before, we're on a use-def cycle.
+  // Such cycles are only valid when PHI nodes are involved or in unreachable
+  // code. The visitPHI function catches cycles containing PHIs, but there
+  // could still be a cycle without PHIs in unreachable code.
+  if (!Visited.insert(GEP1))
+    return MayAlias;
+
   int64_t GEP1BaseOffset;
   SmallVector<std::pair<const Value*, int64_t>, 4> GEP1VariableIndices;
 
@@ -550,6 +597,13 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, unsigned V1Size,
 AliasAnalysis::AliasResult
 BasicAliasAnalysis::aliasSelect(const SelectInst *SI, unsigned SISize,
                                 const Value *V2, unsigned V2Size) {
+  // If this select has been visited before, we're on a use-def cycle.
+  // Such cycles are only valid when PHI nodes are involved or in unreachable
+  // code. The visitPHI function catches cycles containing PHIs, but there
+  // could still be a cycle without PHIs in unreachable code.
+  if (!Visited.insert(SI))
+    return MayAlias;
+
   // If the values are Selects with the same condition, we can do a more precise
   // check: just check for aliases between the values on corresponding arms.
   if (const SelectInst *SI2 = dyn_cast<SelectInst>(V2))
@@ -570,11 +624,17 @@ BasicAliasAnalysis::aliasSelect(const SelectInst *SI, unsigned SISize,
   // If both arms of the Select node NoAlias or MustAlias V2, then returns
   // NoAlias / MustAlias. Otherwise, returns MayAlias.
   AliasResult Alias =
-    aliasCheck(SI->getTrueValue(), SISize, V2, V2Size);
+    aliasCheck(V2, V2Size, SI->getTrueValue(), SISize);
   if (Alias == MayAlias)
     return MayAlias;
+
+  // If V2 is visited, the recursive case will have been caught in the
+  // above aliasCheck call, so these subsequent calls to aliasCheck
+  // don't need to assume that V2 is being visited recursively.
+  Visited.erase(V2);
+
   AliasResult ThisAlias =
-    aliasCheck(SI->getFalseValue(), SISize, V2, V2Size);
+    aliasCheck(V2, V2Size, SI->getFalseValue(), SISize);
   if (ThisAlias != Alias)
     return MayAlias;
   return Alias;
@@ -586,7 +646,7 @@ AliasAnalysis::AliasResult
 BasicAliasAnalysis::aliasPHI(const PHINode *PN, unsigned PNSize,
                              const Value *V2, unsigned V2Size) {
   // The PHI node has already been visited, avoid recursion any further.
-  if (!VisitedPHIs.insert(PN))
+  if (!Visited.insert(PN))
     return MayAlias;
 
   // If the values are PHIs in the same block, we can do a more precise
@@ -636,10 +696,10 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, unsigned PNSize,
   for (unsigned i = 1, e = V1Srcs.size(); i != e; ++i) {
     Value *V = V1Srcs[i];
 
-    // If V2 is a PHI, the recursive case will have been caught in the
+    // If V2 is visited, the recursive case will have been caught in the
     // above aliasCheck call, so these subsequent calls to aliasCheck
     // don't need to assume that V2 is being visited recursively.
-    VisitedPHIs.erase(V2);
+    Visited.erase(V2);
 
     AliasResult ThisAlias = aliasCheck(V2, V2Size, V, PNSize);
     if (ThisAlias != Alias || ThisAlias == MayAlias)
@@ -693,17 +753,32 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, unsigned V1Size,
         (isa<Constant>(O2) && isIdentifiedObject(O1) && !isa<Constant>(O1)))
       return NoAlias;
 
-    // Arguments can't alias with local allocations or noalias calls.
-    if ((isa<Argument>(O1) && (isa<AllocaInst>(O2) || isNoAliasCall(O2))) ||
-        (isa<Argument>(O2) && (isa<AllocaInst>(O1) || isNoAliasCall(O1))))
+    // Arguments can't alias with local allocations or noalias calls
+    // in the same function.
+    if (((isa<Argument>(O1) && (isa<AllocaInst>(O2) || isNoAliasCall(O2))) ||
+         (isa<Argument>(O2) && (isa<AllocaInst>(O1) || isNoAliasCall(O1)))))
       return NoAlias;
 
     // Most objects can't alias null.
-    if ((isa<ConstantPointerNull>(V2) && isKnownNonNull(O1)) ||
-        (isa<ConstantPointerNull>(V1) && isKnownNonNull(O2)))
+    if ((isa<ConstantPointerNull>(O2) && isKnownNonNull(O1)) ||
+        (isa<ConstantPointerNull>(O1) && isKnownNonNull(O2)))
       return NoAlias;
-  }
   
+    // If one pointer is the result of a call/invoke or load and the other is a
+    // non-escaping local object within the same function, then we know the
+    // object couldn't escape to a point where the call could return it.
+    //
+    // Note that if the pointers are in different functions, there are a
+    // variety of complications. A call with a nocapture argument may still
+    // temporary store the nocapture argument's value in a temporary memory
+    // location if that memory location doesn't escape. Or it may pass a
+    // nocapture value to other functions as long as they don't capture it.
+    if (isEscapeSource(O1) && isNonEscapingLocalObject(O2))
+      return NoAlias;
+    if (isEscapeSource(O2) && isNonEscapingLocalObject(O1))
+      return NoAlias;
+  }
+
   // If the size of one access is larger than the entire object on the other
   // side, then we know such behavior is undefined and can assume no alias.
   if (TD)
@@ -711,22 +786,6 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, unsigned V1Size,
         (V2Size != ~0U && isObjectSmallerThan(O1, V2Size, *TD)))
       return NoAlias;
   
-  // If one pointer is the result of a call/invoke or load and the other is a
-  // non-escaping local object, then we know the object couldn't escape to a
-  // point where the call could return it. The load case works because
-  // isNonEscapingLocalObject considers all stores to be escapes (it
-  // passes true for the StoreCaptures argument to PointerMayBeCaptured).
-  if (O1 != O2) {
-    if ((isa<CallInst>(O1) || isa<InvokeInst>(O1) || isa<LoadInst>(O1) ||
-         isa<Argument>(O1)) &&
-        isNonEscapingLocalObject(O2))
-      return NoAlias;
-    if ((isa<CallInst>(O2) || isa<InvokeInst>(O2) || isa<LoadInst>(O2) ||
-         isa<Argument>(O2)) &&
-        isNonEscapingLocalObject(O1))
-      return NoAlias;
-  }
-
   // FIXME: This isn't aggressively handling alias(GEP, PHI) for example: if the
   // GEP can't simplify, we don't even look at the PHI cases.
   if (!isa<GEPOperator>(V1) && isa<GEPOperator>(V2)) {
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 5a37ce0..d9b670d 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -23,6 +23,7 @@ add_llvm_library(LLVMAnalysis
   LibCallSemantics.cpp
   Lint.cpp
   LiveValues.cpp
+  Loads.cpp
   LoopDependenceAnalysis.cpp
   LoopInfo.cpp
   LoopPass.cpp
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 37cda02..13d8f4d 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -208,7 +208,7 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
          i != e; ++i, ++GTI) {
       ConstantInt *CI = dyn_cast<ConstantInt>(*i);
       if (!CI) return false;  // Index isn't a simple constant?
-      if (CI->getZExtValue() == 0) continue;  // Not adding anything.
+      if (CI->isZero()) continue;  // Not adding anything.
       
       if (const StructType *ST = dyn_cast<StructType>(*GTI)) {
         // N = N + Offset
@@ -436,8 +436,10 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
     unsigned StrLen = Str.length();
     const Type *Ty = cast<PointerType>(CE->getType())->getElementType();
     unsigned NumBits = Ty->getPrimitiveSizeInBits();
-    // Replace LI with immediate integer store.
-    if ((NumBits >> 3) == StrLen + 1) {
+    // Replace load with immediate integer if the result is an integer or fp
+    // value.
+    if ((NumBits >> 3) == StrLen + 1 && (NumBits & 7) == 0 &&
+        (isa<IntegerType>(Ty) || Ty->isFloatingPointTy())) {
       APInt StrVal(NumBits, 0);
       APInt SingleChar(NumBits, 0);
       if (TD->isLittleEndian()) {
@@ -454,7 +456,11 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
         SingleChar = 0;
         StrVal = (StrVal << 8) | SingleChar;
       }
-      return ConstantInt::get(CE->getContext(), StrVal);
+      
+      Constant *Res = ConstantInt::get(CE->getContext(), StrVal);
+      if (Ty->isFloatingPointTy())
+        Res = ConstantExpr::getBitCast(Res, Ty);
+      return Res;
     }
   }
   
@@ -772,9 +778,9 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, const Type *DestTy,
   case Instruction::ICmp:
   case Instruction::FCmp: assert(0 && "Invalid for compares");
   case Instruction::Call:
-    if (Function *F = dyn_cast<Function>(Ops[0]))
+    if (Function *F = dyn_cast<Function>(Ops[CallInst::ArgOffset ? 0:NumOps-1]))
       if (canConstantFoldCallTo(F))
-        return ConstantFoldCall(F, Ops+1, NumOps-1);
+        return ConstantFoldCall(F, Ops+CallInst::ArgOffset, NumOps-1);
     return 0;
   case Instruction::PtrToInt:
     // If the input is a inttoptr, eliminate the pair.  This requires knowing
diff --git a/lib/Analysis/DebugInfo.cpp b/lib/Analysis/DebugInfo.cpp
index a7b6d2b..c8d0d22 100644
--- a/lib/Analysis/DebugInfo.cpp
+++ b/lib/Analysis/DebugInfo.cpp
@@ -73,6 +73,15 @@ GlobalVariable *DIDescriptor::getGlobalVariableField(unsigned Elt) const {
   return 0;
 }
 
+Function *DIDescriptor::getFunctionField(unsigned Elt) const {
+  if (DbgNode == 0)
+    return 0;
+
+  if (Elt < DbgNode->getNumOperands())
+      return dyn_cast_or_null<Function>(DbgNode->getOperand(Elt));
+  return 0;
+}
+
 unsigned DIVariable::getNumAddrElements() const {
   return DbgNode->getNumOperands()-6;
 }
@@ -397,6 +406,8 @@ bool DIVariable::isInlinedFnArgument(const Function *CurFn) {
 /// information for the function F.
 bool DISubprogram::describes(const Function *F) {
   assert(F && "Invalid function");
+  if (F == getFunction())
+    return true;
   StringRef Name = getLinkageName();
   if (Name.empty())
     Name = getName();
@@ -938,7 +949,8 @@ DISubprogram DIFactory::CreateSubprogram(DIDescriptor Context,
                                          unsigned VK, unsigned VIndex,
                                          DIType ContainingType,
                                          bool isArtificial,
-                                         bool isOptimized) {
+                                         bool isOptimized,
+                                         Function *Fn) {
 
   Value *Elts[] = {
     GetTagConstant(dwarf::DW_TAG_subprogram),
@@ -956,9 +968,15 @@ DISubprogram DIFactory::CreateSubprogram(DIDescriptor Context,
     ConstantInt::get(Type::getInt32Ty(VMContext), VIndex),
     ContainingType,
     ConstantInt::get(Type::getInt1Ty(VMContext), isArtificial),
-    ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized)
+    ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
+    Fn
   };
-  return DISubprogram(MDNode::get(VMContext, &Elts[0], 16));
+  MDNode *Node = MDNode::get(VMContext, &Elts[0], 17);
+
+  // Create a named metadata so that we do not lose this mdnode.
+  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp");
+  NMD->addOperand(Node);
+  return DISubprogram(Node);
 }
 
 /// CreateSubprogramDefinition - Create new subprogram descriptor for the
@@ -984,9 +1002,15 @@ DISubprogram DIFactory::CreateSubprogramDefinition(DISubprogram &SPDeclaration)
     DeclNode->getOperand(12), // VIndex
     DeclNode->getOperand(13), // Containting Type
     DeclNode->getOperand(14), // isArtificial
-    DeclNode->getOperand(15)  // isOptimized
+    DeclNode->getOperand(15), // isOptimized
+    SPDeclaration.getFunction()
   };
-  return DISubprogram(MDNode::get(VMContext, &Elts[0], 16));
+  MDNode *Node =MDNode::get(VMContext, &Elts[0], 16);
+
+  // Create a named metadata so that we do not lose this mdnode.
+  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp");
+  NMD->addOperand(Node);
+  return DISubprogram(Node);
 }
 
 /// CreateGlobalVariable - Create a new descriptor for the specified global.
@@ -1042,8 +1066,18 @@ DIVariable DIFactory::CreateVariable(unsigned Tag, DIDescriptor Context,
     // The optimizer may remove local variable. If there is an interest
     // to preserve variable info in such situation then stash it in a
     // named mdnode.
-    NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.lv");
-    NMD->addOperand(Node);
+    DISubprogram Fn(getDISubprogram(Context));
+    StringRef FName = "fn";
+    if (Fn.getFunction())
+      FName = Fn.getFunction()->getName();
+    char One = '\1';
+    if (FName.startswith(StringRef(&One, 1)))
+      FName = FName.substr(1);
+    NamedMDNode *FnLocals = M.getNamedMetadata(Twine("llvm.dbg.lv.", FName));
+    if (!FnLocals)
+      FnLocals = NamedMDNode::Create(VMContext, Twine("llvm.dbg.lv.", FName),
+                                     NULL, 0, &M);
+    FnLocals->addOperand(Node);
   }
   return DIVariable(Node);
 }
@@ -1110,18 +1144,6 @@ DILocation DIFactory::CreateLocation(unsigned LineNo, unsigned ColumnNo,
   return DILocation(MDNode::get(VMContext, &Elts[0], 4));
 }
 
-/// CreateLocation - Creates a debug info location.
-DILocation DIFactory::CreateLocation(unsigned LineNo, unsigned ColumnNo,
-                                     DIScope S, MDNode *OrigLoc) {
- Value *Elts[] = {
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
-    ConstantInt::get(Type::getInt32Ty(VMContext), ColumnNo),
-    S,
-    OrigLoc
-  };
-  return DILocation(MDNode::get(VMContext, &Elts[0], 4));
-}
-
 //===----------------------------------------------------------------------===//
 // DIFactory: Routines for inserting code into a function
 //===----------------------------------------------------------------------===//
@@ -1218,17 +1240,19 @@ void DebugInfoFinder::processModule(Module &M) {
           processLocation(DILocation(IA));
       }
 
-  NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.gv");
-  if (!NMD)
-    return;
-
-  for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
-    DIGlobalVariable DIG(cast<MDNode>(NMD->getOperand(i)));
-    if (addGlobalVariable(DIG)) {
-      addCompileUnit(DIG.getCompileUnit());
-      processType(DIG.getType());
+  if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.gv")) {
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+      DIGlobalVariable DIG(cast<MDNode>(NMD->getOperand(i)));
+      if (addGlobalVariable(DIG)) {
+        addCompileUnit(DIG.getCompileUnit());
+        processType(DIG.getType());
+      }
     }
   }
+
+  if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.sp"))
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
+      processSubprogram(DISubprogram(NMD->getOperand(i)));
 }
 
 /// processLocation - Process DILocation.
diff --git a/lib/Analysis/DomPrinter.cpp b/lib/Analysis/DomPrinter.cpp
index a1676e5..d95c376 100644
--- a/lib/Analysis/DomPrinter.cpp
+++ b/lib/Analysis/DomPrinter.cpp
@@ -43,10 +43,10 @@ struct DOTGraphTraits<DomTreeNode*> : public DefaultDOTGraphTraits {
 
     if (isSimple())
       return DOTGraphTraits<const Function*>
-	       ::getSimpleNodeLabel(BB, BB->getParent());
+        ::getSimpleNodeLabel(BB, BB->getParent());
     else
       return DOTGraphTraits<const Function*>
-	       ::getCompleteNodeLabel(BB, BB->getParent());
+        ::getCompleteNodeLabel(BB, BB->getParent());
   }
 };
 
diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp
index 2bde56d7..65c7c6e 100644
--- a/lib/Analysis/IPA/CallGraph.cpp
+++ b/lib/Analysis/IPA/CallGraph.cpp
@@ -126,13 +126,15 @@ private:
     }
 
     // Loop over all of the users of the function, looking for non-call uses.
-    for (Value::use_iterator I = F->use_begin(), E = F->use_end(); I != E; ++I)
-      if ((!isa<CallInst>(I) && !isa<InvokeInst>(I))
-          || !CallSite(cast<Instruction>(I)).isCallee(I)) {
+    for (Value::use_iterator I = F->use_begin(), E = F->use_end(); I != E; ++I){
+      User *U = *I;
+      if ((!isa<CallInst>(U) && !isa<InvokeInst>(U))
+          || !CallSite(cast<Instruction>(U)).isCallee(I)) {
         // Not a call, or being used as a parameter rather than as the callee.
         ExternalCallingNode->addCalledFunction(CallSite(), Node);
         break;
       }
+    }
 
     // If this function is not defined in this translation unit, it could call
     // anything.
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index b14afa3..f13deea 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -233,33 +233,34 @@ bool GlobalsModRef::AnalyzeUsesOfPointer(Value *V,
                                          GlobalValue *OkayStoreDest) {
   if (!V->getType()->isPointerTy()) return true;
 
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
-    if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+  for (Value::use_iterator UI = V->use_begin(), E=V->use_end(); UI != E; ++UI) {
+    User *U = *UI;
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
       Readers.push_back(LI->getParent()->getParent());
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
       if (V == SI->getOperand(1)) {
         Writers.push_back(SI->getParent()->getParent());
       } else if (SI->getOperand(1) != OkayStoreDest) {
         return true;  // Storing the pointer
       }
-    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(*UI)) {
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
       if (AnalyzeUsesOfPointer(GEP, Readers, Writers)) return true;
-    } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(*UI)) {
+    } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
       if (AnalyzeUsesOfPointer(BCI, Readers, Writers, OkayStoreDest))
         return true;
-    } else if (isFreeCall(*UI)) {
-      Writers.push_back(cast<Instruction>(*UI)->getParent()->getParent());
-    } else if (CallInst *CI = dyn_cast<CallInst>(*UI)) {
+    } else if (isFreeCall(U)) {
+      Writers.push_back(cast<Instruction>(U)->getParent()->getParent());
+    } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
       // Make sure that this is just the function being called, not that it is
       // passing into the function.
-      for (unsigned i = 1, e = CI->getNumOperands(); i != e; ++i)
-        if (CI->getOperand(i) == V) return true;
-    } else if (InvokeInst *II = dyn_cast<InvokeInst>(*UI)) {
+      for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+        if (CI->getArgOperand(i) == V) return true;
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) {
       // Make sure that this is just the function being called, not that it is
       // passing into the function.
-      for (unsigned i = 0, e = II->getNumOperands() - 3; i != e; ++i)
-        if (II->getOperand(i) == V) return true;
-    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(*UI)) {
+      for (unsigned i = 0, e = II->getNumArgOperands(); i != e; ++i)
+        if (II->getArgOperand(i) == V) return true;
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
       if (CE->getOpcode() == Instruction::GetElementPtr ||
           CE->getOpcode() == Instruction::BitCast) {
         if (AnalyzeUsesOfPointer(CE, Readers, Writers))
@@ -267,12 +268,14 @@ bool GlobalsModRef::AnalyzeUsesOfPointer(Value *V,
       } else {
         return true;
       }
-    } else if (ICmpInst *ICI = dyn_cast<ICmpInst>(*UI)) {
+    } else if (ICmpInst *ICI = dyn_cast<ICmpInst>(U)) {
       if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
         return true;  // Allow comparison against null.
     } else {
       return true;
     }
+  }
+
   return false;
 }
 
@@ -291,7 +294,8 @@ bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) {
   // Walk the user list of the global.  If we find anything other than a direct
   // load or store, bail out.
   for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I){
-    if (LoadInst *LI = dyn_cast<LoadInst>(*I)) {
+    User *U = *I;
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
       // The pointer loaded from the global can only be used in simple ways:
       // we allow addressing of it and loading storing to it.  We do *not* allow
       // storing the loaded pointer somewhere else or passing to a function.
@@ -299,7 +303,7 @@ bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) {
       if (AnalyzeUsesOfPointer(LI, ReadersWriters, ReadersWriters))
         return false;  // Loaded pointer escapes.
       // TODO: Could try some IP mod/ref of the loaded pointer.
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(*I)) {
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
       // Storing the global itself.
       if (SI->getOperand(0) == GV) return false;
 
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 98dbb69..b1df517 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -162,14 +162,14 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB) {
       if (Function *F = CS.getCalledFunction()) {
         if (F->isDeclaration() && 
             (F->getName() == "setjmp" || F->getName() == "_setjmp"))
-          NeverInline = true;
+          callsSetJmp = true;
        
         // If this call is to function itself, then the function is recursive.
         // Inlining it into other functions is a bad idea, because this is
         // basically just a form of loop peeling, and our metrics aren't useful
         // for that case.
         if (F == BB->getParent())
-          NeverInline = true;
+          isRecursive = true;
       }
 
       if (!isa<IntrinsicInst>(II) && !callIsSmall(CS.getCalledFunction())) {
@@ -220,7 +220,7 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB) {
   // jump would jump from the inlined copy of the function into the original
   // function which is extremely undefined behavior.
   if (isa<IndirectBrInst>(BB->getTerminator()))
-    NeverInline = true;
+    containsIndirectBr = true;
 
   // Remember NumInsts for this BB.
   NumBBInsts[BB] = NumInsts - NumInstsBeforeThisBB;
@@ -247,7 +247,7 @@ void InlineCostAnalyzer::FunctionInfo::analyzeFunction(Function *F) {
 
   // Don't bother calculating argument weights if we are never going to inline
   // the function anyway.
-  if (Metrics.NeverInline)
+  if (NeverInline())
     return;
 
   // Check out all of the arguments to the function, figuring out how much
@@ -258,6 +258,14 @@ void InlineCostAnalyzer::FunctionInfo::analyzeFunction(Function *F) {
                                       CountCodeReductionForAlloca(I)));
 }
 
+/// NeverInline - returns true if the function should never be inlined into
+/// any caller
+bool InlineCostAnalyzer::FunctionInfo::NeverInline()
+{
+  return (Metrics.callsSetJmp || Metrics.isRecursive || 
+          Metrics.containsIndirectBr);
+
+}
 // getInlineCost - The heuristic used to determine if we should inline the
 // function call or not.
 //
@@ -315,7 +323,7 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS,
     CalleeFI->analyzeFunction(Callee);
 
   // If we should never inline this, return a huge cost.
-  if (CalleeFI->Metrics.NeverInline)
+  if (CalleeFI->NeverInline())
     return InlineCost::getNever();
 
   // FIXME: It would be nice to kill off CalleeFI->NeverInline. Then we
@@ -443,10 +451,15 @@ InlineCostAnalyzer::growCachedCostInfo(Function *Caller, Function *Callee) {
   }
   
   // Since CalleeMetrics were already calculated, we know that the CallerMetrics
-  // reference isn't invalidated: both were in the DenseMap.  
-  CallerMetrics.NeverInline |= CalleeMetrics.NeverInline;
+  // reference isn't invalidated: both were in the DenseMap.
   CallerMetrics.usesDynamicAlloca |= CalleeMetrics.usesDynamicAlloca;
 
+  // FIXME: If any of these three are true for the callee, the callee was
+  // not inlined into the caller, so I think they're redundant here.
+  CallerMetrics.callsSetJmp |= CalleeMetrics.callsSetJmp;
+  CallerMetrics.isRecursive |= CalleeMetrics.isRecursive;
+  CallerMetrics.containsIndirectBr |= CalleeMetrics.containsIndirectBr;
+
   CallerMetrics.NumInsts += CalleeMetrics.NumInsts;
   CallerMetrics.NumBlocks += CalleeMetrics.NumBlocks;
   CallerMetrics.NumCalls += CalleeMetrics.NumCalls;
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index a031cbc..9f1b30d 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -19,7 +19,8 @@
 // 
 // Another limitation is that it assumes all code will be executed. A store
 // through a null pointer in a basic block which is never reached is harmless,
-// but this pass will warn about it anyway.
+// but this pass will warn about it anyway. This is the main reason why most
+// of these checks live here instead of in the Verifier pass.
 //
 // Optimization passes may make conditions that this pass checks for more or
 // less obvious. If an optimization pass appears to be introducing a warning,
@@ -35,7 +36,11 @@
 
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/Lint.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Target/TargetData.h"
@@ -64,7 +69,8 @@ namespace {
     void visitFunction(Function &F);
 
     void visitCallSite(CallSite CS);
-    void visitMemoryReference(Instruction &I, Value *Ptr, unsigned Align,
+    void visitMemoryReference(Instruction &I, Value *Ptr,
+                              unsigned Size, unsigned Align,
                               const Type *Ty, unsigned Flags);
 
     void visitCallInst(CallInst &I);
@@ -88,9 +94,14 @@ namespace {
     void visitInsertElementInst(InsertElementInst &I);
     void visitUnreachableInst(UnreachableInst &I);
 
+    Value *findValue(Value *V, bool OffsetOk) const;
+    Value *findValueImpl(Value *V, bool OffsetOk,
+                         SmallPtrSet<Value *, 4> &Visited) const;
+
   public:
     Module *Mod;
     AliasAnalysis *AA;
+    DominatorTree *DT;
     TargetData *TD;
 
     std::string Messages;
@@ -104,6 +115,7 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
       AU.addRequired<AliasAnalysis>();
+      AU.addRequired<DominatorTree>();
     }
     virtual void print(raw_ostream &O, const Module *M) const {}
 
@@ -176,6 +188,7 @@ X("lint", "Statically lint-checks LLVM IR", false, true);
 bool Lint::runOnFunction(Function &F) {
   Mod = F.getParent();
   AA = &getAnalysis<AliasAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
   TD = getAnalysisIfAvailable<TargetData>();
   visit(F);
   dbgs() << MessagesStr.str();
@@ -188,15 +201,17 @@ void Lint::visitFunction(Function &F) {
   // fairly common mistake to neglect to name a function.
   Assert1(F.hasName() || F.hasLocalLinkage(),
           "Unusual: Unnamed function with non-local linkage", &F);
+
+  // TODO: Check for irreducible control flow.
 }
 
 void Lint::visitCallSite(CallSite CS) {
   Instruction &I = *CS.getInstruction();
   Value *Callee = CS.getCalledValue();
 
-  visitMemoryReference(I, Callee, 0, 0, MemRef::Callee);
+  visitMemoryReference(I, Callee, ~0u, 0, 0, MemRef::Callee);
 
-  if (Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) {
+  if (Function *F = dyn_cast<Function>(findValue(Callee, /*OffsetOk=*/false))) {
     Assert1(CS.getCallingConv() == F->getCallingConv(),
             "Undefined behavior: Caller and callee calling convention differ",
             &I);
@@ -209,23 +224,53 @@ void Lint::visitCallSite(CallSite CS) {
               FT->getNumParams() == NumActualArgs,
             "Undefined behavior: Call argument count mismatches callee "
             "argument count", &I);
-      
-    // TODO: Check argument types (in case the callee was casted)
-
-    // TODO: Check ABI-significant attributes.
 
-    // TODO: Check noalias attribute.
-
-    // TODO: Check sret attribute.
+    Assert1(FT->getReturnType() == I.getType(),
+            "Undefined behavior: Call return type mismatches "
+            "callee return type", &I);
+
+    // Check argument types (in case the callee was casted) and attributes.
+    // TODO: Verify that caller and callee attributes are compatible.
+    Function::arg_iterator PI = F->arg_begin(), PE = F->arg_end();
+    CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
+    for (; AI != AE; ++AI) {
+      Value *Actual = *AI;
+      if (PI != PE) {
+        Argument *Formal = PI++;
+        Assert1(Formal->getType() == Actual->getType(),
+                "Undefined behavior: Call argument type mismatches "
+                "callee parameter type", &I);
+
+        // Check that noalias arguments don't alias other arguments. The
+        // AliasAnalysis API isn't expressive enough for what we really want
+        // to do. Known partial overlap is not distinguished from the case
+        // where nothing is known.
+        if (Formal->hasNoAliasAttr() && Actual->getType()->isPointerTy())
+          for (CallSite::arg_iterator BI = CS.arg_begin(); BI != AE; ++BI) {
+            Assert1(AI == BI ||
+                    AA->alias(*AI, ~0u, *BI, ~0u) != AliasAnalysis::MustAlias,
+                    "Unusual: noalias argument aliases another argument", &I);
+          }
+
+        // Check that an sret argument points to valid memory.
+        if (Formal->hasStructRetAttr() && Actual->getType()->isPointerTy()) {
+          const Type *Ty =
+            cast<PointerType>(Formal->getType())->getElementType();
+          visitMemoryReference(I, Actual, AA->getTypeStoreSize(Ty),
+                               TD ? TD->getABITypeAlignment(Ty) : 0,
+                               Ty, MemRef::Read | MemRef::Write);
+        }
+      }
+    }
   }
 
   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->isTailCall())
     for (CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
          AI != AE; ++AI) {
-      Value *Obj = (*AI)->getUnderlyingObject();
-      Assert1(!isa<AllocaInst>(Obj) && !isa<VAArgInst>(Obj),
+      Value *Obj = findValue(*AI, /*OffsetOk=*/true);
+      Assert1(!isa<AllocaInst>(Obj),
               "Undefined behavior: Call with \"tail\" keyword references "
-              "alloca or va_arg", &I);
+              "alloca", &I);
     }
 
 
@@ -237,9 +282,10 @@ void Lint::visitCallSite(CallSite CS) {
 
     case Intrinsic::memcpy: {
       MemCpyInst *MCI = cast<MemCpyInst>(&I);
-      visitMemoryReference(I, MCI->getSource(), MCI->getAlignment(), 0,
+      // TODO: If the size is known, use it.
+      visitMemoryReference(I, MCI->getDest(), ~0u, MCI->getAlignment(), 0,
                            MemRef::Write);
-      visitMemoryReference(I, MCI->getDest(), MCI->getAlignment(), 0,
+      visitMemoryReference(I, MCI->getSource(), ~0u, MCI->getAlignment(), 0,
                            MemRef::Read);
 
       // Check that the memcpy arguments don't overlap. The AliasAnalysis API
@@ -247,7 +293,8 @@ void Lint::visitCallSite(CallSite CS) {
       // overlap is not distinguished from the case where nothing is known.
       unsigned Size = 0;
       if (const ConstantInt *Len =
-            dyn_cast<ConstantInt>(MCI->getLength()->stripPointerCasts()))
+            dyn_cast<ConstantInt>(findValue(MCI->getLength(),
+                                            /*OffsetOk=*/false)))
         if (Len->getValue().isIntN(32))
           Size = Len->getValue().getZExtValue();
       Assert1(AA->alias(MCI->getSource(), Size, MCI->getDest(), Size) !=
@@ -257,15 +304,17 @@ void Lint::visitCallSite(CallSite CS) {
     }
     case Intrinsic::memmove: {
       MemMoveInst *MMI = cast<MemMoveInst>(&I);
-      visitMemoryReference(I, MMI->getSource(), MMI->getAlignment(), 0,
+      // TODO: If the size is known, use it.
+      visitMemoryReference(I, MMI->getDest(), ~0u, MMI->getAlignment(), 0,
                            MemRef::Write);
-      visitMemoryReference(I, MMI->getDest(), MMI->getAlignment(), 0,
+      visitMemoryReference(I, MMI->getSource(), ~0u, MMI->getAlignment(), 0,
                            MemRef::Read);
       break;
     }
     case Intrinsic::memset: {
       MemSetInst *MSI = cast<MemSetInst>(&I);
-      visitMemoryReference(I, MSI->getDest(), MSI->getAlignment(), 0,
+      // TODO: If the size is known, use it.
+      visitMemoryReference(I, MSI->getDest(), ~0u, MSI->getAlignment(), 0,
                            MemRef::Write);
       break;
     }
@@ -275,15 +324,15 @@ void Lint::visitCallSite(CallSite CS) {
               "Undefined behavior: va_start called in a non-varargs function",
               &I);
 
-      visitMemoryReference(I, CS.getArgument(0), 0, 0,
+      visitMemoryReference(I, CS.getArgument(0), ~0u, 0, 0,
                            MemRef::Read | MemRef::Write);
       break;
     case Intrinsic::vacopy:
-      visitMemoryReference(I, CS.getArgument(0), 0, 0, MemRef::Write);
-      visitMemoryReference(I, CS.getArgument(1), 0, 0, MemRef::Read);
+      visitMemoryReference(I, CS.getArgument(0), ~0u, 0, 0, MemRef::Write);
+      visitMemoryReference(I, CS.getArgument(1), ~0u, 0, 0, MemRef::Read);
       break;
     case Intrinsic::vaend:
-      visitMemoryReference(I, CS.getArgument(0), 0, 0,
+      visitMemoryReference(I, CS.getArgument(0), ~0u, 0, 0,
                            MemRef::Read | MemRef::Write);
       break;
 
@@ -291,7 +340,7 @@ void Lint::visitCallSite(CallSite CS) {
       // Stackrestore doesn't read or write memory, but it sets the
       // stack pointer, which the compiler may read from or write to
       // at any time, so check it for both readability and writeability.
-      visitMemoryReference(I, CS.getArgument(0), 0, 0,
+      visitMemoryReference(I, CS.getArgument(0), ~0u, 0, 0,
                            MemRef::Read | MemRef::Write);
       break;
     }
@@ -310,17 +359,35 @@ void Lint::visitReturnInst(ReturnInst &I) {
   Assert1(!F->doesNotReturn(),
           "Unusual: Return statement in function with noreturn attribute",
           &I);
+
+  if (Value *V = I.getReturnValue()) {
+    Value *Obj = findValue(V, /*OffsetOk=*/true);
+    Assert1(!isa<AllocaInst>(Obj),
+            "Unusual: Returning alloca value", &I);
+  }
 }
 
-// TODO: Add a length argument and check that the reference is in bounds
+// TODO: Check that the reference is in bounds.
+// TODO: Check readnone/readonly function attributes.
 void Lint::visitMemoryReference(Instruction &I,
-                                Value *Ptr, unsigned Align, const Type *Ty,
-                                unsigned Flags) {
-  Value *UnderlyingObject = Ptr->getUnderlyingObject();
+                                Value *Ptr, unsigned Size, unsigned Align,
+                                const Type *Ty, unsigned Flags) {
+  // If no memory is being referenced, it doesn't matter if the pointer
+  // is valid.
+  if (Size == 0)
+    return;
+
+  Value *UnderlyingObject = findValue(Ptr, /*OffsetOk=*/true);
   Assert1(!isa<ConstantPointerNull>(UnderlyingObject),
           "Undefined behavior: Null pointer dereference", &I);
   Assert1(!isa<UndefValue>(UnderlyingObject),
           "Undefined behavior: Undef pointer dereference", &I);
+  Assert1(!isa<ConstantInt>(UnderlyingObject) ||
+          !cast<ConstantInt>(UnderlyingObject)->isAllOnesValue(),
+          "Unusual: All-ones pointer dereference", &I);
+  Assert1(!isa<ConstantInt>(UnderlyingObject) ||
+          !cast<ConstantInt>(UnderlyingObject)->isOne(),
+          "Unusual: Address one pointer dereference", &I);
 
   if (Flags & MemRef::Write) {
     if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(UnderlyingObject))
@@ -361,13 +428,16 @@ void Lint::visitMemoryReference(Instruction &I,
 }
 
 void Lint::visitLoadInst(LoadInst &I) {
-  visitMemoryReference(I, I.getPointerOperand(), I.getAlignment(), I.getType(),
-                       MemRef::Read);
+  visitMemoryReference(I, I.getPointerOperand(),
+                       AA->getTypeStoreSize(I.getType()), I.getAlignment(),
+                       I.getType(), MemRef::Read);
 }
 
 void Lint::visitStoreInst(StoreInst &I) {
-  visitMemoryReference(I, I.getPointerOperand(), I.getAlignment(),
-                  I.getOperand(0)->getType(), MemRef::Write);
+  visitMemoryReference(I, I.getPointerOperand(),
+                       AA->getTypeStoreSize(I.getOperand(0)->getType()),
+                       I.getAlignment(),
+                       I.getOperand(0)->getType(), MemRef::Write);
 }
 
 void Lint::visitXor(BinaryOperator &I) {
@@ -384,21 +454,21 @@ void Lint::visitSub(BinaryOperator &I) {
 
 void Lint::visitLShr(BinaryOperator &I) {
   if (ConstantInt *CI =
-        dyn_cast<ConstantInt>(I.getOperand(1)->stripPointerCasts()))
+        dyn_cast<ConstantInt>(findValue(I.getOperand(1), /*OffsetOk=*/false)))
     Assert1(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
             "Undefined result: Shift count out of range", &I);
 }
 
 void Lint::visitAShr(BinaryOperator &I) {
   if (ConstantInt *CI =
-        dyn_cast<ConstantInt>(I.getOperand(1)->stripPointerCasts()))
+        dyn_cast<ConstantInt>(findValue(I.getOperand(1), /*OffsetOk=*/false)))
     Assert1(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
             "Undefined result: Shift count out of range", &I);
 }
 
 void Lint::visitShl(BinaryOperator &I) {
   if (ConstantInt *CI =
-        dyn_cast<ConstantInt>(I.getOperand(1)->stripPointerCasts()))
+        dyn_cast<ConstantInt>(findValue(I.getOperand(1), /*OffsetOk=*/false)))
     Assert1(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
             "Undefined result: Shift count out of range", &I);
 }
@@ -439,27 +509,31 @@ void Lint::visitAllocaInst(AllocaInst &I) {
     // This isn't undefined behavior, it's just an obvious pessimization.
     Assert1(&I.getParent()->getParent()->getEntryBlock() == I.getParent(),
             "Pessimization: Static alloca outside of entry block", &I);
+
+  // TODO: Check for an unusual size (MSB set?)
 }
 
 void Lint::visitVAArgInst(VAArgInst &I) {
-  visitMemoryReference(I, I.getOperand(0), 0, 0,
+  visitMemoryReference(I, I.getOperand(0), ~0u, 0, 0,
                        MemRef::Read | MemRef::Write);
 }
 
 void Lint::visitIndirectBrInst(IndirectBrInst &I) {
-  visitMemoryReference(I, I.getAddress(), 0, 0, MemRef::Branchee);
+  visitMemoryReference(I, I.getAddress(), ~0u, 0, 0, MemRef::Branchee);
 }
 
 void Lint::visitExtractElementInst(ExtractElementInst &I) {
   if (ConstantInt *CI =
-        dyn_cast<ConstantInt>(I.getIndexOperand()->stripPointerCasts()))
+        dyn_cast<ConstantInt>(findValue(I.getIndexOperand(),
+                                        /*OffsetOk=*/false)))
     Assert1(CI->getValue().ult(I.getVectorOperandType()->getNumElements()),
             "Undefined result: extractelement index out of range", &I);
 }
 
 void Lint::visitInsertElementInst(InsertElementInst &I) {
   if (ConstantInt *CI =
-        dyn_cast<ConstantInt>(I.getOperand(2)->stripPointerCasts()))
+        dyn_cast<ConstantInt>(findValue(I.getOperand(2),
+                                        /*OffsetOk=*/false)))
     Assert1(CI->getValue().ult(I.getType()->getNumElements()),
             "Undefined result: insertelement index out of range", &I);
 }
@@ -472,6 +546,91 @@ void Lint::visitUnreachableInst(UnreachableInst &I) {
           "side effects", &I);
 }
 
+/// findValue - Look through bitcasts and simple memory reference patterns
+/// to identify an equivalent, but more informative, value.  If OffsetOk
+/// is true, look through getelementptrs with non-zero offsets too.
+///
+/// Most analysis passes don't require this logic, because instcombine
+/// will simplify most of these kinds of things away. But it's a goal of
+/// this Lint pass to be useful even on non-optimized IR.
+Value *Lint::findValue(Value *V, bool OffsetOk) const {
+  SmallPtrSet<Value *, 4> Visited;
+  return findValueImpl(V, OffsetOk, Visited);
+}
+
+/// findValueImpl - Implementation helper for findValue.
+Value *Lint::findValueImpl(Value *V, bool OffsetOk,
+                           SmallPtrSet<Value *, 4> &Visited) const {
+  // Detect self-referential values.
+  if (!Visited.insert(V))
+    return UndefValue::get(V->getType());
+
+  // TODO: Look through sext or zext cast, when the result is known to
+  // be interpreted as signed or unsigned, respectively.
+  // TODO: Look through eliminable cast pairs.
+  // TODO: Look through calls with unique return values.
+  // TODO: Look through vector insert/extract/shuffle.
+  V = OffsetOk ? V->getUnderlyingObject() : V->stripPointerCasts();
+  if (LoadInst *L = dyn_cast<LoadInst>(V)) {
+    BasicBlock::iterator BBI = L;
+    BasicBlock *BB = L->getParent();
+    SmallPtrSet<BasicBlock *, 4> VisitedBlocks;
+    for (;;) {
+      if (!VisitedBlocks.insert(BB)) break;
+      if (Value *U = FindAvailableLoadedValue(L->getPointerOperand(),
+                                              BB, BBI, 6, AA))
+        return findValueImpl(U, OffsetOk, Visited);
+      if (BBI != BB->begin()) break;
+      BB = BB->getUniquePredecessor();
+      if (!BB) break;
+      BBI = BB->end();
+    }
+  } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    if (Value *W = PN->hasConstantValue(DT))
+      return findValueImpl(W, OffsetOk, Visited);
+  } else if (CastInst *CI = dyn_cast<CastInst>(V)) {
+    if (CI->isNoopCast(TD ? TD->getIntPtrType(V->getContext()) :
+                            Type::getInt64Ty(V->getContext())))
+      return findValueImpl(CI->getOperand(0), OffsetOk, Visited);
+  } else if (ExtractValueInst *Ex = dyn_cast<ExtractValueInst>(V)) {
+    if (Value *W = FindInsertedValue(Ex->getAggregateOperand(),
+                                     Ex->idx_begin(),
+                                     Ex->idx_end()))
+      if (W != V)
+        return findValueImpl(W, OffsetOk, Visited);
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    // Same as above, but for ConstantExpr instead of Instruction.
+    if (Instruction::isCast(CE->getOpcode())) {
+      if (CastInst::isNoopCast(Instruction::CastOps(CE->getOpcode()),
+                               CE->getOperand(0)->getType(),
+                               CE->getType(),
+                               TD ? TD->getIntPtrType(V->getContext()) :
+                                    Type::getInt64Ty(V->getContext())))
+        return findValueImpl(CE->getOperand(0), OffsetOk, Visited);
+    } else if (CE->getOpcode() == Instruction::ExtractValue) {
+      const SmallVector<unsigned, 4> &Indices = CE->getIndices();
+      if (Value *W = FindInsertedValue(CE->getOperand(0),
+                                       Indices.begin(),
+                                       Indices.end()))
+        if (W != V)
+          return findValueImpl(W, OffsetOk, Visited);
+    }
+  }
+
+  // As a last resort, try SimplifyInstruction or constant folding.
+  if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+    if (Value *W = SimplifyInstruction(Inst, TD))
+      if (W != Inst)
+        return findValueImpl(W, OffsetOk, Visited);
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (Value *W = ConstantFoldConstantExpression(CE, TD))
+      if (W != V)
+        return findValueImpl(W, OffsetOk, Visited);
+  }
+
+  return V;
+}
+
 //===----------------------------------------------------------------------===//
 //  Implement the public interfaces to this file...
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
new file mode 100644
index 0000000..2ba1d86
--- /dev/null
+++ b/lib/Analysis/Loads.cpp
@@ -0,0 +1,235 @@
+//===- Loads.cpp - Local load analysis ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines simple local analyses for load instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/IntrinsicInst.h"
+using namespace llvm;
+
+/// AreEquivalentAddressValues - Test if A and B will obviously have the same
+/// value. This includes recognizing that %t0 and %t1 will have the same
+/// value in code like this:
+///   %t0 = getelementptr \@a, 0, 3
+///   store i32 0, i32* %t0
+///   %t1 = getelementptr \@a, 0, 3
+///   %t2 = load i32* %t1
+///
+static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
+  // Test if the values are trivially equivalent.
+  if (A == B) return true;
+  
+  // Test if the values come from identical arithmetic instructions.
+  // Use isIdenticalToWhenDefined instead of isIdenticalTo because
+  // this function is only used when one address use dominates the
+  // other, which means that they'll always either have the same
+  // value or one of them will have an undefined value.
+  if (isa<BinaryOperator>(A) || isa<CastInst>(A) ||
+      isa<PHINode>(A) || isa<GetElementPtrInst>(A))
+    if (const Instruction *BI = dyn_cast<Instruction>(B))
+      if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
+        return true;
+  
+  // Otherwise they may not be equivalent.
+  return false;
+}
+
+/// getUnderlyingObjectWithOffset - Strip off up to MaxLookup GEPs and
+/// bitcasts to get back to the underlying object being addressed, keeping
+/// track of the offset in bytes from the GEPs relative to the result.
+/// This is closely related to Value::getUnderlyingObject but is located
+/// here to avoid making VMCore depend on TargetData.
+static Value *getUnderlyingObjectWithOffset(Value *V, const TargetData *TD,
+                                            uint64_t &ByteOffset,
+                                            unsigned MaxLookup = 6) {
+  if (!V->getType()->isPointerTy())
+    return V;
+  for (unsigned Count = 0; MaxLookup == 0 || Count < MaxLookup; ++Count) {
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      if (!GEP->hasAllConstantIndices())
+        return V;
+      SmallVector<Value*, 8> Indices(GEP->op_begin() + 1, GEP->op_end());
+      ByteOffset += TD->getIndexedOffset(GEP->getPointerOperandType(),
+                                         &Indices[0], Indices.size());
+      V = GEP->getPointerOperand();
+    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+      V = cast<Operator>(V)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+      if (GA->mayBeOverridden())
+        return V;
+      V = GA->getAliasee();
+    } else {
+      return V;
+    }
+    assert(V->getType()->isPointerTy() && "Unexpected operand type!");
+  }
+  return V;
+}
+
+/// isSafeToLoadUnconditionally - Return true if we know that executing a load
+/// from this value cannot trap.  If it is not obviously safe to load from the
+/// specified pointer, we do a quick local scan of the basic block containing
+/// ScanFrom, to determine if the address is already accessed.
+bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
+                                       unsigned Align, const TargetData *TD) {
+  uint64_t ByteOffset = 0;
+  Value *Base = V;
+  if (TD)
+    Base = getUnderlyingObjectWithOffset(V, TD, ByteOffset);
+
+  const Type *BaseType = 0;
+  unsigned BaseAlign = 0;
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(Base)) {
+    // An alloca is safe to load from as load as it is suitably aligned.
+    BaseType = AI->getAllocatedType();
+    BaseAlign = AI->getAlignment();
+  } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(Base)) {
+    // Global variables are safe to load from but their size cannot be
+    // guaranteed if they are overridden.
+    if (!isa<GlobalAlias>(GV) && !GV->mayBeOverridden()) {
+      BaseType = GV->getType()->getElementType();
+      BaseAlign = GV->getAlignment();
+    }
+  }
+
+  if (BaseType && BaseType->isSized()) {
+    if (TD && BaseAlign == 0)
+      BaseAlign = TD->getPrefTypeAlignment(BaseType);
+
+    if (Align <= BaseAlign) {
+      if (!TD)
+        return true; // Loading directly from an alloca or global is OK.
+
+      // Check if the load is within the bounds of the underlying object.
+      const PointerType *AddrTy = cast<PointerType>(V->getType());
+      uint64_t LoadSize = TD->getTypeStoreSize(AddrTy->getElementType());
+      if (ByteOffset + LoadSize <= TD->getTypeAllocSize(BaseType) &&
+          (Align == 0 || (ByteOffset % Align) == 0))
+        return true;
+    }
+  }
+
+  // Otherwise, be a little bit aggressive by scanning the local block where we
+  // want to check to see if the pointer is already being loaded or stored
+  // from/to.  If so, the previous load or store would have already trapped,
+  // so there is no harm doing an extra load (also, CSE will later eliminate
+  // the load entirely).
+  BasicBlock::iterator BBI = ScanFrom, E = ScanFrom->getParent()->begin();
+
+  while (BBI != E) {
+    --BBI;
+
+    // If we see a free or a call which may write to memory (i.e. which might do
+    // a free) the pointer could be marked invalid.
+    if (isa<CallInst>(BBI) && BBI->mayWriteToMemory() &&
+        !isa<DbgInfoIntrinsic>(BBI))
+      return false;
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+      if (AreEquivalentAddressValues(LI->getOperand(0), V)) return true;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+      if (AreEquivalentAddressValues(SI->getOperand(1), V)) return true;
+    }
+  }
+  return false;
+}
+
+/// FindAvailableLoadedValue - Scan the ScanBB block backwards (starting at the
+/// instruction before ScanFrom) checking to see if we have the value at the
+/// memory address *Ptr locally available within a small number of instructions.
+/// If the value is available, return it.
+///
+/// If not, return the iterator for the last validated instruction that the 
+/// value would be live through.  If we scanned the entire block and didn't find
+/// something that invalidates *Ptr or provides it, ScanFrom would be left at
+/// begin() and this returns null.  ScanFrom could also be left 
+///
+/// MaxInstsToScan specifies the maximum instructions to scan in the block.  If
+/// it is set to 0, it will scan the whole block. You can also optionally
+/// specify an alias analysis implementation, which makes this more precise.
+Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
+                                      BasicBlock::iterator &ScanFrom,
+                                      unsigned MaxInstsToScan,
+                                      AliasAnalysis *AA) {
+  if (MaxInstsToScan == 0) MaxInstsToScan = ~0U;
+
+  // If we're using alias analysis to disambiguate get the size of *Ptr.
+  unsigned AccessSize = 0;
+  if (AA) {
+    const Type *AccessTy = cast<PointerType>(Ptr->getType())->getElementType();
+    AccessSize = AA->getTypeStoreSize(AccessTy);
+  }
+  
+  while (ScanFrom != ScanBB->begin()) {
+    // We must ignore debug info directives when counting (otherwise they
+    // would affect codegen).
+    Instruction *Inst = --ScanFrom;
+    if (isa<DbgInfoIntrinsic>(Inst))
+      continue;
+
+    // Restore ScanFrom to expected value in case next test succeeds
+    ScanFrom++;
+   
+    // Don't scan huge blocks.
+    if (MaxInstsToScan-- == 0) return 0;
+    
+    --ScanFrom;
+    // If this is a load of Ptr, the loaded value is available.
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+      if (AreEquivalentAddressValues(LI->getOperand(0), Ptr))
+        return LI;
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      // If this is a store through Ptr, the value is available!
+      if (AreEquivalentAddressValues(SI->getOperand(1), Ptr))
+        return SI->getOperand(0);
+      
+      // If Ptr is an alloca and this is a store to a different alloca, ignore
+      // the store.  This is a trivial form of alias analysis that is important
+      // for reg2mem'd code.
+      if ((isa<AllocaInst>(Ptr) || isa<GlobalVariable>(Ptr)) &&
+          (isa<AllocaInst>(SI->getOperand(1)) ||
+           isa<GlobalVariable>(SI->getOperand(1))))
+        continue;
+      
+      // If we have alias analysis and it says the store won't modify the loaded
+      // value, ignore the store.
+      if (AA &&
+          (AA->getModRefInfo(SI, Ptr, AccessSize) & AliasAnalysis::Mod) == 0)
+        continue;
+      
+      // Otherwise the store that may or may not alias the pointer, bail out.
+      ++ScanFrom;
+      return 0;
+    }
+    
+    // If this is some other instruction that may clobber Ptr, bail out.
+    if (Inst->mayWriteToMemory()) {
+      // If alias analysis claims that it really won't modify the load,
+      // ignore it.
+      if (AA &&
+          (AA->getModRefInfo(Inst, Ptr, AccessSize) & AliasAnalysis::Mod) == 0)
+        continue;
+      
+      // May modify the pointer, bail out.
+      ++ScanFrom;
+      return 0;
+    }
+  }
+  
+  // Got to the start of the block, we didn't find it, but are done for this
+  // block.
+  return 0;
+}
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 735e31f..818d0a9 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -266,15 +266,16 @@ unsigned Loop::getSmallConstantTripMultiple() const {
 bool Loop::isLCSSAForm(DominatorTree &DT) const {
   // Sort the blocks vector so that we can use binary search to do quick
   // lookups.
-  SmallPtrSet<BasicBlock *, 16> LoopBBs(block_begin(), block_end());
+  SmallPtrSet<BasicBlock*, 16> LoopBBs(block_begin(), block_end());
 
   for (block_iterator BI = block_begin(), E = block_end(); BI != E; ++BI) {
     BasicBlock *BB = *BI;
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;++I)
       for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E;
            ++UI) {
-        BasicBlock *UserBB = cast<Instruction>(*UI)->getParent();
-        if (PHINode *P = dyn_cast<PHINode>(*UI))
+        User *U = *UI;
+        BasicBlock *UserBB = cast<Instruction>(U)->getParent();
+        if (PHINode *P = dyn_cast<PHINode>(U))
           UserBB = P->getIncomingBlock(UI);
 
         // Check the current block, as a fast-path, before checking whether
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 89f9743..1ab18ca 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -101,9 +101,9 @@ static Value *computeArraySize(const CallInst *CI, const TargetData *TD,
   if (const StructType *ST = dyn_cast<StructType>(T))
     ElementSize = TD->getStructLayout(ST)->getSizeInBytes();
 
-  // If malloc calls' arg can be determined to be a multiple of ElementSize,
+  // If malloc call's arg can be determined to be a multiple of ElementSize,
   // return the multiple.  Otherwise, return NULL.
-  Value *MallocArg = CI->getOperand(1);
+  Value *MallocArg = CI->getArgOperand(0);
   Value *Multiple = NULL;
   if (ComputeMultiple(MallocArg, ElementSize, Multiple,
                       LookThroughSExt))
@@ -120,7 +120,7 @@ const CallInst *llvm::isArrayMalloc(const Value *I, const TargetData *TD) {
   Value *ArraySize = computeArraySize(CI, TD);
 
   if (ArraySize &&
-      ArraySize != ConstantInt::get(CI->getOperand(1)->getType(), 1))
+      ArraySize != ConstantInt::get(CI->getArgOperand(0)->getType(), 1))
     return CI;
 
   // CI is a non-array malloc or we can't figure out that it is an array malloc.
@@ -183,25 +183,25 @@ Value *llvm::getMallocArraySize(CallInst *CI, const TargetData *TD,
 //  free Call Utility Functions.
 //
 
-/// isFreeCall - Returns true if the value is a call to the builtin free()
-bool llvm::isFreeCall(const Value *I) {
+/// isFreeCall - Returns non-null if the value is a call to the builtin free()
+const CallInst *llvm::isFreeCall(const Value *I) {
   const CallInst *CI = dyn_cast<CallInst>(I);
   if (!CI)
-    return false;
+    return 0;
   Function *Callee = CI->getCalledFunction();
   if (Callee == 0 || !Callee->isDeclaration() || Callee->getName() != "free")
-    return false;
+    return 0;
 
   // Check free prototype.
   // FIXME: workaround for PR5130, this will be obsolete when a nobuiltin 
   // attribute will exist.
   const FunctionType *FTy = Callee->getFunctionType();
   if (!FTy->getReturnType()->isVoidTy())
-    return false;
+    return 0;
   if (FTy->getNumParams() != 1)
-    return false;
+    return 0;
   if (FTy->param_begin()->get() != Type::getInt8PtrTy(Callee->getContext()))
-    return false;
+    return 0;
 
-  return true;
+  return CI;
 }
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 2aa2f17..1f54d74 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -116,8 +116,8 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
     } else if (VAArgInst *V = dyn_cast<VAArgInst>(Inst)) {
       Pointer = V->getOperand(0);
       PointerSize = AA->getTypeStoreSize(V->getType());
-    } else if (isFreeCall(Inst)) {
-      Pointer = Inst->getOperand(1);
+    } else if (const CallInst *CI = isFreeCall(Inst)) {
+      Pointer = CI->getArgOperand(0);
       // calls to free() erase the entire structure
       PointerSize = ~0ULL;
     } else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
@@ -197,9 +197,9 @@ getPointerDependencyFrom(Value *MemPtr, uint64_t MemSize, bool isLoad,
         // pointer, not on query pointers that are indexed off of them.  It'd
         // be nice to handle that at some point.
         AliasAnalysis::AliasResult R = 
-          AA->alias(II->getOperand(3), ~0U, MemPtr, ~0U);
+          AA->alias(II->getArgOperand(2), ~0U, MemPtr, ~0U);
         if (R == AliasAnalysis::MustAlias) {
-          InvariantTag = II->getOperand(1);
+          InvariantTag = II->getArgOperand(0);
           continue;
         }
       
@@ -210,7 +210,7 @@ getPointerDependencyFrom(Value *MemPtr, uint64_t MemSize, bool isLoad,
         // pointer, not on query pointers that are indexed off of them.  It'd
         // be nice to handle that at some point.
         AliasAnalysis::AliasResult R =
-          AA->alias(II->getOperand(2), ~0U, MemPtr, ~0U);
+          AA->alias(II->getArgOperand(1), ~0U, MemPtr, ~0U);
         if (R == AliasAnalysis::MustAlias)
           return MemDepResult::getDef(II);
       }
@@ -365,25 +365,26 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
       MemPtr = LI->getPointerOperand();
       MemSize = AA->getTypeStoreSize(LI->getType());
     }
-  } else if (isFreeCall(QueryInst)) {
-    MemPtr = QueryInst->getOperand(1);
+  } else if (const CallInst *CI = isFreeCall(QueryInst)) {
+    MemPtr = CI->getArgOperand(0);
     // calls to free() erase the entire structure, not just a field.
     MemSize = ~0UL;
   } else if (isa<CallInst>(QueryInst) || isa<InvokeInst>(QueryInst)) {
     int IntrinsicID = 0;  // Intrinsic IDs start at 1.
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(QueryInst))
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(QueryInst);
+    if (II)
       IntrinsicID = II->getIntrinsicID();
 
     switch (IntrinsicID) {
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
     case Intrinsic::invariant_start:
-      MemPtr = QueryInst->getOperand(2);
-      MemSize = cast<ConstantInt>(QueryInst->getOperand(1))->getZExtValue();
+      MemPtr = II->getArgOperand(1);
+      MemSize = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
       break;
     case Intrinsic::invariant_end:
-      MemPtr = QueryInst->getOperand(3);
-      MemSize = cast<ConstantInt>(QueryInst->getOperand(2))->getZExtValue();
+      MemPtr = II->getArgOperand(2);
+      MemSize = cast<ConstantInt>(II->getArgOperand(1))->getZExtValue();
       break;
     default:
       CallSite QueryCS = CallSite::get(QueryInst);
@@ -456,7 +457,7 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
     // Okay, we have a cache entry.  If we know it is not dirty, just return it
     // with no computation.
     if (!CacheP.second) {
-      NumCacheNonLocal++;
+      ++NumCacheNonLocal;
       return Cache;
     }
     
@@ -478,7 +479,7 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
     BasicBlock *QueryBB = QueryCS.getInstruction()->getParent();
     for (BasicBlock **PI = PredCache->GetPreds(QueryBB); *PI; ++PI)
       DirtyBlocks.push_back(*PI);
-    NumUncacheNonLocal++;
+    ++NumUncacheNonLocal;
   }
   
   // isReadonlyCall - If this is a read-only call, we can be more aggressive.
diff --git a/lib/Analysis/PostDominators.cpp b/lib/Analysis/PostDominators.cpp
index f0f3a05..7354afa 100644
--- a/lib/Analysis/PostDominators.cpp
+++ b/lib/Analysis/PostDominators.cpp
@@ -67,10 +67,11 @@ PostDominanceFrontier::calculate(const PostDominatorTree &DT,
   if (BB)
     for (pred_iterator SI = pred_begin(BB), SE = pred_end(BB);
          SI != SE; ++SI) {
+      BasicBlock *P = *SI;
       // Does Node immediately dominate this predecessor?
-      DomTreeNode *SINode = DT[*SI];
+      DomTreeNode *SINode = DT[P];
       if (SINode && SINode->getIDom() != Node)
-        S.insert(*SI);
+        S.insert(P);
     }
 
   // At this point, S is DFlocal.  Now we union in DFup's of our children...
diff --git a/lib/Analysis/ProfileInfo.cpp b/lib/Analysis/ProfileInfo.cpp
index 662576e..38dcd25 100644
--- a/lib/Analysis/ProfileInfo.cpp
+++ b/lib/Analysis/ProfileInfo.cpp
@@ -577,8 +577,6 @@ static void readEdge(ProfileInfo *PI, ProfileInfo::Edge e, double &calcw, std::s
 
 template<>
 bool ProfileInfoT<Function,BasicBlock>::EstimateMissingEdges(const BasicBlock *BB) {
-  bool hasNoSuccessors = false;
-
   double inWeight = 0;
   std::set<Edge> inMissing;
   std::set<const BasicBlock*> ProcessedPreds;
@@ -596,10 +594,8 @@ bool ProfileInfoT<Function,BasicBlock>::EstimateMissingEdges(const BasicBlock *B
   std::set<Edge> outMissing;
   std::set<const BasicBlock*> ProcessedSuccs;
   succ_const_iterator sbbi = succ_begin(BB), sbbe = succ_end(BB);
-  if (sbbi == sbbe) {
+  if (sbbi == sbbe)
     readEdge(this,getEdge(BB,0),outWeight,outMissing);
-    hasNoSuccessors = true;
-  }
   for ( ; sbbi != sbbe; ++sbbi ) {
     if (ProcessedSuccs.insert(*sbbi).second) {
       readEdge(this,getEdge(BB,*sbbi),outWeight,outMissing);
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 6870268..413b3b4 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -822,7 +822,8 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
   // Fold if the operand is constant.
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
     return getConstant(
-      cast<ConstantInt>(ConstantExpr::getTrunc(SC->getValue(), Ty)));
+      cast<ConstantInt>(ConstantExpr::getTrunc(SC->getValue(),
+                                               getEffectiveSCEVType(Ty))));
 
   // trunc(trunc(x)) --> trunc(x)
   if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op))
@@ -844,9 +845,9 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
     return getAddRecExpr(Operands, AddRec->getLoop());
   }
 
-  // The cast wasn't folded; create an explicit cast node.
-  // Recompute the insert position, as it may have been invalidated.
-  if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+  // The cast wasn't folded; create an explicit cast node. We can reuse
+  // the existing insert position since if we get here, we won't have
+  // made any changes which would invalidate it.
   SCEV *S = new (SCEVAllocator) SCEVTruncateExpr(ID.Intern(SCEVAllocator),
                                                  Op, Ty);
   UniqueSCEVs.InsertNode(S, IP);
@@ -862,12 +863,10 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
   Ty = getEffectiveSCEVType(Ty);
 
   // Fold if the operand is constant.
-  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op)) {
-    const Type *IntTy = getEffectiveSCEVType(Ty);
-    Constant *C = ConstantExpr::getZExt(SC->getValue(), IntTy);
-    if (IntTy != Ty) C = ConstantExpr::getIntToPtr(C, Ty);
-    return getConstant(cast<ConstantInt>(C));
-  }
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
+    return getConstant(
+      cast<ConstantInt>(ConstantExpr::getZExt(SC->getValue(),
+                                              getEffectiveSCEVType(Ty))));
 
   // zext(zext(x)) --> zext(x)
   if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
@@ -997,12 +996,10 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
   Ty = getEffectiveSCEVType(Ty);
 
   // Fold if the operand is constant.
-  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op)) {
-    const Type *IntTy = getEffectiveSCEVType(Ty);
-    Constant *C = ConstantExpr::getSExt(SC->getValue(), IntTy);
-    if (IntTy != Ty) C = ConstantExpr::getIntToPtr(C, Ty);
-    return getConstant(cast<ConstantInt>(C));
-  }
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
+    return getConstant(
+      cast<ConstantInt>(ConstantExpr::getSExt(SC->getValue(),
+                                              getEffectiveSCEVType(Ty))));
 
   // sext(sext(x)) --> sext(x)
   if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op))
@@ -1208,8 +1205,19 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
                              ScalarEvolution &SE) {
   bool Interesting = false;
 
-  // Iterate over the add operands.
-  for (unsigned i = 0, e = NumOperands; i != e; ++i) {
+  // Iterate over the add operands. They are sorted, with constants first.
+  unsigned i = 0;
+  while (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[i])) {
+    ++i;
+    // Pull a buried constant out to the outside.
+    if (Scale != 1 || AccumulatedConstant != 0 || C->getValue()->isZero())
+      Interesting = true;
+    AccumulatedConstant += Scale * C->getValue()->getValue();
+  }
+
+  // Next comes everything else. We're especially interested in multiplies
+  // here, but they're in the middle, so just visit the rest with one loop.
+  for (; i != NumOperands; ++i) {
     const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[i]);
     if (Mul && isa<SCEVConstant>(Mul->getOperand(0))) {
       APInt NewScale =
@@ -1237,11 +1245,6 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
           Interesting = true;
         }
       }
-    } else if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[i])) {
-      // Pull a buried constant out to the outside.
-      if (Scale != 1 || AccumulatedConstant != 0 || C->getValue()->isZero())
-        Interesting = true;
-      AccumulatedConstant += Scale * C->getValue()->getValue();
     } else {
       // An ordinary operand. Update the map.
       std::pair<DenseMap<const SCEV *, APInt>::iterator, bool> Pair =
@@ -1275,9 +1278,9 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   assert(!Ops.empty() && "Cannot get empty add!");
   if (Ops.size() == 1) return Ops[0];
 #ifndef NDEBUG
+  const Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
-    assert(getEffectiveSCEVType(Ops[i]->getType()) ==
-           getEffectiveSCEVType(Ops[0]->getType()) &&
+    assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
            "SCEVAddExpr operand types don't match!");
 #endif
 
@@ -1400,8 +1403,8 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     while (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[Idx])) {
       // If we have an add, expand the add operands onto the end of the operands
       // list.
-      Ops.insert(Ops.end(), Add->op_begin(), Add->op_end());
       Ops.erase(Ops.begin()+Idx);
+      Ops.append(Add->op_begin(), Add->op_end());
       DeletedAdd = true;
     }
 
@@ -1549,9 +1552,11 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
                                              AddRec->op_end());
       AddRecOps[0] = getAddExpr(LIOps);
 
-      // It's tempting to propagate NUW/NSW flags here, but nuw/nsw addition
-      // is not associative so this isn't necessarily safe.
-      const SCEV *NewRec = getAddRecExpr(AddRecOps, AddRecLoop);
+      // Build the new addrec. Propagate the NUW and NSW flags if both the
+      // outer add and the inner addrec are guaranteed to have no overflow.
+      const SCEV *NewRec = getAddRecExpr(AddRecOps, AddRecLoop,
+                                         HasNUW && AddRec->hasNoUnsignedWrap(),
+                                         HasNSW && AddRec->hasNoSignedWrap());
 
       // If all of the other operands were loop invariant, we are done.
       if (Ops.size() == 1) return NewRec;
@@ -1578,7 +1583,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
                                               AddRec->op_end());
           for (unsigned i = 0, e = OtherAddRec->getNumOperands(); i != e; ++i) {
             if (i >= NewOps.size()) {
-              NewOps.insert(NewOps.end(), OtherAddRec->op_begin()+i,
+              NewOps.append(OtherAddRec->op_begin()+i,
                             OtherAddRec->op_end());
               break;
             }
@@ -1711,8 +1716,8 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
     while (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[Idx])) {
       // If we have an mul, expand the mul operands onto the end of the operands
       // list.
-      Ops.insert(Ops.end(), Mul->op_begin(), Mul->op_end());
       Ops.erase(Ops.begin()+Idx);
+      Ops.append(Mul->op_begin(), Mul->op_end());
       DeletedMul = true;
     }
 
@@ -1747,23 +1752,15 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
       //  NLI * LI * {Start,+,Step}  -->  NLI * {LI*Start,+,LI*Step}
       SmallVector<const SCEV *, 4> NewOps;
       NewOps.reserve(AddRec->getNumOperands());
-      if (LIOps.size() == 1) {
-        const SCEV *Scale = LIOps[0];
-        for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i)
-          NewOps.push_back(getMulExpr(Scale, AddRec->getOperand(i)));
-      } else {
-        for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) {
-          SmallVector<const SCEV *, 4> MulOps(LIOps.begin(), LIOps.end());
-          MulOps.push_back(AddRec->getOperand(i));
-          NewOps.push_back(getMulExpr(MulOps));
-        }
-      }
+      const SCEV *Scale = getMulExpr(LIOps);
+      for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i)
+        NewOps.push_back(getMulExpr(Scale, AddRec->getOperand(i)));
 
-      // It's tempting to propagate the NSW flag here, but nsw multiplication
-      // is not associative so this isn't necessarily safe.
+      // Build the new addrec. Propagate the NUW and NSW flags if both the
+      // outer mul and the inner addrec are guaranteed to have no overflow.
       const SCEV *NewRec = getAddRecExpr(NewOps, AddRec->getLoop(),
                                          HasNUW && AddRec->hasNoUnsignedWrap(),
-                                         /*HasNSW=*/false);
+                                         HasNSW && AddRec->hasNoSignedWrap());
 
       // If all of the other operands were loop invariant, we are done.
       if (Ops.size() == 1) return NewRec;
@@ -1942,8 +1939,7 @@ const SCEV *ScalarEvolution::getAddRecExpr(const SCEV *Start,
   Operands.push_back(Start);
   if (const SCEVAddRecExpr *StepChrec = dyn_cast<SCEVAddRecExpr>(Step))
     if (StepChrec->getLoop() == L) {
-      Operands.insert(Operands.end(), StepChrec->op_begin(),
-                      StepChrec->op_end());
+      Operands.append(StepChrec->op_begin(), StepChrec->op_end());
       return getAddRecExpr(Operands, L);
     }
 
@@ -2106,8 +2102,8 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
   if (Idx < Ops.size()) {
     bool DeletedSMax = false;
     while (const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(Ops[Idx])) {
-      Ops.insert(Ops.end(), SMax->op_begin(), SMax->op_end());
       Ops.erase(Ops.begin()+Idx);
+      Ops.append(SMax->op_begin(), SMax->op_end());
       DeletedSMax = true;
     }
 
@@ -2211,8 +2207,8 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
   if (Idx < Ops.size()) {
     bool DeletedUMax = false;
     while (const SCEVUMaxExpr *UMax = dyn_cast<SCEVUMaxExpr>(Ops[Idx])) {
-      Ops.insert(Ops.end(), UMax->op_begin(), UMax->op_end());
       Ops.erase(Ops.begin()+Idx);
+      Ops.append(UMax->op_begin(), UMax->op_end());
       DeletedUMax = true;
     }
 
@@ -2278,7 +2274,8 @@ const SCEV *ScalarEvolution::getSizeOfExpr(const Type *AllocTy) {
 
   Constant *C = ConstantExpr::getSizeOf(AllocTy);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    C = ConstantFoldConstantExpression(CE, TD);
+    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD))
+      C = Folded;
   const Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
@@ -2286,7 +2283,8 @@ const SCEV *ScalarEvolution::getSizeOfExpr(const Type *AllocTy) {
 const SCEV *ScalarEvolution::getAlignOfExpr(const Type *AllocTy) {
   Constant *C = ConstantExpr::getAlignOf(AllocTy);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    C = ConstantFoldConstantExpression(CE, TD);
+    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD))
+      C = Folded;
   const Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
@@ -2302,7 +2300,8 @@ const SCEV *ScalarEvolution::getOffsetOfExpr(const StructType *STy,
 
   Constant *C = ConstantExpr::getOffsetOf(STy, FieldNo);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    C = ConstantFoldConstantExpression(CE, TD);
+    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD))
+      C = Folded;
   const Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(STy));
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
@@ -2311,7 +2310,8 @@ const SCEV *ScalarEvolution::getOffsetOfExpr(const Type *CTy,
                                              Constant *FieldNo) {
   Constant *C = ConstantExpr::getOffsetOf(CTy, FieldNo);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    C = ConstantFoldConstantExpression(CE, TD);
+    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD))
+      C = Folded;
   const Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(CTy));
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
@@ -2398,13 +2398,6 @@ const SCEV *ScalarEvolution::getSCEV(Value *V) {
   return S;
 }
 
-/// getIntegerSCEV - Given a SCEVable type, create a constant for the
-/// specified signed integer value and return a SCEV for the constant.
-const SCEV *ScalarEvolution::getIntegerSCEV(int64_t Val, const Type *Ty) {
-  const IntegerType *ITy = cast<IntegerType>(getEffectiveSCEVType(Ty));
-  return getConstant(ConstantInt::get(ITy, Val));
-}
-
 /// getNegativeSCEV - Return a SCEV corresponding to -V = -1*V
 ///
 const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V) {
@@ -2772,7 +2765,11 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
 ///
 const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
 
-  bool InBounds = GEP->isInBounds();
+  // Don't blindly transfer the inbounds flag from the GEP instruction to the
+  // Add expression, because the Instruction may be guarded by control flow
+  // and the no-overflow bits may not be valid for the expression in any
+  // context.
+
   const Type *IntPtrTy = getEffectiveSCEVType(GEP->getType());
   Value *Base = GEP->getOperand(0);
   // Don't attempt to analyze GEPs over unsized objects.
@@ -2788,23 +2785,30 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
     if (const StructType *STy = dyn_cast<StructType>(*GTI++)) {
       // For a struct, add the member offset.
       unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
-      TotalOffset = getAddExpr(TotalOffset,
-                               getOffsetOfExpr(STy, FieldNo),
-                               /*HasNUW=*/false, /*HasNSW=*/InBounds);
+      const SCEV *FieldOffset = getOffsetOfExpr(STy, FieldNo);
+
+      // Add the field offset to the running total offset.
+      TotalOffset = getAddExpr(TotalOffset, FieldOffset);
     } else {
       // For an array, add the element offset, explicitly scaled.
-      const SCEV *LocalOffset = getSCEV(Index);
+      const SCEV *ElementSize = getSizeOfExpr(*GTI);
+      const SCEV *IndexS = getSCEV(Index);
       // Getelementptr indices are signed.
-      LocalOffset = getTruncateOrSignExtend(LocalOffset, IntPtrTy);
-      // Lower "inbounds" GEPs to NSW arithmetic.
-      LocalOffset = getMulExpr(LocalOffset, getSizeOfExpr(*GTI),
-                               /*HasNUW=*/false, /*HasNSW=*/InBounds);
-      TotalOffset = getAddExpr(TotalOffset, LocalOffset,
-                               /*HasNUW=*/false, /*HasNSW=*/InBounds);
+      IndexS = getTruncateOrSignExtend(IndexS, IntPtrTy);
+
+      // Multiply the index by the element size to compute the element offset.
+      const SCEV *LocalOffset = getMulExpr(IndexS, ElementSize);
+
+      // Add the element offset to the running total offset.
+      TotalOffset = getAddExpr(TotalOffset, LocalOffset);
     }
   }
-  return getAddExpr(getSCEV(Base), TotalOffset,
-                    /*HasNUW=*/false, /*HasNSW=*/InBounds);
+
+  // Get the SCEV for the GEP base.
+  const SCEV *BaseS = getSCEV(Base);
+
+  // Add the total offset from all the GEP indices to the base.
+  return getAddExpr(BaseS, TotalOffset);
 }
 
 /// GetMinTrailingZeros - Determine the minimum number of zero bits that S is
@@ -2963,7 +2967,8 @@ ScalarEvolution::getUnsignedRange(const SCEV *S) {
       if (const SCEVConstant *C = dyn_cast<SCEVConstant>(AddRec->getStart()))
         if (!C->getValue()->isZero())
           ConservativeResult =
-            ConstantRange(C->getValue()->getValue(), APInt(BitWidth, 0));
+            ConservativeResult.intersectWith(
+              ConstantRange(C->getValue()->getValue(), APInt(BitWidth, 0)));
 
     // TODO: non-affine addrec
     if (AddRec->isAffine()) {
@@ -3196,15 +3201,9 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
   Operator *U = cast<Operator>(V);
   switch (Opcode) {
   case Instruction::Add:
-    // Don't transfer the NSW and NUW bits from the Add instruction to the
-    // Add expression, because the Instruction may be guarded by control
-    // flow and the no-overflow bits may not be valid for the expression in
-    // any context.
     return getAddExpr(getSCEV(U->getOperand(0)),
                       getSCEV(U->getOperand(1)));
   case Instruction::Mul:
-    // Don't transfer the NSW and NUW bits from the Mul instruction to the
-    // Mul expression, as with Add.
     return getMulExpr(getSCEV(U->getOperand(0)),
                       getSCEV(U->getOperand(1)));
   case Instruction::UDiv:
@@ -3658,6 +3657,26 @@ void ScalarEvolution::forgetValue(Value *V) {
         ConstantEvolutionLoopExitValue.erase(PN);
     }
 
+    // If there's a SCEVUnknown tying this value into the SCEV
+    // space, remove it from the folding set map. The SCEVUnknown
+    // object and any other SCEV objects which reference it
+    // (transitively) remain allocated, effectively leaked until
+    // the underlying BumpPtrAllocator is freed.
+    //
+    // This permits SCEV pointers to be used as keys in maps
+    // such as the ValuesAtScopes map.
+    FoldingSetNodeID ID;
+    ID.AddInteger(scUnknown);
+    ID.AddPointer(I);
+    void *IP;
+    if (SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) {
+      UniqueSCEVs.RemoveNode(S);
+
+      // This isn't necessary, but we might as well remove the
+      // value from the ValuesAtScopes map too.
+      ValuesAtScopes.erase(S);
+    }
+
     PushDefUseChildren(I, Worklist);
   }
 }
@@ -4139,8 +4158,7 @@ static PHINode *getConstantEvolvingPHI(Value *V, const Loop *L) {
   // constant or derived from a PHI node themselves.
   PHINode *PHI = 0;
   for (unsigned Op = 0, e = I->getNumOperands(); Op != e; ++Op)
-    if (!(isa<Constant>(I->getOperand(Op)) ||
-          isa<GlobalValue>(I->getOperand(Op)))) {
+    if (!isa<Constant>(I->getOperand(Op))) {
       PHINode *P = getConstantEvolvingPHI(I->getOperand(Op), L);
       if (P == 0) return 0;  // Not evolving from PHI
       if (PHI == 0)
@@ -4161,11 +4179,9 @@ static Constant *EvaluateExpression(Value *V, Constant *PHIVal,
                                     const TargetData *TD) {
   if (isa<PHINode>(V)) return PHIVal;
   if (Constant *C = dyn_cast<Constant>(V)) return C;
-  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) return GV;
   Instruction *I = cast<Instruction>(V);
 
-  std::vector<Constant*> Operands;
-  Operands.resize(I->getNumOperands());
+  std::vector<Constant*> Operands(I->getNumOperands());
 
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
     Operands[i] = EvaluateExpression(I->getOperand(i), PHIVal, TD);
@@ -4207,8 +4223,8 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
     return RetVal = 0;  // Must be a constant.
 
   Value *BEValue = PN->getIncomingValue(SecondIsBackedge);
-  PHINode *PN2 = getConstantEvolvingPHI(BEValue, L);
-  if (PN2 != PN)
+  if (getConstantEvolvingPHI(BEValue, L) != PN &&
+      !isa<Constant>(BEValue))
     return RetVal = 0;  // Not derived from same PHI.
 
   // Execute the loop symbolically to determine the exit value.
@@ -4243,8 +4259,11 @@ ScalarEvolution::ComputeBackedgeTakenCountExhaustively(const Loop *L,
   PHINode *PN = getConstantEvolvingPHI(Cond, L);
   if (PN == 0) return getCouldNotCompute();
 
-  // Since the loop is canonicalized, the PHI node must have two entries.  One
-  // entry must be a constant (coming in from outside of the loop), and the
+  // If the loop is canonicalized, the PHI will have exactly two entries.
+  // That's the only form we support here.
+  if (PN->getNumIncomingValues() != 2) return getCouldNotCompute();
+
+  // One entry must be a constant (coming in from outside of the loop), and the
   // second must be derived from the same PHI.
   bool SecondIsBackedge = L->contains(PN->getIncomingBlock(1));
   Constant *StartCST =
@@ -4252,8 +4271,9 @@ ScalarEvolution::ComputeBackedgeTakenCountExhaustively(const Loop *L,
   if (StartCST == 0) return getCouldNotCompute();  // Must be a constant.
 
   Value *BEValue = PN->getIncomingValue(SecondIsBackedge);
-  PHINode *PN2 = getConstantEvolvingPHI(BEValue, L);
-  if (PN2 != PN) return getCouldNotCompute();  // Not derived from same PHI.
+  if (getConstantEvolvingPHI(BEValue, L) != PN &&
+      !isa<Constant>(BEValue))
+    return getCouldNotCompute();  // Not derived from same PHI.
 
   // Okay, we find a PHI node that defines the trip count of this loop.  Execute
   // the loop symbolically to determine when the condition gets a value of
@@ -4341,54 +4361,51 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
       // the arguments into constants, and if so, try to constant propagate the
       // result.  This is particularly useful for computing loop exit values.
       if (CanConstantFold(I)) {
-        std::vector<Constant*> Operands;
-        Operands.reserve(I->getNumOperands());
+        SmallVector<Constant *, 4> Operands;
+        bool MadeImprovement = false;
         for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
           Value *Op = I->getOperand(i);
           if (Constant *C = dyn_cast<Constant>(Op)) {
             Operands.push_back(C);
-          } else {
-            // If any of the operands is non-constant and if they are
-            // non-integer and non-pointer, don't even try to analyze them
-            // with scev techniques.
-            if (!isSCEVable(Op->getType()))
-              return V;
-
-            const SCEV *OpV = getSCEVAtScope(Op, L);
-            if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(OpV)) {
-              Constant *C = SC->getValue();
-              if (C->getType() != Op->getType())
-                C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
-                                                                  Op->getType(),
-                                                                  false),
-                                          C, Op->getType());
-              Operands.push_back(C);
-            } else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(OpV)) {
-              if (Constant *C = dyn_cast<Constant>(SU->getValue())) {
-                if (C->getType() != Op->getType())
-                  C =
-                    ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
-                                                                  Op->getType(),
-                                                                  false),
-                                          C, Op->getType());
-                Operands.push_back(C);
-              } else
-                return V;
-            } else {
-              return V;
-            }
+            continue;
           }
+
+          // If any of the operands is non-constant and if they are
+          // non-integer and non-pointer, don't even try to analyze them
+          // with scev techniques.
+          if (!isSCEVable(Op->getType()))
+            return V;
+
+          const SCEV *OrigV = getSCEV(Op);
+          const SCEV *OpV = getSCEVAtScope(OrigV, L);
+          MadeImprovement |= OrigV != OpV;
+
+          Constant *C = 0;
+          if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(OpV))
+            C = SC->getValue();
+          if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(OpV))
+            C = dyn_cast<Constant>(SU->getValue());
+          if (!C) return V;
+          if (C->getType() != Op->getType())
+            C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+                                                              Op->getType(),
+                                                              false),
+                                      C, Op->getType());
+          Operands.push_back(C);
         }
 
-        Constant *C = 0;
-        if (const CmpInst *CI = dyn_cast<CmpInst>(I))
-          C = ConstantFoldCompareInstOperands(CI->getPredicate(),
-                                              Operands[0], Operands[1], TD);
-        else
-          C = ConstantFoldInstOperands(I->getOpcode(), I->getType(),
-                                       &Operands[0], Operands.size(), TD);
-        if (C)
+        // Check to see if getSCEVAtScope actually made an improvement.
+        if (MadeImprovement) {
+          Constant *C = 0;
+          if (const CmpInst *CI = dyn_cast<CmpInst>(I))
+            C = ConstantFoldCompareInstOperands(CI->getPredicate(),
+                                                Operands[0], Operands[1], TD);
+          else
+            C = ConstantFoldInstOperands(I->getOpcode(), I->getType(),
+                                         &Operands[0], Operands.size(), TD);
+          if (!C) return V;
           return getSCEV(C);
+        }
       }
     }
 
@@ -4438,7 +4455,29 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
   // If this is a loop recurrence for a loop that does not contain L, then we
   // are dealing with the final value computed by the loop.
   if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(V)) {
-    if (!L || !AddRec->getLoop()->contains(L)) {
+    // First, attempt to evaluate each operand.
+    // Avoid performing the look-up in the common case where the specified
+    // expression has no loop-variant portions.
+    for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) {
+      const SCEV *OpAtScope = getSCEVAtScope(AddRec->getOperand(i), L);
+      if (OpAtScope == AddRec->getOperand(i))
+        continue;
+
+      // Okay, at least one of these operands is loop variant but might be
+      // foldable.  Build a new instance of the folded commutative expression.
+      SmallVector<const SCEV *, 8> NewOps(AddRec->op_begin(),
+                                          AddRec->op_begin()+i);
+      NewOps.push_back(OpAtScope);
+      for (++i; i != e; ++i)
+        NewOps.push_back(getSCEVAtScope(AddRec->getOperand(i), L));
+
+      AddRec = cast<SCEVAddRecExpr>(getAddRecExpr(NewOps, AddRec->getLoop()));
+      break;
+    }
+
+    // If the scope is outside the addrec's loop, evaluate it by using the
+    // loop exit value of the addrec.
+    if (!AddRec->getLoop()->contains(L)) {
       // To evaluate this recurrence, we need to know how many times the AddRec
       // loop iterates.  Compute this now.
       const SCEV *BackedgeTakenCount = getBackedgeTakenCount(AddRec->getLoop());
@@ -4447,6 +4486,7 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
       // Then, evaluate the AddRec.
       return AddRec->evaluateAtIteration(BackedgeTakenCount, *this);
     }
+
     return AddRec;
   }
 
@@ -4696,23 +4736,6 @@ ScalarEvolution::HowFarToNonZero(const SCEV *V, const Loop *L) {
   return getCouldNotCompute();
 }
 
-/// getLoopPredecessor - If the given loop's header has exactly one unique
-/// predecessor outside the loop, return it. Otherwise return null.
-/// This is less strict that the loop "preheader" concept, which requires
-/// the predecessor to have only one single successor.
-///
-BasicBlock *ScalarEvolution::getLoopPredecessor(const Loop *L) {
-  BasicBlock *Header = L->getHeader();
-  BasicBlock *Pred = 0;
-  for (pred_iterator PI = pred_begin(Header), E = pred_end(Header);
-       PI != E; ++PI)
-    if (!L->contains(*PI)) {
-      if (Pred && Pred != *PI) return 0; // Multiple predecessors.
-      Pred = *PI;
-    }
-  return Pred;
-}
-
 /// getPredecessorWithUniqueSuccessorForBB - Return a predecessor of BB
 /// (which may not be an immediate predecessor) which has exactly one
 /// successor from which BB is reachable, or null if no such block is
@@ -4730,7 +4753,7 @@ ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(BasicBlock *BB) {
   // If the header has a unique predecessor outside the loop, it must be
   // a block that has exactly one successor that can reach the loop.
   if (Loop *L = LI->getLoopFor(BB))
-    return std::make_pair(getLoopPredecessor(L), L->getHeader());
+    return std::make_pair(L->getLoopPredecessor(), L->getHeader());
 
   return std::pair<BasicBlock *, BasicBlock *>();
 }
@@ -5181,7 +5204,7 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
   // as there are predecessors that can be found that have unique successors
   // leading to the original header.
   for (std::pair<BasicBlock *, BasicBlock *>
-         Pair(getLoopPredecessor(L), L->getHeader());
+         Pair(L->getLoopPredecessor(), L->getHeader());
        Pair.first;
        Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
 
diff --git a/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp b/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
index 17b254f..58711b8 100644
--- a/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
+++ b/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
@@ -12,7 +12,7 @@
 //
 // This differs from traditional loop dependence analysis in that it tests
 // for dependencies within a single iteration of a loop, rather than
-// dependences between different iterations.
+// dependencies between different iterations.
 //
 // ScalarEvolution has a more complete understanding of pointer arithmetic
 // than BasicAliasAnalysis' collection of ad-hoc analyses.
@@ -106,6 +106,12 @@ ScalarEvolutionAliasAnalysis::GetBaseValue(const SCEV *S) {
 AliasAnalysis::AliasResult
 ScalarEvolutionAliasAnalysis::alias(const Value *A, unsigned ASize,
                                     const Value *B, unsigned BSize) {
+  // If either of the memory references is empty, it doesn't matter what the
+  // pointer values are. This allows the code below to ignore this special
+  // case.
+  if (ASize == 0 || BSize == 0)
+    return NoAlias;
+
   // This is ScalarEvolutionAliasAnalysis. Get the SCEVs!
   const SCEV *AS = SE->getSCEV(const_cast<Value *>(A));
   const SCEV *BS = SE->getSCEV(const_cast<Value *>(B));
@@ -118,14 +124,32 @@ ScalarEvolutionAliasAnalysis::alias(const Value *A, unsigned ASize,
   if (SE->getEffectiveSCEVType(AS->getType()) ==
       SE->getEffectiveSCEVType(BS->getType())) {
     unsigned BitWidth = SE->getTypeSizeInBits(AS->getType());
-    APInt AI(BitWidth, ASize);
+    APInt ASizeInt(BitWidth, ASize);
+    APInt BSizeInt(BitWidth, BSize);
+
+    // Compute the difference between the two pointers.
     const SCEV *BA = SE->getMinusSCEV(BS, AS);
-    if (AI.ule(SE->getUnsignedRange(BA).getUnsignedMin())) {
-      APInt BI(BitWidth, BSize);
-      const SCEV *AB = SE->getMinusSCEV(AS, BS);
-      if (BI.ule(SE->getUnsignedRange(AB).getUnsignedMin()))
-        return NoAlias;
-    }
+
+    // Test whether the difference is known to be great enough that memory of
+    // the given sizes don't overlap. This assumes that ASizeInt and BSizeInt
+    // are non-zero, which is special-cased above.
+    if (ASizeInt.ule(SE->getUnsignedRange(BA).getUnsignedMin()) &&
+        (-BSizeInt).uge(SE->getUnsignedRange(BA).getUnsignedMax()))
+      return NoAlias;
+
+    // Folding the subtraction while preserving range information can be tricky
+    // (because of INT_MIN, etc.); if the prior test failed, swap AS and BS
+    // and try again to see if things fold better that way.
+
+    // Compute the difference between the two pointers.
+    const SCEV *AB = SE->getMinusSCEV(AS, BS);
+
+    // Test whether the difference is known to be great enough that memory of
+    // the given sizes don't overlap. This assumes that ASizeInt and BSizeInt
+    // are non-zero, which is special-cased above.
+    if (BSizeInt.ule(SE->getUnsignedRange(AB).getUnsignedMin()) &&
+        (-ASizeInt).uge(SE->getUnsignedRange(AB).getUnsignedMax()))
+      return NoAlias;
   }
 
   // If ScalarEvolution can find an underlying object, form a new query.
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 0012b84..d4a4b26 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -21,6 +21,43 @@
 #include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
+/// ReuseOrCreateCast - Arrange for there to be a cast of V to Ty at IP,
+/// reusing an existing cast if a suitable one exists, moving an existing
+/// cast if a suitable one exists but isn't in the right place, or
+/// creating a new one.
+Value *SCEVExpander::ReuseOrCreateCast(Value *V, const Type *Ty,
+                                       Instruction::CastOps Op,
+                                       BasicBlock::iterator IP) {
+  // Check to see if there is already a cast!
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    User *U = *UI;
+    if (U->getType() == Ty)
+      if (CastInst *CI = dyn_cast<CastInst>(U))
+        if (CI->getOpcode() == Op) {
+          // If the cast isn't where we want it, fix it.
+          if (BasicBlock::iterator(CI) != IP) {
+            // Create a new cast, and leave the old cast in place in case
+            // it is being used as an insert point. Clear its operand
+            // so that it doesn't hold anything live.
+            Instruction *NewCI = CastInst::Create(Op, V, Ty, "", IP);
+            NewCI->takeName(CI);
+            CI->replaceAllUsesWith(NewCI);
+            CI->setOperand(0, UndefValue::get(V->getType()));
+            rememberInstruction(NewCI);
+            return NewCI;
+          }
+          rememberInstruction(CI);
+          return CI;
+        }
+  }
+
+  // Create a new cast.
+  Instruction *I = CastInst::Create(Op, V, Ty, V->getName(), IP);
+  rememberInstruction(I);
+  return I;
+}
+
 /// InsertNoopCastOfTo - Insert a cast of V to the specified type,
 /// which must be possible with a noop cast, doing what we can to share
 /// the casts.
@@ -54,71 +91,29 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, const Type *Ty) {
         return CE->getOperand(0);
   }
 
+  // Fold a cast of a constant.
   if (Constant *C = dyn_cast<Constant>(V))
     return ConstantExpr::getCast(Op, C, Ty);
 
+  // Cast the argument at the beginning of the entry block, after
+  // any bitcasts of other arguments.
   if (Argument *A = dyn_cast<Argument>(V)) {
-    // Check to see if there is already a cast!
-    for (Value::use_iterator UI = A->use_begin(), E = A->use_end();
-         UI != E; ++UI)
-      if ((*UI)->getType() == Ty)
-        if (CastInst *CI = dyn_cast<CastInst>(cast<Instruction>(*UI)))
-          if (CI->getOpcode() == Op) {
-            // If the cast isn't the first instruction of the function, move it.
-            if (BasicBlock::iterator(CI) !=
-                A->getParent()->getEntryBlock().begin()) {
-              // Recreate the cast at the beginning of the entry block.
-              // The old cast is left in place in case it is being used
-              // as an insert point.
-              Instruction *NewCI =
-                CastInst::Create(Op, V, Ty, "",
-                                 A->getParent()->getEntryBlock().begin());
-              NewCI->takeName(CI);
-              CI->replaceAllUsesWith(NewCI);
-              return NewCI;
-            }
-            return CI;
-          }
-
-    Instruction *I = CastInst::Create(Op, V, Ty, V->getName(),
-                                      A->getParent()->getEntryBlock().begin());
-    rememberInstruction(I);
-    return I;
+    BasicBlock::iterator IP = A->getParent()->getEntryBlock().begin();
+    while ((isa<BitCastInst>(IP) &&
+            isa<Argument>(cast<BitCastInst>(IP)->getOperand(0)) &&
+            cast<BitCastInst>(IP)->getOperand(0) != A) ||
+           isa<DbgInfoIntrinsic>(IP))
+      ++IP;
+    return ReuseOrCreateCast(A, Ty, Op, IP);
   }
 
+  // Cast the instruction immediately after the instruction.
   Instruction *I = cast<Instruction>(V);
-
-  // Check to see if there is already a cast.  If there is, use it.
-  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
-       UI != E; ++UI) {
-    if ((*UI)->getType() == Ty)
-      if (CastInst *CI = dyn_cast<CastInst>(cast<Instruction>(*UI)))
-        if (CI->getOpcode() == Op) {
-          BasicBlock::iterator It = I; ++It;
-          if (isa<InvokeInst>(I))
-            It = cast<InvokeInst>(I)->getNormalDest()->begin();
-          while (isa<PHINode>(It)) ++It;
-          if (It != BasicBlock::iterator(CI)) {
-            // Recreate the cast after the user.
-            // The old cast is left in place in case it is being used
-            // as an insert point.
-            Instruction *NewCI = CastInst::Create(Op, V, Ty, "", It);
-            NewCI->takeName(CI);
-            CI->replaceAllUsesWith(NewCI);
-            rememberInstruction(NewCI);
-            return NewCI;
-          }
-          rememberInstruction(CI);
-          return CI;
-        }
-  }
   BasicBlock::iterator IP = I; ++IP;
   if (InvokeInst *II = dyn_cast<InvokeInst>(I))
     IP = II->getNormalDest()->begin();
-  while (isa<PHINode>(IP)) ++IP;
-  Instruction *CI = CastInst::Create(Op, V, Ty, V->getName(), IP);
-  rememberInstruction(CI);
-  return CI;
+  while (isa<PHINode>(IP) || isa<DbgInfoIntrinsic>(IP)) ++IP;
+  return ReuseOrCreateCast(I, Ty, Op, IP);
 }
 
 /// InsertBinop - Insert the specified binary operator, doing a small amount
@@ -295,11 +290,11 @@ static void SimplifyAddOperands(SmallVectorImpl<const SCEV *> &Ops,
   // the sum into a single value, so just use that.
   Ops.clear();
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Sum))
-    Ops.insert(Ops.end(), Add->op_begin(), Add->op_end());
+    Ops.append(Add->op_begin(), Add->op_end());
   else if (!Sum->isZero())
     Ops.push_back(Sum);
   // Then append the addrecs.
-  Ops.insert(Ops.end(), AddRecs.begin(), AddRecs.end());
+  Ops.append(AddRecs.begin(), AddRecs.end());
 }
 
 /// SplitAddRecs - Flatten a list of add operands, moving addrec start values
@@ -322,7 +317,7 @@ static void SplitAddRecs(SmallVectorImpl<const SCEV *> &Ops,
                                          A->getLoop()));
       if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Start)) {
         Ops[i] = Zero;
-        Ops.insert(Ops.end(), Add->op_begin(), Add->op_end());
+        Ops.append(Add->op_begin(), Add->op_end());
         e += Add->getNumOperands();
       } else {
         Ops[i] = Start;
@@ -330,7 +325,7 @@ static void SplitAddRecs(SmallVectorImpl<const SCEV *> &Ops,
     }
   if (!AddRecs.empty()) {
     // Add the addrecs onto the end of the list.
-    Ops.insert(Ops.end(), AddRecs.begin(), AddRecs.end());
+    Ops.append(AddRecs.begin(), AddRecs.end());
     // Resort the operand list, moving any constants to the front.
     SimplifyAddOperands(Ops, Ty, SE);
   }
@@ -1070,7 +1065,8 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
     BasicBlock::iterator NewInsertPt =
       llvm::next(BasicBlock::iterator(cast<Instruction>(V)));
-    while (isa<PHINode>(NewInsertPt)) ++NewInsertPt;
+    while (isa<PHINode>(NewInsertPt) || isa<DbgInfoIntrinsic>(NewInsertPt))
+      ++NewInsertPt;
     V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), 0,
                       NewInsertPt);
     restoreInsertPoint(SaveInsertBB, SaveInsertPt);
@@ -1107,8 +1103,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   }
 
   // {0,+,1} --> Insert a canonical induction variable into the loop!
-  if (S->isAffine() &&
-      S->getOperand(1) == SE.getConstant(Ty, 1)) {
+  if (S->isAffine() && S->getOperand(1)->isOne()) {
     // If there's a canonical IV, just use it.
     if (CanonicalIV) {
       assert(Ty == SE.getEffectiveSCEVType(CanonicalIV->getType()) &&
@@ -1125,17 +1120,19 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
 
     Constant *One = ConstantInt::get(Ty, 1);
     for (pred_iterator HPI = pred_begin(Header), HPE = pred_end(Header);
-         HPI != HPE; ++HPI)
-      if (L->contains(*HPI)) {
+         HPI != HPE; ++HPI) {
+      BasicBlock *HP = *HPI;
+      if (L->contains(HP)) {
         // Insert a unit add instruction right before the terminator
         // corresponding to the back-edge.
         Instruction *Add = BinaryOperator::CreateAdd(PN, One, "indvar.next",
-                                                     (*HPI)->getTerminator());
+                                                           HP->getTerminator());
         rememberInstruction(Add);
-        PN->addIncoming(Add, *HPI);
+        PN->addIncoming(Add, HP);
       } else {
-        PN->addIncoming(Constant::getNullValue(Ty), *HPI);
+        PN->addIncoming(Constant::getNullValue(Ty), HP);
       }
+    }
   }
 
   // {0,+,F} --> {0,+,1} * F
@@ -1312,7 +1309,9 @@ Value *SCEVExpander::expand(const SCEV *S) {
 }
 
 void SCEVExpander::rememberInstruction(Value *I) {
-  if (PostIncLoops.empty())
+  if (!PostIncLoops.empty())
+    InsertedPostIncValues.insert(I);
+  else
     InsertedValues.insert(I);
 
   // If we just claimed an existing instruction and that instruction had
diff --git a/lib/Analysis/ScalarEvolutionNormalization.cpp b/lib/Analysis/ScalarEvolutionNormalization.cpp
index 75c381d..563fd2f 100644
--- a/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -105,22 +105,25 @@ const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
       case NormalizeAutodetect:
         if (Instruction *OI = dyn_cast<Instruction>(OperandValToReplace))
           if (IVUseShouldUsePostIncValue(User, OI, L, &DT)) {
-            Result = SE.getMinusSCEV(Result, AR->getStepRecurrence(SE));
+            const SCEV *TransformedStep =
+              TransformForPostIncUse(Kind, AR->getStepRecurrence(SE),
+                                     User, OperandValToReplace, Loops, SE, DT);
+            Result = SE.getMinusSCEV(Result, TransformedStep);
             Loops.insert(L);
           }
         break;
       case Normalize:
-        if (Loops.count(L))
-          Result = SE.getMinusSCEV(Result, AR->getStepRecurrence(SE));
-        break;
-      case Denormalize:
         if (Loops.count(L)) {
           const SCEV *TransformedStep =
             TransformForPostIncUse(Kind, AR->getStepRecurrence(SE),
                                    User, OperandValToReplace, Loops, SE, DT);
-          Result = SE.getAddExpr(Result, TransformedStep);
+          Result = SE.getMinusSCEV(Result, TransformedStep);
         }
         break;
+      case Denormalize:
+        if (Loops.count(L))
+          Result = SE.getAddExpr(Result, AR->getStepRecurrence(SE));
+        break;
       }
       return Result;
     }
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 7e8ec2e..b4c9884 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -953,7 +953,7 @@ bool llvm::CannotBeNegativeZero(const Value *V, unsigned Depth) {
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
     // sqrt(-0.0) = -0.0, no other negative results are possible.
     if (II->getIntrinsicID() == Intrinsic::sqrt)
-      return CannotBeNegativeZero(II->getOperand(1), Depth+1);
+      return CannotBeNegativeZero(II->getArgOperand(0), Depth+1);
   
   if (const CallInst *CI = dyn_cast<CallInst>(I))
     if (const Function *F = CI->getCalledFunction()) {
@@ -966,7 +966,7 @@ bool llvm::CannotBeNegativeZero(const Value *V, unsigned Depth) {
         if (F->getName() == "fabsl") return true;
         if (F->getName() == "sqrt" || F->getName() == "sqrtf" ||
             F->getName() == "sqrtl")
-          return CannotBeNegativeZero(CI->getOperand(1), Depth+1);
+          return CannotBeNegativeZero(CI->getArgOperand(0), Depth+1);
       }
     }
   
diff --git a/lib/Archive/ArchiveWriter.cpp b/lib/Archive/ArchiveWriter.cpp
index 21d4f65..7eeeb59 100644
--- a/lib/Archive/ArchiveWriter.cpp
+++ b/lib/Archive/ArchiveWriter.cpp
@@ -366,8 +366,7 @@ Archive::writeToDisk(bool CreateSymbolTable, bool TruncateNames, bool Compress,
 
   // Check for errors opening or creating archive file.
   if (!ArchiveFile.is_open() || ArchiveFile.bad()) {
-    if (TmpArchive.exists())
-      TmpArchive.eraseFromDisk();
+    TmpArchive.eraseFromDisk();
     if (ErrMsg)
       *ErrMsg = "Error opening archive file: " + archPath.str();
     return true;
@@ -387,8 +386,7 @@ Archive::writeToDisk(bool CreateSymbolTable, bool TruncateNames, bool Compress,
   for (MembersList::iterator I = begin(), E = end(); I != E; ++I) {
     if (writeMember(*I, ArchiveFile, CreateSymbolTable,
                      TruncateNames, Compress, ErrMsg)) {
-      if (TmpArchive.exists())
-        TmpArchive.eraseFromDisk();
+      TmpArchive.eraseFromDisk();
       ArchiveFile.close();
       return true;
     }
@@ -420,8 +418,7 @@ Archive::writeToDisk(bool CreateSymbolTable, bool TruncateNames, bool Compress,
 
     std::ofstream FinalFile(FinalFilePath.c_str(), io_mode);
     if (!FinalFile.is_open() || FinalFile.bad()) {
-      if (TmpArchive.exists())
-        TmpArchive.eraseFromDisk();
+      TmpArchive.eraseFromDisk();
       if (ErrMsg)
         *ErrMsg = "Error opening archive file: " + FinalFilePath.str();
       return true;
@@ -438,8 +435,7 @@ Archive::writeToDisk(bool CreateSymbolTable, bool TruncateNames, bool Compress,
     if (foreignST) {
       if (writeMember(*foreignST, FinalFile, false, false, false, ErrMsg)) {
         FinalFile.close();
-        if (TmpArchive.exists())
-          TmpArchive.eraseFromDisk();
+        TmpArchive.eraseFromDisk();
         return true;
       }
     }
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 9b4370f..f4c0e50 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -492,6 +492,7 @@ lltok::Kind LLLexer::LexIdentifier() {
 
   KEYWORD(private);
   KEYWORD(linker_private);
+  KEYWORD(linker_private_weak);
   KEYWORD(internal);
   KEYWORD(available_externally);
   KEYWORD(linkonce);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 226d8d3..6752181 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -196,19 +196,20 @@ bool LLParser::ParseTopLevelEntities() {
     // optional leading prefixes, the production is:
     // GlobalVar ::= OptionalLinkage OptionalVisibility OptionalThreadLocal
     //               OptionalAddrSpace ('constant'|'global') ...
-    case lltok::kw_private :       // OptionalLinkage
-    case lltok::kw_linker_private: // OptionalLinkage
-    case lltok::kw_internal:       // OptionalLinkage
-    case lltok::kw_weak:           // OptionalLinkage
-    case lltok::kw_weak_odr:       // OptionalLinkage
-    case lltok::kw_linkonce:       // OptionalLinkage
-    case lltok::kw_linkonce_odr:   // OptionalLinkage
-    case lltok::kw_appending:      // OptionalLinkage
-    case lltok::kw_dllexport:      // OptionalLinkage
-    case lltok::kw_common:         // OptionalLinkage
-    case lltok::kw_dllimport:      // OptionalLinkage
-    case lltok::kw_extern_weak:    // OptionalLinkage
-    case lltok::kw_external: {     // OptionalLinkage
+    case lltok::kw_private:             // OptionalLinkage
+    case lltok::kw_linker_private:      // OptionalLinkage
+    case lltok::kw_linker_private_weak: // OptionalLinkage
+    case lltok::kw_internal:            // OptionalLinkage
+    case lltok::kw_weak:                // OptionalLinkage
+    case lltok::kw_weak_odr:            // OptionalLinkage
+    case lltok::kw_linkonce:            // OptionalLinkage
+    case lltok::kw_linkonce_odr:        // OptionalLinkage
+    case lltok::kw_appending:           // OptionalLinkage
+    case lltok::kw_dllexport:           // OptionalLinkage
+    case lltok::kw_common:              // OptionalLinkage
+    case lltok::kw_dllimport:           // OptionalLinkage
+    case lltok::kw_extern_weak:         // OptionalLinkage
+    case lltok::kw_external: {          // OptionalLinkage
       unsigned Linkage, Visibility;
       if (ParseOptionalLinkage(Linkage) ||
           ParseOptionalVisibility(Visibility) ||
@@ -629,7 +630,8 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
       Linkage != GlobalValue::WeakODRLinkage &&
       Linkage != GlobalValue::InternalLinkage &&
       Linkage != GlobalValue::PrivateLinkage &&
-      Linkage != GlobalValue::LinkerPrivateLinkage)
+      Linkage != GlobalValue::LinkerPrivateLinkage &&
+      Linkage != GlobalValue::LinkerPrivateWeakLinkage)
     return Error(LinkageLoc, "invalid linkage type for alias");
 
   Constant *Aliasee;
@@ -1013,11 +1015,13 @@ bool LLParser::ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind) {
 ///   ::= /*empty*/
 ///   ::= 'private'
 ///   ::= 'linker_private'
+///   ::= 'linker_private_weak'
 ///   ::= 'internal'
 ///   ::= 'weak'
 ///   ::= 'weak_odr'
 ///   ::= 'linkonce'
 ///   ::= 'linkonce_odr'
+///   ::= 'available_externally'
 ///   ::= 'appending'
 ///   ::= 'dllexport'
 ///   ::= 'common'
@@ -1030,6 +1034,9 @@ bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage) {
   default:                       Res=GlobalValue::ExternalLinkage; return false;
   case lltok::kw_private:        Res = GlobalValue::PrivateLinkage;       break;
   case lltok::kw_linker_private: Res = GlobalValue::LinkerPrivateLinkage; break;
+  case lltok::kw_linker_private_weak:
+    Res = GlobalValue::LinkerPrivateWeakLinkage;
+    break;
   case lltok::kw_internal:       Res = GlobalValue::InternalLinkage;      break;
   case lltok::kw_weak:           Res = GlobalValue::WeakAnyLinkage;       break;
   case lltok::kw_weak_odr:       Res = GlobalValue::WeakODRLinkage;       break;
@@ -2704,6 +2711,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
     break;
   case GlobalValue::PrivateLinkage:
   case GlobalValue::LinkerPrivateLinkage:
+  case GlobalValue::LinkerPrivateWeakLinkage:
   case GlobalValue::InternalLinkage:
   case GlobalValue::AvailableExternallyLinkage:
   case GlobalValue::LinkOnceAnyLinkage:
@@ -3791,8 +3799,8 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS,
     }
   }
 
-  if (Size && !Size->getType()->isIntegerTy(32))
-    return Error(SizeLoc, "element count must be i32");
+  if (Size && !Size->getType()->isIntegerTy())
+    return Error(SizeLoc, "element count must have integer type");
 
   if (isAlloca) {
     Inst = new AllocaInst(Ty, Size, Alignment);
@@ -3801,6 +3809,8 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS,
 
   // Autoupgrade old malloc instruction to malloc call.
   // FIXME: Remove in LLVM 3.0.
+  if (Size && !Size->getType()->isIntegerTy(32))
+    return Error(SizeLoc, "element count must be i32");
   const Type *IntPtrTy = Type::getInt32Ty(Context);
   Constant *AllocSize = ConstantExpr::getSizeOf(Ty);
   AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, IntPtrTy);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 5eed170..2703134 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -37,9 +37,9 @@ namespace lltok {
     kw_declare, kw_define,
     kw_global,  kw_constant,
 
-    kw_private, kw_linker_private, kw_internal, kw_linkonce, kw_linkonce_odr,
-    kw_weak, kw_weak_odr, kw_appending, kw_dllimport, kw_dllexport, kw_common,
-    kw_available_externally,
+    kw_private, kw_linker_private, kw_linker_private_weak, kw_internal,
+    kw_linkonce, kw_linkonce_odr, kw_weak, kw_weak_odr, kw_appending,
+    kw_dllimport, kw_dllexport, kw_common, kw_available_externally,
     kw_default, kw_hidden, kw_protected,
     kw_extern_weak,
     kw_external, kw_thread_local,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 69adead..527ae49 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -75,6 +75,7 @@ static GlobalValue::LinkageTypes GetDecodedLinkage(unsigned Val) {
   case 11: return GlobalValue::LinkOnceODRLinkage;
   case 12: return GlobalValue::AvailableExternallyLinkage;
   case 13: return GlobalValue::LinkerPrivateLinkage;
+  case 14: return GlobalValue::LinkerPrivateWeakLinkage;
   }
 }
 
@@ -252,17 +253,18 @@ void BitcodeReaderValueList::ResolveConstantForwardRefs() {
     // at once.
     while (!Placeholder->use_empty()) {
       Value::use_iterator UI = Placeholder->use_begin();
+      User *U = *UI;
 
       // If the using object isn't uniqued, just update the operands.  This
       // handles instructions and initializers for global variables.
-      if (!isa<Constant>(*UI) || isa<GlobalValue>(*UI)) {
+      if (!isa<Constant>(U) || isa<GlobalValue>(U)) {
         UI.getUse().set(RealVal);
         continue;
       }
 
       // Otherwise, we have a constant that uses the placeholder.  Replace that
       // constant with a new constant that has *all* placeholder uses updated.
-      Constant *UserC = cast<Constant>(*UI);
+      Constant *UserC = cast<Constant>(U);
       for (User::op_iterator I = UserC->op_begin(), E = UserC->op_end();
            I != E; ++I) {
         Value *NewOp;
@@ -2178,13 +2180,18 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       InstructionList.push_back(I);
       break;
     }
-    case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, op, align]
-      if (Record.size() < 3)
+    case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align]
+      // For backward compatibility, tolerate a lack of an opty, and use i32.
+      // LLVM 3.0: Remove this.
+      if (Record.size() < 3 || Record.size() > 4)
         return Error("Invalid ALLOCA record");
+      unsigned OpNum = 0;
       const PointerType *Ty =
-        dyn_cast_or_null<PointerType>(getTypeByID(Record[0]));
-      Value *Size = getFnValueByID(Record[1], Type::getInt32Ty(Context));
-      unsigned Align = Record[2];
+        dyn_cast_or_null<PointerType>(getTypeByID(Record[OpNum++]));
+      const Type *OpTy = Record.size() == 4 ? getTypeByID(Record[OpNum++]) :
+                                              Type::getInt32Ty(Context);
+      Value *Size = getFnValueByID(Record[OpNum++], OpTy);
+      unsigned Align = Record[OpNum++];
       if (!Ty || !Size) return Error("Invalid ALLOCA record");
       I = new AllocaInst(Ty->getElementType(), Size, (1 << Align) >> 1);
       InstructionList.push_back(I);
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 9bda6dc..fa1b2c4 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -313,6 +313,7 @@ static unsigned getEncodedLinkage(const GlobalValue *GV) {
   case GlobalValue::LinkOnceODRLinkage:         return 11;
   case GlobalValue::AvailableExternallyLinkage: return 12;
   case GlobalValue::LinkerPrivateLinkage:       return 13;
+  case GlobalValue::LinkerPrivateWeakLinkage:   return 14;
   }
 }
 
@@ -577,10 +578,9 @@ static void WriteFunctionLocalMetadata(const Function &F,
                                        BitstreamWriter &Stream) {
   bool StartedMetadataBlock = false;
   SmallVector<uint64_t, 64> Record;
-  const ValueEnumerator::ValueList &Vals = VE.getMDValues();
-  
+  const SmallVector<const MDNode *, 8> &Vals = VE.getFunctionLocalMDValues();
   for (unsigned i = 0, e = Vals.size(); i != e; ++i)
-    if (const MDNode *N = dyn_cast<MDNode>(Vals[i].first))
+    if (const MDNode *N = Vals[i])
       if (N->isFunctionLocal() && N->getFunction() == &F) {
         if (!StartedMetadataBlock) {
           Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
@@ -588,7 +588,7 @@ static void WriteFunctionLocalMetadata(const Function &F,
         }
         WriteMDNode(N, VE, Stream, Record);
       }
-
+      
   if (StartedMetadataBlock)
     Stream.ExitBlock();
 }
@@ -1114,6 +1114,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   case Instruction::Alloca:
     Code = bitc::FUNC_CODE_INST_ALLOCA;
     Vals.push_back(VE.getTypeID(I.getType()));
+    Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));
     Vals.push_back(VE.getValueID(I.getOperand(0))); // size.
     Vals.push_back(Log2_32(cast<AllocaInst>(I).getAlignment())+1);
     break;
@@ -1134,26 +1135,25 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     Vals.push_back(cast<StoreInst>(I).isVolatile());
     break;
   case Instruction::Call: {
-    const PointerType *PTy = cast<PointerType>(I.getOperand(0)->getType());
+    const CallInst &CI = cast<CallInst>(I);
+    const PointerType *PTy = cast<PointerType>(CI.getCalledValue()->getType());
     const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
 
     Code = bitc::FUNC_CODE_INST_CALL;
 
-    const CallInst *CI = cast<CallInst>(&I);
-    Vals.push_back(VE.getAttributeID(CI->getAttributes()));
-    Vals.push_back((CI->getCallingConv() << 1) | unsigned(CI->isTailCall()));
-    PushValueAndType(CI->getOperand(0), InstID, Vals, VE);  // Callee
+    Vals.push_back(VE.getAttributeID(CI.getAttributes()));
+    Vals.push_back((CI.getCallingConv() << 1) | unsigned(CI.isTailCall()));
+    PushValueAndType(CI.getCalledValue(), InstID, Vals, VE);  // Callee
 
     // Emit value #'s for the fixed parameters.
     for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
-      Vals.push_back(VE.getValueID(I.getOperand(i+1)));  // fixed param.
+      Vals.push_back(VE.getValueID(CI.getArgOperand(i)));  // fixed param.
 
     // Emit type/value pairs for varargs params.
     if (FTy->isVarArg()) {
-      unsigned NumVarargs = I.getNumOperands()-1-FTy->getNumParams();
-      for (unsigned i = I.getNumOperands()-NumVarargs, e = I.getNumOperands();
+      for (unsigned i = FTy->getNumParams(), e = CI.getNumArgOperands();
            i != e; ++i)
-        PushValueAndType(I.getOperand(i), InstID, Vals, VE);  // varargs
+        PushValueAndType(CI.getArgOperand(i), InstID, Vals, VE);  // varargs
     }
     break;
   }
@@ -1662,15 +1662,8 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out) {
 
   WriteBitcodeToStream( M, Stream );
 
-  // If writing to stdout, set binary mode.
-  if (&llvm::outs() == &Out)
-    sys::Program::ChangeStdoutToBinary();
-
   // Write the generated bitstream to "Out".
   Out.write((char*)&Buffer.front(), Buffer.size());
-
-  // Make sure it hits disk now.
-  Out.flush();
 }
 
 /// WriteBitcodeToStream - Write the specified module to the specified output
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index d2baec7..7fa425a 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -72,7 +72,7 @@ ValueEnumerator::ValueEnumerator(const Module *M) {
   // Enumerate types used by the type symbol table.
   EnumerateTypeSymbolTable(M->getTypeSymbolTable());
 
-  // Insert constants and metadata  that are named at module level into the slot 
+  // Insert constants and metadata that are named at module level into the slot 
   // pool so that the module symbol table can refer to them...
   EnumerateValueSymbolTable(M->getValueSymbolTable());
   EnumerateMDSymbolTable(M->getMDSymbolTable());
@@ -257,6 +257,8 @@ void ValueEnumerator::EnumerateMetadata(const Value *MD) {
       else
         EnumerateType(Type::getVoidTy(MD->getContext()));
     }
+    if (N->isFunctionLocal() && N->getFunction())
+      FunctionLocalMDs.push_back(N);
     return;
   }
   
@@ -414,7 +416,8 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
 
   FirstInstID = Values.size();
 
-  SmallVector<MDNode *, 8> FunctionLocalMDs;
+  FunctionLocalMDs.clear();
+  SmallVector<MDNode *, 8> FnLocalMDVector;
   // Add all of the instructions.
   for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
     for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I) {
@@ -423,7 +426,7 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
         if (MDNode *MD = dyn_cast<MDNode>(*OI))
           if (MD->isFunctionLocal() && MD->getFunction())
             // Enumerate metadata after the instructions they might refer to.
-            FunctionLocalMDs.push_back(MD);
+            FnLocalMDVector.push_back(MD);
       }
       if (!I->getType()->isVoidTy())
         EnumerateValue(I);
@@ -431,8 +434,8 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
   }
 
   // Add all of the function-local metadata.
-  for (unsigned i = 0, e = FunctionLocalMDs.size(); i != e; ++i)
-    EnumerateOperandType(FunctionLocalMDs[i]);
+  for (unsigned i = 0, e = FnLocalMDVector.size(); i != e; ++i)
+    EnumerateOperandType(FnLocalMDVector[i]);
 }
 
 void ValueEnumerator::purgeFunction() {
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index 4f8ebf5..2b9b15f 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -15,6 +15,7 @@
 #define VALUE_ENUMERATOR_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Attributes.h"
 #include <vector>
 
@@ -26,7 +27,7 @@ class Instruction;
 class BasicBlock;
 class Function;
 class Module;
-class MetadataBase;
+class MDNode;
 class NamedMDNode;
 class AttrListPtr;
 class TypeSymbolTable;
@@ -49,6 +50,7 @@ private:
   ValueMapType ValueMap;
   ValueList Values;
   ValueList MDValues;
+  SmallVector<const MDNode *, 8> FunctionLocalMDs;
   ValueMapType MDValueMap;
   
   typedef DenseMap<void*, unsigned> AttributeMapType;
@@ -105,6 +107,9 @@ public:
   
   const ValueList &getValues() const { return Values; }
   const ValueList &getMDValues() const { return MDValues; }
+  const SmallVector<const MDNode *, 8> &getFunctionLocalMDValues() const { 
+    return FunctionLocalMDs;
+  }
   const TypeList &getTypes() const { return Types; }
   const std::vector<const BasicBlock*> &getBasicBlocks() const {
     return BasicBlocks; 
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 4008a6a..a7189ac 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -114,6 +115,7 @@ AggressiveAntiDepBreaker(MachineFunction& MFi,
                          TargetSubtarget::RegClassVector& CriticalPathRCs) :
   AntiDepBreaker(), MF(MFi),
   MRI(MF.getRegInfo()),
+  TII(MF.getTarget().getInstrInfo()),
   TRI(MF.getTarget().getRegisterInfo()),
   AllocatableSet(TRI->getAllocatableSet(MF)),
   State(NULL) {
@@ -163,25 +165,27 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
         DefIndices[AliasReg] = ~0u;
       }
     }
-  } else {
-    // In a non-return block, examine the live-in regs of all successors.
-    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+  }
+
+  // In a non-return block, examine the live-in regs of all successors.
+  // Note a return block can have successors if the return instruction is
+  // predicated.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
          SE = BB->succ_end(); SI != SE; ++SI)
-      for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
+    for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
            E = (*SI)->livein_end(); I != E; ++I) {
-        unsigned Reg = *I;
-        State->UnionGroups(Reg, 0);
-        KillIndices[Reg] = BB->size();
-        DefIndices[Reg] = ~0u;
-        // Repeat, for all aliases.
-        for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-          unsigned AliasReg = *Alias;
-          State->UnionGroups(AliasReg, 0);
-          KillIndices[AliasReg] = BB->size();
-          DefIndices[AliasReg] = ~0u;
-        }
+      unsigned Reg = *I;
+      State->UnionGroups(Reg, 0);
+      KillIndices[Reg] = BB->size();
+      DefIndices[Reg] = ~0u;
+      // Repeat, for all aliases.
+      for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+        unsigned AliasReg = *Alias;
+        State->UnionGroups(AliasReg, 0);
+        KillIndices[AliasReg] = BB->size();
+        DefIndices[AliasReg] = ~0u;
       }
-  }
+    }
 
   // Mark live-out callee-saved registers. In a return block this is
   // all callee-saved registers. In non-return this is any
@@ -390,7 +394,8 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
     // If MI's defs have a special allocation requirement, don't allow
     // any def registers to be changed. Also assume all registers
     // defined in a call must not be changed (ABI).
-    if (MI->getDesc().isCall() || MI->getDesc().hasExtraDefRegAllocReq()) {
+    if (MI->getDesc().isCall() || MI->getDesc().hasExtraDefRegAllocReq() ||
+        TII->isPredicated(MI)) {
       DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
       State->UnionGroups(Reg, 0);
     }
@@ -443,6 +448,26 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
     RegRefs = State->GetRegRefs();
 
+  // If MI's uses have special allocation requirement, don't allow
+  // any use registers to be changed. Also assume all registers
+  // used in a call must not be changed (ABI).
+  // FIXME: The issue with predicated instruction is more complex. We are being
+  // conservatively here because the kill markers cannot be trusted after
+  // if-conversion:
+  // %R6<def> = LDR %SP, %reg0, 92, pred:14, pred:%reg0; mem:LD4[FixedStack14]
+  // ...
+  // STR %R0, %R6<kill>, %reg0, 0, pred:0, pred:%CPSR; mem:ST4[%395]
+  // %R6<def> = LDR %SP, %reg0, 100, pred:0, pred:%CPSR; mem:LD4[FixedStack12]
+  // STR %R0, %R6<kill>, %reg0, 0, pred:14, pred:%reg0; mem:ST4[%396](align=8)
+  //
+  // The first R6 kill is not really a kill since it's killed by a predicated
+  // instruction which may not be executed. The second R6 def may or may not
+  // re-define R6 so it's not safe to change it since the last R6 use cannot be
+  // changed.
+  bool Special = MI->getDesc().isCall() ||
+    MI->getDesc().hasExtraSrcRegAllocReq() ||
+    TII->isPredicated(MI);
+
   // Scan the register uses for this instruction and update
   // live-ranges, groups and RegRefs.
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -459,10 +484,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
     // for the register.
     HandleLastUse(Reg, Count, "(last-use)");
 
-    // If MI's uses have special allocation requirement, don't allow
-    // any use registers to be changed. Also assume all registers
-    // used in a call must not be changed (ABI).
-    if (MI->getDesc().isCall() || MI->getDesc().hasExtraSrcRegAllocReq()) {
+    if (Special) {
       DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
       State->UnionGroups(Reg, 0);
     }
@@ -604,8 +626,12 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
   // order. If that register is available, and the corresponding
   // registers are available for the other group subregisters, then we
   // can use those registers to rename.
+
+  // FIXME: Using getMinimalPhysRegClass is very conservative. We should
+  // check every use of the register and find the largest register class
+  // that can be used in all of them.
   const TargetRegisterClass *SuperRC =
-    TRI->getPhysicalRegisterRegClass(SuperReg, MVT::Other);
+    TRI->getMinimalPhysRegClass(SuperReg, MVT::Other);
 
   const TargetRegisterClass::iterator RB = SuperRC->allocation_order_begin(MF);
   const TargetRegisterClass::iterator RE = SuperRC->allocation_order_end(MF);
@@ -905,6 +931,19 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
                  AggressiveAntiDepState::RegisterReference>::iterator
                    Q = Range.first, QE = Range.second; Q != QE; ++Q) {
               Q->second.Operand->setReg(NewReg);
+              // If the SU for the instruction being updated has debug
+              // information related to the anti-dependency register, make
+              // sure to update that as well.
+              const SUnit *SU = MISUnitMap[Q->second.Operand->getParent()];
+              if (!SU) continue;
+              for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i) {
+                MachineInstr *DI = SU->DbgInstrList[i];
+                assert (DI->getNumOperands()==3 && DI->getOperand(0).isReg() &&
+                        DI->getOperand(0).getReg()
+                        && "Non register dbg_value attached to SUnit!");
+                if (DI->getOperand(0).getReg() == AntiDepReg)
+                  DI->getOperand(0).setReg(NewReg);
+              }
             }
 
             // We just went back in time and modified history; the
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.h b/lib/CodeGen/AggressiveAntiDepBreaker.h
index 506d43e..91ebb85 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.h
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -115,6 +115,7 @@ namespace llvm {
   class AggressiveAntiDepBreaker : public AntiDepBreaker {
     MachineFunction& MF;
     MachineRegisterInfo &MRI;
+    const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
 
     /// AllocatableSet - The set of allocatable registers.
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 5a0c27b..d9387a8 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -199,7 +199,7 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
   case GlobalValue::LinkOnceODRLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
-  case GlobalValue::LinkerPrivateLinkage:
+  case GlobalValue::LinkerPrivateWeakLinkage:
     if (MAI->getWeakDefDirective() != 0) {
       // .globl _foo
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
@@ -225,6 +225,7 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
     break;
   case GlobalValue::PrivateLinkage:
   case GlobalValue::InternalLinkage:
+  case GlobalValue::LinkerPrivateLinkage:
     break;
   default:
     llvm_unreachable("Unknown linkage type!");
@@ -330,7 +331,6 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     else if (GVKind.isThreadData()) {
       OutStreamer.SwitchSection(TheSection);
 
-      EmitLinkage(GV->getLinkage(), MangSym);
       EmitAlignment(AlignLog, GV);      
       OutStreamer.EmitLabel(MangSym);
       
@@ -353,7 +353,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     //   - spare pointer, used when mapped by the runtime
     //   - pointer to mangled symbol above with initializer
     unsigned PtrSize = TD->getPointerSizeInBits()/8;
-    OutStreamer.EmitSymbolValue(GetExternalSymbolSymbol("__tlv_bootstrap"),
+    OutStreamer.EmitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
                           PtrSize, 0);
     OutStreamer.EmitIntValue(0, PtrSize, 0);
     OutStreamer.EmitSymbolValue(MangSym, PtrSize, 0);
@@ -428,20 +428,12 @@ void AsmPrinter::EmitFunctionHeader() {
   
   // Emit pre-function debug and/or EH information.
   if (DE) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T(EHTimerName, DWARFGroupName);
-      DE->BeginFunction(MF);
-    } else {
-      DE->BeginFunction(MF);
-    }
+    NamedRegionTimer T(EHTimerName, DWARFGroupName, TimePassesIsEnabled);
+    DE->BeginFunction(MF);
   }
   if (DD) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T(DbgTimerName, DWARFGroupName);
-      DD->beginFunction(MF);
-    } else {
-      DD->beginFunction(MF);
-    }
+    NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+    DD->beginFunction(MF);
   }
 }
 
@@ -458,14 +450,11 @@ void AsmPrinter::EmitFunctionEntryLabel() {
 }
 
 
-/// EmitComments - Pretty-print comments for instructions.
-static void EmitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
-  const MachineFunction *MF = MI.getParent()->getParent();
-  const TargetMachine &TM = MF->getTarget();
-  
-  DebugLoc DL = MI.getDebugLoc();
+static void EmitDebugLoc(DebugLoc DL, const MachineFunction *MF, 
+                         raw_ostream &CommentOS) {
+  const LLVMContext &Ctx = MF->getFunction()->getContext();
   if (!DL.isUnknown()) {          // Print source line info.
-    DIScope Scope(DL.getScope(MF->getFunction()->getContext()));
+    DIScope Scope(DL.getScope(Ctx));
     // Omit the directory, because it's likely to be long and uninteresting.
     if (Scope.Verify())
       CommentOS << Scope.getFilename();
@@ -474,6 +463,23 @@ static void EmitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
     CommentOS << ':' << DL.getLine();
     if (DL.getCol() != 0)
       CommentOS << ':' << DL.getCol();
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
+    if (!InlinedAtDL.isUnknown()) {
+      CommentOS << "[ ";
+      EmitDebugLoc(InlinedAtDL, MF, CommentOS);
+      CommentOS << " ]";
+    }
+  }
+}
+
+/// EmitComments - Pretty-print comments for instructions.
+static void EmitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const TargetMachine &TM = MF->getTarget();
+  
+  DebugLoc DL = MI.getDebugLoc();
+  if (!DL.isUnknown()) {          // Print source line info.
+    EmitDebugLoc(DL, MF, CommentOS);
     CommentOS << '\n';
   }
   
@@ -611,12 +617,8 @@ void AsmPrinter::EmitFunctionBody() {
       }
 
       if (ShouldPrintDebugScopes) {
-	if (TimePassesIsEnabled) {
-	  NamedRegionTimer T(DbgTimerName, DWARFGroupName);
-	  DD->beginScope(II);
-	} else {
-	  DD->beginScope(II);
-	}
+        NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+        DD->beginScope(II);
       }
       
       if (isVerbose())
@@ -649,12 +651,8 @@ void AsmPrinter::EmitFunctionBody() {
       }
       
       if (ShouldPrintDebugScopes) {
-	if (TimePassesIsEnabled) {
-	  NamedRegionTimer T(DbgTimerName, DWARFGroupName);
-	  DD->endScope(II);
-	} else {
-	  DD->endScope(II);
-	}
+        NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+        DD->endScope(II);
       }
     }
   }
@@ -692,20 +690,12 @@ void AsmPrinter::EmitFunctionBody() {
   
   // Emit post-function debug information.
   if (DD) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T(DbgTimerName, DWARFGroupName);
-      DD->endFunction(MF);
-    } else {
-      DD->endFunction(MF);
-    }
+    NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+    DD->endFunction(MF);
   }
   if (DE) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T(EHTimerName, DWARFGroupName);
-      DE->EndFunction();
-    } else {
-      DE->EndFunction();
-    }
+    NamedRegionTimer T(EHTimerName, DWARFGroupName, TimePassesIsEnabled);
+    DE->EndFunction();
   }
   MMI->EndFunction();
   
@@ -730,19 +720,15 @@ bool AsmPrinter::doFinalization(Module &M) {
   
   // Finalize debug and EH information.
   if (DE) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T(EHTimerName, DWARFGroupName);
-      DE->EndModule();
-    } else {
+    {
+      NamedRegionTimer T(EHTimerName, DWARFGroupName, TimePassesIsEnabled);
       DE->EndModule();
     }
     delete DE; DE = 0;
   }
   if (DD) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T(DbgTimerName, DWARFGroupName);
-      DD->endModule();
-    } else {
+    {
+      NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
       DD->endModule();
     }
     delete DD; DD = 0;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index ba6fed2..f6f3bae 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -83,7 +83,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, unsigned LocCookie) const {
   // Tell SrcMgr about this buffer, it takes ownership of the buffer.
   SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
   
-  AsmParser Parser(SrcMgr, OutContext, OutStreamer, *MAI);
+  AsmParser Parser(TM.getTarget(), SrcMgr, OutContext, OutStreamer, *MAI);
   OwningPtr<TargetAsmParser> TAP(TM.getTarget().createAsmParser(Parser));
   if (!TAP)
     report_fatal_error("Inline asm not supported by this streamer because"
@@ -279,7 +279,7 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
       // Okay, we finally have a value number.  Ask the target to print this
       // operand!
       if (CurVariant == -1 || CurVariant == AsmPrinterVariant) {
-        unsigned OpNo = 1;
+        unsigned OpNo = 2;
 
         bool Error = false;
 
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index b2c70d5..21396ca 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -201,6 +201,7 @@ void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
   case dwarf::DW_FORM_data8: Size = 8; break;
   case dwarf::DW_FORM_udata: Asm->EmitULEB128(Integer); return;
   case dwarf::DW_FORM_sdata: Asm->EmitSLEB128(Integer); return;
+  case dwarf::DW_FORM_addr:  Size = Asm->getTargetData().getPointerSize(); break;
   default: llvm_unreachable("DIE Value form not supported yet");
   }
   Asm->OutStreamer.EmitIntValue(Integer, Size, 0/*addrspace*/);
@@ -221,6 +222,7 @@ unsigned DIEInteger::SizeOf(AsmPrinter *AP, unsigned Form) const {
   case dwarf::DW_FORM_data8: return sizeof(int64_t);
   case dwarf::DW_FORM_udata: return MCAsmInfo::getULEB128Size(Integer);
   case dwarf::DW_FORM_sdata: return MCAsmInfo::getSLEB128Size(Integer);
+  case dwarf::DW_FORM_addr:  return AP->getTargetData().getPointerSize();
   default: llvm_unreachable("DIE Value form not supported yet"); break;
   }
   return 0;
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 890507c..65c1d19 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -44,7 +44,8 @@ using namespace llvm;
 static cl::opt<bool> PrintDbgScope("print-dbgscope", cl::Hidden,
      cl::desc("Print DbgScope information for each machine instruction"));
 
-static cl::opt<bool> DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
+static cl::opt<bool> DisableDebugInfoPrinting("disable-debug-info-print", 
+                                              cl::Hidden,
      cl::desc("Disable debug info printing"));
 
 static cl::opt<bool> UnknownLocations("use-unknown-locations", cl::Hidden,
@@ -79,15 +80,13 @@ class CompileUnit {
   /// IndexTyDie - An anonymous type for index type.  Owned by CUDie.
   DIE *IndexTyDie;
 
-  /// GVToDieMap - Tracks the mapping of unit level debug informaton
+  /// MDNodeToDieMap - Tracks the mapping of unit level debug informaton
   /// variables to debug information entries.
-  /// FIXME : Rename GVToDieMap -> NodeToDieMap
-  DenseMap<const MDNode *, DIE *> GVToDieMap;
+  DenseMap<const MDNode *, DIE *> MDNodeToDieMap;
 
-  /// GVToDIEEntryMap - Tracks the mapping of unit level debug informaton
+  /// MDNodeToDIEEntryMap - Tracks the mapping of unit level debug informaton
   /// descriptors to debug information entries using a DIEEntry proxy.
-  /// FIXME : Rename
-  DenseMap<const MDNode *, DIEEntry *> GVToDIEEntryMap;
+  DenseMap<const MDNode *, DIEEntry *> MDNodeToDIEEntryMap;
 
   /// Globals - A map of globally visible named entities for this unit.
   ///
@@ -123,25 +122,25 @@ public:
 
   /// getDIE - Returns the debug information entry map slot for the
   /// specified debug variable.
-  DIE *getDIE(const MDNode *N) { return GVToDieMap.lookup(N); }
+  DIE *getDIE(const MDNode *N) { return MDNodeToDieMap.lookup(N); }
 
   /// insertDIE - Insert DIE into the map.
   void insertDIE(const MDNode *N, DIE *D) {
-    GVToDieMap.insert(std::make_pair(N, D));
+    MDNodeToDieMap.insert(std::make_pair(N, D));
   }
 
   /// getDIEEntry - Returns the debug information entry for the speciefied
   /// debug variable.
   DIEEntry *getDIEEntry(const MDNode *N) { 
-    DenseMap<const MDNode *, DIEEntry *>::iterator I = GVToDIEEntryMap.find(N);
-    if (I == GVToDIEEntryMap.end())
+    DenseMap<const MDNode *, DIEEntry *>::iterator I = MDNodeToDIEEntryMap.find(N);
+    if (I == MDNodeToDIEEntryMap.end())
       return NULL;
     return I->second;
   }
 
   /// insertDIEEntry - Insert debug information entry into the map.
   void insertDIEEntry(const MDNode *N, DIEEntry *E) {
-    GVToDIEEntryMap.insert(std::make_pair(N, E));
+    MDNodeToDIEEntryMap.insert(std::make_pair(N, E));
   }
 
   /// addDie - Adds or interns the DIE to the compile unit.
@@ -321,12 +320,12 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   DwarfFrameSectionSym = DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0;
   DwarfStrSectionSym = TextSectionSym = 0;
   DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = 0; 
+  DwarfDebugLineSectionSym = CurrentLineSectionSym = 0;
   FunctionBeginSym = FunctionEndSym = 0;
-  if (TimePassesIsEnabled) {
-      NamedRegionTimer T(DbgTimerName, DWARFGroupName);
-      beginModule(M);
-  } else {
-      beginModule(M);
+  DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
+  {
+    NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+    beginModule(M);
   }
 }
 DwarfDebug::~DwarfDebug() {
@@ -378,7 +377,8 @@ DIEEntry *DwarfDebug::createDIEEntry(DIE *Entry) {
 void DwarfDebug::addUInt(DIE *Die, unsigned Attribute,
                          unsigned Form, uint64_t Integer) {
   if (!Form) Form = DIEInteger::BestForm(false, Integer);
-  DIEValue *Value = new (DIEValueAllocator) DIEInteger(Integer);
+  DIEValue *Value = Integer == 1 ? 
+    DIEIntegerOne : new (DIEValueAllocator) DIEInteger(Integer);
   Die->addValue(Attribute, Form, Value);
 }
 
@@ -866,6 +866,10 @@ void DwarfDebug::addToContextOwner(DIE *Die, DIDescriptor Context) {
   } else if (Context.isNameSpace()) {
     DIE *ContextDIE = getOrCreateNameSpace(DINameSpace(Context));
     ContextDIE->addChild(Die);
+  } else if (Context.isSubprogram()) {
+    DIE *ContextDIE = createSubprogramDIE(DISubprogram(Context),
+                                          /*MakeDecl=*/false);
+    ContextDIE->addChild(Die);
   } else if (DIE *ContextDIE = getCompileUnit(Context)->getDIE(Context))
     ContextDIE->addChild(Die);
   else 
@@ -1055,6 +1059,10 @@ void DwarfDebug::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     if (DIDescriptor(ContainingType).isCompositeType())
       addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, 
                   getOrCreateTypeDIE(DIType(ContainingType)));
+    else {
+      DIDescriptor Context = CTy.getContext();
+      addToContextOwner(&Buffer, Context);
+    }
     break;
   }
   default:
@@ -1065,8 +1073,9 @@ void DwarfDebug::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
   if (!Name.empty())
     addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
 
-  if (Tag == dwarf::DW_TAG_enumeration_type || Tag == dwarf::DW_TAG_class_type ||
-      Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type) {
+  if (Tag == dwarf::DW_TAG_enumeration_type || Tag == dwarf::DW_TAG_class_type 
+      || Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
+    {
     // Add size if non-zero (derived types might be zero-sized.)
     if (Size)
       addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
@@ -1329,6 +1338,9 @@ DIE *DwarfDebug::createSubprogramDIE(const DISubprogram &SP, bool MakeDecl) {
   // DW_TAG_inlined_subroutine may refer to this DIE.
   SPCU->insertDIE(SP, SPDie);
 
+  // Add to context owner.
+  addToContextOwner(SPDie, SP.getContext());
+
   return SPDie;
 }
 
@@ -1379,6 +1391,7 @@ static bool isSubprogramContext(const MDNode *Context) {
 DIE *DwarfDebug::updateSubprogramScopeDIE(const MDNode *SPNode) {
   CompileUnit *SPCU = getCompileUnit(SPNode);
   DIE *SPDie = SPCU->getDIE(SPNode);
+
   assert(SPDie && "Unable to find subprogram DIE!");
   DISubprogram SP(SPNode);
   
@@ -1412,6 +1425,14 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(const MDNode *SPNode) {
     SPCU->addDie(SPDie);
   }
   
+  // Pick up abstract subprogram DIE.
+  if (DIE *AbsSPDIE = AbstractSPDies.lookup(SPNode)) {
+    SPDie = new DIE(dwarf::DW_TAG_subprogram);
+    addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin,
+                dwarf::DW_FORM_ref4, AbsSPDIE);
+    SPCU->addDie(SPDie);
+  }
+
   addLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
            Asm->GetTempSymbol("func_begin", Asm->getFunctionNumber()));
   addLabel(SPDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
@@ -1483,7 +1504,7 @@ DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
   const MCSymbol *StartLabel = getLabelBeforeInsn(RI->first);
   const MCSymbol *EndLabel = getLabelAfterInsn(RI->second);
 
-  if (StartLabel == FunctionBeginSym || EndLabel == 0) {
+  if (StartLabel == 0 || EndLabel == 0) {
     assert (0 && "Unexpected Start and End  labels for a inlined scope!");
     return 0;
   }
@@ -1605,11 +1626,13 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
     // FIXME : Handle getNumOperands != 3 
     if (DVInsn->getNumOperands() == 3) {
       if (DVInsn->getOperand(0).isReg())
-        updated = addRegisterAddress(VariableDie, DVLabel, DVInsn->getOperand(0));
+        updated = 
+          addRegisterAddress(VariableDie, DVLabel, DVInsn->getOperand(0));
       else if (DVInsn->getOperand(0).isImm())
         updated = addConstantValue(VariableDie, DVLabel, DVInsn->getOperand(0));
       else if (DVInsn->getOperand(0).isFPImm()) 
-        updated = addConstantFPValue(VariableDie, DVLabel, DVInsn->getOperand(0));
+        updated = 
+          addConstantFPValue(VariableDie, DVLabel, DVInsn->getOperand(0));
     } else {
       MachineLocation Location = Asm->getDebugValueLocation(DVInsn);
       if (Location.getReg()) {
@@ -1682,8 +1705,13 @@ DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
   if (Scope->getInlinedAt())
     ScopeDIE = constructInlinedScopeDIE(Scope);
   else if (DS.isSubprogram()) {
-    if (Scope->isAbstractScope())
+    ProcessedSPNodes.insert(DS);
+    if (Scope->isAbstractScope()) {
       ScopeDIE = getCompileUnit(DS)->getDIE(DS);
+      // Note down abstract DIE.
+      if (ScopeDIE)
+        AbstractSPDies.insert(std::make_pair(DS, ScopeDIE));
+    }
     else
       ScopeDIE = updateSubprogramScopeDIE(DS);
   }
@@ -1782,11 +1810,11 @@ void DwarfDebug::constructCompileUnit(const MDNode *N) {
   addString(Die, dwarf::DW_AT_name, dwarf::DW_FORM_string, FN);
   // Use DW_AT_entry_pc instead of DW_AT_low_pc/DW_AT_high_pc pair. This
   // simplifies debug range entries.
-  addUInt(Die, dwarf::DW_AT_entry_pc, dwarf::DW_FORM_data4, 0);
+  addUInt(Die, dwarf::DW_AT_entry_pc, dwarf::DW_FORM_addr, 0);
   // DW_AT_stmt_list is a offset of line number information for this
-  // compile unit in debug_line section. It is always zero when only one
-  // compile unit is emitted in one object file.
-  addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
+  // compile unit in debug_line section. This offset is calculated 
+  // during endMoudle().
+  addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
 
   if (!Dir.empty())
     addString(Die, dwarf::DW_AT_comp_dir, dwarf::DW_FORM_string, Dir);
@@ -1996,6 +2024,40 @@ void DwarfDebug::beginModule(Module *M) {
 ///
 void DwarfDebug::endModule() {
   if (!FirstCU) return;
+  const Module *M = MMI->getModule();
+  if (NamedMDNode *AllSPs = M->getNamedMetadata("llvm.dbg.sp")) {
+    for (unsigned SI = 0, SE = AllSPs->getNumOperands(); SI != SE; ++SI) {
+      if (ProcessedSPNodes.count(AllSPs->getOperand(SI)) != 0) continue;
+      DISubprogram SP(AllSPs->getOperand(SI));
+      if (!SP.Verify()) continue;
+
+      // Collect info for variables that were optimized out.
+      StringRef FName = SP.getLinkageName();
+      if (FName.empty())
+        FName = SP.getName();
+      NamedMDNode *NMD = 
+        M->getNamedMetadata(Twine("llvm.dbg.lv.", getRealLinkageName(FName)));
+      if (!NMD) continue;
+      unsigned E = NMD->getNumOperands();
+      if (!E) continue;
+      DbgScope *Scope = new DbgScope(NULL, DIDescriptor(SP), NULL);
+      for (unsigned I = 0; I != E; ++I) {
+        DIVariable DV(NMD->getOperand(I));
+        if (!DV.Verify()) continue;
+        Scope->addVariable(new DbgVariable(DV));
+      }
+      
+      // Construct subprogram DIE and add variables DIEs.
+      constructSubprogramDIE(SP);
+      DIE *ScopeDIE = getCompileUnit(SP)->getDIE(SP);
+      const SmallVector<DbgVariable *, 8> &Variables = Scope->getVariables();
+      for (unsigned i = 0, N = Variables.size(); i < N; ++i) {
+        DIE *VariableDIE = constructVariableDIE(Variables[i], Scope);
+        if (VariableDIE)
+          ScopeDIE->addChild(VariableDIE);
+      }
+    }
+  }
 
   // Attach DW_AT_inline attribute with inlined subprogram DIEs.
   for (SmallPtrSet<DIE *, 4>::iterator AI = InlinedSubprogramDIEs.begin(),
@@ -2037,15 +2099,15 @@ void DwarfDebug::endModule() {
   // Compute DIE offsets and sizes.
   computeSizeAndOffsets();
 
+  // Emit source line correspondence into a debug line section.
+  emitDebugLines();
+
   // Emit all the DIEs into a debug info section
   emitDebugInfo();
 
   // Corresponding abbreviations into a abbrev section.
   emitAbbreviations();
 
-  // Emit source line correspondence into a debug line section.
-  emitDebugLines();
-
   // Emit info into a debug pubnames section.
   emitDebugPubNames();
 
@@ -2150,8 +2212,9 @@ static bool isDbgValueInDefinedReg(const MachineInstr *MI) {
 }
 
 /// collectVariableInfo - Populate DbgScope entries with variables' info.
-void DwarfDebug::collectVariableInfo(const MachineFunction *MF) {
-  SmallPtrSet<const MDNode *, 16> Processed;
+void 
+DwarfDebug::collectVariableInfo(const MachineFunction *MF,
+                                SmallPtrSet<const MDNode *, 16> &Processed) {
   
   /// collection info from MMI table.
   collectVariableInfoFromMMITable(MF, Processed);
@@ -2180,16 +2243,23 @@ void DwarfDebug::collectVariableInfo(const MachineFunction *MF) {
     if (Processed.count(DV) != 0)
       continue;
 
+    const MachineInstr *PrevMI = MInsn;
     for (SmallVector<const MachineInstr *, 8>::iterator MI = I+1, 
            ME = DbgValues.end(); MI != ME; ++MI) {
       const MDNode *Var = 
         (*MI)->getOperand((*MI)->getNumOperands()-1).getMetadata();
-      if (Var == DV && isDbgValueInDefinedReg(*MI))
+      if (Var == DV && isDbgValueInDefinedReg(*MI) && 
+          !PrevMI->isIdenticalTo(*MI))
         MultipleValues.push_back(*MI);
+      PrevMI = *MI;
     }
 
     DbgScope *Scope = findDbgScope(MInsn);
-    if (!Scope && DV.getTag() == dwarf::DW_TAG_arg_variable)
+    bool CurFnArg = false;
+    if (DV.getTag() == dwarf::DW_TAG_arg_variable &&
+        DISubprogram(DV.getContext()).describes(MF->getFunction()))
+      CurFnArg = true;
+    if (!Scope && CurFnArg)
       Scope = CurrentFnDbgScope;
     // If variable scope is not found then skip this variable.
     if (!Scope)
@@ -2198,7 +2268,7 @@ void DwarfDebug::collectVariableInfo(const MachineFunction *MF) {
     Processed.insert(DV);
     DbgVariable *RegVar = new DbgVariable(DV);
     Scope->addVariable(RegVar);
-    if (DV.getTag() != dwarf::DW_TAG_arg_variable)
+    if (!CurFnArg)
       DbgVariableLabelsMap[RegVar] = getLabelBeforeInsn(MInsn); 
     if (DbgVariable *AbsVar = findAbstractVariable(DV, MInsn->getDebugLoc())) {
       DbgVariableToDbgInstMap[AbsVar] = MInsn;
@@ -2217,7 +2287,8 @@ void DwarfDebug::collectVariableInfo(const MachineFunction *MF) {
     const MachineInstr *Begin = NULL;
     const MachineInstr *End = NULL;
     for (SmallVector<const MachineInstr *, 4>::iterator 
-           MVI = MultipleValues.begin(), MVE = MultipleValues.end(); MVI != MVE; ++MVI) {
+           MVI = MultipleValues.begin(), MVE = MultipleValues.end(); 
+         MVI != MVE; ++MVI) {
       if (!Begin) {
         Begin = *MVI;
         continue;
@@ -2241,8 +2312,11 @@ void DwarfDebug::collectVariableInfo(const MachineFunction *MF) {
   }
 
   // Collect info for variables that were optimized out.
+  const Function *F = MF->getFunction();
+  const Module *M = F->getParent();
   if (NamedMDNode *NMD = 
-      MF->getFunction()->getParent()->getNamedMetadata("llvm.dbg.lv")) {
+      M->getNamedMetadata(Twine("llvm.dbg.lv.", 
+                                getRealLinkageName(F->getName())))) {
     for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
       DIVariable DV(cast_or_null<MDNode>(NMD->getOperand(i)));
       if (!DV || !Processed.insert(DV))
@@ -2319,7 +2393,8 @@ void DwarfDebug::endScope(const MachineInstr *MI) {
 }
 
 /// getOrCreateDbgScope - Create DbgScope for the scope.
-DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope, const MDNode *InlinedAt) {
+DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope, 
+                                          const MDNode *InlinedAt) {
   if (!InlinedAt) {
     DbgScope *WScope = DbgScopeMap.lookup(Scope);
     if (WScope)
@@ -2335,13 +2410,20 @@ DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope, const MDNode *Inl
 
     if (!WScope->getParent()) {
       StringRef SPName = DISubprogram(Scope).getLinkageName();
-      if (SPName == Asm->MF->getFunction()->getName())
+      // We used to check only for a linkage name, but that fails
+      // since we began omitting the linkage name for private
+      // functions.  The new way is to check for the name in metadata,
+      // but that's not supported in old .ll test cases.  Ergo, we
+      // check both.
+      if (SPName == Asm->MF->getFunction()->getName() ||
+          DISubprogram(Scope).getFunction() == Asm->MF->getFunction())
         CurrentFnDbgScope = WScope;
     }
     
     return WScope;
   }
 
+  getOrCreateAbstractScope(Scope);
   DbgScope *WScope = DbgScopeMap.lookup(InlinedAt);
   if (WScope)
     return WScope;
@@ -2355,7 +2437,6 @@ DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope, const MDNode *Inl
   Parent->addScope(WScope);
 
   ConcreteScopes[InlinedAt] = WScope;
-  getOrCreateAbstractScope(Scope);
 
   return WScope;
 }
@@ -2365,8 +2446,6 @@ DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope, const MDNode *Inl
 static bool hasValidLocation(LLVMContext &Ctx,
                              const MachineInstr *MInsn,
                              const MDNode *&Scope, const MDNode *&InlinedAt) {
-  if (MInsn->isDebugValue())
-    return false;
   DebugLoc DL = MInsn->getDebugLoc();
   if (DL.isUnknown()) return false;
       
@@ -2488,7 +2567,8 @@ bool DwarfDebug::extractScopeInformation() {
         // current instruction scope does not match scope of first instruction
         // in this range then create a new instruction range.
         DbgRange R(RangeBeginMI, PrevMI);
-        MI2ScopeMap[RangeBeginMI] = getOrCreateDbgScope(PrevScope, PrevInlinedAt);
+        MI2ScopeMap[RangeBeginMI] = getOrCreateDbgScope(PrevScope, 
+                                                        PrevInlinedAt);
         MIRanges.push_back(R);
       } 
 
@@ -2565,7 +2645,6 @@ void DwarfDebug::identifyScopeMarkers() {
            RE = Ranges.end(); RI != RE; ++RI) {
       assert(RI->first && "DbgRange does not have first instruction!");      
       assert(RI->second && "DbgRange does not have second instruction!");      
-      InsnsBeginScopeSet.insert(RI->first);
       InsnsEndScopeSet.insert(RI->second);
     }
   }
@@ -2616,6 +2695,9 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   
   recordSourceLine(Line, Col, Scope);
 
+  /// ProcessedArgs - Collection of arguments already processed.
+  SmallPtrSet<const MDNode *, 8> ProcessedArgs;
+
   DebugLoc PrevLoc;
   for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
        I != E; ++I)
@@ -2624,14 +2706,19 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
       const MachineInstr *MI = II;
       DebugLoc DL = MI->getDebugLoc();
       if (MI->isDebugValue()) {
-        // DBG_VALUE needs a label if the variable is local variable or
-        // an argument whose location is changing.
         assert (MI->getNumOperands() > 1 && "Invalid machine instruction!");
         DIVariable DV(MI->getOperand(MI->getNumOperands() - 1).getMetadata());
         if (!DV.Verify()) continue;
-        if (DV.getTag() != dwarf::DW_TAG_arg_variable)
+        // If DBG_VALUE is for a local variable then it needs a label.
+        if (DV.getTag() != dwarf::DW_TAG_arg_variable 
+            && isDbgValueInUndefinedReg(MI) == false)
           InsnNeedsLabel.insert(MI);
-        else if (!ProcessedArgs.insert(DV))
+        // DBG_VALUE for inlined functions argument needs a label.
+        else if (!DISubprogram(getDISubprogram(DV.getContext())).
+                 describes(MF->getFunction()))
+          InsnNeedsLabel.insert(MI);
+        // DBG_VALUE indicating argument location change needs a label.
+        else if (isDbgValueInUndefinedReg(MI) == false && !ProcessedArgs.insert(DV))
           InsnNeedsLabel.insert(MI);
       } else {
         // If location is unknown then instruction needs a location only if 
@@ -2664,7 +2751,8 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
     // Assumes in correct section after the entry point.
     Asm->OutStreamer.EmitLabel(FunctionEndSym);
     
-    collectVariableInfo(MF);
+    SmallPtrSet<const MDNode *, 16> ProcessedVars;
+    collectVariableInfo(MF, ProcessedVars);
 
     // Get function line info.
     if (!Lines.empty()) {
@@ -2679,9 +2767,31 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
     
     // Construct abstract scopes.
     for (SmallVector<DbgScope *, 4>::iterator AI = AbstractScopesList.begin(),
-           AE = AbstractScopesList.end(); AI != AE; ++AI)
-      constructScopeDIE(*AI);
-    
+           AE = AbstractScopesList.end(); AI != AE; ++AI) {
+      DISubprogram SP((*AI)->getScopeNode());
+      if (SP.Verify()) {
+        // Collect info for variables that were optimized out.
+        StringRef FName = SP.getLinkageName();
+        if (FName.empty())
+          FName = SP.getName();
+        const Module *M = MF->getFunction()->getParent();
+        if (NamedMDNode *NMD = 
+            M->getNamedMetadata(Twine("llvm.dbg.lv.", 
+                                      getRealLinkageName(FName)))) {
+          for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+          DIVariable DV(cast_or_null<MDNode>(NMD->getOperand(i)));
+          if (!DV || !ProcessedVars.insert(DV))
+            continue;
+          DbgScope *Scope = AbstractScopes.lookup(DV.getContext());
+          if (Scope)
+            Scope->addVariable(new DbgVariable(DV));
+          }
+        }
+      }
+      if (ProcessedSPNodes.count((*AI)->getScopeNode()) == 0)
+        constructScopeDIE(*AI);
+    }
+
     DIE *CurFnDIE = constructScopeDIE(CurrentFnDbgScope);
     
     if (!DisableFramePointerElim(*MF))
@@ -2696,13 +2806,11 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   // Clear debug info
   CurrentFnDbgScope = NULL;
   InsnNeedsLabel.clear();
-  ProcessedArgs.clear();
   DbgVariableToFrameIndexMap.clear();
   VarToAbstractVarMap.clear();
   DbgVariableToDbgInstMap.clear();
   DbgVariableLabelsMap.clear();
   DeleteContainerSeconds(DbgScopeMap);
-  InsnsBeginScopeSet.clear();
   InsnsEndScopeSet.clear();
   ConcreteScopes.clear();
   DeleteContainerSeconds(AbstractScopes);
@@ -2764,7 +2872,8 @@ DbgScope *DwarfDebug::findDbgScope(const MachineInstr *MInsn) {
 /// recordSourceLine - Register a source line with debug info. Returns the
 /// unique label that was emitted and which provides correspondence to
 /// the source line list.
-MCSymbol *DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S) {
+MCSymbol *DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, 
+                                       const MDNode *S) {
   StringRef Dir;
   StringRef Fn;
 
@@ -2790,6 +2899,16 @@ MCSymbol *DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode
     Src = GetOrCreateSourceID(Dir, Fn);
   }
 
+#if 0
+  if (!Lines.empty()) {
+    SrcLineInfo lastSrcLineInfo = Lines.back();
+    // Emitting sequential line records with the same line number (but
+    // different addresses) seems to confuse GDB.  Avoid this.
+    if (lastSrcLineInfo.getLine() == Line)
+      return NULL;
+  }
+#endif
+
   MCSymbol *Label = MMI->getContext().CreateTempSymbol();
   Lines.push_back(SrcLineInfo(Line, Col, Src, Label));
 
@@ -2898,7 +3017,8 @@ void DwarfDebug::EmitSectionLabels() {
   if (const MCSection *MacroInfo = TLOF.getDwarfMacroInfoSection())
     EmitSectionSym(Asm, MacroInfo);
 
-  EmitSectionSym(Asm, TLOF.getDwarfLineSection());
+  DwarfDebugLineSectionSym = 
+    EmitSectionSym(Asm, TLOF.getDwarfLineSection(), "section_line");
   EmitSectionSym(Asm, TLOF.getDwarfLocSection());
   EmitSectionSym(Asm, TLOF.getDwarfPubNamesSection());
   EmitSectionSym(Asm, TLOF.getDwarfPubTypesSection());
@@ -2961,6 +3081,11 @@ void DwarfDebug::emitDIE(DIE *Die) {
                                      4);
       break;
     }
+    case dwarf::DW_AT_stmt_list: {
+      Asm->EmitLabelDifference(CurrentLineSectionSym, 
+                               DwarfDebugLineSectionSym, 4);
+      break;
+    }
     case dwarf::DW_AT_location: {
       if (UseDotDebugLocEntry.count(Die) != 0) {
         DIELabel *L = cast<DIELabel>(Values[i]);
@@ -3106,6 +3231,8 @@ void DwarfDebug::emitDebugLines() {
                             Asm->getObjFileLowering().getDwarfLineSection());
 
   // Construct the section header.
+  CurrentLineSectionSym = Asm->GetTempSymbol("section_line_begin");
+  Asm->OutStreamer.EmitLabel(CurrentLineSectionSym);
   Asm->OutStreamer.AddComment("Length of Source Line Info");
   Asm->EmitLabelDifference(Asm->GetTempSymbol("line_end"),
                            Asm->GetTempSymbol("line_begin"), 4);
@@ -3491,8 +3618,9 @@ void DwarfDebug::emitDebugLoc() {
   unsigned char Size = Asm->getTargetData().getPointerSize();
   Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_loc", 0));
   unsigned index = 1;
-  for (SmallVector<DotDebugLocEntry, 4>::iterator I = DotDebugLocEntries.begin(),
-         E = DotDebugLocEntries.end(); I != E; ++I, ++index) {
+  for (SmallVector<DotDebugLocEntry, 4>::iterator 
+         I = DotDebugLocEntries.begin(), E = DotDebugLocEntries.end(); 
+       I != E; ++I, ++index) {
     DotDebugLocEntry Entry = *I;
     if (Entry.isEmpty()) {
       Asm->OutStreamer.EmitIntValue(0, Size, /*addrspace*/0);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 0d6116f..5a281c8 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -156,6 +156,9 @@ class DwarfDebug {
   /// not included DbgScopeMap.  AbstractScopes owns its DbgScope*s.
   DenseMap<const MDNode *, DbgScope *> AbstractScopes;
 
+  /// AbstractSPDies - Collection of abstract subprogram DIEs.
+  DenseMap<const MDNode *, DIE *> AbstractSPDies;
+
   /// AbstractScopesList - Tracks abstract scopes constructed while processing
   /// a function. This list is cleared during endFunction().
   SmallVector<DbgScope *, 4>AbstractScopesList;
@@ -210,7 +213,7 @@ class DwarfDebug {
   DenseMap<DIE *, const MDNode *> ContainingTypeMap;
 
   typedef SmallVector<DbgScope *, 2> ScopeVector;
-  SmallPtrSet<const MachineInstr *, 8> InsnsBeginScopeSet;
+
   SmallPtrSet<const MachineInstr *, 8> InsnsEndScopeSet;
 
   /// InlineInfo - Keep track of inlined functions and their location.  This
@@ -219,6 +222,10 @@ class DwarfDebug {
   DenseMap<const MDNode *, SmallVector<InlineInfoLabels, 4> > InlineInfo;
   SmallVector<const MDNode *, 4> InlinedSPNodes;
 
+  // ProcessedSPNodes - This is a collection of subprogram MDNodes that
+  // are processed to create DIEs.
+  SmallPtrSet<const MDNode *, 16> ProcessedSPNodes;
+
   /// LabelsBeforeInsn - Maps instruction with label emitted before 
   /// instruction.
   DenseMap<const MachineInstr *, MCSymbol *> LabelsBeforeInsn;
@@ -231,9 +238,6 @@ class DwarfDebug {
   /// a debuggging information entity.
   SmallPtrSet<const MachineInstr *, 8> InsnNeedsLabel;
 
-  /// ProcessedArgs - Collection of arguments already processed.
-  SmallPtrSet<const MDNode *, 8> ProcessedArgs;
-
   SmallVector<const MCSymbol *, 8> DebugRangeSymbols;
 
   /// Previous instruction's location information. This is used to determine
@@ -257,7 +261,10 @@ class DwarfDebug {
   MCSymbol *DwarfFrameSectionSym, *DwarfInfoSectionSym, *DwarfAbbrevSectionSym;
   MCSymbol *DwarfStrSectionSym, *TextSectionSym, *DwarfDebugRangeSectionSym;
   MCSymbol *DwarfDebugLocSectionSym;
+  MCSymbol *DwarfDebugLineSectionSym, *CurrentLineSectionSym;
   MCSymbol *FunctionBeginSym, *FunctionEndSym;
+
+  DIEInteger *DIEIntegerOne;
 private:
   
   /// getSourceDirectoryAndFileIds - Return the directory and file ids that
@@ -593,7 +600,8 @@ private:
   bool extractScopeInformation();
   
   /// collectVariableInfo - Populate DbgScope entries with variables' info.
-  void collectVariableInfo(const MachineFunction *);
+  void collectVariableInfo(const MachineFunction *,
+                           SmallPtrSet<const MDNode *, 16> &ProcessedVars);
   
   /// collectVariableInfoFromMMITable - Collect variable information from
   /// side table maintained by MMI.
diff --git a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index f92127f..c8a63cf 100644
--- a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -52,13 +52,13 @@ static void EmitCamlGlobal(const Module &M, AsmPrinter &AP, const char *Id) {
   SymName.append(MId.begin(), std::find(MId.begin(), MId.end(), '.'));
   SymName += "__";
   SymName += Id;
-  
+
   // Capitalize the first letter of the module name.
   SymName[Letter] = toupper(SymName[Letter]);
-  
+
   SmallString<128> TmpStr;
   AP.Mang->getNameWithPrefix(TmpStr, SymName);
-  
+
   MCSymbol *Sym = AP.OutContext.GetOrCreateSymbol(TmpStr);
 
   AP.OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 9dec22e..7f98df0 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -358,23 +358,10 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
 }
 
 /// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
-/// after it, replacing it with an unconditional branch to NewDest.  This
-/// returns true if OldInst's block is modified, false if NewDest is modified.
+/// after it, replacing it with an unconditional branch to NewDest.
 void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
                                            MachineBasicBlock *NewDest) {
-  MachineBasicBlock *OldBB = OldInst->getParent();
-
-  // Remove all the old successors of OldBB from the CFG.
-  while (!OldBB->succ_empty())
-    OldBB->removeSuccessor(OldBB->succ_begin());
-
-  // Remove all the dead instructions from the end of OldBB.
-  OldBB->erase(OldInst, OldBB->end());
-
-  // If OldBB isn't immediately before OldBB, insert a branch to it.
-  if (++MachineFunction::iterator(OldBB) != MachineFunction::iterator(NewDest))
-    TII->InsertBranch(*OldBB, NewDest, 0, SmallVector<MachineOperand, 0>());
-  OldBB->addSuccessor(NewDest);
+  TII->ReplaceTailWithBranchTo(OldInst, NewDest);
   ++NumTailMerge;
 }
 
@@ -383,6 +370,9 @@ void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
 /// iterator.  This returns the new MBB.
 MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
                                             MachineBasicBlock::iterator BBI1) {
+  if (!TII->isLegalToSplitMBBAt(CurMBB, BBI1))
+    return 0;
+
   MachineFunction &MF = *CurMBB.getParent();
 
   // Create the fall-through block.
@@ -443,18 +433,20 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
   MachineFunction::iterator I = llvm::next(MachineFunction::iterator(CurMBB));
   MachineBasicBlock *TBB = 0, *FBB = 0;
   SmallVector<MachineOperand, 4> Cond;
+  DebugLoc dl;  // FIXME: this is nowhere
   if (I != MF->end() &&
       !TII->AnalyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
     MachineBasicBlock *NextBB = I;
     if (TBB == NextBB && !Cond.empty() && !FBB) {
       if (!TII->ReverseBranchCondition(Cond)) {
         TII->RemoveBranch(*CurMBB);
-        TII->InsertBranch(*CurMBB, SuccBB, NULL, Cond);
+        TII->InsertBranch(*CurMBB, SuccBB, NULL, Cond, dl);
         return;
       }
     }
   }
-  TII->InsertBranch(*CurMBB, SuccBB, NULL, SmallVector<MachineOperand, 0>());
+  TII->InsertBranch(*CurMBB, SuccBB, NULL,
+                    SmallVector<MachineOperand, 0>(), dl);
 }
 
 bool
@@ -625,9 +617,10 @@ void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
 
 /// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist
 /// only of the common tail.  Create a block that does by splitting one.
-unsigned BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
-                                             unsigned maxCommonTailLength) {
-  unsigned commonTailIndex = 0;
+bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                             unsigned maxCommonTailLength,
+                                             unsigned &commonTailIndex) {
+  commonTailIndex = 0;
   unsigned TimeEstimate = ~0U;
   for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
     // Use PredBB if possible; that doesn't require a new branch.
@@ -655,6 +648,11 @@ unsigned BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
                << maxCommonTailLength);
 
   MachineBasicBlock *newMBB = SplitMBBAt(*MBB, BBI);
+  if (!newMBB) {
+    DEBUG(dbgs() << "... failed!");
+    return false;
+  }
+
   SameTails[commonTailIndex].setBlock(newMBB);
   SameTails[commonTailIndex].setTailStartPos(newMBB->begin());
 
@@ -662,7 +660,7 @@ unsigned BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
   if (PredBB == MBB)
     PredBB = newMBB;
 
-  return commonTailIndex;
+  return true;
 }
 
 // See if any of the blocks in MergePotentials (which all have a common single
@@ -757,7 +755,11 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
          !SameTails[commonTailIndex].tailIsWholeBlock())) {
       // None of the blocks consist entirely of the common tail.
       // Split a block so that one does.
-      commonTailIndex = CreateCommonTailOnlyBlock(PredBB, maxCommonTailLength);
+      if (!CreateCommonTailOnlyBlock(PredBB,
+                                     maxCommonTailLength, commonTailIndex)) {
+        RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
+        continue;
+      }
     }
 
     MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
@@ -874,10 +876,11 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
           }
           // Remove the unconditional branch at the end, if any.
           if (TBB && (Cond.empty() || FBB)) {
+            DebugLoc dl;  // FIXME: this is nowhere
             TII->RemoveBranch(*PBB);
             if (!Cond.empty())
               // reinsert conditional branch only, for now
-              TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, 0, NewCond);
+              TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, 0, NewCond, dl);
           }
           MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P));
         }
@@ -976,6 +979,7 @@ static bool IsBetterFallthrough(MachineBasicBlock *MBB1,
 bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
   bool MadeChange = false;
   MachineFunction &MF = *MBB->getParent();
+  DebugLoc dl;  // FIXME: this is nowhere
 ReoptimizeBlock:
 
   MachineFunction::iterator FallThrough = MBB;
@@ -1027,7 +1031,7 @@ ReoptimizeBlock:
       TII->RemoveBranch(PrevBB);
       PriorCond.clear();
       if (PriorTBB != MBB)
-        TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond);
+        TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond, dl);
       MadeChange = true;
       ++NumBranchOpts;
       goto ReoptimizeBlock;
@@ -1066,7 +1070,7 @@ ReoptimizeBlock:
     // the condition is false, remove the uncond second branch.
     if (PriorFBB == MBB) {
       TII->RemoveBranch(PrevBB);
-      TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond);
+      TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond, dl);
       MadeChange = true;
       ++NumBranchOpts;
       goto ReoptimizeBlock;
@@ -1079,7 +1083,7 @@ ReoptimizeBlock:
       SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
       if (!TII->ReverseBranchCondition(NewPriorCond)) {
         TII->RemoveBranch(PrevBB);
-        TII->InsertBranch(PrevBB, PriorFBB, 0, NewPriorCond);
+        TII->InsertBranch(PrevBB, PriorFBB, 0, NewPriorCond, dl);
         MadeChange = true;
         ++NumBranchOpts;
         goto ReoptimizeBlock;
@@ -1116,7 +1120,7 @@ ReoptimizeBlock:
                        << "To make fallthrough to: " << *PriorTBB << "\n");
 
           TII->RemoveBranch(PrevBB);
-          TII->InsertBranch(PrevBB, MBB, 0, NewPriorCond);
+          TII->InsertBranch(PrevBB, MBB, 0, NewPriorCond, dl);
 
           // Move this block to the end of the function.
           MBB->moveAfter(--MF.end());
@@ -1145,7 +1149,7 @@ ReoptimizeBlock:
       SmallVector<MachineOperand, 4> NewCond(CurCond);
       if (!TII->ReverseBranchCondition(NewCond)) {
         TII->RemoveBranch(*MBB);
-        TII->InsertBranch(*MBB, CurFBB, CurTBB, NewCond);
+        TII->InsertBranch(*MBB, CurFBB, CurTBB, NewCond, dl);
         MadeChange = true;
         ++NumBranchOpts;
         goto ReoptimizeBlock;
@@ -1200,7 +1204,7 @@ ReoptimizeBlock:
               PriorFBB = MBB;
             }
             TII->RemoveBranch(PrevBB);
-            TII->InsertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond);
+            TII->InsertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, dl);
           }
 
           // Iterate through all the predecessors, revectoring each in-turn.
@@ -1226,7 +1230,7 @@ ReoptimizeBlock:
               if (!NewCurUnAnalyzable && NewCurTBB && NewCurTBB == NewCurFBB) {
                 TII->RemoveBranch(*PMBB);
                 NewCurCond.clear();
-                TII->InsertBranch(*PMBB, NewCurTBB, 0, NewCurCond);
+                TII->InsertBranch(*PMBB, NewCurTBB, 0, NewCurCond, dl);
                 MadeChange = true;
                 ++NumBranchOpts;
                 PMBB->CorrectExtraCFGEdges(NewCurTBB, 0, false);
@@ -1246,7 +1250,7 @@ ReoptimizeBlock:
       }
 
       // Add the branch back if the block is more than just an uncond branch.
-      TII->InsertBranch(*MBB, CurTBB, 0, CurCond);
+      TII->InsertBranch(*MBB, CurTBB, 0, CurCond, dl);
     }
   }
 
@@ -1286,7 +1290,7 @@ ReoptimizeBlock:
           if (CurFallsThru) {
             MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
             CurCond.clear();
-            TII->InsertBranch(*MBB, NextBB, 0, CurCond);
+            TII->InsertBranch(*MBB, NextBB, 0, CurCond, dl);
           }
           MBB->moveAfter(PredBB);
           MadeChange = true;
diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h
index b087395..15dfa7f 100644
--- a/lib/CodeGen/BranchFolding.h
+++ b/lib/CodeGen/BranchFolding.h
@@ -102,8 +102,9 @@ namespace llvm {
                               MachineBasicBlock *PredBB);
     void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB,
                                                 MachineBasicBlock* PredBB);
-    unsigned CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
-                                       unsigned maxCommonTailLength);
+    bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                   unsigned maxCommonTailLength,
+                                   unsigned &commonTailIndex);
 
     bool OptimizeBranches(MachineFunction &MF);
     bool OptimizeBlock(MachineBasicBlock *MBB);
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 3e38872..ffeff1e 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -1,19 +1,20 @@
 add_llvm_library(LLVMCodeGen
-  Analysis.cpp
   AggressiveAntiDepBreaker.cpp
+  Analysis.cpp
   BranchFolding.cpp
   CalcSpillWeights.cpp
+  CallingConvLower.cpp
   CodePlacementOpt.cpp
   CriticalAntiDepBreaker.cpp
   DeadMachineInstructionElim.cpp
   DwarfEHPrepare.cpp
   ELFCodeEmitter.cpp
   ELFWriter.cpp
-  ExactHazardRecognizer.cpp
   GCMetadata.cpp
   GCMetadataPrinter.cpp
   GCStrategy.cpp
   IfConversion.cpp
+  InlineSpiller.cpp
   IntrinsicLowering.cpp
   LLVMTargetMachine.cpp
   LatencyPriorityQueue.cpp
@@ -45,6 +46,7 @@ add_llvm_library(LLVMCodeGen
   OptimizePHIs.cpp
   PHIElimination.cpp
   Passes.cpp
+  PostRAHazardRecognizer.cpp
   PostRASchedulerList.cpp
   PreAllocSplitting.cpp
   ProcessImplicitDefs.cpp
@@ -52,7 +54,6 @@ add_llvm_library(LLVMCodeGen
   PseudoSourceValue.cpp
   RegAllocFast.cpp
   RegAllocLinearScan.cpp
-  RegAllocLocal.cpp
   RegAllocPBQP.cpp
   RegisterCoalescer.cpp
   RegisterScavenging.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index a328d0e..240a7b9 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -116,7 +116,7 @@ bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &fn) {
       SmallVector<LiveInterval*, 4> spillIs;
       if (lis->isReMaterializable(li, spillIs, isLoad)) {
         // If all of the definitions of the interval are re-materializable,
-        // it is a preferred candidate for spilling. If non of the defs are
+        // it is a preferred candidate for spilling. If none of the defs are
         // loads, then it's potentially very cheap to re-materialize.
         // FIXME: this gets much more complicated once we support non-trivial
         // re-materialization.
diff --git a/lib/CodeGen/SelectionDAG/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index 4e6c1fc..62ad817 100644
--- a/lib/CodeGen/SelectionDAG/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -80,13 +80,12 @@ CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
 
 /// CheckReturn - Analyze the return values of a function, returning true if
 /// the return can be performed without sret-demotion, and false otherwise.
-bool CCState::CheckReturn(const SmallVectorImpl<EVT> &OutTys,
-                          const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
+bool CCState::CheckReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
                           CCAssignFn Fn) {
   // Determine which register each value should be copied into.
-  for (unsigned i = 0, e = OutTys.size(); i != e; ++i) {
-    EVT VT = OutTys[i];
-    ISD::ArgFlagsTy ArgFlags = ArgsFlags[i];
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    EVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
     if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
       return false;
   }
@@ -99,7 +98,7 @@ void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
                             CCAssignFn Fn) {
   // Determine which register each value should be copied into.
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-    EVT VT = Outs[i].Val.getValueType();
+    EVT VT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
     if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this)) {
 #ifndef NDEBUG
@@ -111,14 +110,13 @@ void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
   }
 }
 
-
 /// AnalyzeCallOperands - Analyze the outgoing arguments to a call,
 /// incorporating info about the passed values into this state.
 void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
                                   CCAssignFn Fn) {
   unsigned NumOps = Outs.size();
   for (unsigned i = 0; i != NumOps; ++i) {
-    EVT ArgVT = Outs[i].Val.getValueType();
+    EVT ArgVT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
     if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
 #ifndef NDEBUG
diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp
index 3ff2a04..e0e315c 100644
--- a/lib/CodeGen/CodePlacementOpt.cpp
+++ b/lib/CodeGen/CodePlacementOpt.cpp
@@ -178,6 +178,8 @@ bool CodePlacementOpt::EliminateUnconditionalJumpsToTop(MachineFunction &MF,
         continue;
 
       // Move the block.
+      DEBUG(dbgs() << "CGP: Moving blocks starting at BB#" << Pred->getNumber()
+                   << " to top of loop.\n");
       Changed = true;
 
       // Move it and all the blocks that can reach it via fallthrough edges
@@ -297,6 +299,8 @@ bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF,
         continue;
 
       // Move the block.
+      DEBUG(dbgs() << "CGP: Moving blocks starting at BB#" << BB->getNumber()
+                   << " to be contiguous with loop.\n");
       Changed = true;
 
       // Process this block and all loop blocks contiguous with it, to keep
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index fd957b1..e3746a9 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -29,6 +30,7 @@ CriticalAntiDepBreaker::
 CriticalAntiDepBreaker(MachineFunction& MFi) :
   AntiDepBreaker(), MF(MFi),
   MRI(MF.getRegInfo()),
+  TII(MF.getTarget().getInstrInfo()),
   TRI(MF.getTarget().getRegisterInfo()),
   AllocatableSet(TRI->getAllocatableSet(MF))
 {
@@ -71,25 +73,27 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
         DefIndices[AliasReg] = ~0u;
       }
     }
-  } else {
-    // In a non-return block, examine the live-in regs of all successors.
-    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+  }
+
+  // In a non-return block, examine the live-in regs of all successors.
+  // Note a return block can have successors if the return instruction is
+  // predicated.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
          SE = BB->succ_end(); SI != SE; ++SI)
-      for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
+    for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
            E = (*SI)->livein_end(); I != E; ++I) {
-        unsigned Reg = *I;
-        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
-        KillIndices[Reg] = BB->size();
-        DefIndices[Reg] = ~0u;
-        // Repeat, for all aliases.
-        for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-          unsigned AliasReg = *Alias;
-          Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
-          KillIndices[AliasReg] = BB->size();
-          DefIndices[AliasReg] = ~0u;
-        }
+      unsigned Reg = *I;
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[Reg] = BB->size();
+      DefIndices[Reg] = ~0u;
+      // Repeat, for all aliases.
+      for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+        unsigned AliasReg = *Alias;
+        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        KillIndices[AliasReg] = BB->size();
+        DefIndices[AliasReg] = ~0u;
       }
-  }
+    }
 
   // Mark live-out callee-saved registers. In a return block this is
   // all callee-saved registers. In non-return this is any
@@ -164,6 +168,26 @@ static const SDep *CriticalPathStep(const SUnit *SU) {
 }
 
 void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
+  // It's not safe to change register allocation for source operands of
+  // that have special allocation requirements. Also assume all registers
+  // used in a call must not be changed (ABI).
+  // FIXME: The issue with predicated instruction is more complex. We are being
+  // conservatively here because the kill markers cannot be trusted after
+  // if-conversion:
+  // %R6<def> = LDR %SP, %reg0, 92, pred:14, pred:%reg0; mem:LD4[FixedStack14]
+  // ...
+  // STR %R0, %R6<kill>, %reg0, 0, pred:0, pred:%CPSR; mem:ST4[%395]
+  // %R6<def> = LDR %SP, %reg0, 100, pred:0, pred:%CPSR; mem:LD4[FixedStack12]
+  // STR %R0, %R6<kill>, %reg0, 0, pred:14, pred:%reg0; mem:ST4[%396](align=8)
+  //
+  // The first R6 kill is not really a kill since it's killed by a predicated
+  // instruction which may not be executed. The second R6 def may or may not
+  // re-define R6 so it's not safe to change it since the last R6 use cannot be
+  // changed.
+  bool Special = MI->getDesc().isCall() ||
+    MI->getDesc().hasExtraSrcRegAllocReq() ||
+    TII->isPredicated(MI);
+
   // Scan the register operands for this instruction and update
   // Classes and RegRefs.
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -199,9 +223,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
     if (Classes[Reg] != reinterpret_cast<TargetRegisterClass *>(-1))
       RegRefs.insert(std::make_pair(Reg, &MO));
 
-    // It's not safe to change register allocation for source operands of
-    // that have special allocation requirements.
-    if (MO.isUse() && MI->getDesc().hasExtraSrcRegAllocReq()) {
+    if (MO.isUse() && Special) {
       if (KeepRegs.insert(Reg)) {
         for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
              *Subreg; ++Subreg)
@@ -216,38 +238,43 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
   // Update liveness.
   // Proceding upwards, registers that are defed but not used in this
   // instruction are now dead.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
-    if (Reg == 0) continue;
-    if (!MO.isDef()) continue;
-    // Ignore two-addr defs.
-    if (MI->isRegTiedToUseOperand(i)) continue;
-
-    DefIndices[Reg] = Count;
-    KillIndices[Reg] = ~0u;
-    assert(((KillIndices[Reg] == ~0u) !=
-            (DefIndices[Reg] == ~0u)) &&
-           "Kill and Def maps aren't consistent for Reg!");
-    KeepRegs.erase(Reg);
-    Classes[Reg] = 0;
-    RegRefs.erase(Reg);
-    // Repeat, for all subregs.
-    for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
-         *Subreg; ++Subreg) {
-      unsigned SubregReg = *Subreg;
-      DefIndices[SubregReg] = Count;
-      KillIndices[SubregReg] = ~0u;
-      KeepRegs.erase(SubregReg);
-      Classes[SubregReg] = 0;
-      RegRefs.erase(SubregReg);
-    }
-    // Conservatively mark super-registers as unusable.
-    for (const unsigned *Super = TRI->getSuperRegisters(Reg);
-         *Super; ++Super) {
-      unsigned SuperReg = *Super;
-      Classes[SuperReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+  if (!TII->isPredicated(MI)) {
+    // Predicated defs are modeled as read + write, i.e. similar to two
+    // address updates.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+      if (!MO.isDef()) continue;
+      // Ignore two-addr defs.
+      if (MI->isRegTiedToUseOperand(i)) continue;
+
+      DefIndices[Reg] = Count;
+      KillIndices[Reg] = ~0u;
+      assert(((KillIndices[Reg] == ~0u) !=
+              (DefIndices[Reg] == ~0u)) &&
+             "Kill and Def maps aren't consistent for Reg!");
+      KeepRegs.erase(Reg);
+      Classes[Reg] = 0;
+      RegRefs.erase(Reg);
+      // Repeat, for all subregs.
+      for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+           *Subreg; ++Subreg) {
+        unsigned SubregReg = *Subreg;
+        DefIndices[SubregReg] = Count;
+        KillIndices[SubregReg] = ~0u;
+        KeepRegs.erase(SubregReg);
+        Classes[SubregReg] = 0;
+        RegRefs.erase(SubregReg);
+      }
+      // Conservatively mark super-registers as unusable.
+      for (const unsigned *Super = TRI->getSuperRegisters(Reg);
+           *Super; ++Super) {
+        unsigned SuperReg = *Super;
+        Classes[SuperReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      }
     }
   }
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -334,10 +361,15 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
   // so just duck out immediately if the block is empty.
   if (SUnits.empty()) return 0;
 
+  // Keep a map of the MachineInstr*'s back to the SUnit representing them.
+  // This is used for updating debug information.
+  DenseMap<MachineInstr*,const SUnit*> MISUnitMap;
+
   // Find the node at the bottom of the critical path.
   const SUnit *Max = 0;
   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
     const SUnit *SU = &SUnits[i];
+    MISUnitMap[SU->getInstr()] = SU;
     if (!Max || SU->getDepth() + SU->Latency > Max->getDepth() + Max->Latency)
       Max = SU;
   }
@@ -473,7 +505,11 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
 
     PrescanInstruction(MI);
 
-    if (MI->getDesc().hasExtraDefRegAllocReq())
+    // If MI's defs have a special allocation requirement, don't allow
+    // any def registers to be changed. Also assume all registers
+    // defined in a call must not be changed (ABI).
+    if (MI->getDesc().isCall() || MI->getDesc().hasExtraDefRegAllocReq() ||
+        TII->isPredicated(MI))
       // If this instruction's defs have special allocation requirement, don't
       // break this anti-dependency.
       AntiDepReg = 0;
@@ -485,7 +521,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
         if (!MO.isReg()) continue;
         unsigned Reg = MO.getReg();
         if (Reg == 0) continue;
-        if (MO.isUse() && AntiDepReg == Reg) {
+        if (MO.isUse() && TRI->regsOverlap(AntiDepReg, Reg)) {
           AntiDepReg = 0;
           break;
         }
@@ -519,8 +555,22 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
                   std::multimap<unsigned, MachineOperand *>::iterator>
            Range = RegRefs.equal_range(AntiDepReg);
         for (std::multimap<unsigned, MachineOperand *>::iterator
-             Q = Range.first, QE = Range.second; Q != QE; ++Q)
+             Q = Range.first, QE = Range.second; Q != QE; ++Q) {
           Q->second->setReg(NewReg);
+          // If the SU for the instruction being updated has debug information
+          // related to the anti-dependency register, make sure to update that
+          // as well.
+          const SUnit *SU = MISUnitMap[Q->second->getParent()];
+          if (!SU) continue;
+          for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i) {
+            MachineInstr *DI = SU->DbgInstrList[i];
+            assert (DI->getNumOperands()==3 && DI->getOperand(0).isReg() &&
+                    DI->getOperand(0).getReg()
+                    && "Non register dbg_value attached to SUnit!");
+            if (DI->getOperand(0).getReg() == AntiDepReg)
+              DI->getOperand(0).setReg(NewReg);
+          }
+        }
 
         // We just went back in time and modified history; the
         // liveness information for the anti-depenence reg is now
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index cc42dd2..5406300 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -22,15 +22,18 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include <map>
 
 namespace llvm {
+class TargetInstrInfo;
+class TargetRegisterInfo;
+
   class CriticalAntiDepBreaker : public AntiDepBreaker {
     MachineFunction& MF;
     MachineRegisterInfo &MRI;
+    const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
 
     /// AllocatableSet - The set of allocatable registers.
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index f6739f4..01b31b4 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
@@ -87,10 +88,13 @@ namespace {
     /// CleanupSelectors - Any remaining eh.selector intrinsic calls which still
     /// use the ".llvm.eh.catch.all.value" call need to convert to using its
     /// initializer instead.
-    bool CleanupSelectors();
+    bool CleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels);
+
+    bool HasCatchAllInSelector(IntrinsicInst *);
 
     /// FindAllCleanupSelectors - Find all eh.selector calls that are clean-ups.
-    void FindAllCleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels);
+    void FindAllCleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels,
+                                 SmallPtrSet<IntrinsicInst*, 32> &CatchAllSels);
 
     /// FindAllURoRInvokes - Find all URoR invokes in the function.
     void FindAllURoRInvokes(SmallPtrSet<InvokeInst*, 32> &URoRInvokes);
@@ -150,7 +154,7 @@ namespace {
         Changed = true;
       }
 
-      return false;
+      return Changed;
     }
 
   public:
@@ -186,25 +190,32 @@ FunctionPass *llvm::createDwarfEHPass(const TargetMachine *tm, bool fast) {
   return new DwarfEHPrepare(tm, fast);
 }
 
+/// HasCatchAllInSelector - Return true if the intrinsic instruction has a
+/// catch-all.
+bool DwarfEHPrepare::HasCatchAllInSelector(IntrinsicInst *II) {
+  if (!EHCatchAllValue) return false;
+
+  unsigned ArgIdx = II->getNumArgOperands() - 1;
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(II->getArgOperand(ArgIdx));
+  return GV == EHCatchAllValue;
+}
+
 /// FindAllCleanupSelectors - Find all eh.selector calls that are clean-ups.
 void DwarfEHPrepare::
-FindAllCleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels) {
+FindAllCleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels,
+                        SmallPtrSet<IntrinsicInst*, 32> &CatchAllSels) {
   for (Value::use_iterator
          I = SelectorIntrinsic->use_begin(),
          E = SelectorIntrinsic->use_end(); I != E; ++I) {
-    IntrinsicInst *SI = cast<IntrinsicInst>(I);
-    if (!SI || SI->getParent()->getParent() != F) continue;
-
-    unsigned NumOps = SI->getNumOperands();
-    if (NumOps > 4) continue;
-    bool IsCleanUp = (NumOps == 3);
+    IntrinsicInst *II = cast<IntrinsicInst>(I);
 
-    if (!IsCleanUp)
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(SI->getOperand(3)))
-        IsCleanUp = (CI->getZExtValue() == 0);
+    if (II->getParent()->getParent() != F)
+      continue;
 
-    if (IsCleanUp)
-      Sels.insert(SI);
+    if (!HasCatchAllInSelector(II))
+      Sels.insert(II);
+    else
+      CatchAllSels.insert(II);
   }
 }
 
@@ -222,7 +233,7 @@ FindAllURoRInvokes(SmallPtrSet<InvokeInst*, 32> &URoRInvokes) {
 /// CleanupSelectors - Any remaining eh.selector intrinsic calls which still use
 /// the ".llvm.eh.catch.all.value" call need to convert to using its
 /// initializer instead.
-bool DwarfEHPrepare::CleanupSelectors() {
+bool DwarfEHPrepare::CleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels) {
   if (!EHCatchAllValue) return false;
 
   if (!SelectorIntrinsic) {
@@ -232,17 +243,15 @@ bool DwarfEHPrepare::CleanupSelectors() {
   }
 
   bool Changed = false;
-  for (Value::use_iterator
-         I = SelectorIntrinsic->use_begin(),
-         E = SelectorIntrinsic->use_end(); I != E; ++I) {
-    IntrinsicInst *Sel = dyn_cast<IntrinsicInst>(I);
-    if (!Sel || Sel->getParent()->getParent() != F) continue;
+  for (SmallPtrSet<IntrinsicInst*, 32>::iterator
+         I = Sels.begin(), E = Sels.end(); I != E; ++I) {
+    IntrinsicInst *Sel = *I;
 
     // Index of the ".llvm.eh.catch.all.value" variable.
-    unsigned OpIdx = Sel->getNumOperands() - 1;
-    GlobalVariable *GV = dyn_cast<GlobalVariable>(Sel->getOperand(OpIdx));
+    unsigned OpIdx = Sel->getNumArgOperands() - 1;
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(Sel->getArgOperand(OpIdx));
     if (GV != EHCatchAllValue) continue;
-    Sel->setOperand(OpIdx, EHCatchAllValue->getInitializer());
+    Sel->setArgOperand(OpIdx, EHCatchAllValue->getInitializer());
     Changed = true;
   }
 
@@ -293,8 +302,6 @@ DwarfEHPrepare::FindSelectorAndURoR(Instruction *Inst, bool &URoRInvoke,
 /// function. This is a candidate to merge the selector associated with the URoR
 /// invoke with the one from the URoR's landing pad.
 bool DwarfEHPrepare::HandleURoRInvokes() {
-  if (!DT) return CleanupSelectors(); // We require DominatorTree information.
-
   if (!EHCatchAllValue) {
     EHCatchAllValue =
       F->getParent()->getNamedGlobal(".llvm.eh.catch.all.value");
@@ -307,14 +314,20 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
     if (!SelectorIntrinsic) return false;
   }
 
+  SmallPtrSet<IntrinsicInst*, 32> Sels;
+  SmallPtrSet<IntrinsicInst*, 32> CatchAllSels;
+  FindAllCleanupSelectors(Sels, CatchAllSels);
+
+  if (!DT)
+    // We require DominatorTree information.
+    return CleanupSelectors(CatchAllSels);
+
   if (!URoR) {
     URoR = F->getParent()->getFunction("_Unwind_Resume_or_Rethrow");
-    if (!URoR) return CleanupSelectors();
+    if (!URoR) return CleanupSelectors(CatchAllSels);
   }
 
-  SmallPtrSet<IntrinsicInst*, 32> Sels;
   SmallPtrSet<InvokeInst*, 32> URoRInvokes;
-  FindAllCleanupSelectors(Sels);
   FindAllURoRInvokes(URoRInvokes);
 
   SmallPtrSet<IntrinsicInst*, 32> SelsToConvert;
@@ -340,7 +353,8 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
     if (!ExceptionValueIntrinsic) {
       ExceptionValueIntrinsic =
         Intrinsic::getDeclaration(F->getParent(), Intrinsic::eh_exception);
-      if (!ExceptionValueIntrinsic) return CleanupSelectors();
+      if (!ExceptionValueIntrinsic)
+        return CleanupSelectors(CatchAllSels);
     }
 
     for (Value::use_iterator
@@ -360,21 +374,9 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
         // an eh.selector intrinsic call. If the eh.selector is a 'clean-up', we
         // need to convert it to a 'catch-all'.
         for (SmallPtrSet<IntrinsicInst*, 8>::iterator
-               SI = SelCalls.begin(), SE = SelCalls.end(); SI != SE; ++SI) {
-          IntrinsicInst *II = *SI;
-          unsigned NumOps = II->getNumOperands();
-
-          if (NumOps <= 4) {
-            bool IsCleanUp = (NumOps == 3);
-
-            if (!IsCleanUp)
-              if (ConstantInt *CI = dyn_cast<ConstantInt>(II->getOperand(3)))
-                IsCleanUp = (CI->getZExtValue() == 0);
-
-            if (IsCleanUp)
-              SelsToConvert.insert(II);
-          }
-        }
+               SI = SelCalls.begin(), SE = SelCalls.end(); SI != SE; ++SI)
+          if (!HasCatchAllInSelector(*SI))
+              SelsToConvert.insert(*SI);
       }
     }
   }
@@ -388,12 +390,22 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
            SI = SelsToConvert.begin(), SE = SelsToConvert.end();
          SI != SE; ++SI) {
       IntrinsicInst *II = *SI;
-      SmallVector<Value*, 8> Args;
 
       // Use the exception object pointer and the personality function
       // from the original selector.
-      Args.push_back(II->getOperand(1)); // Exception object pointer.
-      Args.push_back(II->getOperand(2)); // Personality function.
+      CallSite CS(II);
+      IntrinsicInst::op_iterator I = CS.arg_begin();
+      IntrinsicInst::op_iterator E = CS.arg_end();
+      IntrinsicInst::op_iterator B = prior(E);
+
+      // Exclude last argument if it is an integer.
+      if (isa<ConstantInt>(B)) E = B;
+
+      // Add exception object pointer (front).
+      // Add personality function (next).
+      // Add in any filter IDs (rest).
+      SmallVector<Value*, 8> Args(I, E);
+
       Args.push_back(EHCatchAllValue->getInitializer()); // Catch-all indicator.
 
       CallInst *NewSelector =
@@ -409,7 +421,7 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
     }
   }
 
-  Changed |= CleanupSelectors();
+  Changed |= CleanupSelectors(CatchAllSels);
   return Changed;
 }
 
diff --git a/lib/CodeGen/ELFCodeEmitter.cpp b/lib/CodeGen/ELFCodeEmitter.cpp
index 8416d3b..36b0e65 100644
--- a/lib/CodeGen/ELFCodeEmitter.cpp
+++ b/lib/CodeGen/ELFCodeEmitter.cpp
@@ -90,7 +90,7 @@ bool ELFCodeEmitter::finishFunction(MachineFunction &MF) {
     for (std::vector<MachineRelocation>::iterator MRI = JTRelocations.begin(),
          MRE = JTRelocations.end(); MRI != MRE; ++MRI) {
       MachineRelocation &MR = *MRI;
-      unsigned MBBOffset = getMachineBasicBlockAddress(MR.getBasicBlock());
+      uintptr_t MBBOffset = getMachineBasicBlockAddress(MR.getBasicBlock());
       MR.setResultPointer((void*)MBBOffset);
       MR.setConstantVal(ES->SectionIdx);
       JTSection.addRelocation(MR);
diff --git a/lib/CodeGen/ExactHazardRecognizer.h b/lib/CodeGen/ExactHazardRecognizer.h
deleted file mode 100644
index 91c81a9..0000000
--- a/lib/CodeGen/ExactHazardRecognizer.h
+++ /dev/null
@@ -1,86 +0,0 @@
-//=- llvm/CodeGen/ExactHazardRecognizer.h - Scheduling Support -*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ExactHazardRecognizer class, which
-// implements hazard-avoidance heuristics for scheduling, based on the
-// scheduling itineraries specified for the target.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_EXACTHAZARDRECOGNIZER_H
-#define LLVM_CODEGEN_EXACTHAZARDRECOGNIZER_H
-
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Target/TargetInstrItineraries.h"
-
-namespace llvm {
-  class ExactHazardRecognizer : public ScheduleHazardRecognizer {
-    // ScoreBoard to track function unit usage. ScoreBoard[0] is a
-    // mask of the FUs in use in the cycle currently being
-    // schedule. ScoreBoard[1] is a mask for the next cycle. The
-    // ScoreBoard is used as a circular buffer with the current cycle
-    // indicated by Head.
-    class ScoreBoard {
-      unsigned *Data;
-
-      // The maximum number of cycles monitored by the Scoreboard. This
-      // value is determined based on the target itineraries to ensure
-      // that all hazards can be tracked.
-      size_t Depth;
-      // Indices into the Scoreboard that represent the current cycle.
-      size_t Head;
-    public:
-      ScoreBoard():Data(NULL), Depth(0), Head(0) { }
-      ~ScoreBoard() {
-        delete[] Data;
-      }
-
-      size_t getDepth() const { return Depth; }
-      unsigned& operator[](size_t idx) const {
-        assert(Depth && "ScoreBoard was not initialized properly!");
-
-        return Data[(Head + idx) % Depth];
-      }
-
-      void reset(size_t d = 1) {
-        if (Data == NULL) {
-          Depth = d;
-          Data = new unsigned[Depth];
-        }
-
-        memset(Data, 0, Depth * sizeof(Data[0]));
-        Head = 0;
-      }
-
-      void advance() {
-        Head = (Head + 1) % Depth;
-      }
-
-      // Print the scoreboard.
-      void dump() const;
-    };
-
-    // Itinerary data for the target.
-    const InstrItineraryData &ItinData;
-
-    ScoreBoard ReservedScoreboard;
-    ScoreBoard RequiredScoreboard;
-
-  public:
-    ExactHazardRecognizer(const InstrItineraryData &ItinData);
-
-    virtual HazardType getHazardType(SUnit *SU);
-    virtual void Reset();
-    virtual void EmitInstruction(SUnit *SU);
-    virtual void AdvanceCycle();
-  };
-}
-
-#endif
diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp
index 790cb21..71506cc 100644
--- a/lib/CodeGen/GCStrategy.cpp
+++ b/lib/CodeGen/GCStrategy.cpp
@@ -271,7 +271,7 @@ bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
         case Intrinsic::gcwrite:
           if (LowerWr) {
             // Replace a write barrier with a simple store.
-            Value *St = new StoreInst(CI->getOperand(1), CI->getOperand(3), CI);
+            Value *St = new StoreInst(CI->getArgOperand(0), CI->getArgOperand(2), CI);
             CI->replaceAllUsesWith(St);
             CI->eraseFromParent();
           }
@@ -279,7 +279,7 @@ bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
         case Intrinsic::gcread:
           if (LowerRd) {
             // Replace a read barrier with a simple load.
-            Value *Ld = new LoadInst(CI->getOperand(2), "", CI);
+            Value *Ld = new LoadInst(CI->getArgOperand(1), "", CI);
             Ld->takeName(CI);
             CI->replaceAllUsesWith(Ld);
             CI->eraseFromParent();
@@ -290,7 +290,7 @@ bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
             // Initialize the GC root, but do not delete the intrinsic. The
             // backend needs the intrinsic to flag the stack slot.
             Roots.push_back(cast<AllocaInst>(
-                              CI->getOperand(1)->stripPointerCasts()));
+                              CI->getArgOperand(0)->stripPointerCasts()));
           }
           break;
         default:
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index c61fd17..6b445e0 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -33,20 +34,22 @@ using namespace llvm;
 static cl::opt<int> IfCvtFnStart("ifcvt-fn-start", cl::init(-1), cl::Hidden);
 static cl::opt<int> IfCvtFnStop("ifcvt-fn-stop", cl::init(-1), cl::Hidden);
 static cl::opt<int> IfCvtLimit("ifcvt-limit", cl::init(-1), cl::Hidden);
-static cl::opt<bool> DisableSimple("disable-ifcvt-simple", 
+static cl::opt<bool> DisableSimple("disable-ifcvt-simple",
                                    cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableSimpleF("disable-ifcvt-simple-false", 
+static cl::opt<bool> DisableSimpleF("disable-ifcvt-simple-false",
                                     cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableTriangle("disable-ifcvt-triangle", 
+static cl::opt<bool> DisableTriangle("disable-ifcvt-triangle",
                                      cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableTriangleR("disable-ifcvt-triangle-rev", 
+static cl::opt<bool> DisableTriangleR("disable-ifcvt-triangle-rev",
                                       cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableTriangleF("disable-ifcvt-triangle-false", 
+static cl::opt<bool> DisableTriangleF("disable-ifcvt-triangle-false",
                                       cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableTriangleFR("disable-ifcvt-triangle-false-rev", 
+static cl::opt<bool> DisableTriangleFR("disable-ifcvt-triangle-false-rev",
                                        cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableDiamond("disable-ifcvt-diamond", 
+static cl::opt<bool> DisableDiamond("disable-ifcvt-diamond",
                                     cl::init(false), cl::Hidden);
+static cl::opt<bool> IfCvtBranchFold("ifcvt-branch-fold",
+                                     cl::init(true), cl::Hidden);
 
 STATISTIC(NumSimple,       "Number of simple if-conversions performed");
 STATISTIC(NumSimpleFalse,  "Number of simple (F) if-conversions performed");
@@ -115,7 +118,7 @@ namespace {
                  BB(0), TrueBB(0), FalseBB(0) {}
     };
 
-    /// IfcvtToken - Record information about pending if-conversions to attemp:
+    /// IfcvtToken - Record information about pending if-conversions to attempt:
     /// BBI             - Corresponding BBInfo.
     /// Kind            - Type of block. See IfcvtKind.
     /// NeedSubsumption - True if the to-be-predicated BB has already been
@@ -146,6 +149,7 @@ namespace {
 
     const TargetLowering *TLI;
     const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
     bool MadeChange;
     int FnNum;
   public:
@@ -167,8 +171,7 @@ namespace {
                          std::vector<IfcvtToken*> &Tokens);
     bool FeasibilityAnalysis(BBInfo &BBI, SmallVectorImpl<MachineOperand> &Cond,
                              bool isTriangle = false, bool RevBranch = false);
-    bool AnalyzeBlocks(MachineFunction &MF,
-                       std::vector<IfcvtToken*> &Tokens);
+    void AnalyzeBlocks(MachineFunction &MF, std::vector<IfcvtToken*> &Tokens);
     void InvalidatePreds(MachineBasicBlock *BB);
     void RemoveExtraEdges(BBInfo &BBI);
     bool IfConvertSimple(BBInfo &BBI, IfcvtKind Kind);
@@ -177,14 +180,22 @@ namespace {
                           unsigned NumDups1, unsigned NumDups2);
     void PredicateBlock(BBInfo &BBI,
                         MachineBasicBlock::iterator E,
-                        SmallVectorImpl<MachineOperand> &Cond);
+                        SmallVectorImpl<MachineOperand> &Cond,
+                        SmallSet<unsigned, 4> &Redefs);
     void CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                SmallVectorImpl<MachineOperand> &Cond,
+                               SmallSet<unsigned, 4> &Redefs,
                                bool IgnoreBr = false);
-    void MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI);
+    void MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges = true);
 
-    bool MeetIfcvtSizeLimit(unsigned Size) const {
-      return Size > 0 && Size <= TLI->getIfCvtBlockSizeLimit();
+    bool MeetIfcvtSizeLimit(MachineBasicBlock &BB, unsigned Size) const {
+      return Size > 0 && TII->isProfitableToIfCvt(BB, Size);
+    }
+
+    bool MeetIfcvtSizeLimit(MachineBasicBlock &TBB, unsigned TSize,
+                            MachineBasicBlock &FBB, unsigned FSize) const {
+      return TSize > 0 && FSize > 0 &&
+        TII->isProfitableToIfCvt(TBB, TSize, FBB, FSize);
     }
 
     // blockAlwaysFallThrough - Block ends without a terminator.
@@ -227,8 +238,15 @@ FunctionPass *llvm::createIfConverterPass() { return new IfConverter(); }
 bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   TLI = MF.getTarget().getTargetLowering();
   TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
   if (!TII) return false;
 
+  // Tail merge tend to expose more if-conversion opportunities.
+  BranchFolder BF(true);
+  bool BFChange = BF.OptimizeFunction(MF, TII,
+                                   MF.getTarget().getRegisterInfo(),
+                                   getAnalysisIfAvailable<MachineModuleInfo>());
+
   DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum <<  ") \'"
                << MF.getFunction()->getName() << "\'");
 
@@ -253,7 +271,8 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   while (IfCvtLimit == -1 || (int)NumIfCvts < IfCvtLimit) {
     // Do an initial analysis for each basic block and find all the potential
     // candidates to perform if-conversion.
-    bool Change = AnalyzeBlocks(MF, Tokens);
+    bool Change = false;
+    AnalyzeBlocks(MF, Tokens);
     while (!Tokens.empty()) {
       IfcvtToken *Token = Tokens.back();
       Tokens.pop_back();
@@ -281,7 +300,8 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
       case ICSimpleFalse: {
         bool isFalse = Kind == ICSimpleFalse;
         if ((isFalse && DisableSimpleF) || (!isFalse && DisableSimple)) break;
-        DEBUG(dbgs() << "Ifcvt (Simple" << (Kind == ICSimpleFalse ? " false" :"")
+        DEBUG(dbgs() << "Ifcvt (Simple" << (Kind == ICSimpleFalse ?
+                                            " false" : "")
                      << "): BB#" << BBI.BB->getNumber() << " ("
                      << ((Kind == ICSimpleFalse)
                          ? BBI.FalseBB->getNumber()
@@ -289,8 +309,8 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
         RetVal = IfConvertSimple(BBI, Kind);
         DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
         if (RetVal) {
-          if (isFalse) NumSimpleFalse++;
-          else         NumSimple++;
+          if (isFalse) ++NumSimpleFalse;
+          else         ++NumSimple;
         }
        break;
       }
@@ -316,11 +336,11 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
         DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
         if (RetVal) {
           if (isFalse) {
-            if (isRev) NumTriangleFRev++;
-            else       NumTriangleFalse++;
+            if (isRev) ++NumTriangleFRev;
+            else       ++NumTriangleFalse;
           } else {
-            if (isRev) NumTriangleRev++;
-            else       NumTriangle++;
+            if (isRev) ++NumTriangleRev;
+            else       ++NumTriangle;
           }
         }
         break;
@@ -332,7 +352,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
                      << BBI.FalseBB->getNumber() << ") ");
         RetVal = IfConvertDiamond(BBI, Kind, NumDups, NumDups2);
         DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
-        if (RetVal) NumDiamonds++;
+        if (RetVal) ++NumDiamonds;
         break;
       }
       }
@@ -361,13 +381,14 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   Roots.clear();
   BBAnalysis.clear();
 
-  if (MadeChange) {
+  if (MadeChange && IfCvtBranchFold) {
     BranchFolder BF(false);
     BF.OptimizeFunction(MF, TII,
                         MF.getTarget().getRegisterInfo(),
                         getAnalysisIfAvailable<MachineModuleInfo>());
   }
 
+  MadeChange |= BFChange;
   return MadeChange;
 }
 
@@ -387,9 +408,10 @@ static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
 /// ReverseBranchCondition - Reverse the condition of the end of the block
 /// branch. Swap block's 'true' and 'false' successors.
 bool IfConverter::ReverseBranchCondition(BBInfo &BBI) {
+  DebugLoc dl;  // FIXME: this is nowhere
   if (!TII->ReverseBranchCondition(BBI.BrCond)) {
     TII->RemoveBranch(*BBI.BB);
-    TII->InsertBranch(*BBI.BB, BBI.FalseBB, BBI.TrueBB, BBI.BrCond);
+    TII->InsertBranch(*BBI.BB, BBI.FalseBB, BBI.TrueBB, BBI.BrCond, dl);
     std::swap(BBI.TrueBB, BBI.FalseBB);
     return true;
   }
@@ -420,7 +442,7 @@ bool IfConverter::ValidSimple(BBInfo &TrueBBI, unsigned &Dups) const {
 
   if (TrueBBI.BB->pred_size() > 1) {
     if (TrueBBI.CannotBeCopied ||
-        TrueBBI.NonPredSize > TLI->getIfCvtDupBlockSizeLimit())
+        !TII->isProfitableToDupForIfCvt(*TrueBBI.BB, TrueBBI.NonPredSize))
       return false;
     Dups = TrueBBI.NonPredSize;
   }
@@ -431,7 +453,7 @@ bool IfConverter::ValidSimple(BBInfo &TrueBBI, unsigned &Dups) const {
 /// ValidTriangle - Returns true if the 'true' and 'false' blocks (along
 /// with their common predecessor) forms a valid triangle shape for ifcvt.
 /// If 'FalseBranch' is true, it checks if 'true' block's false branch
-/// branches to the false branch rather than the other way around. It also
+/// branches to the 'false' block rather than the other way around. It also
 /// returns the number of instructions that the ifcvt would need to duplicate
 /// if performed in 'Dups'.
 bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
@@ -457,7 +479,7 @@ bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
           ++Size;
       }
     }
-    if (Size > TLI->getIfCvtDupBlockSizeLimit())
+    if (!TII->isProfitableToDupForIfCvt(*TrueBBI.BB, Size))
       return false;
     Dups = Size;
   }
@@ -514,7 +536,27 @@ bool IfConverter::ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
 
   MachineBasicBlock::iterator TI = TrueBBI.BB->begin();
   MachineBasicBlock::iterator FI = FalseBBI.BB->begin();
-  while (TI != TrueBBI.BB->end() && FI != FalseBBI.BB->end()) {
+  MachineBasicBlock::iterator TIE = TrueBBI.BB->end();
+  MachineBasicBlock::iterator FIE = FalseBBI.BB->end();
+  // Skip dbg_value instructions
+  while (TI != TIE && TI->isDebugValue())
+    ++TI;
+  while (FI != FIE && FI->isDebugValue())
+    ++FI;
+  while (TI != TIE && FI != FIE) {
+    // Skip dbg_value instructions. These do not count.
+    if (TI->isDebugValue()) {
+      while (TI != TIE && TI->isDebugValue())
+        ++TI;
+      if (TI == TIE)
+        break;
+    }
+    if (FI->isDebugValue()) {
+      while (FI != FIE && FI->isDebugValue())
+        ++FI;
+      if (FI == FIE)
+        break;
+    }
     if (!TI->isIdenticalTo(FI))
       break;
     ++Dups1;
@@ -524,7 +566,27 @@ bool IfConverter::ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
 
   TI = firstNonBranchInst(TrueBBI.BB, TII);
   FI = firstNonBranchInst(FalseBBI.BB, TII);
-  while (TI != TrueBBI.BB->begin() && FI != FalseBBI.BB->begin()) {
+  MachineBasicBlock::iterator TIB = TrueBBI.BB->begin();
+  MachineBasicBlock::iterator FIB = FalseBBI.BB->begin();
+  // Skip dbg_value instructions at end of the bb's.
+  while (TI != TIB && TI->isDebugValue())
+    --TI;
+  while (FI != FIB && FI->isDebugValue())
+    --FI;
+  while (TI != TIB && FI != FIB) {
+    // Skip dbg_value instructions. These do not count.
+    if (TI->isDebugValue()) {
+      while (TI != TIB && TI->isDebugValue())
+        --TI;
+      if (TI == TIB)
+        break;
+    }
+    if (FI->isDebugValue()) {
+      while (FI != FIB && FI->isDebugValue())
+        --FI;
+      if (FI == FIB)
+        break;
+    }
     if (!TI->isIdenticalTo(FI))
       break;
     ++Dups2;
@@ -556,7 +618,7 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
     // No false branch. This BB must end with a conditional branch and a
     // fallthrough.
     if (!BBI.FalseBB)
-      BBI.FalseBB = findFalseBlock(BBI.BB, BBI.TrueBB);  
+      BBI.FalseBB = findFalseBlock(BBI.BB, BBI.TrueBB);
     if (!BBI.FalseBB) {
       // Malformed bcc? True and false blocks are the same?
       BBI.IsUnpredicable = true;
@@ -569,6 +631,9 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
   BBI.ClobbersPred = false;
   for (MachineBasicBlock::iterator I = BBI.BB->begin(), E = BBI.BB->end();
        I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+
     const TargetInstrDesc &TID = I->getDesc();
     if (TID.isNotDuplicable())
       BBI.CannotBeCopied = true;
@@ -702,8 +767,8 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
   bool FNeedSub = FalseBBI.Predicate.size() > 0;
   bool Enqueued = false;
   if (CanRevCond && ValidDiamond(TrueBBI, FalseBBI, Dups, Dups2) &&
-      MeetIfcvtSizeLimit(TrueBBI.NonPredSize - (Dups + Dups2)) &&
-      MeetIfcvtSizeLimit(FalseBBI.NonPredSize - (Dups + Dups2)) &&
+      MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize - (Dups + Dups2),
+                         *FalseBBI.BB, FalseBBI.NonPredSize - (Dups + Dups2)) &&
       FeasibilityAnalysis(TrueBBI, BBI.BrCond) &&
       FeasibilityAnalysis(FalseBBI, RevCond)) {
     // Diamond:
@@ -720,7 +785,7 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
   }
 
   if (ValidTriangle(TrueBBI, FalseBBI, false, Dups) &&
-      MeetIfcvtSizeLimit(TrueBBI.NonPredSize) &&
+      MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize) &&
       FeasibilityAnalysis(TrueBBI, BBI.BrCond, true)) {
     // Triangle:
     //   EBB
@@ -732,23 +797,23 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
     Tokens.push_back(new IfcvtToken(BBI, ICTriangle, TNeedSub, Dups));
     Enqueued = true;
   }
-  
+
   if (ValidTriangle(TrueBBI, FalseBBI, true, Dups) &&
-      MeetIfcvtSizeLimit(TrueBBI.NonPredSize) &&
+      MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize) &&
       FeasibilityAnalysis(TrueBBI, BBI.BrCond, true, true)) {
     Tokens.push_back(new IfcvtToken(BBI, ICTriangleRev, TNeedSub, Dups));
     Enqueued = true;
   }
 
   if (ValidSimple(TrueBBI, Dups) &&
-      MeetIfcvtSizeLimit(TrueBBI.NonPredSize) &&
+      MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize) &&
       FeasibilityAnalysis(TrueBBI, BBI.BrCond)) {
     // Simple (split, no rejoin):
     //   EBB
     //   | \_
     //   |  |
     //   | TBB---> exit
-    //   |    
+    //   |
     //   FBB
     Tokens.push_back(new IfcvtToken(BBI, ICSimple, TNeedSub, Dups));
     Enqueued = true;
@@ -757,21 +822,21 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
   if (CanRevCond) {
     // Try the other path...
     if (ValidTriangle(FalseBBI, TrueBBI, false, Dups) &&
-        MeetIfcvtSizeLimit(FalseBBI.NonPredSize) &&
+        MeetIfcvtSizeLimit(*FalseBBI.BB, FalseBBI.NonPredSize) &&
         FeasibilityAnalysis(FalseBBI, RevCond, true)) {
       Tokens.push_back(new IfcvtToken(BBI, ICTriangleFalse, FNeedSub, Dups));
       Enqueued = true;
     }
 
     if (ValidTriangle(FalseBBI, TrueBBI, true, Dups) &&
-        MeetIfcvtSizeLimit(FalseBBI.NonPredSize) &&
+        MeetIfcvtSizeLimit(*FalseBBI.BB, FalseBBI.NonPredSize) &&
         FeasibilityAnalysis(FalseBBI, RevCond, true, true)) {
       Tokens.push_back(new IfcvtToken(BBI, ICTriangleFRev, FNeedSub, Dups));
       Enqueued = true;
     }
 
     if (ValidSimple(FalseBBI, Dups) &&
-        MeetIfcvtSizeLimit(FalseBBI.NonPredSize) &&
+        MeetIfcvtSizeLimit(*FalseBBI.BB, FalseBBI.NonPredSize) &&
         FeasibilityAnalysis(FalseBBI, RevCond)) {
       Tokens.push_back(new IfcvtToken(BBI, ICSimpleFalse, FNeedSub, Dups));
       Enqueued = true;
@@ -785,11 +850,9 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
 }
 
 /// AnalyzeBlocks - Analyze all blocks and find entries for all if-conversion
-/// candidates. It returns true if any CFG restructuring is done to expose more
-/// if-conversion opportunities.
-bool IfConverter::AnalyzeBlocks(MachineFunction &MF,
+/// candidates.
+void IfConverter::AnalyzeBlocks(MachineFunction &MF,
                                 std::vector<IfcvtToken*> &Tokens) {
-  bool Change = false;
   std::set<MachineBasicBlock*> Visited;
   for (unsigned i = 0, e = Roots.size(); i != e; ++i) {
     for (idf_ext_iterator<MachineBasicBlock*> I=idf_ext_begin(Roots[i],Visited),
@@ -801,20 +864,23 @@ bool IfConverter::AnalyzeBlocks(MachineFunction &MF,
 
   // Sort to favor more complex ifcvt scheme.
   std::stable_sort(Tokens.begin(), Tokens.end(), IfcvtTokenCmp);
-
-  return Change;
 }
 
 /// canFallThroughTo - Returns true either if ToBB is the next block after BB or
 /// that all the intervening blocks are empty (given BB can fall through to its
 /// next block).
 static bool canFallThroughTo(MachineBasicBlock *BB, MachineBasicBlock *ToBB) {
-  MachineFunction::iterator I = BB;
+  MachineFunction::iterator PI = BB;
+  MachineFunction::iterator I = llvm::next(PI);
   MachineFunction::iterator TI = ToBB;
   MachineFunction::iterator E = BB->getParent()->end();
-  while (++I != TI)
-    if (I == E || !I->empty())
+  while (I != TI) {
+    // Check isSuccessor to avoid case where the next block is empty, but
+    // it's not a successor.
+    if (I == E || !I->empty() || !PI->isSuccessor(I))
       return false;
+    PI = I++;
+  }
   return true;
 }
 
@@ -836,8 +902,9 @@ void IfConverter::InvalidatePreds(MachineBasicBlock *BB) {
 ///
 static void InsertUncondBranch(MachineBasicBlock *BB, MachineBasicBlock *ToBB,
                                const TargetInstrInfo *TII) {
+  DebugLoc dl;  // FIXME: this is nowhere
   SmallVector<MachineOperand, 0> NoCond;
-  TII->InsertBranch(*BB, ToBB, NULL, NoCond);
+  TII->InsertBranch(*BB, ToBB, NULL, NoCond, dl);
 }
 
 /// RemoveExtraEdges - Remove true / false edges if either / both are no longer
@@ -849,6 +916,66 @@ void IfConverter::RemoveExtraEdges(BBInfo &BBI) {
     BBI.BB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
 }
 
+/// InitPredRedefs / UpdatePredRedefs - Defs by predicated instructions are
+/// modeled as read + write (sort like two-address instructions). These
+/// routines track register liveness and add implicit uses to if-converted
+/// instructions to conform to the model.
+static void InitPredRedefs(MachineBasicBlock *BB, SmallSet<unsigned,4> &Redefs,
+                           const TargetRegisterInfo *TRI) {
+  for (MachineBasicBlock::livein_iterator I = BB->livein_begin(),
+         E = BB->livein_end(); I != E; ++I) {
+    unsigned Reg = *I;
+    Redefs.insert(Reg);
+    for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+         *Subreg; ++Subreg)
+      Redefs.insert(*Subreg);
+  }
+}
+
+static void UpdatePredRedefs(MachineInstr *MI, SmallSet<unsigned,4> &Redefs,
+                             const TargetRegisterInfo *TRI,
+                             bool AddImpUse = false) {
+  SmallVector<unsigned, 4> Defs;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isDef())
+      Defs.push_back(Reg);
+    else if (MO.isKill()) {
+      Redefs.erase(Reg);
+      for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
+        Redefs.erase(*SR);
+    }
+  }
+  for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+    unsigned Reg = Defs[i];
+    if (Redefs.count(Reg)) {
+      if (AddImpUse)
+        // Treat predicated update as read + write.
+        MI->addOperand(MachineOperand::CreateReg(Reg, false/*IsDef*/,
+                                                true/*IsImp*/,false/*IsKill*/));
+    } else {
+      Redefs.insert(Reg);
+      for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
+        Redefs.insert(*SR);
+    }
+  }
+}
+
+static void UpdatePredRedefs(MachineBasicBlock::iterator I,
+                             MachineBasicBlock::iterator E,
+                             SmallSet<unsigned,4> &Redefs,
+                             const TargetRegisterInfo *TRI) {
+  while (I != E) {
+    UpdatePredRedefs(I, Redefs, TRI);
+    ++I;
+  }
+}
+
 /// IfConvertSimple - If convert a simple (split, no rejoin) sub-CFG.
 ///
 bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
@@ -873,13 +1000,19 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
     if (TII->ReverseBranchCondition(Cond))
       assert(false && "Unable to reverse branch condition!");
 
+  // Initialize liveins to the first BB. These are potentiall redefined by
+  // predicated instructions.
+  SmallSet<unsigned, 4> Redefs;
+  InitPredRedefs(CvtBBI->BB, Redefs, TRI);
+  InitPredRedefs(NextBBI->BB, Redefs, TRI);
+
   if (CvtBBI->BB->pred_size() > 1) {
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
-    CopyAndPredicateBlock(BBI, *CvtBBI, Cond);
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, Redefs);
   } else {
-    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond, Redefs);
 
     // Merge converted block into entry block.
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
@@ -922,6 +1055,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
   BBInfo *CvtBBI = &TrueBBI;
   BBInfo *NextBBI = &FalseBBI;
+  DebugLoc dl;  // FIXME: this is nowhere
 
   SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
   if (Kind == ICTriangleFalse || Kind == ICTriangleFRev)
@@ -957,21 +1091,26 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     }
   }
 
+  // Initialize liveins to the first BB. These are potentially redefined by
+  // predicated instructions.
+  SmallSet<unsigned, 4> Redefs;
+  InitPredRedefs(CvtBBI->BB, Redefs, TRI);
+  InitPredRedefs(NextBBI->BB, Redefs, TRI);
+
   bool HasEarlyExit = CvtBBI->FalseBB != NULL;
-  bool DupBB = CvtBBI->BB->pred_size() > 1;
-  if (DupBB) {
+  if (CvtBBI->BB->pred_size() > 1) {
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
-    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, Redefs, true);
   } else {
     // Predicate the 'true' block after removing its branch.
     CvtBBI->NonPredSize -= TII->RemoveBranch(*CvtBBI->BB);
-    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond, Redefs);
 
     // Now merge the entry of the triangle with the true block.
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
-    MergeBlocks(BBI, *CvtBBI);
+    MergeBlocks(BBI, *CvtBBI, false);
   }
 
   // If 'true' block has a 'false' successor, add an exit branch to it.
@@ -980,7 +1119,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
                                            CvtBBI->BrCond.end());
     if (TII->ReverseBranchCondition(RevCond))
       assert(false && "Unable to reverse branch condition!");
-    TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, NULL, RevCond);
+    TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, NULL, RevCond, dl);
     BBI.BB->addSuccessor(CvtBBI->FalseBB);
   }
 
@@ -1009,7 +1148,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   RemoveExtraEdges(BBI);
 
   // Update block info. BB can be iteratively if-converted.
-  if (!IterIfcvt) 
+  if (!IterIfcvt)
     BBI.IsDone = true;
   InvalidatePreds(BBI.BB);
   CvtBBI->IsDone = true;
@@ -1044,9 +1183,9 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
     return false;
   }
 
-  // Merge the 'true' and 'false' blocks by copying the instructions
-  // from the 'false' block to the 'true' block. That is, unless the true
-  // block would clobber the predicate, in that case, do the opposite.
+  // Put the predicated instructions from the 'true' block before the
+  // instructions from the 'false' block, unless the true block would clobber
+  // the predicate, in which case, do the opposite.
   BBInfo *BBI1 = &TrueBBI;
   BBInfo *BBI2 = &FalseBBI;
   SmallVector<MachineOperand, 4> RevCond(BBI.BrCond.begin(), BBI.BrCond.end());
@@ -1071,39 +1210,72 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   // Remove the conditional branch from entry to the blocks.
   BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
 
+  // Initialize liveins to the first BB. These are potentially redefined by
+  // predicated instructions.
+  SmallSet<unsigned, 4> Redefs;
+  InitPredRedefs(BBI1->BB, Redefs, TRI);
+
   // Remove the duplicated instructions at the beginnings of both paths.
   MachineBasicBlock::iterator DI1 = BBI1->BB->begin();
   MachineBasicBlock::iterator DI2 = BBI2->BB->begin();
+  MachineBasicBlock::iterator DIE1 = BBI1->BB->end();
+  MachineBasicBlock::iterator DIE2 = BBI2->BB->end();
+  // Skip dbg_value instructions
+  while (DI1 != DIE1 && DI1->isDebugValue())
+    ++DI1;
+  while (DI2 != DIE2 && DI2->isDebugValue())
+    ++DI2;
   BBI1->NonPredSize -= NumDups1;
   BBI2->NonPredSize -= NumDups1;
+
+  // Skip past the dups on each side separately since there may be
+  // differing dbg_value entries.
+  for (unsigned i = 0; i < NumDups1; ++DI1) {
+    if (!DI1->isDebugValue())
+      ++i;
+  }
   while (NumDups1 != 0) {
-    ++DI1;
     ++DI2;
-    --NumDups1;
+    if (!DI2->isDebugValue())
+      --NumDups1;
   }
+
+  UpdatePredRedefs(BBI1->BB->begin(), DI1, Redefs, TRI);
   BBI.BB->splice(BBI.BB->end(), BBI1->BB, BBI1->BB->begin(), DI1);
   BBI2->BB->erase(BBI2->BB->begin(), DI2);
 
   // Predicate the 'true' block after removing its branch.
   BBI1->NonPredSize -= TII->RemoveBranch(*BBI1->BB);
   DI1 = BBI1->BB->end();
-  for (unsigned i = 0; i != NumDups2; ++i)
+  for (unsigned i = 0; i != NumDups2; ) {
+    // NumDups2 only counted non-dbg_value instructions, so this won't
+    // run off the head of the list.
+    assert (DI1 != BBI1->BB->begin());
     --DI1;
+    // skip dbg_value instructions
+    if (!DI1->isDebugValue())
+      ++i;
+  }
   BBI1->BB->erase(DI1, BBI1->BB->end());
-  PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1);
+  PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1, Redefs);
 
   // Predicate the 'false' block.
   BBI2->NonPredSize -= TII->RemoveBranch(*BBI2->BB);
   DI2 = BBI2->BB->end();
   while (NumDups2 != 0) {
+    // NumDups2 only counted non-dbg_value instructions, so this won't
+    // run off the head of the list.
+    assert (DI2 != BBI2->BB->begin());
     --DI2;
-    --NumDups2;
+    // skip dbg_value instructions
+    if (!DI2->isDebugValue())
+      --NumDups2;
   }
-  PredicateBlock(*BBI2, DI2, *Cond2);
+  PredicateBlock(*BBI2, DI2, *Cond2, Redefs);
 
   // Merge the true block into the entry of the diamond.
-  MergeBlocks(BBI, *BBI1);
-  MergeBlocks(BBI, *BBI2);
+  MergeBlocks(BBI, *BBI1, TailBB == 0);
+  MergeBlocks(BBI, *BBI2, TailBB == 0);
 
   // If the if-converted block falls through or unconditionally branches into
   // the tail block, and the tail block does not have other predecessors, then
@@ -1111,16 +1283,32 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   // tail, add a unconditional branch to it.
   if (TailBB) {
     BBInfo TailBBI = BBAnalysis[TailBB->getNumber()];
-    if (TailBB->pred_size() == 1 && !TailBBI.HasFallThrough) {
-      BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    bool CanMergeTail = !TailBBI.HasFallThrough;
+    // There may still be a fall-through edge from BBI1 or BBI2 to TailBB;
+    // check if there are any other predecessors besides those.
+    unsigned NumPreds = TailBB->pred_size();
+    if (NumPreds > 1)
+      CanMergeTail = false;
+    else if (NumPreds == 1 && CanMergeTail) {
+      MachineBasicBlock::pred_iterator PI = TailBB->pred_begin();
+      if (*PI != BBI1->BB && *PI != BBI2->BB)
+        CanMergeTail = false;
+    }
+    if (CanMergeTail) {
       MergeBlocks(BBI, TailBBI);
       TailBBI.IsDone = true;
     } else {
+      BBI.BB->addSuccessor(TailBB);
       InsertUncondBranch(BBI.BB, TailBB, TII);
       BBI.HasFallThrough = false;
     }
   }
 
+  // RemoveExtraEdges won't work if the block has an unanalyzable branch,
+  // which can happen here if TailBB is unanalyzable and is merged, so
+  // explicitly remove BBI1 and BBI2 as successors.
+  BBI.BB->removeSuccessor(BBI1->BB);
+  BBI.BB->removeSuccessor(BBI2->BB);
   RemoveExtraEdges(BBI);
 
   // Update block info.
@@ -1135,9 +1323,10 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
 /// specified end with the specified condition.
 void IfConverter::PredicateBlock(BBInfo &BBI,
                                  MachineBasicBlock::iterator E,
-                                 SmallVectorImpl<MachineOperand> &Cond) {
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 SmallSet<unsigned, 4> &Redefs) {
   for (MachineBasicBlock::iterator I = BBI.BB->begin(); I != E; ++I) {
-    if (TII->isPredicated(I))
+    if (I->isDebugValue() || TII->isPredicated(I))
       continue;
     if (!TII->PredicateInstruction(I, Cond)) {
 #ifndef NDEBUG
@@ -1145,6 +1334,10 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
 #endif
       llvm_unreachable(0);
     }
+
+    // If the predicated instruction now redefines a register as the result of
+    // if-conversion, add an implicit kill.
+    UpdatePredRedefs(I, Redefs, TRI, true);
   }
 
   std::copy(Cond.begin(), Cond.end(), std::back_inserter(BBI.Predicate));
@@ -1152,48 +1345,55 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
   BBI.IsAnalyzed = false;
   BBI.NonPredSize = 0;
 
-  NumIfConvBBs++;
+  ++NumIfConvBBs;
 }
 
 /// CopyAndPredicateBlock - Copy and predicate instructions from source BB to
 /// the destination block. Skip end of block branches if IgnoreBr is true.
 void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                         SmallVectorImpl<MachineOperand> &Cond,
+                                        SmallSet<unsigned, 4> &Redefs,
                                         bool IgnoreBr) {
   MachineFunction &MF = *ToBBI.BB->getParent();
 
   for (MachineBasicBlock::iterator I = FromBBI.BB->begin(),
          E = FromBBI.BB->end(); I != E; ++I) {
     const TargetInstrDesc &TID = I->getDesc();
-    bool isPredicated = TII->isPredicated(I);
     // Do not copy the end of the block branches.
-    if (IgnoreBr && !isPredicated && TID.isBranch())
+    if (IgnoreBr && TID.isBranch())
       break;
 
     MachineInstr *MI = MF.CloneMachineInstr(I);
     ToBBI.BB->insert(ToBBI.BB->end(), MI);
     ToBBI.NonPredSize++;
 
-    if (!isPredicated)
+    if (!TII->isPredicated(I) && !MI->isDebugValue()) {
       if (!TII->PredicateInstruction(MI, Cond)) {
 #ifndef NDEBUG
         dbgs() << "Unable to predicate " << *I << "!\n";
 #endif
         llvm_unreachable(0);
       }
+    }
+
+    // If the predicated instruction now redefines a register as the result of
+    // if-conversion, add an implicit kill.
+    UpdatePredRedefs(MI, Redefs, TRI, true);
   }
 
-  std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
-                                         FromBBI.BB->succ_end());
-  MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
-  MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : NULL;
+  if (!IgnoreBr) {
+    std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
+                                           FromBBI.BB->succ_end());
+    MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
+    MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : NULL;
 
-  for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
-    MachineBasicBlock *Succ = Succs[i];
-    // Fallthrough edge can't be transferred.
-    if (Succ == FallThrough)
-      continue;
-    ToBBI.BB->addSuccessor(Succ);
+    for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
+      MachineBasicBlock *Succ = Succs[i];
+      // Fallthrough edge can't be transferred.
+      if (Succ == FallThrough)
+        continue;
+      ToBBI.BB->addSuccessor(Succ);
+    }
   }
 
   std::copy(FromBBI.Predicate.begin(), FromBBI.Predicate.end(),
@@ -1203,25 +1403,18 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
   ToBBI.ClobbersPred |= FromBBI.ClobbersPred;
   ToBBI.IsAnalyzed = false;
 
-  NumDupBBs++;
+  ++NumDupBBs;
 }
 
 /// MergeBlocks - Move all instructions from FromBB to the end of ToBB.
-///
-void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI) {
+/// This will leave FromBB as an empty block, so remove all of its
+/// successor edges except for the fall-through edge.  If AddEdges is true,
+/// i.e., when FromBBI's branch is being moved, add those successor edges to
+/// ToBBI.
+void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
   ToBBI.BB->splice(ToBBI.BB->end(),
                    FromBBI.BB, FromBBI.BB->begin(), FromBBI.BB->end());
 
-  // Redirect all branches to FromBB to ToBB.
-  std::vector<MachineBasicBlock *> Preds(FromBBI.BB->pred_begin(),
-                                         FromBBI.BB->pred_end());
-  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
-    MachineBasicBlock *Pred = Preds[i];
-    if (Pred == ToBBI.BB)
-      continue;
-    Pred->ReplaceUsesOfBlockWith(FromBBI.BB, ToBBI.BB);
-  }
- 
   std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
                                          FromBBI.BB->succ_end());
   MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
@@ -1233,7 +1426,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI) {
     if (Succ == FallThrough)
       continue;
     FromBBI.BB->removeSuccessor(Succ);
-    ToBBI.BB->addSuccessor(Succ);
+    if (AddEdges)
+      ToBBI.BB->addSuccessor(Succ);
   }
 
   // Now FromBBI always falls through to the next block!
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
new file mode 100644
index 0000000..12adcaa
--- /dev/null
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -0,0 +1,408 @@
+//===-------- InlineSpiller.cpp - Insert spills and restores inline -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The inline spiller modifies the machine function directly instead of
+// inserting spills and restores in VirtRegMap.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "spiller"
+#include "Spiller.h"
+#include "VirtRegMap.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class InlineSpiller : public Spiller {
+  MachineFunction &mf_;
+  LiveIntervals &lis_;
+  VirtRegMap &vrm_;
+  MachineFrameInfo &mfi_;
+  MachineRegisterInfo &mri_;
+  const TargetInstrInfo &tii_;
+  const TargetRegisterInfo &tri_;
+  const BitVector reserved_;
+
+  // Variables that are valid during spill(), but used by multiple methods.
+  LiveInterval *li_;
+  std::vector<LiveInterval*> *newIntervals_;
+  const TargetRegisterClass *rc_;
+  int stackSlot_;
+  const SmallVectorImpl<LiveInterval*> *spillIs_;
+
+  // Values of the current interval that can potentially remat.
+  SmallPtrSet<VNInfo*, 8> reMattable_;
+
+  // Values in reMattable_ that failed to remat at some point.
+  SmallPtrSet<VNInfo*, 8> usedValues_;
+
+  ~InlineSpiller() {}
+
+public:
+  InlineSpiller(MachineFunction *mf, LiveIntervals *lis, VirtRegMap *vrm)
+    : mf_(*mf), lis_(*lis), vrm_(*vrm),
+      mfi_(*mf->getFrameInfo()),
+      mri_(mf->getRegInfo()),
+      tii_(*mf->getTarget().getInstrInfo()),
+      tri_(*mf->getTarget().getRegisterInfo()),
+      reserved_(tri_.getReservedRegs(mf_)) {}
+
+  void spill(LiveInterval *li,
+             std::vector<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &spillIs,
+             SlotIndex *earliestIndex);
+
+private:
+  bool allUsesAvailableAt(const MachineInstr *OrigMI, SlotIndex OrigIdx,
+                          SlotIndex UseIdx);
+  bool reMaterializeFor(MachineBasicBlock::iterator MI);
+  void reMaterializeAll();
+
+  bool foldMemoryOperand(MachineBasicBlock::iterator MI,
+                         const SmallVectorImpl<unsigned> &Ops);
+  void insertReload(LiveInterval &NewLI, MachineBasicBlock::iterator MI);
+  void insertSpill(LiveInterval &NewLI, MachineBasicBlock::iterator MI);
+};
+}
+
+namespace llvm {
+Spiller *createInlineSpiller(MachineFunction *mf,
+                             LiveIntervals *lis,
+                             const MachineLoopInfo *mli,
+                             VirtRegMap *vrm) {
+  return new InlineSpiller(mf, lis, vrm);
+}
+}
+
+/// allUsesAvailableAt - Return true if all registers used by OrigMI at
+/// OrigIdx are also available with the same value at UseIdx.
+bool InlineSpiller::allUsesAvailableAt(const MachineInstr *OrigMI,
+                                       SlotIndex OrigIdx,
+                                       SlotIndex UseIdx) {
+  OrigIdx = OrigIdx.getUseIndex();
+  UseIdx = UseIdx.getUseIndex();
+  for (unsigned i = 0, e = OrigMI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = OrigMI->getOperand(i);
+    if (!MO.isReg() || !MO.getReg() || MO.getReg() == li_->reg)
+      continue;
+    // Reserved registers are OK.
+    if (MO.isUndef() || !lis_.hasInterval(MO.getReg()))
+      continue;
+    // We don't want to move any defs.
+    if (MO.isDef())
+      return false;
+    // We cannot depend on virtual registers in spillIs_. They will be spilled.
+    for (unsigned si = 0, se = spillIs_->size(); si != se; ++si)
+      if ((*spillIs_)[si]->reg == MO.getReg())
+        return false;
+
+    LiveInterval &LI = lis_.getInterval(MO.getReg());
+    const VNInfo *OVNI = LI.getVNInfoAt(OrigIdx);
+    if (!OVNI)
+      continue;
+    if (OVNI != LI.getVNInfoAt(UseIdx))
+      return false;
+  }
+  return true;
+}
+
+/// reMaterializeFor - Attempt to rematerialize li_->reg before MI instead of
+/// reloading it.
+bool InlineSpiller::reMaterializeFor(MachineBasicBlock::iterator MI) {
+  SlotIndex UseIdx = lis_.getInstructionIndex(MI).getUseIndex();
+  VNInfo *OrigVNI = li_->getVNInfoAt(UseIdx);
+  if (!OrigVNI) {
+    DEBUG(dbgs() << "\tadding <undef> flags: ");
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isUse() && MO.getReg() == li_->reg)
+        MO.setIsUndef();
+    }
+    DEBUG(dbgs() << UseIdx << '\t' << *MI);
+    return true;
+  }
+  if (!reMattable_.count(OrigVNI)) {
+    DEBUG(dbgs() << "\tusing non-remat valno " << OrigVNI->id << ": "
+                 << UseIdx << '\t' << *MI);
+    return false;
+  }
+  MachineInstr *OrigMI = lis_.getInstructionFromIndex(OrigVNI->def);
+  if (!allUsesAvailableAt(OrigMI, OrigVNI->def, UseIdx)) {
+    usedValues_.insert(OrigVNI);
+    DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << *MI);
+    return false;
+  }
+
+  // If the instruction also writes li_->reg, it had better not require the same
+  // register for uses and defs.
+  bool Reads, Writes;
+  SmallVector<unsigned, 8> Ops;
+  tie(Reads, Writes) = MI->readsWritesVirtualRegister(li_->reg, &Ops);
+  if (Writes) {
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(Ops[i]);
+      if (MO.isUse() ? MI->isRegTiedToDefOperand(Ops[i]) : MO.getSubReg()) {
+        usedValues_.insert(OrigVNI);
+        DEBUG(dbgs() << "\tcannot remat tied reg: " << UseIdx << '\t' << *MI);
+        return false;
+      }
+    }
+  }
+
+  // Alocate a new register for the remat.
+  unsigned NewVReg = mri_.createVirtualRegister(rc_);
+  vrm_.grow();
+  LiveInterval &NewLI = lis_.getOrCreateInterval(NewVReg);
+  NewLI.markNotSpillable();
+  newIntervals_->push_back(&NewLI);
+
+  // Finally we can rematerialize OrigMI before MI.
+  MachineBasicBlock &MBB = *MI->getParent();
+  tii_.reMaterialize(MBB, MI, NewLI.reg, 0, OrigMI, tri_);
+  MachineBasicBlock::iterator RematMI = MI;
+  SlotIndex DefIdx = lis_.InsertMachineInstrInMaps(--RematMI).getDefIndex();
+  DEBUG(dbgs() << "\tremat:  " << DefIdx << '\t' << *RematMI);
+
+  // Replace operands
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(Ops[i]);
+    if (MO.isReg() && MO.isUse() && MO.getReg() == li_->reg) {
+      MO.setReg(NewVReg);
+      MO.setIsKill();
+    }
+  }
+  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << *MI);
+
+  VNInfo *DefVNI = NewLI.getNextValue(DefIdx, 0, true,
+                                       lis_.getVNInfoAllocator());
+  NewLI.addRange(LiveRange(DefIdx, UseIdx.getDefIndex(), DefVNI));
+  DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
+  return true;
+}
+
+/// reMaterializeAll - Try to rematerialize as many uses of li_ as possible,
+/// and trim the live ranges after.
+void InlineSpiller::reMaterializeAll() {
+  // Do a quick scan of the interval values to find if any are remattable.
+  reMattable_.clear();
+  usedValues_.clear();
+  for (LiveInterval::const_vni_iterator I = li_->vni_begin(),
+       E = li_->vni_end(); I != E; ++I) {
+    VNInfo *VNI = *I;
+    if (VNI->isUnused() || !VNI->isDefAccurate())
+      continue;
+    MachineInstr *DefMI = lis_.getInstructionFromIndex(VNI->def);
+    if (!DefMI || !tii_.isTriviallyReMaterializable(DefMI))
+      continue;
+    reMattable_.insert(VNI);
+  }
+
+  // Often, no defs are remattable.
+  if (reMattable_.empty())
+    return;
+
+  // Try to remat before all uses of li_->reg.
+  bool anyRemat = false;
+  for (MachineRegisterInfo::use_nodbg_iterator
+       RI = mri_.use_nodbg_begin(li_->reg);
+       MachineInstr *MI = RI.skipInstruction();)
+     anyRemat |= reMaterializeFor(MI);
+
+  if (!anyRemat)
+    return;
+
+  // Remove any values that were completely rematted.
+  bool anyRemoved = false;
+  for (SmallPtrSet<VNInfo*, 8>::iterator I = reMattable_.begin(),
+       E = reMattable_.end(); I != E; ++I) {
+    VNInfo *VNI = *I;
+    if (VNI->hasPHIKill() || usedValues_.count(VNI))
+      continue;
+    MachineInstr *DefMI = lis_.getInstructionFromIndex(VNI->def);
+    DEBUG(dbgs() << "\tremoving dead def: " << VNI->def << '\t' << *DefMI);
+    lis_.RemoveMachineInstrFromMaps(DefMI);
+    vrm_.RemoveMachineInstrFromMaps(DefMI);
+    DefMI->eraseFromParent();
+    li_->removeValNo(VNI);
+    anyRemoved = true;
+  }
+
+  if (!anyRemoved)
+    return;
+
+  // Removing values may cause debug uses where li_ is not live.
+  for (MachineRegisterInfo::use_iterator RI = mri_.use_begin(li_->reg);
+       MachineInstr *MI = RI.skipInstruction();) {
+    if (!MI->isDebugValue())
+      continue;
+    // Try to preserve the debug value if li_ is live immediately after it.
+    MachineBasicBlock::iterator NextMI = MI;
+    ++NextMI;
+    if (NextMI != MI->getParent()->end() && !lis_.isNotInMIMap(NextMI)) {
+      SlotIndex NearIdx = lis_.getInstructionIndex(NextMI);
+      if (li_->liveAt(NearIdx))
+        continue;
+    }
+    DEBUG(dbgs() << "Removing debug info due to remat:" << "\t" << *MI);
+    MI->eraseFromParent();
+  }
+}
+
+/// foldMemoryOperand - Try folding stack slot references in Ops into MI.
+/// Return true on success, and MI will be erased.
+bool InlineSpiller::foldMemoryOperand(MachineBasicBlock::iterator MI,
+                                      const SmallVectorImpl<unsigned> &Ops) {
+  // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
+  // operands.
+  SmallVector<unsigned, 8> FoldOps;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    unsigned Idx = Ops[i];
+    MachineOperand &MO = MI->getOperand(Idx);
+    if (MO.isImplicit())
+      continue;
+    // FIXME: Teach targets to deal with subregs.
+    if (MO.getSubReg())
+      return false;
+    // Tied use operands should not be passed to foldMemoryOperand.
+    if (!MI->isRegTiedToDefOperand(Idx))
+      FoldOps.push_back(Idx);
+  }
+
+  MachineInstr *FoldMI = tii_.foldMemoryOperand(MI, FoldOps, stackSlot_);
+  if (!FoldMI)
+    return false;
+  lis_.ReplaceMachineInstrInMaps(MI, FoldMI);
+  vrm_.addSpillSlotUse(stackSlot_, FoldMI);
+  MI->eraseFromParent();
+  DEBUG(dbgs() << "\tfolded: " << *FoldMI);
+  return true;
+}
+
+/// insertReload - Insert a reload of NewLI.reg before MI.
+void InlineSpiller::insertReload(LiveInterval &NewLI,
+                                 MachineBasicBlock::iterator MI) {
+  MachineBasicBlock &MBB = *MI->getParent();
+  SlotIndex Idx = lis_.getInstructionIndex(MI).getDefIndex();
+  tii_.loadRegFromStackSlot(MBB, MI, NewLI.reg, stackSlot_, rc_, &tri_);
+  --MI; // Point to load instruction.
+  SlotIndex LoadIdx = lis_.InsertMachineInstrInMaps(MI).getDefIndex();
+  vrm_.addSpillSlotUse(stackSlot_, MI);
+  DEBUG(dbgs() << "\treload:  " << LoadIdx << '\t' << *MI);
+  VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, 0, true,
+                                       lis_.getVNInfoAllocator());
+  NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI));
+}
+
+/// insertSpill - Insert a spill of NewLI.reg after MI.
+void InlineSpiller::insertSpill(LiveInterval &NewLI,
+                                MachineBasicBlock::iterator MI) {
+  MachineBasicBlock &MBB = *MI->getParent();
+  SlotIndex Idx = lis_.getInstructionIndex(MI).getDefIndex();
+  tii_.storeRegToStackSlot(MBB, ++MI, NewLI.reg, true, stackSlot_, rc_, &tri_);
+  --MI; // Point to store instruction.
+  SlotIndex StoreIdx = lis_.InsertMachineInstrInMaps(MI).getDefIndex();
+  vrm_.addSpillSlotUse(stackSlot_, MI);
+  DEBUG(dbgs() << "\tspilled: " << StoreIdx << '\t' << *MI);
+  VNInfo *StoreVNI = NewLI.getNextValue(Idx, 0, true,
+                                        lis_.getVNInfoAllocator());
+  NewLI.addRange(LiveRange(Idx, StoreIdx, StoreVNI));
+}
+
+void InlineSpiller::spill(LiveInterval *li,
+                          std::vector<LiveInterval*> &newIntervals,
+                          SmallVectorImpl<LiveInterval*> &spillIs,
+                          SlotIndex *earliestIndex) {
+  DEBUG(dbgs() << "Inline spilling " << *li << "\n");
+  assert(li->isSpillable() && "Attempting to spill already spilled value.");
+  assert(!li->isStackSlot() && "Trying to spill a stack slot.");
+
+  li_ = li;
+  newIntervals_ = &newIntervals;
+  rc_ = mri_.getRegClass(li->reg);
+  spillIs_ = &spillIs;
+
+  reMaterializeAll();
+
+  // Remat may handle everything.
+  if (li_->empty())
+    return;
+
+  stackSlot_ = vrm_.assignVirt2StackSlot(li->reg);
+
+  // Iterate over instructions using register.
+  for (MachineRegisterInfo::reg_iterator RI = mri_.reg_begin(li->reg);
+       MachineInstr *MI = RI.skipInstruction();) {
+
+    // Debug values are not allowed to affect codegen.
+    if (MI->isDebugValue()) {
+      // Modify DBG_VALUE now that the value is in a spill slot.
+      uint64_t Offset = MI->getOperand(1).getImm();
+      const MDNode *MDPtr = MI->getOperand(2).getMetadata();
+      DebugLoc DL = MI->getDebugLoc();
+      if (MachineInstr *NewDV = tii_.emitFrameIndexDebugValue(mf_, stackSlot_,
+                                                           Offset, MDPtr, DL)) {
+        DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
+        MachineBasicBlock *MBB = MI->getParent();
+        MBB->insert(MBB->erase(MI), NewDV);
+      } else {
+        DEBUG(dbgs() << "Removing debug info due to spill:" << "\t" << *MI);
+        MI->eraseFromParent();
+      }
+      continue;
+    }
+
+    // Analyze instruction.
+    bool Reads, Writes;
+    SmallVector<unsigned, 8> Ops;
+    tie(Reads, Writes) = MI->readsWritesVirtualRegister(li->reg, &Ops);
+
+    // Attempt to fold memory ops.
+    if (foldMemoryOperand(MI, Ops))
+      continue;
+
+    // Allocate interval around instruction.
+    // FIXME: Infer regclass from instruction alone.
+    unsigned NewVReg = mri_.createVirtualRegister(rc_);
+    vrm_.grow();
+    LiveInterval &NewLI = lis_.getOrCreateInterval(NewVReg);
+    NewLI.markNotSpillable();
+
+    if (Reads)
+      insertReload(NewLI, MI);
+
+    // Rewrite instruction operands.
+    bool hasLiveDef = false;
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(Ops[i]);
+      MO.setReg(NewVReg);
+      if (MO.isUse()) {
+        if (!MI->isRegTiedToDefOperand(Ops[i]))
+          MO.setIsKill();
+      } else {
+        if (!MO.isDead())
+          hasLiveDef = true;
+      }
+    }
+
+    // FIXME: Use a second vreg if instruction has no tied ops.
+    if (Writes && hasLiveDef)
+      insertSpill(NewLI, MI);
+
+    DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
+    newIntervals.push_back(&NewLI);
+  }
+}
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index 63bb5f2..03ae214 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Module.h"
 #include "llvm/Type.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/raw_ostream.h"
@@ -314,21 +315,22 @@ static Value *LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP) {
 static void ReplaceFPIntrinsicWithCall(CallInst *CI, const char *Fname,
                                        const char *Dname,
                                        const char *LDname) {
-  switch (CI->getOperand(1)->getType()->getTypeID()) {
+  CallSite CS(CI);
+  switch (CI->getArgOperand(0)->getType()->getTypeID()) {
   default: llvm_unreachable("Invalid type in intrinsic");
   case Type::FloatTyID:
-    ReplaceCallWith(Fname, CI, CI->op_begin() + 1, CI->op_end(),
+    ReplaceCallWith(Fname, CI, CS.arg_begin(), CS.arg_end(),
                   Type::getFloatTy(CI->getContext()));
     break;
   case Type::DoubleTyID:
-    ReplaceCallWith(Dname, CI, CI->op_begin() + 1, CI->op_end(),
+    ReplaceCallWith(Dname, CI, CS.arg_begin(), CS.arg_end(),
                   Type::getDoubleTy(CI->getContext()));
     break;
   case Type::X86_FP80TyID:
   case Type::FP128TyID:
   case Type::PPC_FP128TyID:
-    ReplaceCallWith(LDname, CI, CI->op_begin() + 1, CI->op_end(),
-                  CI->getOperand(1)->getType());
+    ReplaceCallWith(LDname, CI, CS.arg_begin(), CS.arg_end(),
+                  CI->getArgOperand(0)->getType());
     break;
   }
 }
@@ -340,6 +342,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
   const Function *Callee = CI->getCalledFunction();
   assert(Callee && "Cannot lower an indirect call!");
 
+  CallSite CS(CI);
   switch (Callee->getIntrinsicID()) {
   case Intrinsic::not_intrinsic:
     report_fatal_error("Cannot lower a call to a non-intrinsic function '"+
@@ -353,7 +356,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     // by the lowerinvoke pass.  In both cases, the right thing to do is to
     // convert the call to an explicit setjmp or longjmp call.
   case Intrinsic::setjmp: {
-    Value *V = ReplaceCallWith("setjmp", CI, CI->op_begin() + 1, CI->op_end(),
+    Value *V = ReplaceCallWith("setjmp", CI, CS.arg_begin(), CS.arg_end(),
                                Type::getInt32Ty(Context));
     if (!CI->getType()->isVoidTy())
       CI->replaceAllUsesWith(V);
@@ -365,32 +368,32 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
      break;
 
   case Intrinsic::longjmp: {
-    ReplaceCallWith("longjmp", CI, CI->op_begin() + 1, CI->op_end(),
+    ReplaceCallWith("longjmp", CI, CS.arg_begin(), CS.arg_end(),
                     Type::getVoidTy(Context));
     break;
   }
 
   case Intrinsic::siglongjmp: {
     // Insert the call to abort
-    ReplaceCallWith("abort", CI, CI->op_end(), CI->op_end(), 
+    ReplaceCallWith("abort", CI, CS.arg_end(), CS.arg_end(), 
                     Type::getVoidTy(Context));
     break;
   }
   case Intrinsic::ctpop:
-    CI->replaceAllUsesWith(LowerCTPOP(Context, CI->getOperand(1), CI));
+    CI->replaceAllUsesWith(LowerCTPOP(Context, CI->getArgOperand(0), CI));
     break;
 
   case Intrinsic::bswap:
-    CI->replaceAllUsesWith(LowerBSWAP(Context, CI->getOperand(1), CI));
+    CI->replaceAllUsesWith(LowerBSWAP(Context, CI->getArgOperand(0), CI));
     break;
     
   case Intrinsic::ctlz:
-    CI->replaceAllUsesWith(LowerCTLZ(Context, CI->getOperand(1), CI));
+    CI->replaceAllUsesWith(LowerCTLZ(Context, CI->getArgOperand(0), CI));
     break;
 
   case Intrinsic::cttz: {
     // cttz(x) -> ctpop(~X & (X-1))
-    Value *Src = CI->getOperand(1);
+    Value *Src = CI->getArgOperand(0);
     Value *NotSrc = Builder.CreateNot(Src);
     NotSrc->setName(Src->getName() + ".not");
     Value *SrcM1 = ConstantInt::get(Src->getType(), 1);
@@ -451,37 +454,37 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     
   case Intrinsic::memcpy: {
     const IntegerType *IntPtr = TD.getIntPtrType(Context);
-    Value *Size = Builder.CreateIntCast(CI->getOperand(3), IntPtr,
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
-    Ops[0] = CI->getOperand(1);
-    Ops[1] = CI->getOperand(2);
+    Ops[0] = CI->getArgOperand(0);
+    Ops[1] = CI->getArgOperand(1);
     Ops[2] = Size;
-    ReplaceCallWith("memcpy", CI, Ops, Ops+3, CI->getOperand(1)->getType());
+    ReplaceCallWith("memcpy", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
     break;
   }
   case Intrinsic::memmove: {
     const IntegerType *IntPtr = TD.getIntPtrType(Context);
-    Value *Size = Builder.CreateIntCast(CI->getOperand(3), IntPtr,
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
-    Ops[0] = CI->getOperand(1);
-    Ops[1] = CI->getOperand(2);
+    Ops[0] = CI->getArgOperand(0);
+    Ops[1] = CI->getArgOperand(1);
     Ops[2] = Size;
-    ReplaceCallWith("memmove", CI, Ops, Ops+3, CI->getOperand(1)->getType());
+    ReplaceCallWith("memmove", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
     break;
   }
   case Intrinsic::memset: {
     const IntegerType *IntPtr = TD.getIntPtrType(Context);
-    Value *Size = Builder.CreateIntCast(CI->getOperand(3), IntPtr,
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
-    Ops[0] = CI->getOperand(1);
+    Ops[0] = CI->getArgOperand(0);
     // Extend the amount to i32.
-    Ops[1] = Builder.CreateIntCast(CI->getOperand(2), Type::getInt32Ty(Context),
+    Ops[1] = Builder.CreateIntCast(CI->getArgOperand(1), Type::getInt32Ty(Context),
                                    /* isSigned */ false);
     Ops[2] = Size;
-    ReplaceCallWith("memset", CI, Ops, Ops+3, CI->getOperand(1)->getType());
+    ReplaceCallWith("memset", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
     break;
   }
   case Intrinsic::sqrt: {
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index b584704..bf3137e 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -329,12 +329,15 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   if (OptLevel != CodeGenOpt::None)
     PM.add(createOptimizePHIsPass());
 
-  // Delete dead machine instructions regardless of optimization level.
-  PM.add(createDeadMachineInstructionElimPass());
-  printAndVerify(PM, "After codegen DCE pass",
-                 /* allowDoubleDefs= */ true);
-
   if (OptLevel != CodeGenOpt::None) {
+    // With optimization, dead code should already be eliminated. However
+    // there is one known exception: lowered code for arguments that are only
+    // used by tail calls, where the tail calls reuse the incoming stack
+    // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+    PM.add(createDeadMachineInstructionElimPass());
+    printAndVerify(PM, "After codegen DCE pass",
+                   /* allowDoubleDefs= */ true);
+
     PM.add(createOptimizeExtsPass());
     if (!DisableMachineLICM)
       PM.add(createMachineLICMPass());
@@ -358,7 +361,7 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
                    /* allowDoubleDefs= */ true);
 
   // Perform register allocation.
-  PM.add(createRegisterAllocator());
+  PM.add(createRegisterAllocator(OptLevel));
   printAndVerify(PM, "After Register Allocation");
 
   // Perform stack slot coloring and post-ra machine LICM.
diff --git a/lib/CodeGen/LatencyPriorityQueue.cpp b/lib/CodeGen/LatencyPriorityQueue.cpp
index 03b4eab..b9527fa 100644
--- a/lib/CodeGen/LatencyPriorityQueue.cpp
+++ b/lib/CodeGen/LatencyPriorityQueue.cpp
@@ -118,7 +118,7 @@ void LatencyPriorityQueue::AdjustPriorityOfUnscheduledPreds(SUnit *SU) {
 SUnit *LatencyPriorityQueue::pop() {
   if (empty()) return NULL;
   std::vector<SUnit *>::iterator Best = Queue.begin();
-  for (std::vector<SUnit *>::iterator I = next(Queue.begin()),
+  for (std::vector<SUnit *>::iterator I = llvm::next(Queue.begin()),
        E = Queue.end(); I != E; ++I)
     if (Picker(*Best, *I))
       Best = I;
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 025ad05..21a9b7d 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -68,6 +68,37 @@ bool LiveInterval::liveBeforeAndAt(SlotIndex I) const {
   return r->end == I;
 }
 
+/// killedAt - Return true if a live range ends at index. Note that the kill
+/// point is not contained in the half-open live range. It is usually the
+/// getDefIndex() slot following its last use.
+bool LiveInterval::killedAt(SlotIndex I) const {
+  Ranges::const_iterator r = std::lower_bound(ranges.begin(), ranges.end(), I);
+
+  // Now r points to the first interval with start >= I, or ranges.end().
+  if (r == ranges.begin())
+    return false;
+
+  --r;
+  // Now r points to the last interval with end <= I.
+  // r->end is the kill point.
+  return r->end == I;
+}
+
+/// killedInRange - Return true if the interval has kills in [Start,End).
+bool LiveInterval::killedInRange(SlotIndex Start, SlotIndex End) const {
+  Ranges::const_iterator r =
+    std::lower_bound(ranges.begin(), ranges.end(), End);
+
+  // Now r points to the first interval with start >= End, or ranges.end().
+  if (r == ranges.begin())
+    return false;
+
+  --r;
+  // Now r points to the last interval with end <= End.
+  // r->end is the kill point.
+  return r->end >= Start && r->end < End;
+}
+
 // overlaps - Return true if the intersection of the two live intervals is
 // not empty.
 //
@@ -149,7 +180,6 @@ bool LiveInterval::overlaps(SlotIndex Start, SlotIndex End) const {
 void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
   assert(I != ranges.end() && "Not a valid interval!");
   VNInfo *ValNo = I->valno;
-  SlotIndex OldEnd = I->end;
 
   // Search for the first interval that we can't merge with.
   Ranges::iterator MergeTo = next(I);
@@ -163,9 +193,6 @@ void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
   // Erase any dead ranges.
   ranges.erase(next(I), MergeTo);
 
-  // Update kill info.
-  ValNo->removeKills(OldEnd, I->end.getPrevSlot());
-
   // If the newly formed range now touches the range after it and if they have
   // the same value number, merge the two ranges into one range.
   Ranges::iterator Next = next(I);
@@ -245,9 +272,6 @@ LiveInterval::addRangeFrom(LiveRange LR, iterator From) {
         // endpoint as well.
         if (End > it->end)
           extendIntervalEndTo(it, End);
-        else if (End < it->end)
-          // Overlapping intervals, there might have been a kill here.
-          it->valno->removeKill(End);
         return it;
       }
     } else {
@@ -288,7 +312,6 @@ void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
   VNInfo *ValNo = I->valno;
   if (I->start == Start) {
     if (I->end == End) {
-      ValNo->removeKills(Start, End);
       if (RemoveDeadValNo) {
         // Check if val# is dead.
         bool isDead = true;
@@ -296,7 +319,7 @@ void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
           if (II != I && II->valno == ValNo) {
             isDead = false;
             break;
-          }          
+          }
         if (isDead) {
           // Now that ValNo is dead, remove it.  If it is the largest value
           // number, just nuke it (and any other deleted values neighboring it),
@@ -320,7 +343,6 @@ void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
   // Otherwise if the span we are removing is at the end of the LiveRange,
   // adjust the other way.
   if (I->end == End) {
-    ValNo->removeKills(Start, End);
     I->end = Start;
     return;
   }
@@ -529,6 +551,7 @@ void LiveInterval::MergeValueInAsValue(
   SmallVector<VNInfo*, 4> ReplacedValNos;
   iterator IP = begin();
   for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
+    assert(I->valno == RHS.getValNumInfo(I->valno->id) && "Bad VNInfo");
     if (I->valno != RHSValNo)
       continue;
     SlotIndex Start = I->start, End = I->end;
@@ -823,10 +846,12 @@ void LiveInterval::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
   else {
     OS << " = ";
     for (LiveInterval::Ranges::const_iterator I = ranges.begin(),
-           E = ranges.end(); I != E; ++I)
-    OS << *I;
+           E = ranges.end(); I != E; ++I) {
+      OS << *I;
+      assert(I->valno == getValNumInfo(I->valno->id) && "Bad VNInfo");
+    }
   }
-  
+
   // Print value number info.
   if (getNumValNums()) {
     OS << "  ";
@@ -843,21 +868,6 @@ void LiveInterval::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
           OS << "?";
         else
           OS << vni->def;
-        unsigned ee = vni->kills.size();
-        if (ee || vni->hasPHIKill()) {
-          OS << "-(";
-          for (unsigned j = 0; j != ee; ++j) {
-            OS << vni->kills[j];
-            if (j != ee-1)
-              OS << " ";
-          }
-          if (vni->hasPHIKill()) {
-            if (ee)
-              OS << " ";
-            OS << "phi";
-          }
-          OS << ")";
-        }
       }
     }
   }
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index a6d38ad..194d03d 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -50,9 +50,6 @@ using namespace llvm;
 static cl::opt<bool> DisableReMat("disable-rematerialization", 
                                   cl::init(false), cl::Hidden);
 
-static cl::opt<bool> EnableFastSpilling("fast-spill",
-                                        cl::init(false), cl::Hidden);
-
 STATISTIC(numIntervals , "Number of original intervals");
 STATISTIC(numFolds     , "Number of loads/stores folded into instructions");
 STATISTIC(numSplits    , "Number of intervals split");
@@ -90,8 +87,8 @@ void LiveIntervals::releaseMemory() {
   
   r2iMap_.clear();
 
-  // Release VNInfo memroy regions after all VNInfo objects are dtor'd.
-  VNInfoAllocator.DestroyAll();
+  // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
+  VNInfoAllocator.Reset();
   while (!CloneMIs.empty()) {
     MachineInstr *MI = CloneMIs.back();
     CloneMIs.pop_back();
@@ -195,6 +192,10 @@ bool LiveIntervals::conflictsWithPhysReg(const LiveInterval &li,
     if (tii_->isMoveInstr(MI, SrcReg, DstReg, SrcSubReg, DstSubReg))
       if (SrcReg == li.reg || DstReg == li.reg)
         continue;
+    if (MI.isCopy())
+      if (MI.getOperand(0).getReg() == li.reg ||
+          MI.getOperand(1).getReg() == li.reg)
+        continue;
 
     // Check for operands using reg
     for (unsigned i = 0, e = MI.getNumOperands(); i != e;  ++i) {
@@ -218,10 +219,7 @@ bool LiveIntervals::conflictsWithPhysReg(const LiveInterval &li,
   return false;
 }
 
-/// conflictsWithSubPhysRegRef - Similar to conflictsWithPhysRegRef except
-/// it checks for sub-register reference and it can check use as well.
-bool LiveIntervals::conflictsWithSubPhysRegRef(LiveInterval &li,
-                                            unsigned Reg, bool CheckUse,
+bool LiveIntervals::conflictsWithAliasRef(LiveInterval &li, unsigned Reg,
                                   SmallPtrSet<MachineInstr*,32> &JoinedCopies) {
   for (LiveInterval::Ranges::const_iterator
          I = li.ranges.begin(), E = li.ranges.end(); I != E; ++I) {
@@ -239,12 +237,11 @@ bool LiveIntervals::conflictsWithSubPhysRegRef(LiveInterval &li,
         MachineOperand& MO = MI->getOperand(i);
         if (!MO.isReg())
           continue;
-        if (MO.isUse() && !CheckUse)
-          continue;
         unsigned PhysReg = MO.getReg();
-        if (PhysReg == 0 || TargetRegisterInfo::isVirtualRegister(PhysReg))
+        if (PhysReg == 0 || PhysReg == Reg ||
+            TargetRegisterInfo::isVirtualRegister(PhysReg))
           continue;
-        if (tri_->isSubRegister(Reg, PhysReg))
+        if (tri_->regsOverlap(Reg, PhysReg))
           return true;
       }
     }
@@ -272,7 +269,7 @@ bool MultipleDefsBySameMI(const MachineInstr &MI, unsigned MOIdx) {
     if (MO.getReg() == Reg && MO.isDef()) {
       assert(MI.getOperand(MOIdx).getSubReg() != MO.getSubReg() &&
              MI.getOperand(MOIdx).getSubReg() &&
-             MO.getSubReg());
+             (MO.getSubReg() || MO.isImplicit()));
       return true;
     }
   }
@@ -328,9 +325,10 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
 
     MachineInstr *CopyMI = NULL;
     unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-    if (mi->isExtractSubreg() || mi->isInsertSubreg() || mi->isSubregToReg() ||
-        tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg))
+    if (mi->isCopyLike() ||
+        tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg)) {
       CopyMI = mi;
+    }
 
     VNInfo *ValNo = interval.getNextValue(defIndex, CopyMI, true,
                                           VNInfoAllocator);
@@ -356,7 +354,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
         LiveRange LR(defIndex, killIdx, ValNo);
         interval.addRange(LR);
         DEBUG(dbgs() << " +" << LR << "\n");
-        ValNo->addKill(killIdx);
         return;
       }
     }
@@ -376,7 +373,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       // valno in the killing blocks.
       assert(vi.AliveBlocks.empty() && "Phi join can't pass through blocks");
       DEBUG(dbgs() << " phi-join");
-      ValNo->addKill(indexes_->getTerminatorGap(mbb));
       ValNo->setHasPHIKill(true);
     } else {
       // Iterate over all of the blocks that the variable is completely
@@ -407,7 +403,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       }
       LiveRange LR(Start, killIdx, ValNo);
       interval.addRange(LR);
-      ValNo->addKill(killIdx);
       DEBUG(dbgs() << " +" << LR);
     }
 
@@ -434,11 +429,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       // are actually two values in the live interval.  Because of this we
       // need to take the LiveRegion that defines this register and split it
       // into two values.
-      // Two-address vregs should always only be redefined once.  This means
-      // that at this point, there should be exactly one value number in it.
-      assert((PartReDef || interval.containsOneValue()) &&
-             "Unexpected 2-addr liveint!");
-      SlotIndex DefIndex = interval.getValNumInfo(0)->def.getDefIndex();
       SlotIndex RedefIndex = MIIdx.getDefIndex();
       if (MO.isEarlyClobber())
         RedefIndex = MIIdx.getUseIndex();
@@ -446,8 +436,9 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       const LiveRange *OldLR =
         interval.getLiveRangeContaining(RedefIndex.getUseIndex());
       VNInfo *OldValNo = OldLR->valno;
+      SlotIndex DefIndex = OldValNo->def.getDefIndex();
 
-      // Delete the initial value, which should be short and continuous,
+      // Delete the previous value, which should be short and continuous,
       // because the 2-addr copy must be in the same MBB as the redef.
       interval.removeRange(DefIndex, RedefIndex);
 
@@ -464,15 +455,14 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
 
       // A re-def may be a copy. e.g. %reg1030:6<def> = VMOVD %reg1026, ...
       unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-      if (PartReDef &&
-          tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg))
+      if (PartReDef && (mi->isCopyLike() ||
+          tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg)))
         OldValNo->setCopy(&*mi);
       
       // Add the new live interval which replaces the range for the input copy.
       LiveRange LR(DefIndex, RedefIndex, ValNo);
       DEBUG(dbgs() << " replace range with " << LR);
       interval.addRange(LR);
-      ValNo->addKill(RedefIndex);
 
       // If this redefinition is dead, we need to add a dummy unit live
       // range covering the def slot.
@@ -496,7 +486,7 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       VNInfo *ValNo;
       MachineInstr *CopyMI = NULL;
       unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-      if (mi->isExtractSubreg() || mi->isInsertSubreg() || mi->isSubregToReg()||
+      if (mi->isCopyLike() ||
           tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg))
         CopyMI = mi;
       ValNo = interval.getNextValue(defIndex, CopyMI, true, VNInfoAllocator);
@@ -504,7 +494,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       SlotIndex killIndex = getMBBEndIdx(mbb);
       LiveRange LR(defIndex, killIndex, ValNo);
       interval.addRange(LR);
-      ValNo->addKill(indexes_->getTerminatorGap(mbb));
       ValNo->setHasPHIKill(true);
       DEBUG(dbgs() << " phi-join +" << LR);
     } else {
@@ -600,7 +589,6 @@ exit:
     ValNo->setHasRedefByEC(true);
   LiveRange LR(start, end, ValNo);
   interval.addRange(LR);
-  LR.valno->addKill(end);
   DEBUG(dbgs() << " +" << LR << '\n');
 }
 
@@ -615,7 +603,7 @@ void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB,
   else if (allocatableRegs_[MO.getReg()]) {
     MachineInstr *CopyMI = NULL;
     unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-    if (MI->isExtractSubreg() || MI->isInsertSubreg() || MI->isSubregToReg() ||
+    if (MI->isCopyLike() ||
         tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg))
       CopyMI = MI;
     handlePhysicalRegisterDef(MBB, MI, MIIdx, MO,
@@ -701,7 +689,6 @@ void LiveIntervals::handleLiveInRegister(MachineBasicBlock *MBB,
   LiveRange LR(start, end, vni);
 
   interval.addRange(LR);
-  LR.valno->addKill(end);
   DEBUG(dbgs() << " +" << LR << '\n');
 }
 
@@ -787,37 +774,6 @@ LiveInterval* LiveIntervals::dupInterval(LiveInterval *li) {
   return NewLI;
 }
 
-/// getVNInfoSourceReg - Helper function that parses the specified VNInfo
-/// copy field and returns the source register that defines it.
-unsigned LiveIntervals::getVNInfoSourceReg(const VNInfo *VNI) const {
-  if (!VNI->getCopy())
-    return 0;
-
-  if (VNI->getCopy()->isExtractSubreg()) {
-    // If it's extracting out of a physical register, return the sub-register.
-    unsigned Reg = VNI->getCopy()->getOperand(1).getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-      unsigned SrcSubReg = VNI->getCopy()->getOperand(2).getImm();
-      unsigned DstSubReg = VNI->getCopy()->getOperand(0).getSubReg();
-      if (SrcSubReg == DstSubReg)
-        // %reg1034:3<def> = EXTRACT_SUBREG %EDX, 3
-        // reg1034 can still be coalesced to EDX.
-        return Reg;
-      assert(DstSubReg == 0);
-      Reg = tri_->getSubReg(Reg, VNI->getCopy()->getOperand(2).getImm());
-    }
-    return Reg;
-  } else if (VNI->getCopy()->isInsertSubreg() ||
-             VNI->getCopy()->isSubregToReg())
-    return VNI->getCopy()->getOperand(2).getReg();
-
-  unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-  if (tii_->isMoveInstr(*VNI->getCopy(), SrcReg, DstReg, SrcSubReg, DstSubReg))
-    return SrcReg;
-  llvm_unreachable("Unrecognized copy instruction!");
-  return 0;
-}
-
 //===----------------------------------------------------------------------===//
 // Register allocator hooks.
 //
@@ -991,22 +947,22 @@ bool LiveIntervals::tryFoldMemoryOperand(MachineInstr* &MI,
   if (DefMI && (MRInfo & VirtRegMap::isMod))
     return false;
 
-  MachineInstr *fmi = isSS ? tii_->foldMemoryOperand(*mf_, MI, FoldOps, Slot)
-                           : tii_->foldMemoryOperand(*mf_, MI, FoldOps, DefMI);
+  MachineInstr *fmi = isSS ? tii_->foldMemoryOperand(MI, FoldOps, Slot)
+                           : tii_->foldMemoryOperand(MI, FoldOps, DefMI);
   if (fmi) {
     // Remember this instruction uses the spill slot.
     if (isSS) vrm.addSpillSlotUse(Slot, fmi);
 
     // Attempt to fold the memory reference into the instruction. If
     // we can do this, we don't need to insert spill code.
-    MachineBasicBlock &MBB = *MI->getParent();
     if (isSS && !mf_->getFrameInfo()->isImmutableObjectIndex(Slot))
       vrm.virtFolded(Reg, MI, fmi, (VirtRegMap::ModRef)MRInfo);
     vrm.transferSpillPts(MI, fmi);
     vrm.transferRestorePts(MI, fmi);
     vrm.transferEmergencySpills(MI, fmi);
     ReplaceMachineInstrInMaps(MI, fmi);
-    MI = MBB.insert(MBB.erase(MI), fmi);
+    MI->eraseFromParent();
+    MI = fmi;
     ++numFolds;
     return true;
   }
@@ -1098,7 +1054,6 @@ rewriteInstructionForSpills(const LiveInterval &li, const VNInfo *VNI,
     if (!mop.isReg())
       continue;
     unsigned Reg = mop.getReg();
-    unsigned RegI = Reg;
     if (Reg == 0 || TargetRegisterInfo::isPhysicalRegister(Reg))
       continue;
     if (Reg != li.reg)
@@ -1140,26 +1095,8 @@ rewriteInstructionForSpills(const LiveInterval &li, const VNInfo *VNI,
     //
     // Keep track of whether we replace a use and/or def so that we can
     // create the spill interval with the appropriate range. 
-
-    HasUse = mop.isUse();
-    HasDef = mop.isDef();
     SmallVector<unsigned, 2> Ops;
-    Ops.push_back(i);
-    for (unsigned j = i+1, e = MI->getNumOperands(); j != e; ++j) {
-      const MachineOperand &MOj = MI->getOperand(j);
-      if (!MOj.isReg())
-        continue;
-      unsigned RegJ = MOj.getReg();
-      if (RegJ == 0 || TargetRegisterInfo::isPhysicalRegister(RegJ))
-        continue;
-      if (RegJ == RegI) {
-        Ops.push_back(j);
-        if (!MOj.isUndef()) {
-          HasUse |= MOj.isUse();
-          HasDef |= MOj.isDef();
-        }
-      }
-    }
+    tie(HasUse, HasDef) = MI->readsWritesVirtualRegister(Reg, &Ops);
 
     // Create a new virtual register for the spill interval.
     // Create the new register now so we can map the fold instruction
@@ -1294,16 +1231,7 @@ bool LiveIntervals::anyKillInMBBAfterIdx(const LiveInterval &li,
                                    const VNInfo *VNI,
                                    MachineBasicBlock *MBB,
                                    SlotIndex Idx) const {
-  SlotIndex End = getMBBEndIdx(MBB);
-  for (unsigned j = 0, ee = VNI->kills.size(); j != ee; ++j) {
-    if (VNI->kills[j].isPHI())
-      continue;
-
-    SlotIndex KillIdx = VNI->kills[j];
-    if (KillIdx > Idx && KillIdx <= End)
-      return true;
-  }
-  return false;
+  return li.killedInRange(Idx.getNextSlot(), getMBBEndIdx(MBB));
 }
 
 /// RewriteInfo - Keep track of machine instrs that will be rewritten
@@ -1312,10 +1240,7 @@ namespace {
   struct RewriteInfo {
     SlotIndex Index;
     MachineInstr *MI;
-    bool HasUse;
-    bool HasDef;
-    RewriteInfo(SlotIndex i, MachineInstr *mi, bool u, bool d)
-      : Index(i), MI(mi), HasUse(u), HasDef(d) {}
+    RewriteInfo(SlotIndex i, MachineInstr *mi) : Index(i), MI(mi) {}
   };
 
   struct RewriteInfoCompare {
@@ -1394,7 +1319,7 @@ rewriteInstructionsForSpills(const LiveInterval &li, bool TrySplit,
       // easily see a situation where both registers are reloaded before
       // the INSERT_SUBREG and both target registers that would overlap.
       continue;
-    RewriteMIs.push_back(RewriteInfo(index, MI, O.isUse(), O.isDef()));
+    RewriteMIs.push_back(RewriteInfo(index, MI));
   }
   std::sort(RewriteMIs.begin(), RewriteMIs.end(), RewriteInfoCompare());
 
@@ -1404,18 +1329,11 @@ rewriteInstructionsForSpills(const LiveInterval &li, bool TrySplit,
     RewriteInfo &rwi = RewriteMIs[i];
     ++i;
     SlotIndex index = rwi.Index;
-    bool MIHasUse = rwi.HasUse;
-    bool MIHasDef = rwi.HasDef;
     MachineInstr *MI = rwi.MI;
     // If MI def and/or use the same register multiple times, then there
     // are multiple entries.
-    unsigned NumUses = MIHasUse;
     while (i != e && RewriteMIs[i].MI == MI) {
       assert(RewriteMIs[i].Index == index);
-      bool isUse = RewriteMIs[i].HasUse;
-      if (isUse) ++NumUses;
-      MIHasUse |= isUse;
-      MIHasDef |= RewriteMIs[i].HasDef;
       ++i;
     }
     MachineBasicBlock *MBB = MI->getParent();
@@ -1440,7 +1358,8 @@ rewriteInstructionsForSpills(const LiveInterval &li, bool TrySplit,
         //     = use
         // It's better to start a new interval to avoid artifically
         // extend the new interval.
-        if (MIHasDef && !MIHasUse) {
+        if (MI->readsWritesVirtualRegister(li.reg) ==
+            std::make_pair(false,true)) {
           MBBVRegsMap.erase(MBB->getNumber());
           ThisVReg = 0;
         }
@@ -1652,103 +1571,9 @@ LiveIntervals::normalizeSpillWeights(std::vector<LiveInterval*> &NewLIs) {
 }
 
 std::vector<LiveInterval*> LiveIntervals::
-addIntervalsForSpillsFast(const LiveInterval &li,
-                          const MachineLoopInfo *loopInfo,
-                          VirtRegMap &vrm) {
-  unsigned slot = vrm.assignVirt2StackSlot(li.reg);
-
-  std::vector<LiveInterval*> added;
-
-  assert(li.isSpillable() && "attempt to spill already spilled interval!");
-
-  DEBUG({
-      dbgs() << "\t\t\t\tadding intervals for spills for interval: ";
-      li.dump();
-      dbgs() << '\n';
-    });
-
-  const TargetRegisterClass* rc = mri_->getRegClass(li.reg);
-
-  MachineRegisterInfo::reg_iterator RI = mri_->reg_begin(li.reg);
-  while (RI != mri_->reg_end()) {
-    MachineInstr* MI = &*RI;
-    
-    SmallVector<unsigned, 2> Indices;
-    bool HasUse = false;
-    bool HasDef = false;
-    
-    for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
-      MachineOperand& mop = MI->getOperand(i);
-      if (!mop.isReg() || mop.getReg() != li.reg) continue;
-      
-      HasUse |= MI->getOperand(i).isUse();
-      HasDef |= MI->getOperand(i).isDef();
-      
-      Indices.push_back(i);
-    }
-    
-    if (!tryFoldMemoryOperand(MI, vrm, NULL, getInstructionIndex(MI),
-                              Indices, true, slot, li.reg)) {
-      unsigned NewVReg = mri_->createVirtualRegister(rc);
-      vrm.grow();
-      vrm.assignVirt2StackSlot(NewVReg, slot);
-      
-      // create a new register for this spill
-      LiveInterval &nI = getOrCreateInterval(NewVReg);
-      nI.markNotSpillable();
-      
-      // Rewrite register operands to use the new vreg.
-      for (SmallVectorImpl<unsigned>::iterator I = Indices.begin(),
-           E = Indices.end(); I != E; ++I) {
-        MI->getOperand(*I).setReg(NewVReg);
-        
-        if (MI->getOperand(*I).isUse())
-          MI->getOperand(*I).setIsKill(true);
-      }
-      
-      // Fill in  the new live interval.
-      SlotIndex index = getInstructionIndex(MI);
-      if (HasUse) {
-        LiveRange LR(index.getLoadIndex(), index.getUseIndex(),
-                     nI.getNextValue(SlotIndex(), 0, false,
-                                     getVNInfoAllocator()));
-        DEBUG(dbgs() << " +" << LR);
-        nI.addRange(LR);
-        vrm.addRestorePoint(NewVReg, MI);
-      }
-      if (HasDef) {
-        LiveRange LR(index.getDefIndex(), index.getStoreIndex(),
-                     nI.getNextValue(SlotIndex(), 0, false,
-                                     getVNInfoAllocator()));
-        DEBUG(dbgs() << " +" << LR);
-        nI.addRange(LR);
-        vrm.addSpillPoint(NewVReg, true, MI);
-      }
-      
-      added.push_back(&nI);
-        
-      DEBUG({
-          dbgs() << "\t\t\t\tadded new interval: ";
-          nI.dump();
-          dbgs() << '\n';
-        });
-    }
-    
-    
-    RI = mri_->reg_begin(li.reg);
-  }
-
-  return added;
-}
-
-std::vector<LiveInterval*> LiveIntervals::
 addIntervalsForSpills(const LiveInterval &li,
                       SmallVectorImpl<LiveInterval*> &SpillIs,
                       const MachineLoopInfo *loopInfo, VirtRegMap &vrm) {
-  
-  if (EnableFastSpilling)
-    return addIntervalsForSpillsFast(li, loopInfo, vrm);
-  
   assert(li.isSpillable() && "attempt to spill already spilled interval!");
 
   DEBUG({
@@ -2184,7 +2009,6 @@ LiveRange LiveIntervals::addLiveRangeToEndOfBlock(unsigned reg,
     SlotIndex(getInstructionIndex(startInst).getDefIndex()),
     startInst, true, getVNInfoAllocator());
   VN->setHasPHIKill(true);
-  VN->kills.push_back(indexes_->getTerminatorGap(startInst->getParent()));
   LiveRange LR(
      SlotIndex(getInstructionIndex(startInst).getDefIndex()),
      getMBBEndIdx(startInst->getParent()), VN);
diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp
index 798b9b9..709e2c6 100644
--- a/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/lib/CodeGen/LiveStackAnalysis.cpp
@@ -35,8 +35,8 @@ void LiveStacks::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 void LiveStacks::releaseMemory() {
-  // Release VNInfo memroy regions after all VNInfo objects are dtor'd.
-  VNInfoAllocator.DestroyAll();
+  // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
+  VNInfoAllocator.Reset();
   S2IMap.clear();
   S2RCMap.clear();
 }
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 079684e..41b891d 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -286,7 +286,7 @@ MachineInstr *LiveVariables::FindLastRefOrPartRef(unsigned Reg) {
   MachineInstr *LastDef = PhysRegDef[Reg];
   MachineInstr *LastUse = PhysRegUse[Reg];
   if (!LastDef && !LastUse)
-    return false;
+    return 0;
 
   MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef;
   unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
@@ -609,7 +609,12 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
 
     // Finally, if the last instruction in the block is a return, make sure to
     // mark it as using all of the live-out values in the function.
-    if (!MBB->empty() && MBB->back().getDesc().isReturn()) {
+    // Things marked both call and return are tail calls; do not do this for
+    // them.  The tail callee need not take the same registers as input
+    // that it produces as output, and there are dependencies for its input
+    // registers elsewhere.
+    if (!MBB->empty() && MBB->back().getDesc().isReturn()
+        && !MBB->back().getDesc().isCall()) {
       MachineInstr *Ret = &MBB->back();
 
       for (MachineRegisterInfo::liveout_iterator
diff --git a/lib/CodeGen/LowerSubregs.cpp b/lib/CodeGen/LowerSubregs.cpp
index b0348a5..dfd4eae 100644
--- a/lib/CodeGen/LowerSubregs.cpp
+++ b/lib/CodeGen/LowerSubregs.cpp
@@ -53,15 +53,15 @@ namespace {
     bool runOnMachineFunction(MachineFunction&);
 
   private:
-    bool LowerExtract(MachineInstr *MI);
-    bool LowerInsert(MachineInstr *MI);
     bool LowerSubregToReg(MachineInstr *MI);
+    bool LowerCopy(MachineInstr *MI);
 
     void TransferDeadFlag(MachineInstr *MI, unsigned DstReg,
                           const TargetRegisterInfo *TRI);
     void TransferKillFlag(MachineInstr *MI, unsigned SrcReg,
                           const TargetRegisterInfo *TRI,
                           bool AddIfNotFound = false);
+    void TransferImplicitDefs(MachineInstr *MI);
   };
 
   char LowerSubregsInstructionPass::ID = 0;
@@ -83,7 +83,7 @@ LowerSubregsInstructionPass::TransferDeadFlag(MachineInstr *MI,
     if (MII->addRegisterDead(DstReg, TRI))
       break;
     assert(MII != MI->getParent()->begin() &&
-           "copyRegToReg output doesn't reference destination register!");
+           "copyPhysReg output doesn't reference destination register!");
   }
 }
 
@@ -100,64 +100,24 @@ LowerSubregsInstructionPass::TransferKillFlag(MachineInstr *MI,
     if (MII->addRegisterKilled(SrcReg, TRI, AddIfNotFound))
       break;
     assert(MII != MI->getParent()->begin() &&
-           "copyRegToReg output doesn't reference source register!");
+           "copyPhysReg output doesn't reference source register!");
   }
 }
 
-bool LowerSubregsInstructionPass::LowerExtract(MachineInstr *MI) {
-  MachineBasicBlock *MBB = MI->getParent();
-
-  assert(MI->getOperand(0).isReg() && MI->getOperand(0).isDef() &&
-         MI->getOperand(1).isReg() && MI->getOperand(1).isUse() &&
-         MI->getOperand(2).isImm() && "Malformed extract_subreg");
-
-  unsigned DstReg   = MI->getOperand(0).getReg();
-  unsigned SuperReg = MI->getOperand(1).getReg();
-  unsigned SubIdx   = MI->getOperand(2).getImm();
-  unsigned SrcReg   = TRI->getSubReg(SuperReg, SubIdx);
-
-  assert(TargetRegisterInfo::isPhysicalRegister(SuperReg) &&
-         "Extract supperg source must be a physical register");
-  assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
-         "Extract destination must be in a physical register");
-  assert(SrcReg && "invalid subregister index for register");
-
-  DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
-
-  if (SrcReg == DstReg) {
-    // No need to insert an identity copy instruction.
-    if (MI->getOperand(1).isKill()) {
-      // We must make sure the super-register gets killed. Replace the
-      // instruction with KILL.
-      MI->setDesc(TII->get(TargetOpcode::KILL));
-      MI->RemoveOperand(2);     // SubIdx
-      DEBUG(dbgs() << "subreg: replace by: " << *MI);
-      return true;
-    }
-
-    DEBUG(dbgs() << "subreg: eliminated!");
-  } else {
-    // Insert copy
-    const TargetRegisterClass *TRCS = TRI->getPhysicalRegisterRegClass(DstReg);
-    const TargetRegisterClass *TRCD = TRI->getPhysicalRegisterRegClass(SrcReg);
-    bool Emitted = TII->copyRegToReg(*MBB, MI, DstReg, SrcReg, TRCD, TRCS,
-                                     MI->getDebugLoc());
-    (void)Emitted;
-    assert(Emitted && "Subreg and Dst must be of compatible register class");
-    // Transfer the kill/dead flags, if needed.
-    if (MI->getOperand(0).isDead())
-      TransferDeadFlag(MI, DstReg, TRI);
-    if (MI->getOperand(1).isKill())
-      TransferKillFlag(MI, SuperReg, TRI, true);
-    DEBUG({
-        MachineBasicBlock::iterator dMI = MI;
-        dbgs() << "subreg: " << *(--dMI);
-      });
+/// TransferImplicitDefs - MI is a pseudo-instruction, and the lowered
+/// replacement instructions immediately precede it.  Copy any implicit-def
+/// operands from MI to the replacement instruction.
+void
+LowerSubregsInstructionPass::TransferImplicitDefs(MachineInstr *MI) {
+  MachineBasicBlock::iterator CopyMI = MI;
+  --CopyMI;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isImplicit() || MO.isUse())
+      continue;
+    CopyMI->addOperand(MachineOperand::CreateReg(MO.getReg(), true, true));
   }
-
-  DEBUG(dbgs() << '\n');
-  MBB->erase(MI);
-  return true;
 }
 
 bool LowerSubregsInstructionPass::LowerSubregToReg(MachineInstr *MI) {
@@ -166,10 +126,10 @@ bool LowerSubregsInstructionPass::LowerSubregToReg(MachineInstr *MI) {
          MI->getOperand(1).isImm() &&
          (MI->getOperand(2).isReg() && MI->getOperand(2).isUse()) &&
           MI->getOperand(3).isImm() && "Invalid subreg_to_reg");
-          
+
   unsigned DstReg  = MI->getOperand(0).getReg();
   unsigned InsReg  = MI->getOperand(2).getReg();
-  unsigned InsSIdx = MI->getOperand(2).getSubReg();
+  assert(!MI->getOperand(2).getSubReg() && "SubIdx on physreg?");
   unsigned SubIdx  = MI->getOperand(3).getImm();
 
   assert(SubIdx != 0 && "Invalid index for insert_subreg");
@@ -182,27 +142,25 @@ bool LowerSubregsInstructionPass::LowerSubregToReg(MachineInstr *MI) {
 
   DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
 
-  if (DstSubReg == InsReg && InsSIdx == 0) {
+  if (DstSubReg == InsReg) {
     // No need to insert an identify copy instruction.
     // Watch out for case like this:
-    // %RAX<def> = ...
-    // %RAX<def> = SUBREG_TO_REG 0, %EAX:3<kill>, 3
-    // The first def is defining RAX, not EAX so the top bits were not
-    // zero extended.
+    // %RAX<def> = SUBREG_TO_REG 0, %EAX<kill>, 3
+    // We must leave %RAX live.
+    if (DstReg != InsReg) {
+      MI->setDesc(TII->get(TargetOpcode::KILL));
+      MI->RemoveOperand(3);     // SubIdx
+      MI->RemoveOperand(1);     // Imm
+      DEBUG(dbgs() << "subreg: replace by: " << *MI);
+      return true;
+    }
     DEBUG(dbgs() << "subreg: eliminated!");
   } else {
-    // Insert sub-register copy
-    const TargetRegisterClass *TRC0= TRI->getPhysicalRegisterRegClass(DstSubReg);
-    const TargetRegisterClass *TRC1= TRI->getPhysicalRegisterRegClass(InsReg);
-    bool Emitted = TII->copyRegToReg(*MBB, MI, DstSubReg, InsReg, TRC0, TRC1,
-                                     MI->getDebugLoc());
-    (void)Emitted;
-    assert(Emitted && "Subreg and Dst must be of compatible register class");
+    TII->copyPhysReg(*MBB, MI, MI->getDebugLoc(), DstSubReg, InsReg,
+                     MI->getOperand(2).isKill());
     // Transfer the kill/dead flags, if needed.
     if (MI->getOperand(0).isDead())
       TransferDeadFlag(MI, DstSubReg, TRI);
-    if (MI->getOperand(2).isKill())
-      TransferKillFlag(MI, InsReg, TRI);
     DEBUG({
         MachineBasicBlock::iterator dMI = MI;
         dbgs() << "subreg: " << *(--dMI);
@@ -214,87 +172,39 @@ bool LowerSubregsInstructionPass::LowerSubregToReg(MachineInstr *MI) {
   return true;
 }
 
-bool LowerSubregsInstructionPass::LowerInsert(MachineInstr *MI) {
-  MachineBasicBlock *MBB = MI->getParent();
-  assert((MI->getOperand(0).isReg() && MI->getOperand(0).isDef()) &&
-         (MI->getOperand(1).isReg() && MI->getOperand(1).isUse()) &&
-         (MI->getOperand(2).isReg() && MI->getOperand(2).isUse()) &&
-          MI->getOperand(3).isImm() && "Invalid insert_subreg");
-          
-  unsigned DstReg = MI->getOperand(0).getReg();
-#ifndef NDEBUG
-  unsigned SrcReg = MI->getOperand(1).getReg();
-#endif
-  unsigned InsReg = MI->getOperand(2).getReg();
-  unsigned SubIdx = MI->getOperand(3).getImm();     
+bool LowerSubregsInstructionPass::LowerCopy(MachineInstr *MI) {
+  MachineOperand &DstMO = MI->getOperand(0);
+  MachineOperand &SrcMO = MI->getOperand(1);
 
-  assert(DstReg == SrcReg && "insert_subreg not a two-address instruction?");
-  assert(SubIdx != 0 && "Invalid index for insert_subreg");
-  unsigned DstSubReg = TRI->getSubReg(DstReg, SubIdx);
-  assert(DstSubReg && "invalid subregister index for register");
-  assert(TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
-         "Insert superreg source must be in a physical register");
-  assert(TargetRegisterInfo::isPhysicalRegister(InsReg) &&
-         "Inserted value must be in a physical register");
-
-  DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
-
-  if (DstSubReg == InsReg) {
-    // No need to insert an identity copy instruction. If the SrcReg was
-    // <undef>, we need to make sure it is alive by inserting a KILL
-    if (MI->getOperand(1).isUndef() && !MI->getOperand(0).isDead()) {
-      MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
-                                TII->get(TargetOpcode::KILL), DstReg);
-      if (MI->getOperand(2).isUndef())
-        MIB.addReg(InsReg, RegState::Undef);
-      else
-        MIB.addReg(InsReg, RegState::Kill);
-    } else {
-      DEBUG(dbgs() << "subreg: eliminated!\n");
-      MBB->erase(MI);
+  if (SrcMO.getReg() == DstMO.getReg()) {
+    DEBUG(dbgs() << "identity copy: " << *MI);
+    // No need to insert an identity copy instruction, but replace with a KILL
+    // if liveness is changed.
+    if (DstMO.isDead() || SrcMO.isUndef() || MI->getNumOperands() > 2) {
+      // We must make sure the super-register gets killed. Replace the
+      // instruction with KILL.
+      MI->setDesc(TII->get(TargetOpcode::KILL));
+      DEBUG(dbgs() << "replaced by:   " << *MI);
       return true;
     }
-  } else {
-    // Insert sub-register copy
-    const TargetRegisterClass *TRC0= TRI->getPhysicalRegisterRegClass(DstSubReg);
-    const TargetRegisterClass *TRC1= TRI->getPhysicalRegisterRegClass(InsReg);
-    if (MI->getOperand(2).isUndef())
-      // If the source register being inserted is undef, then this becomes a
-      // KILL.
-      BuildMI(*MBB, MI, MI->getDebugLoc(),
-              TII->get(TargetOpcode::KILL), DstSubReg);
-    else {
-      bool Emitted = TII->copyRegToReg(*MBB, MI, DstSubReg, InsReg, TRC0, TRC1,
-                                       MI->getDebugLoc());
-      (void)Emitted;
-      assert(Emitted && "Subreg and Dst must be of compatible register class");
-    }
-    MachineBasicBlock::iterator CopyMI = MI;
-    --CopyMI;
-
-    // INSERT_SUBREG is a two-address instruction so it implicitly kills SrcReg.
-    if (!MI->getOperand(1).isUndef())
-      CopyMI->addOperand(MachineOperand::CreateReg(DstReg, false, true, true));
-
-    // Transfer the kill/dead flags, if needed.
-    if (MI->getOperand(0).isDead()) {
-      TransferDeadFlag(MI, DstSubReg, TRI);
-    } else {
-      // Make sure the full DstReg is live after this replacement.
-      CopyMI->addOperand(MachineOperand::CreateReg(DstReg, true, true));
-    }
-
-    // Make sure the inserted register gets killed
-    if (MI->getOperand(2).isKill() && !MI->getOperand(2).isUndef())
-      TransferKillFlag(MI, InsReg, TRI);
+    // Vanilla identity copy.
+    MI->eraseFromParent();
+    return true;
   }
 
-  DEBUG({
-      MachineBasicBlock::iterator dMI = MI;
-      dbgs() << "subreg: " << *(--dMI) << "\n";
-    });
+  DEBUG(dbgs() << "real copy:   " << *MI);
+  TII->copyPhysReg(*MI->getParent(), MI, MI->getDebugLoc(),
+                   DstMO.getReg(), SrcMO.getReg(), SrcMO.isKill());
 
-  MBB->erase(MI);
+  if (DstMO.isDead())
+    TransferDeadFlag(MI, DstMO.getReg(), TRI);
+  if (MI->getNumOperands() > 2)
+    TransferImplicitDefs(MI);
+  DEBUG({
+    MachineBasicBlock::iterator dMI = MI;
+    dbgs() << "replaced by: " << *(--dMI);
+  });
+  MI->eraseFromParent();
   return true;
 }
 
@@ -317,12 +227,13 @@ bool LowerSubregsInstructionPass::runOnMachineFunction(MachineFunction &MF) {
          mi != me;) {
       MachineBasicBlock::iterator nmi = llvm::next(mi);
       MachineInstr *MI = mi;
-      if (MI->isExtractSubreg()) {
-        MadeChange |= LowerExtract(MI);
-      } else if (MI->isInsertSubreg()) {
-        MadeChange |= LowerInsert(MI);
-      } else if (MI->isSubregToReg()) {
+      assert(!MI->isInsertSubreg() && "INSERT_SUBREG should no longer appear");
+      assert(MI->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+             "EXTRACT_SUBREG should no longer appear");
+      if (MI->isSubregToReg()) {
         MadeChange |= LowerSubregToReg(MI);
+      } else if (MI->isCopy()) {
+        MadeChange |= LowerCopy(MI);
       }
       mi = nmi;
     }
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index eaaa1f8..a27ee47 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -13,7 +13,10 @@
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/BasicBlock.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -136,6 +139,13 @@ void ilist_traits<MachineInstr>::deleteNode(MachineInstr* MI) {
   Parent->getParent()->DeleteMachineInstr(MI);
 }
 
+MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() {
+  iterator I = begin();
+  while (I != end() && I->isPHI())
+    ++I;
+  return I;
+}
+
 MachineBasicBlock::iterator MachineBasicBlock::getFirstTerminator() {
   iterator I = end();
   while (I != begin() && (--I)->getDesc().isTerminator())
@@ -245,6 +255,7 @@ void MachineBasicBlock::updateTerminator() {
 
   MachineBasicBlock *TBB = 0, *FBB = 0;
   SmallVector<MachineOperand, 4> Cond;
+  DebugLoc dl;  // FIXME: this is nowhere
   bool B = TII->AnalyzeBranch(*this, TBB, FBB, Cond);
   (void) B;
   assert(!B && "UpdateTerminators requires analyzable predecessors!");
@@ -259,7 +270,7 @@ void MachineBasicBlock::updateTerminator() {
       // its layout successor, insert a branch.
       TBB = *succ_begin();
       if (!isLayoutSuccessor(TBB))
-        TII->InsertBranch(*this, TBB, 0, Cond);
+        TII->InsertBranch(*this, TBB, 0, Cond, dl);
     }
   } else {
     if (FBB) {
@@ -270,10 +281,10 @@ void MachineBasicBlock::updateTerminator() {
         if (TII->ReverseBranchCondition(Cond))
           return;
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, FBB, 0, Cond);
+        TII->InsertBranch(*this, FBB, 0, Cond, dl);
       } else if (isLayoutSuccessor(FBB)) {
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, TBB, 0, Cond);
+        TII->InsertBranch(*this, TBB, 0, Cond, dl);
       }
     } else {
       // The block has a fallthrough conditional branch.
@@ -284,14 +295,14 @@ void MachineBasicBlock::updateTerminator() {
         if (TII->ReverseBranchCondition(Cond)) {
           // We can't reverse the condition, add an unconditional branch.
           Cond.clear();
-          TII->InsertBranch(*this, MBBA, 0, Cond);
+          TII->InsertBranch(*this, MBBA, 0, Cond, dl);
           return;
         }
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, MBBA, 0, Cond);
+        TII->InsertBranch(*this, MBBA, 0, Cond, dl);
       } else if (!isLayoutSuccessor(MBBA)) {
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, TBB, MBBA, Cond);
+        TII->InsertBranch(*this, TBB, MBBA, Cond, dl);
       }
     }
   }
@@ -331,12 +342,32 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *fromMBB) {
   if (this == fromMBB)
     return;
   
-  for (MachineBasicBlock::succ_iterator I = fromMBB->succ_begin(), 
-       E = fromMBB->succ_end(); I != E; ++I)
-    addSuccessor(*I);
+  while (!fromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *fromMBB->succ_begin();
+    addSuccessor(Succ);
+    fromMBB->removeSuccessor(Succ);
+  }
+}
+
+void
+MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) {
+  if (this == fromMBB)
+    return;
   
-  while (!fromMBB->succ_empty())
-    fromMBB->removeSuccessor(fromMBB->succ_begin());
+  while (!fromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *fromMBB->succ_begin();
+    addSuccessor(Succ);
+    fromMBB->removeSuccessor(Succ);
+
+    // Fix up any PHI nodes in the successor.
+    for (MachineBasicBlock::iterator MI = Succ->begin(), ME = Succ->end();
+         MI != ME && MI->isPHI(); ++MI)
+      for (unsigned i = 2, e = MI->getNumOperands()+1; i != e; i += 2) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (MO.getMBB() == fromMBB)
+          MO.setMBB(this);
+      }
+  }
 }
 
 bool MachineBasicBlock::isSuccessor(const MachineBasicBlock *MBB) const {
@@ -395,6 +426,82 @@ bool MachineBasicBlock::canFallThrough() {
   return FBB == 0;
 }
 
+MachineBasicBlock *
+MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
+  MachineFunction *MF = getParent();
+  DebugLoc dl;  // FIXME: this is nowhere
+
+  // We may need to update this's terminator, but we can't do that if AnalyzeBranch
+  // fails. If this uses a jump table, we won't touch it.
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->AnalyzeBranch(*this, TBB, FBB, Cond))
+    return NULL;
+
+  MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
+  MF->insert(llvm::next(MachineFunction::iterator(this)), NMBB);
+  DEBUG(dbgs() << "PHIElimination splitting critical edge:"
+        " BB#" << getNumber()
+        << " -- BB#" << NMBB->getNumber()
+        << " -- BB#" << Succ->getNumber() << '\n');
+
+  ReplaceUsesOfBlockWith(Succ, NMBB);
+  updateTerminator();
+
+  // Insert unconditional "jump Succ" instruction in NMBB if necessary.
+  NMBB->addSuccessor(Succ);
+  if (!NMBB->isLayoutSuccessor(Succ)) {
+    Cond.clear();
+    MF->getTarget().getInstrInfo()->InsertBranch(*NMBB, Succ, NULL, Cond, dl);
+  }
+
+  // Fix PHI nodes in Succ so they refer to NMBB instead of this
+  for (MachineBasicBlock::iterator i = Succ->begin(), e = Succ->end();
+       i != e && i->isPHI(); ++i)
+    for (unsigned ni = 1, ne = i->getNumOperands(); ni != ne; ni += 2)
+      if (i->getOperand(ni+1).getMBB() == this)
+        i->getOperand(ni+1).setMBB(NMBB);
+
+  if (LiveVariables *LV =
+        P->getAnalysisIfAvailable<LiveVariables>())
+    LV->addNewBlock(NMBB, this, Succ);
+
+  if (MachineDominatorTree *MDT =
+        P->getAnalysisIfAvailable<MachineDominatorTree>())
+    MDT->addNewBlock(NMBB, this);
+
+  if (MachineLoopInfo *MLI =
+        P->getAnalysisIfAvailable<MachineLoopInfo>())
+    if (MachineLoop *TIL = MLI->getLoopFor(this)) {
+      // If one or the other blocks were not in a loop, the new block is not
+      // either, and thus LI doesn't need to be updated.
+      if (MachineLoop *DestLoop = MLI->getLoopFor(Succ)) {
+        if (TIL == DestLoop) {
+          // Both in the same loop, the NMBB joins loop.
+          DestLoop->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else if (TIL->contains(DestLoop)) {
+          // Edge from an outer loop to an inner loop.  Add to the outer loop.
+          TIL->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else if (DestLoop->contains(TIL)) {
+          // Edge from an inner loop to an outer loop.  Add to the outer loop.
+          DestLoop->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else {
+          // Edge from two loops with no containment relation.  Because these
+          // are natural loops, we know that the destination block must be the
+          // header of its loop (adding a branch into a loop elsewhere would
+          // create an irreducible loop).
+          assert(DestLoop->getHeader() == Succ &&
+                 "Should not create irreducible loops!");
+          if (MachineLoop *P = DestLoop->getParentLoop())
+            P->addBasicBlockToLoop(NMBB, MLI->getBase());
+        }
+      }
+    }
+
+  return NMBB;
+}
+
 /// removeFromParent - This method unlinks 'this' from the containing function,
 /// and returns it, but does not delete it.
 MachineBasicBlock *MachineBasicBlock::removeFromParent() {
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 6f4f7a8..833cc00 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -30,9 +30,7 @@ using namespace llvm;
 
 STATISTIC(NumCoalesces, "Number of copies coalesced");
 STATISTIC(NumCSEs,      "Number of common subexpression eliminated");
-
-static cl::opt<bool> CSEPhysDef("machine-cse-phys-defs",
-                                cl::init(false), cl::Hidden);
+STATISTIC(NumPhysCSEs,  "Number of phyreg defining common subexpr eliminated");
 
 namespace {
   class MachineCSE : public MachineFunctionPass {
@@ -128,6 +126,28 @@ bool MachineCSE::PerformTrivialCoalescing(MachineInstr *MI,
       ++NumCoalesces;
       Changed = true;
     }
+
+    if (!DefMI->isCopy())
+      continue;
+    SrcReg = DefMI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+      continue;
+    if (DefMI->getOperand(0).getSubReg() || DefMI->getOperand(1).getSubReg())
+      continue;
+    const TargetRegisterClass *SRC   = MRI->getRegClass(SrcReg);
+    const TargetRegisterClass *RC    = MRI->getRegClass(Reg);
+    const TargetRegisterClass *NewRC = getCommonSubClass(RC, SRC);
+    if (!NewRC)
+      continue;
+    DEBUG(dbgs() << "Coalescing: " << *DefMI);
+    DEBUG(dbgs() << "*** to: " << *MI);
+    MO.setReg(SrcReg);
+    MRI->clearKillFlags(SrcReg);
+    if (NewRC != SRC)
+      MRI->setRegClass(SrcReg, NewRC);
+    DefMI->eraseFromParent();
+    ++NumCoalesces;
+    Changed = true;
   }
 
   return Changed;
@@ -172,7 +192,8 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
 
 /// hasLivePhysRegDefUse - Return true if the specified instruction read / write
 /// physical registers (except for dead defs of physical registers). It also
-/// returns the physical register def by reference if it's the only one.
+/// returns the physical register def by reference if it's the only one and the
+/// instruction does not uses a physical register.
 bool MachineCSE::hasLivePhysRegDefUse(const MachineInstr *MI,
                                       const MachineBasicBlock *MBB,
                                       unsigned &PhysDef) const {
@@ -186,9 +207,11 @@ bool MachineCSE::hasLivePhysRegDefUse(const MachineInstr *MI,
       continue;
     if (TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
-    if (MO.isUse())
+    if (MO.isUse()) {
       // Can't touch anything to read a physical register.
+      PhysDef = 0;
       return true;
+    }
     if (MO.isDead())
       // If the def is dead, it's ok.
       continue;
@@ -240,8 +263,8 @@ bool MachineCSE::PhysRegDefReaches(MachineInstr *CSMI, MachineInstr *MI,
 
 static bool isCopy(const MachineInstr *MI, const TargetInstrInfo *TII) {
   unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-  return TII->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) ||
-    MI->isExtractSubreg() || MI->isInsertSubreg() || MI->isSubregToReg();
+  return MI->isCopyLike() ||
+    TII->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);
 }
 
 bool MachineCSE::isCSECandidate(MachineInstr *MI) {
@@ -356,6 +379,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
     if (!isCSECandidate(MI))
       continue;
 
+    bool DefPhys = false;
     bool FoundCSE = VNT.count(MI);
     if (!FoundCSE) {
       // Look for trivial copy coalescing opportunities.
@@ -376,11 +400,13 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
 
       // ... Unless the CS is local and it also defines the physical register
       // which is not clobbered in between.
-      if (PhysDef && CSEPhysDef) {
+      if (PhysDef) {
         unsigned CSVN = VNT.lookup(MI);
         MachineInstr *CSMI = Exps[CSVN];
-        if (PhysRegDefReaches(CSMI, MI, PhysDef))
+        if (PhysRegDefReaches(CSMI, MI, PhysDef)) {
           FoundCSE = true;
+          DefPhys = true;
+        }
       }
     }
 
@@ -426,6 +452,8 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
       }
       MI->eraseFromParent();
       ++NumCSEs;
+      if (DefPhys)
+        ++NumPhysCSEs;
     } else {
       DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
       VNT.insert(MI, CurrVN++);
diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
index 4088739..b5f8fbb 100644
--- a/lib/CodeGen/MachineDominators.cpp
+++ b/lib/CodeGen/MachineDominators.cpp
@@ -46,7 +46,6 @@ MachineDominatorTree::MachineDominatorTree()
 }
 
 MachineDominatorTree::~MachineDominatorTree() {
-  DT->releaseMemory();
   delete DT;
 }
 
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index a38c881..666120f 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -378,7 +378,7 @@ void MachineFunction::viewCFG() const
 #ifndef NDEBUG
   ViewGraph(this, "mf" + getFunction()->getNameStr());
 #else
-  errs() << "SelectionDAG::viewGraph is only available in debug builds on "
+  errs() << "MachineFunction::viewCFG is only available in debug builds on "
          << "systems with Graphviz or gv!\n";
 #endif // NDEBUG
 }
@@ -388,7 +388,7 @@ void MachineFunction::viewCFGOnly() const
 #ifndef NDEBUG
   ViewGraph(this, "mf" + getFunction()->getNameStr(), true);
 #else
-  errs() << "SelectionDAG::viewGraph is only available in debug builds on "
+  errs() << "MachineFunction::viewCFGOnly is only available in debug builds on "
          << "systems with Graphviz or gv!\n";
 #endif // NDEBUG
 }
@@ -438,10 +438,16 @@ MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
 /// index with a negative value.
 ///
 int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
-                                        bool Immutable, bool isSS) {
+                                        bool Immutable) {
   assert(Size != 0 && "Cannot allocate zero size fixed stack objects!");
-  Objects.insert(Objects.begin(), StackObject(Size, 1, SPOffset, Immutable,
-                                              isSS));
+  // The alignment of the frame index can be determined from its offset from
+  // the incoming frame position.  If the frame object is at offset 32 and
+  // the stack is guaranteed to be 16-byte aligned, then we know that the
+  // object is 16-byte aligned.
+  unsigned StackAlign = TFI.getStackAlignment();
+  unsigned Align = MinAlign(SPOffset, StackAlign);
+  Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
+                                              /*isSS*/false));
   return -++NumFixedObjects;
 }
 
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index e54cd5c..6b2e985 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -111,6 +111,26 @@ void MachineOperand::setReg(unsigned Reg) {
   Contents.Reg.RegNo = Reg;
 }
 
+void MachineOperand::substVirtReg(unsigned Reg, unsigned SubIdx,
+                                  const TargetRegisterInfo &TRI) {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  if (SubIdx && getSubReg())
+    SubIdx = TRI.composeSubRegIndices(SubIdx, getSubReg());
+  setReg(Reg);
+  if (SubIdx)
+    setSubReg(SubIdx);
+}
+
+void MachineOperand::substPhysReg(unsigned Reg, const TargetRegisterInfo &TRI) {
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  if (getSubReg()) {
+    Reg = TRI.getSubReg(Reg, getSubReg());
+    assert(Reg && "Invalid SubReg for physical register");
+    setSubReg(0);
+  }
+  setReg(Reg);
+}
+
 /// ChangeToImmediate - Replace this operand with a new immediate operand of
 /// the specified value.  If an operand is known to be an immediate already,
 /// the setImm method should be used.
@@ -861,14 +881,14 @@ int MachineInstr::findFirstPredOperandIdx() const {
 bool MachineInstr::
 isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx) const {
   if (isInlineAsm()) {
-    assert(DefOpIdx >= 2);
+    assert(DefOpIdx >= 3);
     const MachineOperand &MO = getOperand(DefOpIdx);
     if (!MO.isReg() || !MO.isDef() || MO.getReg() == 0)
       return false;
     // Determine the actual operand index that corresponds to this index.
     unsigned DefNo = 0;
     unsigned DefPart = 0;
-    for (unsigned i = 1, e = getNumOperands(); i < e; ) {
+    for (unsigned i = 2, e = getNumOperands(); i < e; ) {
       const MachineOperand &FMO = getOperand(i);
       // After the normal asm operands there may be additional imp-def regs.
       if (!FMO.isImm())
@@ -883,7 +903,7 @@ isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx) const {
       }
       ++DefNo;
     }
-    for (unsigned i = 1, e = getNumOperands(); i != e; ++i) {
+    for (unsigned i = 2, e = getNumOperands(); i != e; ++i) {
       const MachineOperand &FMO = getOperand(i);
       if (!FMO.isImm())
         continue;
@@ -926,7 +946,7 @@ isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx) const {
 
     // Find the flag operand corresponding to UseOpIdx
     unsigned FlagIdx, NumOps=0;
-    for (FlagIdx = 1; FlagIdx < UseOpIdx; FlagIdx += NumOps+1) {
+    for (FlagIdx = 2; FlagIdx < UseOpIdx; FlagIdx += NumOps+1) {
       const MachineOperand &UFMO = getOperand(FlagIdx);
       // After the normal asm operands there may be additional imp-def regs.
       if (!UFMO.isImm())
@@ -944,9 +964,9 @@ isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx) const {
       if (!DefOpIdx)
         return true;
 
-      unsigned DefIdx = 1;
-      // Remember to adjust the index. First operand is asm string, then there
-      // is a flag for each.
+      unsigned DefIdx = 2;
+      // Remember to adjust the index. First operand is asm string, second is
+      // the AlignStack bit, then there is a flag for each.
       while (DefNo) {
         const MachineOperand &FMO = getOperand(DefIdx);
         assert(FMO.isImm());
@@ -1017,6 +1037,29 @@ void MachineInstr::copyPredicates(const MachineInstr *MI) {
   }
 }
 
+void MachineInstr::substituteRegister(unsigned FromReg,
+                                      unsigned ToReg,
+                                      unsigned SubIdx,
+                                      const TargetRegisterInfo &RegInfo) {
+  if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
+    if (SubIdx)
+      ToReg = RegInfo.getSubReg(ToReg, SubIdx);
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = getOperand(i);
+      if (!MO.isReg() || MO.getReg() != FromReg)
+        continue;
+      MO.substPhysReg(ToReg, RegInfo);
+    }
+  } else {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = getOperand(i);
+      if (!MO.isReg() || MO.getReg() != FromReg)
+        continue;
+      MO.substVirtReg(ToReg, SubIdx, RegInfo);
+    }
+  }
+}
+
 /// isSafeToMove - Return true if it is safe to move this instruction. If
 /// SawStore is set to true, it means that there is a store (or call) between
 /// the instruction's location and its intended destination.
@@ -1168,6 +1211,28 @@ void MachineInstr::dump() const {
   dbgs() << "  " << *this;
 }
 
+static void printDebugLoc(DebugLoc DL, const MachineFunction *MF, 
+                         raw_ostream &CommentOS) {
+  const LLVMContext &Ctx = MF->getFunction()->getContext();
+  if (!DL.isUnknown()) {          // Print source line info.
+    DIScope Scope(DL.getScope(Ctx));
+    // Omit the directory, because it's likely to be long and uninteresting.
+    if (Scope.Verify())
+      CommentOS << Scope.getFilename();
+    else
+      CommentOS << "<unknown>";
+    CommentOS << ':' << DL.getLine();
+    if (DL.getCol() != 0)
+      CommentOS << ':' << DL.getCol();
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
+    if (!InlinedAtDL.isUnknown()) {
+      CommentOS << " @[ ";
+      printDebugLoc(InlinedAtDL, MF, CommentOS);
+      CommentOS << " ]";
+    }
+  }
+}
+
 void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
   // We can be a bit tidier if we know the TargetMachine and/or MachineFunction.
   const MachineFunction *MF = 0;
@@ -1240,6 +1305,8 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
         OS << "!\"" << MDS->getString() << '\"';
       else
         MO.print(OS, TM);
+    } else if (TM && (isInsertSubreg() || isRegSequence()) && MO.isImm()) {
+      OS << TM->getRegisterInfo()->getSubRegIndexName(MO.getImm());
     } else
       MO.print(OS, TM);
   }
@@ -1265,19 +1332,8 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
 
   if (!debugLoc.isUnknown() && MF) {
     if (!HaveSemi) OS << ";";
-
-    // TODO: print InlinedAtLoc information
-
-    DIScope Scope(debugLoc.getScope(MF->getFunction()->getContext()));
     OS << " dbg:";
-    // Omit the directory, since it's usually long and uninteresting.
-    if (Scope.Verify())
-      OS << Scope.getFilename();
-    else
-      OS << "<unknown>";
-    OS << ':' << debugLoc.getLine();
-    if (debugLoc.getCol() != 0)
-      OS << ':' << debugLoc.getCol();
+    printDebugLoc(debugLoc, MF, OS);
   }
 
   OS << "\n";
@@ -1418,6 +1474,25 @@ void MachineInstr::addRegisterDefined(unsigned IncomingReg,
                                        true  /*IsImp*/));
 }
 
+void MachineInstr::setPhysRegsDeadExcept(const SmallVectorImpl<unsigned> &UsedRegs,
+                                         const TargetRegisterInfo &TRI) {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    bool Dead = true;
+    for (SmallVectorImpl<unsigned>::const_iterator I = UsedRegs.begin(),
+         E = UsedRegs.end(); I != E; ++I)
+      if (TRI.regsOverlap(*I, Reg)) {
+        Dead = false;
+        break;
+      }
+    // If there are no uses, including partial uses, the def is dead.
+    if (Dead) MO.setIsDead();
+  }
+}
+
 unsigned
 MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
   unsigned Hash = MI->getOpcode() * 37;
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 6120617..956d21c 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -62,6 +62,7 @@ namespace {
 
     // State that is updated as we process loops
     bool         Changed;          // True if a loop is changed.
+    bool         FirstInLoop;      // True if it's the first LICM in the loop.
     MachineLoop *CurLoop;          // The current loop we are working on.
     MachineBasicBlock *CurPreheader; // The preheader for CurLoop.
 
@@ -82,7 +83,6 @@ namespace {
 
     const char *getPassName() const { return "Machine Instruction LICM"; }
 
-    // FIXME: Loop preheaders?
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
       AU.addRequired<MachineLoopInfo>();
@@ -127,8 +127,8 @@ namespace {
     void AddToLiveIns(unsigned Reg);
 
     /// IsLICMCandidate - Returns true if the instruction may be a suitable
-    /// candidate for LICM. e.g. If the instruction is a call, then it's obviously
-    /// not safe to hoist it.
+    /// candidate for LICM. e.g. If the instruction is a call, then it's
+    /// obviously not safe to hoist it.
     bool IsLICMCandidate(MachineInstr &I);
 
     /// IsLoopInvariantInst - Returns true if the instruction is loop
@@ -181,6 +181,10 @@ namespace {
     /// current loop preheader that may become duplicates of instructions that
     /// are hoisted out of the loop.
     void InitCSEMap(MachineBasicBlock *BB);
+
+    /// getCurPreheader - Get the preheader for the current loop, splitting
+    /// a critical edge if needed.
+    MachineBasicBlock *getCurPreheader();
   };
 } // end anonymous namespace
 
@@ -192,12 +196,17 @@ FunctionPass *llvm::createMachineLICMPass(bool PreRegAlloc) {
   return new MachineLICM(PreRegAlloc);
 }
 
-/// LoopIsOuterMostWithPreheader - Test if the given loop is the outer-most
-/// loop that has a preheader.
-static bool LoopIsOuterMostWithPreheader(MachineLoop *CurLoop) {
+/// LoopIsOuterMostWithPredecessor - Test if the given loop is the outer-most
+/// loop that has a unique predecessor.
+static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
+  // Check whether this loop even has a unique predecessor.
+  if (!CurLoop->getLoopPredecessor())
+    return false;
+  // Ok, now check to see if any of its outer loops do.
   for (MachineLoop *L = CurLoop->getParentLoop(); L; L = L->getParentLoop())
-    if (L->getLoopPreheader())
+    if (L->getLoopPredecessor())
       return false;
+  // None of them did, so this is the outermost with a unique predecessor.
   return true;
 }
 
@@ -207,7 +216,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
   else
     DEBUG(dbgs() << "******** Post-regalloc Machine LICM ********\n");
 
-  Changed = false;
+  Changed = FirstInLoop = false;
   TM = &MF.getTarget();
   TII = TM->getInstrInfo();
   TRI = TM->getRegisterInfo();
@@ -220,23 +229,17 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
   DT  = &getAnalysis<MachineDominatorTree>();
   AA  = &getAnalysis<AliasAnalysis>();
 
-  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I){
-    CurLoop = *I;
+  SmallVector<MachineLoop *, 8> Worklist(MLI->begin(), MLI->end());
+  while (!Worklist.empty()) {
+    CurLoop = Worklist.pop_back_val();
+    CurPreheader = 0;
 
     // If this is done before regalloc, only visit outer-most preheader-sporting
     // loops.
-    if (PreRegAlloc && !LoopIsOuterMostWithPreheader(CurLoop))
-      continue;
-
-    // Determine the block to which to hoist instructions. If we can't find a
-    // suitable loop preheader, we can't do any hoisting.
-    //
-    // FIXME: We are only hoisting if the basic block coming into this loop
-    // has only one successor. This isn't the case in general because we haven't
-    // broken critical edges or added preheaders.
-    CurPreheader = CurLoop->getLoopPreheader();
-    if (!CurPreheader)
+    if (PreRegAlloc && !LoopIsOuterMostWithPredecessor(CurLoop)) {
+      Worklist.append(CurLoop->begin(), CurLoop->end());
       continue;
+    }
 
     if (!PreRegAlloc)
       HoistRegionPostRA();
@@ -244,6 +247,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
       // CSEMap is initialized for loop header when the first instruction is
       // being hoisted.
       MachineDomTreeNode *N = DT->getNode(CurLoop->getHeader());
+      FirstInLoop = true;
       HoistRegion(N);
       CSEMap.clear();
     }
@@ -436,13 +440,16 @@ void MachineLICM::AddToLiveIns(unsigned Reg) {
 /// operands that is safe to hoist, this instruction is called to do the
 /// dirty work.
 void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader) return;
+
   // Now move the instructions to the predecessor, inserting it before any
   // terminator instructions.
   DEBUG({
       dbgs() << "Hoisting " << *MI;
-      if (CurPreheader->getBasicBlock())
+      if (Preheader->getBasicBlock())
         dbgs() << " to MachineBasicBlock "
-               << CurPreheader->getName();
+               << Preheader->getName();
       if (MI->getParent()->getBasicBlock())
         dbgs() << " from MachineBasicBlock "
                << MI->getParent()->getName();
@@ -451,7 +458,7 @@ void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
 
   // Splice the instruction to the preheader.
   MachineBasicBlock *MBB = MI->getParent();
-  CurPreheader->splice(CurPreheader->getFirstTerminator(), MBB, MI);
+  Preheader->splice(Preheader->getFirstTerminator(), MBB, MI);
 
   // Add register to livein list to all the BBs in the current loop since a 
   // loop invariant must be kept live throughout the whole loop. This is
@@ -490,26 +497,16 @@ void MachineLICM::HoistRegion(MachineDomTreeNode *N) {
 /// candidate for LICM. e.g. If the instruction is a call, then it's obviously
 /// not safe to hoist it.
 bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
+  // It is not profitable to hoist implicitdefs.  FIXME: Why not?  what if they
+  // are an argument to some other otherwise-hoistable instruction?
   if (I.isImplicitDef())
     return false;
-
-  const TargetInstrDesc &TID = I.getDesc();
   
-  // Ignore stuff that we obviously can't hoist.
-  if (TID.mayStore() || TID.isCall() || TID.isTerminator() ||
-      TID.hasUnmodeledSideEffects())
+  // Check if it's safe to move the instruction.
+  bool DontMoveAcrossStore = true;
+  if (!I.isSafeToMove(TII, AA, DontMoveAcrossStore))
     return false;
-
-  if (TID.mayLoad()) {
-    // Okay, this instruction does a load. As a refinement, we allow the target
-    // to decide whether the loaded value is actually a constant. If so, we can
-    // actually use it as a load.
-    if (!I.isInvariantLoad(AA))
-      // FIXME: we should be able to hoist loads with no other side effects if
-      // there are no other instructions which can change memory in this loop.
-      // This is a trivial form of alias analysis.
-      return false;
-  }
+  
   return true;
 }
 
@@ -754,6 +751,9 @@ bool MachineLICM::EliminateCSE(MachineInstr *MI,
 /// that are safe to hoist, this instruction is called to do the dirty work.
 ///
 void MachineLICM::Hoist(MachineInstr *MI) {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader) return;
+
   // First check whether we should hoist this instruction.
   if (!IsLoopInvariantInst(*MI) || !IsProfitableToHoist(*MI)) {
     // If not, try unfolding a hoistable load.
@@ -765,9 +765,9 @@ void MachineLICM::Hoist(MachineInstr *MI) {
   // terminator instructions.
   DEBUG({
       dbgs() << "Hoisting " << *MI;
-      if (CurPreheader->getBasicBlock())
+      if (Preheader->getBasicBlock())
         dbgs() << " to MachineBasicBlock "
-               << CurPreheader->getName();
+               << Preheader->getName();
       if (MI->getParent()->getBasicBlock())
         dbgs() << " from MachineBasicBlock "
                << MI->getParent()->getName();
@@ -776,7 +776,10 @@ void MachineLICM::Hoist(MachineInstr *MI) {
 
   // If this is the first instruction being hoisted to the preheader,
   // initialize the CSE map with potential common expressions.
-  InitCSEMap(CurPreheader);
+  if (FirstInLoop) {
+    InitCSEMap(Preheader);
+    FirstInLoop = false;
+  }
 
   // Look for opportunity to CSE the hoisted instruction.
   unsigned Opcode = MI->getOpcode();
@@ -784,7 +787,7 @@ void MachineLICM::Hoist(MachineInstr *MI) {
     CI = CSEMap.find(Opcode);
   if (!EliminateCSE(MI, CI)) {
     // Otherwise, splice the instruction to the preheader.
-    CurPreheader->splice(CurPreheader->getFirstTerminator(),MI->getParent(),MI);
+    Preheader->splice(Preheader->getFirstTerminator(),MI->getParent(),MI);
 
     // Clear the kill flags of any register this instruction defines,
     // since they may need to be live throughout the entire loop
@@ -808,3 +811,30 @@ void MachineLICM::Hoist(MachineInstr *MI) {
   ++NumHoisted;
   Changed = true;
 }
+
+MachineBasicBlock *MachineLICM::getCurPreheader() {
+  // Determine the block to which to hoist instructions. If we can't find a
+  // suitable loop predecessor, we can't do any hoisting.
+
+  // If we've tried to get a preheader and failed, don't try again.
+  if (CurPreheader == reinterpret_cast<MachineBasicBlock *>(-1))
+    return 0;
+
+  if (!CurPreheader) {
+    CurPreheader = CurLoop->getLoopPreheader();
+    if (!CurPreheader) {
+      MachineBasicBlock *Pred = CurLoop->getLoopPredecessor();
+      if (!Pred) {
+        CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
+        return 0;
+      }
+
+      CurPreheader = Pred->SplitCriticalEdge(CurLoop->getHeader(), this);
+      if (!CurPreheader) {
+        CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
+        return 0;
+      }
+    }
+  }
+  return CurPreheader;
+}
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 70bf7e5..5d852f2 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -20,7 +20,7 @@ using namespace llvm;
 MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
-  RegClass2VRegMap.resize(TRI.getNumRegClasses()+1); // RC ID starts at 1.
+  RegClass2VRegMap = new std::vector<unsigned>[TRI.getNumRegClasses()];
   UsedPhysRegs.resize(TRI.getNumRegs());
   
   // Create the physreg use/def lists.
@@ -37,6 +37,7 @@ MachineRegisterInfo::~MachineRegisterInfo() {
            "PhysRegUseDefLists has entries after all instructions are deleted");
 #endif
   delete [] PhysRegUseDefLists;
+  delete [] RegClass2VRegMap;
 }
 
 /// setRegClass - Set the register class of the specified virtual register.
@@ -52,7 +53,7 @@ MachineRegisterInfo::setRegClass(unsigned Reg, const TargetRegisterClass *RC) {
   // Remove from old register class's vregs list. This may be slow but
   // fortunately this operation is rarely needed.
   std::vector<unsigned> &VRegs = RegClass2VRegMap[OldRC->getID()];
-  std::vector<unsigned>::iterator I=std::find(VRegs.begin(), VRegs.end(), VR);
+  std::vector<unsigned>::iterator I = std::find(VRegs.begin(), VRegs.end(), VR);
   VRegs.erase(I);
 
   // Add to new register class's vregs list.
@@ -174,115 +175,36 @@ unsigned MachineRegisterInfo::getLiveInVirtReg(unsigned PReg) const {
   return 0;
 }
 
-static cl::opt<bool>
-SchedLiveInCopies("schedule-livein-copies", cl::Hidden,
-                  cl::desc("Schedule copies of livein registers"),
-                  cl::init(false));
-
-/// EmitLiveInCopy - Emit a copy for a live in physical register. If the
-/// physical register has only a single copy use, then coalesced the copy
-/// if possible.
-static void EmitLiveInCopy(MachineBasicBlock *MBB,
-                           MachineBasicBlock::iterator &InsertPos,
-                           unsigned VirtReg, unsigned PhysReg,
-                           const TargetRegisterClass *RC,
-                           DenseMap<MachineInstr*, unsigned> &CopyRegMap,
-                           const MachineRegisterInfo &MRI,
-                           const TargetRegisterInfo &TRI,
-                           const TargetInstrInfo &TII) {
-  unsigned NumUses = 0;
-  MachineInstr *UseMI = NULL;
-  for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(VirtReg),
-         UE = MRI.use_end(); UI != UE; ++UI) {
-    UseMI = &*UI;
-    if (++NumUses > 1)
-      break;
-  }
-
-  // If the number of uses is not one, or the use is not a move instruction,
-  // don't coalesce. Also, only coalesce away a virtual register to virtual
-  // register copy.
-  bool Coalesced = false;
-  unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-  if (NumUses == 1 &&
-      TII.isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
-      TargetRegisterInfo::isVirtualRegister(DstReg)) {
-    VirtReg = DstReg;
-    Coalesced = true;
-  }
-
-  // Now find an ideal location to insert the copy.
-  MachineBasicBlock::iterator Pos = InsertPos;
-  while (Pos != MBB->begin()) {
-    MachineInstr *PrevMI = prior(Pos);
-    DenseMap<MachineInstr*, unsigned>::iterator RI = CopyRegMap.find(PrevMI);
-    // copyRegToReg might emit multiple instructions to do a copy.
-    unsigned CopyDstReg = (RI == CopyRegMap.end()) ? 0 : RI->second;
-    if (CopyDstReg && !TRI.regsOverlap(CopyDstReg, PhysReg))
-      // This is what the BB looks like right now:
-      // r1024 = mov r0
-      // ...
-      // r1    = mov r1024
-      //
-      // We want to insert "r1025 = mov r1". Inserting this copy below the
-      // move to r1024 makes it impossible for that move to be coalesced.
-      //
-      // r1025 = mov r1
-      // r1024 = mov r0
-      // ...
-      // r1    = mov 1024
-      // r2    = mov 1025
-      break; // Woot! Found a good location.
-    --Pos;
-  }
-
-  bool Emitted = TII.copyRegToReg(*MBB, Pos, VirtReg, PhysReg, RC, RC,
-                                  DebugLoc());
-  assert(Emitted && "Unable to issue a live-in copy instruction!\n");
-  (void) Emitted;
-
-  CopyRegMap.insert(std::make_pair(prior(Pos), VirtReg));
-  if (Coalesced) {
-    if (&*InsertPos == UseMI) ++InsertPos;
-    MBB->erase(UseMI);
-  }
-}
-
 /// EmitLiveInCopies - Emit copies to initialize livein virtual registers
 /// into the given entry block.
 void
 MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB,
                                       const TargetRegisterInfo &TRI,
                                       const TargetInstrInfo &TII) {
-  if (SchedLiveInCopies) {
-    // Emit the copies at a heuristically-determined location in the block.
-    DenseMap<MachineInstr*, unsigned> CopyRegMap;
-    MachineBasicBlock::iterator InsertPos = EntryMBB->begin();
-    for (MachineRegisterInfo::livein_iterator LI = livein_begin(),
-           E = livein_end(); LI != E; ++LI)
-      if (LI->second) {
-        const TargetRegisterClass *RC = getRegClass(LI->second);
-        EmitLiveInCopy(EntryMBB, InsertPos, LI->second, LI->first,
-                       RC, CopyRegMap, *this, TRI, TII);
+  // Emit the copies into the top of the block.
+  for (unsigned i = 0, e = LiveIns.size(); i != e; ++i)
+    if (LiveIns[i].second) {
+      if (use_empty(LiveIns[i].second)) {
+        // The livein has no uses. Drop it.
+        //
+        // It would be preferable to have isel avoid creating live-in
+        // records for unused arguments in the first place, but it's
+        // complicated by the debug info code for arguments.
+        LiveIns.erase(LiveIns.begin() + i);
+        --i; --e;
+      } else {
+        // Emit a copy.
+        BuildMI(*EntryMBB, EntryMBB->begin(), DebugLoc(),
+                TII.get(TargetOpcode::COPY), LiveIns[i].second)
+          .addReg(LiveIns[i].first);
+
+        // Add the register to the entry block live-in set.
+        EntryMBB->addLiveIn(LiveIns[i].first);
       }
-  } else {
-    // Emit the copies into the top of the block.
-    for (MachineRegisterInfo::livein_iterator LI = livein_begin(),
-           E = livein_end(); LI != E; ++LI)
-      if (LI->second) {
-        const TargetRegisterClass *RC = getRegClass(LI->second);
-        bool Emitted = TII.copyRegToReg(*EntryMBB, EntryMBB->begin(),
-                                        LI->second, LI->first, RC, RC,
-                                        DebugLoc());
-        assert(Emitted && "Unable to issue a live-in copy instruction!\n");
-        (void) Emitted;
-      }
-  }
-
-  // Add function live-ins to entry block live-in set.
-  for (MachineRegisterInfo::livein_iterator I = livein_begin(),
-       E = livein_end(); I != E; ++I)
-    EntryMBB->addLiveIn(I->first);
+    } else {
+      // Add the register to the entry block live-in set.
+      EntryMBB->addLiveIn(LiveIns[i].first);
+    }
 }
 
 void MachineRegisterInfo::closePhysRegsUsed(const TargetRegisterInfo &TRI) {
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 1610e6c..61334fc 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass moves instructions into successor blocks, when possible, so that
+// This pass moves instructions into successor blocks when possible, so that
 // they aren't executed on paths where their results aren't needed.
 //
 // This pass is not intended to be a replacement or a complete alternative
@@ -45,9 +45,9 @@ namespace {
   public:
     static char ID; // Pass identification
     MachineSinking() : MachineFunctionPass(&ID) {}
-    
+
     virtual bool runOnMachineFunction(MachineFunction &MF);
-    
+
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -63,7 +63,7 @@ namespace {
     bool AllUsesDominatedByBlock(unsigned Reg, MachineBasicBlock *MBB) const;
   };
 } // end anonymous namespace
-  
+
 char MachineSinking::ID = 0;
 static RegisterPass<MachineSinking>
 X("machine-sink", "Machine code sinking");
@@ -72,7 +72,7 @@ FunctionPass *llvm::createMachineSinkingPass() { return new MachineSinking(); }
 
 /// AllUsesDominatedByBlock - Return true if all uses of the specified register
 /// occur in blocks dominated by the specified block.
-bool MachineSinking::AllUsesDominatedByBlock(unsigned Reg, 
+bool MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
                                              MachineBasicBlock *MBB) const {
   assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
          "Only makes sense for vregs");
@@ -80,27 +80,30 @@ bool MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
   // This may leave a referencing dbg_value in the original block, before
   // the definition of the vreg.  Dwarf generator handles this although the
   // user might not get the right info at runtime.
-  for (MachineRegisterInfo::use_nodbg_iterator I = 
-       RegInfo->use_nodbg_begin(Reg),
-       E = RegInfo->use_nodbg_end(); I != E; ++I) {
+  for (MachineRegisterInfo::use_nodbg_iterator
+         I = RegInfo->use_nodbg_begin(Reg), E = RegInfo->use_nodbg_end();
+       I != E; ++I) {
     // Determine the block of the use.
     MachineInstr *UseInst = &*I;
     MachineBasicBlock *UseBlock = UseInst->getParent();
+
     if (UseInst->isPHI()) {
       // PHI nodes use the operand in the predecessor block, not the block with
       // the PHI.
       UseBlock = UseInst->getOperand(I.getOperandNo()+1).getMBB();
     }
+
     // Check that it dominates.
     if (!DT->dominates(MBB, UseBlock))
       return false;
   }
+
   return true;
 }
 
 bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "******** Machine Sinking ********\n");
-  
+
   const TargetMachine &TM = MF.getTarget();
   TII = TM.getInstrInfo();
   TRI = TM.getRegisterInfo();
@@ -111,19 +114,19 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
   AllocatableSet = TRI->getAllocatableSet(MF);
 
   bool EverMadeChange = false;
-  
+
   while (1) {
     bool MadeChange = false;
 
     // Process all basic blocks.
-    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); 
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end();
          I != E; ++I)
       MadeChange |= ProcessBlock(*I);
-    
+
     // If this iteration over the code changed anything, keep iterating.
     if (!MadeChange) break;
     EverMadeChange = true;
-  } 
+  }
   return EverMadeChange;
 }
 
@@ -132,8 +135,8 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
   if (MBB.succ_size() <= 1 || MBB.empty()) return false;
 
   // Don't bother sinking code out of unreachable blocks. In addition to being
-  // unprofitable, it can also lead to infinite looping, because in an unreachable
-  // loop there may be nowhere to stop.
+  // unprofitable, it can also lead to infinite looping, because in an
+  // unreachable loop there may be nowhere to stop.
   if (!DT->isReachableFromEntry(&MBB)) return false;
 
   bool MadeChange = false;
@@ -144,7 +147,7 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
   bool ProcessedBegin, SawStore = false;
   do {
     MachineInstr *MI = I;  // The instruction to sink.
-    
+
     // Predecrement I (if it's not begin) so that it isn't invalidated by
     // sinking.
     ProcessedBegin = I == MBB.begin();
@@ -156,10 +159,10 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
 
     if (SinkInstruction(MI, SawStore))
       ++NumSunk, MadeChange = true;
-    
+
     // If we just processed the first instruction in the block, we're done.
   } while (!ProcessedBegin);
-  
+
   return MadeChange;
 }
 
@@ -169,7 +172,7 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
   // Check if it's safe to move the instruction.
   if (!MI->isSafeToMove(TII, AA, SawStore))
     return false;
-  
+
   // FIXME: This should include support for sinking instructions within the
   // block they are currently in to shorten the live ranges.  We often get
   // instructions sunk into the top of a large block, but it would be better to
@@ -177,22 +180,22 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
   // be careful not to *increase* register pressure though, e.g. sinking
   // "x = y + z" down if it kills y and z would increase the live ranges of y
   // and z and only shrink the live range of x.
-  
+
   // Loop over all the operands of the specified instruction.  If there is
   // anything we can't handle, bail out.
   MachineBasicBlock *ParentBlock = MI->getParent();
-  
+
   // SuccToSinkTo - This is the successor to sink this instruction to, once we
   // decide.
   MachineBasicBlock *SuccToSinkTo = 0;
-  
+
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg()) continue;  // Ignore non-register operands.
-    
+
     unsigned Reg = MO.getReg();
     if (Reg == 0) continue;
-    
+
     if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
       if (MO.isUse()) {
         // If the physreg has no defs anywhere, it's just an ambient register
@@ -200,13 +203,16 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
         // it could get allocated to something with a def during allocation.
         if (!RegInfo->def_empty(Reg))
           return false;
+
         if (AllocatableSet.test(Reg))
           return false;
+
         // Check for a def among the register's aliases too.
         for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
           unsigned AliasReg = *Alias;
           if (!RegInfo->def_empty(AliasReg))
             return false;
+
           if (AllocatableSet.test(AliasReg))
             return false;
         }
@@ -221,28 +227,31 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
       // If it's not safe to move defs of the register class, then abort.
       if (!TII->isSafeToMoveRegClassDefs(RegInfo->getRegClass(Reg)))
         return false;
-      
+
       // FIXME: This picks a successor to sink into based on having one
       // successor that dominates all the uses.  However, there are cases where
       // sinking can happen but where the sink point isn't a successor.  For
       // example:
+      //
       //   x = computation
       //   if () {} else {}
       //   use x
-      // the instruction could be sunk over the whole diamond for the 
+      //
+      // the instruction could be sunk over the whole diamond for the
       // if/then/else (or loop, etc), allowing it to be sunk into other blocks
       // after that.
-      
+
       // Virtual register defs can only be sunk if all their uses are in blocks
       // dominated by one of the successors.
       if (SuccToSinkTo) {
         // If a previous operand picked a block to sink to, then this operand
         // must be sinkable to the same block.
-        if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo)) 
+        if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo))
           return false;
+
         continue;
       }
-      
+
       // Otherwise, we should look at all the successors and decide which one
       // we should sink to.
       for (MachineBasicBlock::succ_iterator SI = ParentBlock->succ_begin(),
@@ -252,13 +261,13 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
           break;
         }
       }
-      
+
       // If we couldn't find a block to sink to, ignore this instruction.
       if (SuccToSinkTo == 0)
         return false;
     }
   }
-  
+
   // If there are no outputs, it must have side-effects.
   if (SuccToSinkTo == 0)
     return false;
@@ -267,15 +276,26 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
   // landing pad is implicitly defined.
   if (SuccToSinkTo->isLandingPad())
     return false;
-  
+
   // It is not possible to sink an instruction into its own block.  This can
   // happen with loops.
   if (MI->getParent() == SuccToSinkTo)
     return false;
-  
-  DEBUG(dbgs() << "Sink instr " << *MI);
-  DEBUG(dbgs() << "to block " << *SuccToSinkTo);
-  
+
+  // If the instruction to move defines a dead physical register which is live
+  // when leaving the basic block, don't move it because it could turn into a
+  // "zombie" define of that preg. E.g., EFLAGS. (<rdar://problem/8030636>)
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    if (SuccToSinkTo->isLiveIn(Reg))
+      return false;
+  }
+
+  DEBUG(dbgs() << "Sink instr " << *MI << "\tinto block " << *SuccToSinkTo);
+
   // If the block has multiple predecessors, this would introduce computation on
   // a path that it doesn't already exist.  We could split the critical edge,
   // but for now we just punt.
@@ -305,18 +325,18 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
     // Otherwise we are OK with sinking along a critical edge.
     DEBUG(dbgs() << "Sinking along critical edge.\n");
   }
-  
-  // Determine where to insert into.  Skip phi nodes.
+
+  // Determine where to insert into. Skip phi nodes.
   MachineBasicBlock::iterator InsertPos = SuccToSinkTo->begin();
   while (InsertPos != SuccToSinkTo->end() && InsertPos->isPHI())
     ++InsertPos;
-  
+
   // Move the instruction.
   SuccToSinkTo->splice(InsertPos, ParentBlock, MI,
                        ++MachineBasicBlock::iterator(MI));
 
-  // Conservatively, clear any kill flags, since it's possible that
-  // they are no longer correct.
+  // Conservatively, clear any kill flags, since it's possible that they are no
+  // longer correct.
   MI->clearKillInfo();
 
   return true;
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 8baf01c..2297c90 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -390,7 +390,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
         report("MBB exits via unconditional fall-through but its successor "
                "differs from its CFG successor!", MBB);
       }
-      if (!MBB->empty() && MBB->back().getDesc().isBarrier()) {
+      if (!MBB->empty() && MBB->back().getDesc().isBarrier() &&
+          !TII->isPredicated(&MBB->back())) {
         report("MBB exits via unconditional fall-through but ends with a "
                "barrier instruction!", MBB);
       }
diff --git a/lib/CodeGen/OptimizeExts.cpp b/lib/CodeGen/OptimizeExts.cpp
index 41fc204..dcdc243 100644
--- a/lib/CodeGen/OptimizeExts.cpp
+++ b/lib/CodeGen/OptimizeExts.cpp
@@ -118,6 +118,26 @@ bool OptimizeExts::OptimizeInstr(MachineInstr *MI, MachineBasicBlock *MBB,
         continue;
       }
 
+      // It's an error to translate this:
+      //
+      //    %reg1025 = <sext> %reg1024
+      //     ...
+      //    %reg1026 = SUBREG_TO_REG 0, %reg1024, 4
+      //
+      // into this:
+      //
+      //    %reg1025 = <sext> %reg1024
+      //     ...
+      //    %reg1027 = COPY %reg1025:4
+      //    %reg1026 = SUBREG_TO_REG 0, %reg1027, 4
+      //
+      // The problem here is that SUBREG_TO_REG is there to assert that an
+      // implicit zext occurs. It doesn't insert a zext instruction. If we allow
+      // the COPY here, it will give us the value after the <sext>,
+      // not the original value of %reg1024 before <sext>.
+      if (UseMI->getOpcode() == TargetOpcode::SUBREG_TO_REG)
+        continue;
+
       MachineBasicBlock *UseMBB = UseMI->getParent();
       if (UseMBB == MBB) {
         // Local uses that come after the extension.
@@ -165,8 +185,8 @@ bool OptimizeExts::OptimizeInstr(MachineInstr *MI, MachineBasicBlock *MBB,
           continue;
         unsigned NewVR = MRI->createVirtualRegister(RC);
         BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
-                TII->get(TargetOpcode::EXTRACT_SUBREG), NewVR)
-          .addReg(DstReg).addImm(SubIdx);
+                TII->get(TargetOpcode::COPY), NewVR)
+          .addReg(DstReg, 0, SubIdx);
         UseMO->setReg(NewVR);
         ++NumReuse;
         Changed = true;
diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp
index 2717d4d..1613fe2 100644
--- a/lib/CodeGen/OptimizePHIs.cpp
+++ b/lib/CodeGen/OptimizePHIs.cpp
@@ -107,6 +107,11 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
         SrcSubIdx == 0 && DstSubIdx == 0 &&
         TargetRegisterInfo::isVirtualRegister(MvSrcReg))
       SrcMI = MRI->getVRegDef(MvSrcReg);
+    else if (SrcMI && SrcMI->isCopy() &&
+             !SrcMI->getOperand(0).getSubReg() &&
+             !SrcMI->getOperand(1).getSubReg() &&
+           TargetRegisterInfo::isVirtualRegister(SrcMI->getOperand(1).getReg()))
+      SrcMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg());
     if (!SrcMI)
       return false;
 
diff --git a/lib/CodeGen/PBQP/HeuristicSolver.h b/lib/CodeGen/PBQP/HeuristicSolver.h
index bd18b52..02938df 100644
--- a/lib/CodeGen/PBQP/HeuristicSolver.h
+++ b/lib/CodeGen/PBQP/HeuristicSolver.h
@@ -406,7 +406,7 @@ namespace PBQP {
 
       // Create node data objects.
       for (Graph::NodeItr nItr = g.nodesBegin(), nEnd = g.nodesEnd();
-	       nItr != nEnd; ++nItr) {
+           nItr != nEnd; ++nItr) {
         nodeDataList.push_back(NodeData());
         g.setNodeData(nItr, &nodeDataList.back());
       }
diff --git a/lib/CodeGen/PBQP/Heuristics/Briggs.h b/lib/CodeGen/PBQP/Heuristics/Briggs.h
index 30d34d9..4c1ce11 100644
--- a/lib/CodeGen/PBQP/Heuristics/Briggs.h
+++ b/lib/CodeGen/PBQP/Heuristics/Briggs.h
@@ -18,7 +18,6 @@
 #ifndef LLVM_CODEGEN_PBQP_HEURISTICS_BRIGGS_H
 #define LLVM_CODEGEN_PBQP_HEURISTICS_BRIGGS_H
 
-#include "llvm/Support/Compiler.h"
 #include "../HeuristicSolver.h"
 #include "../HeuristicBase.h"
 
@@ -267,8 +266,8 @@ namespace PBQP {
         if (!nd.isHeuristic)
           return;
 
-        EdgeData &ed ATTRIBUTE_UNUSED = getHeuristicEdgeData(eItr);
-
+        EdgeData &ed = getHeuristicEdgeData(eItr);
+        (void)ed;
         assert(ed.isUpToDate && "Edge data is not up to date.");
 
         // Update node.
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index edbc13f..ea6b094 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -34,7 +34,6 @@
 using namespace llvm;
 
 STATISTIC(NumAtomic, "Number of atomic phis lowered");
-STATISTIC(NumSplits, "Number of critical edges split on demand");
 STATISTIC(NumReused, "Number of reused lowered phis");
 
 char PHIElimination::ID = 0;
@@ -184,7 +183,6 @@ void llvm::PHIElimination::LowerAtomicPHINode(
 
   // Create a new register for the incoming PHI arguments.
   MachineFunction &MF = *MBB.getParent();
-  const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg);
   unsigned IncomingReg = 0;
   bool reusedIncoming = false;  // Is IncomingReg reused from an earlier PHI?
 
@@ -208,10 +206,12 @@ void llvm::PHIElimination::LowerAtomicPHINode(
       ++NumReused;
       DEBUG(dbgs() << "Reusing %reg" << IncomingReg << " for " << *MPhi);
     } else {
+      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg);
       entry = IncomingReg = MF.getRegInfo().createVirtualRegister(RC);
     }
-    TII->copyRegToReg(MBB, AfterPHIsIt, DestReg, IncomingReg, RC, RC,
-                      MPhi->getDebugLoc());
+    BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+            TII->get(TargetOpcode::COPY), DestReg)
+      .addReg(IncomingReg);
   }
 
   // Update live variable information if there is any.
@@ -293,8 +293,8 @@ void llvm::PHIElimination::LowerAtomicPHINode(
 
     // Insert the copy.
     if (!reusedIncoming && IncomingReg)
-      TII->copyRegToReg(opBlock, InsertPos, IncomingReg, SrcReg, RC, RC,
-                        MPhi->getDebugLoc());
+      BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), IncomingReg).addReg(SrcReg);
 
     // Now update live variable information if we have it.  Otherwise we're done
     if (!LV) continue;
@@ -391,57 +391,8 @@ bool llvm::PHIElimination::SplitPHIEdges(MachineFunction &MF,
       // (not considering PHI nodes). If the register is live in to this block
       // anyway, we would gain nothing from splitting.
       if (!LV.isLiveIn(Reg, MBB) && LV.isLiveOut(Reg, *PreMBB))
-        SplitCriticalEdge(PreMBB, &MBB);
+        PreMBB->SplitCriticalEdge(&MBB, this);
     }
   }
   return true;
 }
-
-MachineBasicBlock *PHIElimination::SplitCriticalEdge(MachineBasicBlock *A,
-                                                     MachineBasicBlock *B) {
-  assert(A && B && "Missing MBB end point");
-
-  MachineFunction *MF = A->getParent();
-
-  // We may need to update A's terminator, but we can't do that if AnalyzeBranch
-  // fails. If A uses a jump table, we won't touch it.
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
-  MachineBasicBlock *TBB = 0, *FBB = 0;
-  SmallVector<MachineOperand, 4> Cond;
-  if (TII->AnalyzeBranch(*A, TBB, FBB, Cond))
-    return NULL;
-
-  ++NumSplits;
-
-  MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
-  MF->insert(llvm::next(MachineFunction::iterator(A)), NMBB);
-  DEBUG(dbgs() << "PHIElimination splitting critical edge:"
-        " BB#" << A->getNumber()
-        << " -- BB#" << NMBB->getNumber()
-        << " -- BB#" << B->getNumber() << '\n');
-
-  A->ReplaceUsesOfBlockWith(B, NMBB);
-  A->updateTerminator();
-
-  // Insert unconditional "jump B" instruction in NMBB if necessary.
-  NMBB->addSuccessor(B);
-  if (!NMBB->isLayoutSuccessor(B)) {
-    Cond.clear();
-    MF->getTarget().getInstrInfo()->InsertBranch(*NMBB, B, NULL, Cond);
-  }
-
-  // Fix PHI nodes in B so they refer to NMBB instead of A
-  for (MachineBasicBlock::iterator i = B->begin(), e = B->end();
-       i != e && i->isPHI(); ++i)
-    for (unsigned ni = 1, ne = i->getNumOperands(); ni != ne; ni += 2)
-      if (i->getOperand(ni+1).getMBB() == A)
-        i->getOperand(ni+1).setMBB(NMBB);
-
-  if (LiveVariables *LV=getAnalysisIfAvailable<LiveVariables>())
-    LV->addNewBlock(NMBB, A, B);
-
-  if (MachineDominatorTree *MDT=getAnalysisIfAvailable<MachineDominatorTree>())
-    MDT->addNewBlock(NMBB, A);
-
-  return NMBB;
-}
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 5ea2941..3489db2 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -24,6 +24,11 @@ using namespace llvm;
 //===---------------------------------------------------------------------===//
 MachinePassRegistry RegisterRegAlloc::Registry;
 
+static FunctionPass *createDefaultRegisterAllocator() { return 0; }
+static RegisterRegAlloc
+defaultRegAlloc("default",
+                "pick register allocator based on -O option",
+                createDefaultRegisterAllocator);
 
 //===---------------------------------------------------------------------===//
 ///
@@ -33,8 +38,8 @@ MachinePassRegistry RegisterRegAlloc::Registry;
 static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
                RegisterPassParser<RegisterRegAlloc> >
 RegAlloc("regalloc",
-         cl::init(&createLinearScanRegisterAllocator),
-         cl::desc("Register allocator to use (default=linearscan)")); 
+         cl::init(&createDefaultRegisterAllocator),
+         cl::desc("Register allocator to use"));
 
 
 //===---------------------------------------------------------------------===//
@@ -42,13 +47,22 @@ RegAlloc("regalloc",
 /// createRegisterAllocator - choose the appropriate register allocator.
 ///
 //===---------------------------------------------------------------------===//
-FunctionPass *llvm::createRegisterAllocator() {
+FunctionPass *llvm::createRegisterAllocator(CodeGenOpt::Level OptLevel) {
   RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
-  
+
   if (!Ctor) {
     Ctor = RegAlloc;
     RegisterRegAlloc::setDefault(RegAlloc);
   }
-  
-  return Ctor();
+
+  if (Ctor != createDefaultRegisterAllocator)
+    return Ctor();
+
+  // When the 'default' allocator is requested, pick one based on OptLevel.
+  switch (OptLevel) {
+  case CodeGenOpt::None:
+    return createFastRegisterAllocator();
+  default:
+    return createLinearScanRegisterAllocator();
+  }
 }
diff --git a/lib/CodeGen/ExactHazardRecognizer.cpp b/lib/CodeGen/PostRAHazardRecognizer.cpp
index af5f289..cbde2b0 100644
--- a/lib/CodeGen/ExactHazardRecognizer.cpp
+++ b/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -1,4 +1,4 @@
-//===----- ExactHazardRecognizer.cpp - hazard recognizer -------- ---------===//
+//===----- PostRAHazardRecognizer.cpp - hazard recognizer -------- ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "post-RA-sched"
-#include "ExactHazardRecognizer.h"
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/PostRAHazardRecognizer.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -22,10 +22,9 @@
 
 using namespace llvm;
 
-ExactHazardRecognizer::
-ExactHazardRecognizer(const InstrItineraryData &LItinData) :
-  ScheduleHazardRecognizer(), ItinData(LItinData) 
-{
+PostRAHazardRecognizer::
+PostRAHazardRecognizer(const InstrItineraryData &LItinData) :
+  ScheduleHazardRecognizer(), ItinData(LItinData) {
   // Determine the maximum depth of any itinerary. This determines the
   // depth of the scoreboard. We always make the scoreboard at least 1
   // cycle deep to avoid dealing with the boundary condition.
@@ -48,16 +47,16 @@ ExactHazardRecognizer(const InstrItineraryData &LItinData) :
   ReservedScoreboard.reset(ScoreboardDepth);
   RequiredScoreboard.reset(ScoreboardDepth);
 
-  DEBUG(dbgs() << "Using exact hazard recognizer: ScoreboardDepth = " 
+  DEBUG(dbgs() << "Using post-ra hazard recognizer: ScoreboardDepth = " 
                << ScoreboardDepth << '\n');
 }
 
-void ExactHazardRecognizer::Reset() {
+void PostRAHazardRecognizer::Reset() {
   RequiredScoreboard.reset();
   ReservedScoreboard.reset();
 }
 
-void ExactHazardRecognizer::ScoreBoard::dump() const {
+void PostRAHazardRecognizer::ScoreBoard::dump() const {
   dbgs() << "Scoreboard:\n";
 
   unsigned last = Depth - 1;
@@ -73,7 +72,8 @@ void ExactHazardRecognizer::ScoreBoard::dump() const {
   }
 }
 
-ExactHazardRecognizer::HazardType ExactHazardRecognizer::getHazardType(SUnit *SU) {
+ScheduleHazardRecognizer::HazardType
+PostRAHazardRecognizer::getHazardType(SUnit *SU) {
   if (ItinData.isEmpty())
     return NoHazard;
 
@@ -120,7 +120,7 @@ ExactHazardRecognizer::HazardType ExactHazardRecognizer::getHazardType(SUnit *SU
   return NoHazard;
 }
 
-void ExactHazardRecognizer::EmitInstruction(SUnit *SU) {
+void PostRAHazardRecognizer::EmitInstruction(SUnit *SU) {
   if (ItinData.isEmpty())
     return;
 
@@ -174,7 +174,7 @@ void ExactHazardRecognizer::EmitInstruction(SUnit *SU) {
   DEBUG(RequiredScoreboard.dump());
 }
 
-void ExactHazardRecognizer::AdvanceCycle() {
+void PostRAHazardRecognizer::AdvanceCycle() {
   ReservedScoreboard[0] = 0; ReservedScoreboard.advance();
   RequiredScoreboard[0] = 0; RequiredScoreboard.advance();
 }
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 9714ea6..4af8e07 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -22,8 +22,6 @@
 #include "AntiDepBreaker.h"
 #include "AggressiveAntiDepBreaker.h"
 #include "CriticalAntiDepBreaker.h"
-#include "ExactHazardRecognizer.h"
-#include "SimpleHazardRecognizer.h"
 #include "ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
@@ -65,10 +63,6 @@ EnableAntiDepBreaking("break-anti-dependencies",
                       cl::desc("Break post-RA scheduling anti-dependencies: "
                                "\"critical\", \"all\", or \"none\""),
                       cl::init("none"), cl::Hidden);
-static cl::opt<bool>
-EnablePostRAHazardAvoidance("avoid-hazards",
-                      cl::desc("Enable exact hazard avoidance"),
-                      cl::init(true), cl::Hidden);
 
 // If DebugDiv > 0 then only schedule MBB with (ID % DebugDiv) == DebugMod
 static cl::opt<int>
@@ -85,6 +79,7 @@ AntiDepBreaker::~AntiDepBreaker() { }
 namespace {
   class PostRAScheduler : public MachineFunctionPass {
     AliasAnalysis *AA;
+    const TargetInstrInfo *TII;
     CodeGenOpt::Level OptLevel;
 
   public:
@@ -187,30 +182,9 @@ namespace {
   };
 }
 
-/// isSchedulingBoundary - Test if the given instruction should be
-/// considered a scheduling boundary. This primarily includes labels
-/// and terminators.
-///
-static bool isSchedulingBoundary(const MachineInstr *MI,
-                                 const MachineFunction &MF) {
-  // Terminators and labels can't be scheduled around.
-  if (MI->getDesc().isTerminator() || MI->isLabel())
-    return true;
-
-  // Don't attempt to schedule around any instruction that defines
-  // a stack-oriented pointer, as it's unlikely to be profitable. This
-  // saves compile time, because it doesn't require every single
-  // stack slot reference to depend on the instruction that does the
-  // modification.
-  const TargetLowering &TLI = *MF.getTarget().getTargetLowering();
-  if (MI->definesRegister(TLI.getStackPointerRegisterToSaveRestore()))
-    return true;
-
-  return false;
-}
-
 bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
   AA = &getAnalysis<AliasAnalysis>();
+  TII = Fn.getTarget().getInstrInfo();
 
   // Check for explicit enable/disable of post-ra scheduling.
   TargetSubtarget::AntiDepBreakMode AntiDepMode = TargetSubtarget::ANTIDEP_NONE;
@@ -237,10 +211,10 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
 
   const MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
   const MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
-  const InstrItineraryData &InstrItins = Fn.getTarget().getInstrItineraryData();
-  ScheduleHazardRecognizer *HR = EnablePostRAHazardAvoidance ?
-    (ScheduleHazardRecognizer *)new ExactHazardRecognizer(InstrItins) :
-    (ScheduleHazardRecognizer *)new SimpleHazardRecognizer();
+  const TargetMachine &TM = Fn.getTarget();
+  const InstrItineraryData &InstrItins = TM.getInstrItineraryData();
+  ScheduleHazardRecognizer *HR =
+    TM.getInstrInfo()->CreateTargetPostRAHazardRecognizer(InstrItins);
   AntiDepBreaker *ADB =
     ((AntiDepMode == TargetSubtarget::ANTIDEP_ALL) ?
      (AntiDepBreaker *)new AggressiveAntiDepBreaker(Fn, CriticalPathRCs) :
@@ -271,8 +245,8 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     MachineBasicBlock::iterator Current = MBB->end();
     unsigned Count = MBB->size(), CurrentCount = Count;
     for (MachineBasicBlock::iterator I = Current; I != MBB->begin(); ) {
-      MachineInstr *MI = prior(I);
-      if (isSchedulingBoundary(MI, Fn)) {
+      MachineInstr *MI = llvm::prior(I);
+      if (TII->isSchedulingBoundary(MI, MBB, Fn)) {
         Scheduler.Run(MBB, I, Current, CurrentCount);
         Scheduler.EmitSchedule();
         Current = MI;
@@ -680,15 +654,6 @@ void SchedulePostRATDList::ListScheduleTopDown() {
       ScheduleNodeTopDown(FoundSUnit, CurCycle);
       HazardRec->EmitInstruction(FoundSUnit);
       CycleHasInsts = true;
-
-      // If we are using the target-specific hazards, then don't
-      // advance the cycle time just because we schedule a node. If
-      // the target allows it we can schedule multiple nodes in the
-      // same cycle.
-      if (!EnablePostRAHazardAvoidance) {
-        if (FoundSUnit->Latency)  // Don't increment CurCycle for pseudo-ops!
-          ++CurCycle;
-      }
     } else {
       if (CycleHasInsts) {
         DEBUG(dbgs() << "*** Finished cycle " << CurCycle << '\n');
diff --git a/lib/CodeGen/PreAllocSplitting.cpp b/lib/CodeGen/PreAllocSplitting.cpp
index 96e7327..fb2f909 100644
--- a/lib/CodeGen/PreAllocSplitting.cpp
+++ b/lib/CodeGen/PreAllocSplitting.cpp
@@ -512,9 +512,6 @@ PreAllocSplitting::PerformPHIConstruction(MachineBasicBlock::iterator UseI,
     LI->addRange(LiveRange(UseIndex, EndIndex, RetVNI));
     
     // FIXME: Need to set kills properly for inter-block stuff.
-    if (RetVNI->isKill(UseIndex)) RetVNI->removeKill(UseIndex);
-    if (IsIntraBlock)
-      RetVNI->addKill(EndIndex);
   } else if (ContainsDefs && ContainsUses) {
     SmallPtrSet<MachineInstr*, 2>& BlockDefs = Defs[MBB];
     SmallPtrSet<MachineInstr*, 2>& BlockUses = Uses[MBB];
@@ -556,12 +553,6 @@ PreAllocSplitting::PerformPHIConstruction(MachineBasicBlock::iterator UseI,
                                       NewVNs, LiveOut, Phis, false, true);
 
     LI->addRange(LiveRange(StartIndex, EndIndex, RetVNI));
-    
-    if (foundUse && RetVNI->isKill(StartIndex))
-      RetVNI->removeKill(StartIndex);
-    if (IsIntraBlock) {
-      RetVNI->addKill(EndIndex);
-    }
   }
   
   // Memoize results so we don't have to recompute them.
@@ -636,9 +627,6 @@ PreAllocSplitting::PerformPHIConstructionFallBack(MachineBasicBlock::iterator Us
     for (DenseMap<MachineBasicBlock*, VNInfo*>::iterator I =
            IncomingVNs.begin(), E = IncomingVNs.end(); I != E; ++I) {
       I->second->setHasPHIKill(true);
-      SlotIndex KillIndex(LIs->getMBBEndIdx(I->first), true);
-      if (!I->second->isKill(KillIndex))
-        I->second->addKill(KillIndex);
     }
   }
       
@@ -648,8 +636,6 @@ PreAllocSplitting::PerformPHIConstructionFallBack(MachineBasicBlock::iterator Us
   } else
     EndIndex = LIs->getMBBEndIdx(MBB);
   LI->addRange(LiveRange(StartIndex, EndIndex, RetVNI));
-  if (IsIntraBlock)
-    RetVNI->addKill(EndIndex);
 
   // Memoize results so we don't have to recompute them.
   if (!IsIntraBlock)
@@ -691,10 +677,12 @@ void PreAllocSplitting::ReconstructLiveInterval(LiveInterval* LI) {
     
     // If the def is a move, set the copy field.
     unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-    if (TII->isMoveInstr(*DI, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
+    if (TII->isMoveInstr(*DI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
       if (DstReg == LI->reg)
         NewVN->setCopy(&*DI);
-    
+    } else if (DI->isCopyLike() && DI->getOperand(0).getReg() == LI->reg)
+      NewVN->setCopy(&*DI);
+
     NewVNs[&*DI] = NewVN;
   }
   
@@ -725,25 +713,6 @@ void PreAllocSplitting::ReconstructLiveInterval(LiveInterval* LI) {
     
     VNInfo* DeadVN = NewVNs[&*DI];
     LI->addRange(LiveRange(DefIdx, DefIdx.getNextSlot(), DeadVN));
-    DeadVN->addKill(DefIdx);
-  }
-
-  // Update kill markers.
-  for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
-       VI != VE; ++VI) {
-    VNInfo* VNI = *VI;
-    for (unsigned i = 0, e = VNI->kills.size(); i != e; ++i) {
-      SlotIndex KillIdx = VNI->kills[i];
-      if (KillIdx.isPHI())
-        continue;
-      MachineInstr *KillMI = LIs->getInstructionFromIndex(KillIdx);
-      if (KillMI) {
-        MachineOperand *KillMO = KillMI->findRegisterUseOperand(CurrLI->reg);
-        if (KillMO)
-          // It could be a dead def.
-          KillMO->setIsKill();
-      }
-    }
   }
 }
 
@@ -773,19 +742,14 @@ void PreAllocSplitting::RenumberValno(VNInfo* VN) {
     VNsToCopy.push_back(OldVN);
     
     // Locate two-address redefinitions
-    for (VNInfo::KillSet::iterator KI = OldVN->kills.begin(),
-         KE = OldVN->kills.end(); KI != KE; ++KI) {
-      assert(!KI->isPHI() &&
-             "VN previously reported having no PHI kills.");
-      MachineInstr* MI = LIs->getInstructionFromIndex(*KI);
-      unsigned DefIdx = MI->findRegisterDefOperandIdx(CurrLI->reg);
-      if (DefIdx == ~0U) continue;
-      if (MI->isRegTiedToUseOperand(DefIdx)) {
-        VNInfo* NextVN =
-          CurrLI->findDefinedVNInfoForRegInt(KI->getDefIndex());
-        if (NextVN == OldVN) continue;
+    for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(CurrLI->reg),
+         DE = MRI->def_end(); DI != DE; ++DI) {
+      if (!DI->isRegTiedToUseOperand(DI.getOperandNo())) continue;
+      SlotIndex DefIdx = LIs->getInstructionIndex(&*DI).getDefIndex();
+      VNInfo* NextVN = CurrLI->findDefinedVNInfoForRegInt(DefIdx);
+      if (std::find(VNsToCopy.begin(), VNsToCopy.end(), NextVN) !=
+          VNsToCopy.end())
         Stack.push_back(NextVN);
-      }
     }
   }
   
@@ -836,7 +800,7 @@ void PreAllocSplitting::RenumberValno(VNInfo* VN) {
   if (IntervalSSMap.count(CurrLI->reg))
     IntervalSSMap[NewVReg] = IntervalSSMap[CurrLI->reg];
   
-  NumRenumbers++;
+  ++NumRenumbers;
 }
 
 bool PreAllocSplitting::Rematerialize(unsigned VReg, VNInfo* ValNo,
@@ -854,7 +818,7 @@ bool PreAllocSplitting::Rematerialize(unsigned VReg, VNInfo* ValNo,
   if (KillPt == DefMI->getParent()->end())
     return false;
   
-  TII->reMaterialize(MBB, RestorePt, VReg, 0, DefMI, TRI);
+  TII->reMaterialize(MBB, RestorePt, VReg, 0, DefMI, *TRI);
   SlotIndex RematIdx = LIs->InsertMachineInstrInMaps(prior(RestorePt));
   
   ReconstructLiveInterval(CurrLI);
@@ -899,12 +863,11 @@ MachineInstr* PreAllocSplitting::FoldSpill(unsigned vreg,
     SS = MFI->CreateSpillStackObject(RC->getSize(), RC->getAlignment());
   }
   
-  MachineInstr* FMI = TII->foldMemoryOperand(*MBB->getParent(),
-                                             FoldPt, Ops, SS);
+  MachineInstr* FMI = TII->foldMemoryOperand(FoldPt, Ops, SS);
   
   if (FMI) {
     LIs->ReplaceMachineInstrInMaps(FoldPt, FMI);
-    FMI = MBB->insert(MBB->erase(FoldPt), FMI);
+    FoldPt->eraseFromParent();
     ++NumFolds;
     
     IntervalSSMap[vreg] = SS;
@@ -980,12 +943,11 @@ MachineInstr* PreAllocSplitting::FoldRestore(unsigned vreg,
   if (!TII->canFoldMemoryOperand(FoldPt, Ops))
     return 0;
   
-  MachineInstr* FMI = TII->foldMemoryOperand(*MBB->getParent(),
-                                             FoldPt, Ops, SS);
+  MachineInstr* FMI = TII->foldMemoryOperand(FoldPt, Ops, SS);
   
   if (FMI) {
     LIs->ReplaceMachineInstrInMaps(FoldPt, FMI);
-    FMI = MBB->insert(MBB->erase(FoldPt), FMI);
+    FoldPt->eraseFromParent();
     ++NumRestoreFolds;
   }
   
@@ -1192,7 +1154,7 @@ unsigned PreAllocSplitting::getNumberOfNonSpills(
     int StoreFrameIndex;
     unsigned StoreVReg = TII->isStoreToStackSlot(*UI, StoreFrameIndex);
     if (StoreVReg != Reg || StoreFrameIndex != FrameIndex)
-      NonSpills++;
+      ++NonSpills;
     
     int DefIdx = (*UI)->findRegisterDefOperandIdx(Reg);
     if (DefIdx != -1 && (*UI)->isRegTiedToUseOperand(DefIdx))
@@ -1255,7 +1217,7 @@ bool PreAllocSplitting::removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split) {
         (*LI)->removeValNo(CurrVN);
         DefMI->eraseFromParent();
         VNUseCount.erase(CurrVN);
-        NumDeadSpills++;
+        ++NumDeadSpills;
         changed = true;
         continue;
       }
@@ -1291,9 +1253,7 @@ bool PreAllocSplitting::removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split) {
         Ops.push_back(OpIdx);
         if (!TII->canFoldMemoryOperand(use, Ops)) continue;
 
-        MachineInstr* NewMI =
-                          TII->foldMemoryOperand(*use->getParent()->getParent(),  
-                                                 use, Ops, FrameIndex);
+        MachineInstr* NewMI = TII->foldMemoryOperand(use, Ops, FrameIndex);
 
         if (!NewMI) continue;
 
@@ -1303,10 +1263,9 @@ bool PreAllocSplitting::removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split) {
         (*LI)->removeValNo(CurrVN);
 
         DefMI->eraseFromParent();
-        MachineBasicBlock* MBB = use->getParent();
-        NewMI = MBB->insert(MBB->erase(use), NewMI);
+        use->eraseFromParent();
         VNUseCount[CurrVN].erase(use);
-        
+
         // Remove deleted instructions.  Note that we need to remove them from 
         // the VNInfo->use map as well, just to be safe.
         for (SmallPtrSet<MachineInstr*, 4>::iterator II = 
@@ -1328,7 +1287,7 @@ bool PreAllocSplitting::removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split) {
           if (VI->second.erase(use))
             VI->second.insert(NewMI);
 
-        NumDeadSpills++;
+        ++NumDeadSpills;
         changed = true;
         continue;
       }
@@ -1350,7 +1309,7 @@ bool PreAllocSplitting::removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split) {
       LIs->RemoveMachineInstrFromMaps(DefMI);
       (*LI)->removeValNo(CurrVN);
       DefMI->eraseFromParent();
-      NumDeadSpills++;
+      ++NumDeadSpills;
       changed = true;
     }
   }
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index 62f525f..ca4c477 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -46,14 +46,14 @@ bool ProcessImplicitDefs::CanTurnIntoImplicitDef(MachineInstr *MI,
                                                  const TargetInstrInfo *tii_) {
   unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
   if (tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
-      Reg == SrcReg && SrcSubReg == 0 && DstSubReg == 0)
+      Reg == SrcReg && DstSubReg == 0)
     return true;
 
-  if (OpIdx == 2 && MI->isSubregToReg())
-    return true;
-  if (OpIdx == 1 && MI->isExtractSubreg())
-    return true;
-  return false;
+  switch(OpIdx) {
+    case 1: return MI->isCopy() && MI->getOperand(0).getSubReg() == 0;
+    case 2: return MI->isSubregToReg() && MI->getOperand(0).getSubReg() == 0;
+    default: return false;
+  }
 }
 
 /// processImplicitDefs - Process IMPLICIT_DEF instructions and make sure
@@ -101,11 +101,10 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &fn) {
         continue;
       }
 
-      if (MI->isInsertSubreg()) {
-        MachineOperand &MO = MI->getOperand(2);
+      // Eliminate %reg1032:sub<def> = COPY undef.
+      if (MI->isCopy() && MI->getOperand(0).getSubReg()) {
+        MachineOperand &MO = MI->getOperand(1);
         if (ImpDefRegs.count(MO.getReg())) {
-          // %reg1032<def> = INSERT_SUBREG %reg1032, undef, 2
-          // This is an identity copy, eliminate it now.
           if (MO.isKill()) {
             LiveVariables::VarInfo& vi = lv_->getVarInfo(MO.getReg());
             vi.removeKill(MI);
@@ -119,7 +118,7 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &fn) {
       bool ChangedToImpDef = false;
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         MachineOperand& MO = MI->getOperand(i);
-        if (!MO.isReg() || !MO.isUse() || MO.isUndef())
+        if (!MO.isReg() || (MO.isDef() && !MO.getSubReg()) || MO.isUndef())
           continue;
         unsigned Reg = MO.getReg();
         if (!Reg)
@@ -144,6 +143,12 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &fn) {
 
         Changed = true;
         MO.setIsUndef();
+        // This is a partial register redef of an implicit def.
+        // Make sure the whole register is defined by the instruction.
+        if (MO.isDef()) {
+          MI->addRegisterDefined(Reg);
+          continue;
+        }
         if (MO.isKill() || MI->isRegTiedToDefOperand(i)) {
           // Make sure other uses of 
           for (unsigned j = i+1; j != e; ++j) {
@@ -219,8 +224,10 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &fn) {
 
         // Turn a copy use into an implicit_def.
         unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-        if (tii_->isMoveInstr(*RMI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
-            Reg == SrcReg && SrcSubReg == 0 && DstSubReg == 0) {
+        if ((RMI->isCopy() && RMI->getOperand(1).getReg() == Reg &&
+             RMI->getOperand(0).getSubReg() == 0) ||
+            (tii_->isMoveInstr(*RMI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
+             Reg == SrcReg && DstSubReg == 0)) {
           RMI->setDesc(tii_->get(TargetOpcode::IMPLICIT_DEF));
 
           bool isKill = false;
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index e778024..3843b25 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -158,9 +158,9 @@ void PEI::calculateCallsInformation(MachineFunction &Fn) {
         AdjustsStack = true;
         FrameSDOps.push_back(I);
       } else if (I->isInlineAsm()) {
-        // An InlineAsm might be a call; assume it is to get the stack frame
-        // aligned correctly for calls.
-        AdjustsStack = true;
+        // Some inline asm's need a stack frame, as indicated by operand 1.
+        if (I->getOperand(1).getImm())
+          AdjustsStack = true;
       }
 
   MFI->setAdjustsStack(AdjustsStack);
@@ -202,22 +202,17 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &Fn) {
   if (Fn.getFunction()->hasFnAttr(Attribute::Naked))
     return;
 
-  // Figure out which *callee saved* registers are modified by the current
-  // function, thus needing to be saved and restored in the prolog/epilog.
-  const TargetRegisterClass * const *CSRegClasses =
-    RegInfo->getCalleeSavedRegClasses(&Fn);
-
   std::vector<CalleeSavedInfo> CSI;
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     if (Fn.getRegInfo().isPhysRegUsed(Reg)) {
       // If the reg is modified, save it!
-      CSI.push_back(CalleeSavedInfo(Reg, CSRegClasses[i]));
+      CSI.push_back(CalleeSavedInfo(Reg));
     } else {
       for (const unsigned *AliasSet = RegInfo->getAliasSet(Reg);
            *AliasSet; ++AliasSet) {  // Check alias registers too.
         if (Fn.getRegInfo().isPhysRegUsed(*AliasSet)) {
-          CSI.push_back(CalleeSavedInfo(Reg, CSRegClasses[i]));
+          CSI.push_back(CalleeSavedInfo(Reg));
           break;
         }
       }
@@ -236,7 +231,7 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &Fn) {
   for (std::vector<CalleeSavedInfo>::iterator
          I = CSI.begin(), E = CSI.end(); I != E; ++I) {
     unsigned Reg = I->getReg();
-    const TargetRegisterClass *RC = I->getRegClass();
+    const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
 
     int FrameIdx;
     if (RegInfo->hasReservedSpillSlot(Fn, Reg, FrameIdx)) {
@@ -265,8 +260,7 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &Fn) {
       if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
     } else {
       // Spill it to the stack where we must.
-      FrameIdx = MFI->CreateFixedObject(RC->getSize(), FixedSlot->Offset,
-                                        true, false);
+      FrameIdx = MFI->CreateFixedObject(RC->getSize(), FixedSlot->Offset, true);
     }
 
     I->setFrameIdx(FrameIdx);
@@ -303,8 +297,10 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
         EntryBlock->addLiveIn(CSI[i].getReg());
 
         // Insert the spill to the stack frame.
-        TII.storeRegToStackSlot(*EntryBlock, I, CSI[i].getReg(), true,
-                                CSI[i].getFrameIdx(), CSI[i].getRegClass(),TRI);
+        unsigned Reg = CSI[i].getReg();
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.storeRegToStackSlot(*EntryBlock, I, Reg, true,
+                                CSI[i].getFrameIdx(), RC, TRI);
       }
     }
 
@@ -328,9 +324,11 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
       // terminators that preceed it.
       if (!TII.restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
         for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-          TII.loadRegFromStackSlot(*MBB, I, CSI[i].getReg(),
+          unsigned Reg = CSI[i].getReg();
+          const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+          TII.loadRegFromStackSlot(*MBB, I, Reg,
                                    CSI[i].getFrameIdx(),
-                                   CSI[i].getRegClass(), TRI);
+                                   RC, TRI);
           assert(I != MBB->begin() &&
                  "loadRegFromStackSlot didn't insert any code!");
           // Insert in reverse order.  loadRegFromStackSlot can insert
@@ -374,10 +372,12 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
       MBB->addLiveIn(blockCSI[i].getReg());
 
       // Insert the spill to the stack frame.
-      TII.storeRegToStackSlot(*MBB, I, blockCSI[i].getReg(),
+      unsigned Reg = blockCSI[i].getReg();
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(*MBB, I, Reg,
                               true,
                               blockCSI[i].getFrameIdx(),
-                              blockCSI[i].getRegClass(), TRI);
+                              RC, TRI);
     }
   }
 
@@ -423,9 +423,11 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
     // Restore all registers immediately before the return and any
     // terminators that preceed it.
     for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
-      TII.loadRegFromStackSlot(*MBB, I, blockCSI[i].getReg(),
+      unsigned Reg = blockCSI[i].getReg();
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(*MBB, I, Reg,
                                blockCSI[i].getFrameIdx(),
-                               blockCSI[i].getRegClass(), TRI);
+                               RC, TRI);
       assert(I != MBB->begin() &&
              "loadRegFromStackSlot didn't insert any code!");
       // Insert in reverse order.  loadRegFromStackSlot can insert
@@ -639,6 +641,9 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
 
   for (MachineFunction::iterator BB = Fn.begin(),
          E = Fn.end(); BB != E; ++BB) {
+#ifndef NDEBUG
+    int SPAdjCount = 0; // frame setup / destroy count.
+#endif
     int SPAdj = 0;  // SP offset due to call frame setup / destroy.
     if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
 
@@ -646,6 +651,10 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
 
       if (I->getOpcode() == FrameSetupOpcode ||
           I->getOpcode() == FrameDestroyOpcode) {
+#ifndef NDEBUG
+        // Track whether we see even pairs of them
+        SPAdjCount += I->getOpcode() == FrameSetupOpcode ? 1 : -1;
+#endif
         // Remember how much SP has been adjusted to create the call
         // frame.
         int Size = I->getOperand(0).getImm();
@@ -712,7 +721,13 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
       if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI);
     }
 
-    assert(SPAdj == 0 && "Unbalanced call frame setup / destroy pairs?");
+    // If we have evenly matched pairs of frame setup / destroy instructions,
+    // make sure the adjustments come out to zero. If we don't have matched
+    // pairs, we can't be sure the missing bit isn't in another basic block
+    // due to a custom inserter playing tricks, so just asserting SPAdj==0
+    // isn't sufficient. See tMOVCC on Thumb1, for example.
+    assert((SPAdjCount || SPAdj == 0) &&
+           "Unbalanced call frame setup / destroy pairs?");
   }
 }
 
@@ -870,11 +885,7 @@ void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
             // Scavenge a new scratch register
             CurrentVirtReg = Reg;
             const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
-            CurrentScratchReg = RS->FindUnusedReg(RC);
-            if (CurrentScratchReg == 0)
-              // No register is "free". Scavenge a register.
-              CurrentScratchReg = RS->scavengeRegister(RC, I, SPAdj);
-
+            CurrentScratchReg = RS->scavengeRegister(RC, I, SPAdj);
             PrevValue = Value;
           }
           // replace this reference to the virtual register with the
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index b3b5760..f44478e 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -110,6 +110,11 @@ namespace {
     // Allocatable - vector of allocatable physical registers.
     BitVector Allocatable;
 
+    // SkippedInstrs - Descriptors of instructions whose clobber list was ignored
+    // because all registers were spilled. It is still necessary to mark all the
+    // clobbered registers as used by the function.
+    SmallPtrSet<const TargetInstrDesc*, 4> SkippedInstrs;
+
     // isBulkSpilling - This flag is set when LiveRegMap will be cleared
     // completely after spilling all live registers. LiveRegMap entries should
     // not be erased.
@@ -135,6 +140,8 @@ namespace {
   private:
     bool runOnMachineFunction(MachineFunction &Fn);
     void AllocateBasicBlock();
+    void handleThroughOperands(MachineInstr *MI,
+                               SmallVectorImpl<unsigned> &VirtDead);
     int getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC);
     bool isLastUseOfLocalReg(MachineOperand&);
 
@@ -508,27 +515,20 @@ RAFast::defineVirtReg(MachineInstr *MI, unsigned OpNum,
   bool New;
   tie(LRI, New) = LiveVirtRegs.insert(std::make_pair(VirtReg, LiveReg()));
   LiveReg &LR = LRI->second;
-  bool PartialRedef = MI->getOperand(OpNum).getSubReg();
   if (New) {
     // If there is no hint, peek at the only use of this register.
     if ((!Hint || !TargetRegisterInfo::isPhysicalRegister(Hint)) &&
         MRI->hasOneNonDBGUse(VirtReg)) {
+      const MachineInstr &UseMI = *MRI->use_nodbg_begin(VirtReg);
       unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
       // It's a copy, use the destination register as a hint.
-      if (TII->isMoveInstr(*MRI->use_nodbg_begin(VirtReg),
-                           SrcReg, DstReg, SrcSubReg, DstSubReg))
+      if (UseMI.isCopyLike())
+        Hint = UseMI.getOperand(0).getReg();
+      else if (TII->isMoveInstr(UseMI, SrcReg, DstReg, SrcSubReg, DstSubReg))
         Hint = DstReg;
     }
     allocVirtReg(MI, *LRI, Hint);
-    // If this is only a partial redefinition, we must reload the other parts.
-    if (PartialRedef && MI->readsVirtualRegister(VirtReg)) {
-      const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
-      int FI = getStackSpaceFor(VirtReg, RC);
-      DEBUG(dbgs() << "Reloading for partial redef: %reg" << VirtReg << "\n");
-      TII->loadRegFromStackSlot(*MBB, MI, LR.PhysReg, FI, RC, TRI);
-      ++NumLoads;
-    }
-  } else if (LR.LastUse && !PartialRedef) {
+  } else if (LR.LastUse) {
     // Redefining a live register - kill at the last use, unless it is this
     // instruction defining VirtReg multiple times.
     if (LR.LastUse != MI || LR.LastUse->getOperand(LR.LastOpNum).isUse())
@@ -564,10 +564,16 @@ RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum,
   } else if (LR.Dirty) {
     if (isLastUseOfLocalReg(MO)) {
       DEBUG(dbgs() << "Killing last use: " << MO << "\n");
-      MO.setIsKill();
+      if (MO.isUse())
+        MO.setIsKill();
+      else
+        MO.setIsDead();
     } else if (MO.isKill()) {
       DEBUG(dbgs() << "Clearing dubious kill: " << MO << "\n");
       MO.setIsKill(false);
+    } else if (MO.isDead()) {
+      DEBUG(dbgs() << "Clearing dubious dead: " << MO << "\n");
+      MO.setIsDead(false);
     }
   } else if (MO.isKill()) {
     // We must remove kill flags from uses of reloaded registers because the
@@ -576,6 +582,9 @@ RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum,
     // This would cause a second reload of %x into a different register.
     DEBUG(dbgs() << "Clearing clean kill: " << MO << "\n");
     MO.setIsKill(false);
+  } else if (MO.isDead()) {
+    DEBUG(dbgs() << "Clearing clean dead: " << MO << "\n");
+    MO.setIsDead(false);
   }
   assert(LR.PhysReg && "Register not assigned");
   LR.LastUse = MI;
@@ -607,6 +616,91 @@ bool RAFast::setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg) {
   return MO.isDead();
 }
 
+// Handle special instruction operand like early clobbers and tied ops when
+// there are additional physreg defines.
+void RAFast::handleThroughOperands(MachineInstr *MI,
+                                   SmallVectorImpl<unsigned> &VirtDead) {
+  DEBUG(dbgs() << "Scanning for through registers:");
+  SmallSet<unsigned, 8> ThroughRegs;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    if (MO.isEarlyClobber() || MI->isRegTiedToDefOperand(i) ||
+        (MO.getSubReg() && MI->readsVirtualRegister(Reg))) {
+      if (ThroughRegs.insert(Reg))
+        DEBUG(dbgs() << " %reg" << Reg);
+    }
+  }
+
+  // If any physreg defines collide with preallocated through registers,
+  // we must spill and reallocate.
+  DEBUG(dbgs() << "\nChecking for physdef collisions.\n");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    UsedInInstr.set(Reg);
+    if (ThroughRegs.count(PhysRegState[Reg]))
+      definePhysReg(MI, Reg, regFree);
+    for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
+      UsedInInstr.set(*AS);
+      if (ThroughRegs.count(PhysRegState[*AS]))
+        definePhysReg(MI, *AS, regFree);
+    }
+  }
+
+  SmallVector<unsigned, 8> PartialDefs;
+  DEBUG(dbgs() << "Allocating tied uses and early clobbers.\n");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    if (MO.isUse()) {
+      unsigned DefIdx = 0;
+      if (!MI->isRegTiedToDefOperand(i, &DefIdx)) continue;
+      DEBUG(dbgs() << "Operand " << i << "("<< MO << ") is tied to operand "
+        << DefIdx << ".\n");
+      LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, 0);
+      unsigned PhysReg = LRI->second.PhysReg;
+      setPhysReg(MI, i, PhysReg);
+      // Note: we don't update the def operand yet. That would cause the normal
+      // def-scan to attempt spilling.
+    } else if (MO.getSubReg() && MI->readsVirtualRegister(Reg)) {
+      DEBUG(dbgs() << "Partial redefine: " << MO << "\n");
+      // Reload the register, but don't assign to the operand just yet.
+      // That would confuse the later phys-def processing pass.
+      LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, 0);
+      PartialDefs.push_back(LRI->second.PhysReg);
+    } else if (MO.isEarlyClobber()) {
+      // Note: defineVirtReg may invalidate MO.
+      LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, 0);
+      unsigned PhysReg = LRI->second.PhysReg;
+      if (setPhysReg(MI, i, PhysReg))
+        VirtDead.push_back(Reg);
+    }
+  }
+
+  // Restore UsedInInstr to a state usable for allocating normal virtual uses.
+  UsedInInstr.reset();
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    UsedInInstr.set(Reg);
+    for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+      UsedInInstr.set(*AS);
+  }
+
+  // Also mark PartialDefs as used to avoid reallocation.
+  for (unsigned i = 0, e = PartialDefs.size(); i != e; ++i)
+    UsedInInstr.set(PartialDefs[i]);
+}
+
 void RAFast::AllocateBasicBlock() {
   DEBUG(dbgs() << "\nAllocating " << *MBB);
 
@@ -620,7 +714,7 @@ void RAFast::AllocateBasicBlock() {
          E = MBB->livein_end(); I != E; ++I)
     definePhysReg(MII, *I, regReserved);
 
-  SmallVector<unsigned, 8> PhysECs, VirtDead;
+  SmallVector<unsigned, 8> VirtDead;
   SmallVector<MachineInstr*, 32> Coalesced;
 
   // Otherwise, sequentially allocate each instruction in the MBB.
@@ -670,8 +764,25 @@ void RAFast::AllocateBasicBlock() {
         LiveRegMap::iterator LRI = LiveVirtRegs.find(Reg);
         if (LRI != LiveVirtRegs.end())
           setPhysReg(MI, i, LRI->second.PhysReg);
-        else
-          MO.setReg(0); // We can't allocate a physreg for a DebugValue, sorry!
+        else {
+          int SS = StackSlotForVirtReg[Reg];
+          if (SS == -1)
+            MO.setReg(0); // We can't allocate a physreg for a DebugValue, sorry!
+          else {
+            // Modify DBG_VALUE now that the value is in a spill slot.
+            uint64_t Offset = MI->getOperand(1).getImm();
+            const MDNode *MDPtr = 
+              MI->getOperand(MI->getNumOperands()-1).getMetadata();
+            DebugLoc DL = MI->getDebugLoc();
+            if (MachineInstr *NewDV = 
+                TII->emitFrameIndexDebugValue(*MF, SS, Offset, MDPtr, DL)) {
+              DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
+              MachineBasicBlock *MBB = MI->getParent();
+              MBB->insert(MBB->erase(MI), NewDV);
+            } else
+              MO.setReg(0); // We can't allocate a physreg for a DebugValue, sorry!
+          }
+        }
       }
       // Next instruction.
       continue;
@@ -679,17 +790,25 @@ void RAFast::AllocateBasicBlock() {
 
     // If this is a copy, we may be able to coalesce.
     unsigned CopySrc, CopyDst, CopySrcSub, CopyDstSub;
-    if (!TII->isMoveInstr(*MI, CopySrc, CopyDst, CopySrcSub, CopyDstSub))
+    if (MI->isCopy()) {
+      CopyDst = MI->getOperand(0).getReg();
+      CopySrc = MI->getOperand(1).getReg();
+      CopyDstSub = MI->getOperand(0).getSubReg();
+      CopySrcSub = MI->getOperand(1).getSubReg();
+    } else if (!TII->isMoveInstr(*MI, CopySrc, CopyDst, CopySrcSub, CopyDstSub))
       CopySrc = CopyDst = 0;
 
     // Track registers used by instruction.
     UsedInInstr.reset();
-    PhysECs.clear();
 
     // First scan.
     // Mark physreg uses and early clobbers as used.
     // Find the end of the virtreg operands
     unsigned VirtOpEnd = 0;
+    bool hasTiedOps = false;
+    bool hasEarlyClobbers = false;
+    bool hasPartialRedefs = false;
+    bool hasPhysDefs = false;
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
       if (!MO.isReg()) continue;
@@ -697,20 +816,44 @@ void RAFast::AllocateBasicBlock() {
       if (!Reg) continue;
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
         VirtOpEnd = i+1;
+        if (MO.isUse()) {
+          hasTiedOps = hasTiedOps ||
+                                TID.getOperandConstraint(i, TOI::TIED_TO) != -1;
+        } else {
+          if (MO.isEarlyClobber())
+            hasEarlyClobbers = true;
+          if (MO.getSubReg() && MI->readsVirtualRegister(Reg))
+            hasPartialRedefs = true;
+        }
         continue;
       }
       if (!Allocatable.test(Reg)) continue;
       if (MO.isUse()) {
         usePhysReg(MO);
       } else if (MO.isEarlyClobber()) {
-        definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
-        PhysECs.push_back(Reg);
-      }
+        definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ?
+                               regFree : regReserved);
+        hasEarlyClobbers = true;
+      } else
+        hasPhysDefs = true;
+    }
+
+    // The instruction may have virtual register operands that must be allocated
+    // the same register at use-time and def-time: early clobbers and tied
+    // operands. If there are also physical defs, these registers must avoid
+    // both physical defs and uses, making them more constrained than normal
+    // operands.
+    // We didn't detect inline asm tied operands above, so just make this extra
+    // pass for all inline asm.
+    if (MI->isInlineAsm() || hasEarlyClobbers || hasPartialRedefs ||
+        (hasTiedOps && hasPhysDefs)) {
+      handleThroughOperands(MI, VirtDead);
+      // Don't attempt coalescing when we have funny stuff going on.
+      CopyDst = 0;
     }
 
     // Second scan.
-    // Allocate virtreg uses and early clobbers.
-    // Collect VirtKills
+    // Allocate virtreg uses.
     for (unsigned i = 0; i != VirtOpEnd; ++i) {
       MachineOperand &MO = MI->getOperand(i);
       if (!MO.isReg()) continue;
@@ -722,12 +865,6 @@ void RAFast::AllocateBasicBlock() {
         CopySrc = (CopySrc == Reg || CopySrc == PhysReg) ? PhysReg : 0;
         if (setPhysReg(MI, i, PhysReg))
           killVirtReg(LRI);
-      } else if (MO.isEarlyClobber()) {
-        // Note: defineVirtReg may invalidate MO.
-        LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, 0);
-        unsigned PhysReg = LRI->second.PhysReg;
-        setPhysReg(MI, i, PhysReg);
-        PhysECs.push_back(PhysReg);
       }
     }
 
@@ -735,12 +872,16 @@ void RAFast::AllocateBasicBlock() {
 
     // Track registers defined by instruction - early clobbers at this point.
     UsedInInstr.reset();
-    for (unsigned i = 0, e = PhysECs.size(); i != e; ++i) {
-      unsigned PhysReg = PhysECs[i];
-      UsedInInstr.set(PhysReg);
-      for (const unsigned *AS = TRI->getAliasSet(PhysReg);
-            unsigned Alias = *AS; ++AS)
-        UsedInInstr.set(Alias);
+    if (hasEarlyClobbers) {
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg() || !MO.isDef()) continue;
+        unsigned Reg = MO.getReg();
+        if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+        UsedInInstr.set(Reg);
+        for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+          UsedInInstr.set(*AS);
+      }
     }
 
     unsigned DefOpEnd = MI->getNumOperands();
@@ -752,13 +893,18 @@ void RAFast::AllocateBasicBlock() {
       DefOpEnd = VirtOpEnd;
       DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
       spillAll(MI);
+
+      // The imp-defs are skipped below, but we still need to mark those
+      // registers as used by the function.
+      SkippedInstrs.insert(&TID);
     }
 
     // Third scan.
     // Allocate defs and collect dead defs.
     for (unsigned i = 0; i != DefOpEnd; ++i) {
       MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isDef() || !MO.getReg()) continue;
+      if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
+        continue;
       unsigned Reg = MO.getReg();
 
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
@@ -837,6 +983,14 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
   // Make sure the set of used physregs is closed under subreg operations.
   MRI->closePhysRegsUsed(*TRI);
 
+  // Add the clobber lists for all the instructions we skipped earlier.
+  for (SmallPtrSet<const TargetInstrDesc*, 4>::const_iterator
+       I = SkippedInstrs.begin(), E = SkippedInstrs.end(); I != E; ++I)
+    if (const unsigned *Defs = (*I)->getImplicitDefs())
+      while (*Defs)
+        MRI->setPhysRegUsed(*Defs++);
+
+  SkippedInstrs.clear();
   StackSlotForVirtReg.clear();
   return true;
 }
diff --git a/lib/CodeGen/RegAllocLinearScan.cpp b/lib/CodeGen/RegAllocLinearScan.cpp
index bc331f0..044672d 100644
--- a/lib/CodeGen/RegAllocLinearScan.cpp
+++ b/lib/CodeGen/RegAllocLinearScan.cpp
@@ -83,7 +83,8 @@ namespace {
   // pressure, it can caused fewer GPRs to be held in the queue.
   static cl::opt<unsigned>
   NumRecentlyUsedRegs("linearscan-skip-count",
-                      cl::desc("Number of registers for linearscan to remember to skip."),
+                      cl::desc("Number of registers for linearscan to remember"
+                               "to skip."),
                       cl::init(0),
                       cl::Hidden);
  
@@ -421,9 +422,10 @@ unsigned RALinScan::attemptTrivialCoalescing(LiveInterval &cur, unsigned Reg) {
     unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
     if (vni->def != SlotIndex() && vni->isDefAccurate() &&
         (CopyMI = li_->getInstructionFromIndex(vni->def)) &&
-        tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg))
+        (CopyMI->isCopy() ||
+         tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg)))
       // Defined by a copy, try to extend SrcReg forward
-      CandReg = SrcReg;
+      CandReg = CopyMI->isCopy() ? CopyMI->getOperand(1).getReg() : SrcReg;
     else if (TrivCoalesceEnds &&
              (CopyMI =
               li_->getInstructionFromIndex(range.end.getBaseIndex())) &&
@@ -992,6 +994,24 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
           if (Reg && allocatableRegs_[Reg] && RC->contains(Reg))
             mri_->setRegAllocationHint(cur->reg, 0, Reg);
         }
+      } else if (CopyMI && CopyMI->isCopy()) {
+        DstReg = CopyMI->getOperand(0).getReg();
+        DstSubReg = CopyMI->getOperand(0).getSubReg();
+        SrcReg = CopyMI->getOperand(1).getReg();
+        SrcSubReg = CopyMI->getOperand(1).getSubReg();
+        unsigned Reg = 0;
+        if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
+          Reg = SrcReg;
+        else if (vrm_->isAssignedReg(SrcReg))
+          Reg = vrm_->getPhys(SrcReg);
+        if (Reg) {
+          if (SrcSubReg)
+            Reg = tri_->getSubReg(Reg, SrcSubReg);
+          if (DstSubReg)
+            Reg = tri_->getMatchingSuperReg(Reg, DstSubReg, RC);
+          if (Reg && allocatableRegs_[Reg] && RC->contains(Reg))
+            mri_->setRegAllocationHint(cur->reg, 0, Reg);
+        }
       }
     }
   }
@@ -1206,8 +1226,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
     DEBUG(dbgs() << "\t\t\tspilling(c): " << *cur << '\n');
     SmallVector<LiveInterval*, 8> spillIs;
     std::vector<LiveInterval*> added;
-    
-    added = spiller_->spill(cur, spillIs); 
+    spiller_->spill(cur, added, spillIs);
 
     std::sort(added.begin(), added.end(), LISorter());
     addStackInterval(cur, ls_, li_, mri_, *vrm_);
@@ -1285,10 +1304,8 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
     if (sli->beginIndex() < earliestStart)
       earliestStart = sli->beginIndex();
        
-    std::vector<LiveInterval*> newIs;
-    newIs = spiller_->spill(sli, spillIs, &earliestStart);
+    spiller_->spill(sli, added, spillIs, &earliestStart);
     addStackInterval(sli, ls_, li_, mri_, *vrm_);
-    std::copy(newIs.begin(), newIs.end(), std::back_inserter(added));
     spilled.insert(sli->reg);
   }
 
diff --git a/lib/CodeGen/RegAllocLocal.cpp b/lib/CodeGen/RegAllocLocal.cpp
deleted file mode 100644
index 321ae12..0000000
--- a/lib/CodeGen/RegAllocLocal.cpp
+++ /dev/null
@@ -1,1254 +0,0 @@
-//===-- RegAllocLocal.cpp - A BasicBlock generic register allocator -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This register allocator allocates registers to a basic block at a time,
-// attempting to keep values in registers and reusing registers as appropriate.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "regalloc"
-#include "llvm/BasicBlock.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/RegAllocRegistry.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/IndexedMap.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
-#include <algorithm>
-using namespace llvm;
-
-STATISTIC(NumStores, "Number of stores added");
-STATISTIC(NumLoads , "Number of loads added");
-STATISTIC(NumCopies, "Number of copies coalesced");
-
-static RegisterRegAlloc
-  localRegAlloc("local", "local register allocator",
-                createLocalRegisterAllocator);
-
-namespace {
-  class RALocal : public MachineFunctionPass {
-  public:
-    static char ID;
-    RALocal() : MachineFunctionPass(&ID), StackSlotForVirtReg(-1) {}
-  private:
-    const TargetMachine *TM;
-    MachineFunction *MF;
-    MachineRegisterInfo *MRI;
-    const TargetRegisterInfo *TRI;
-    const TargetInstrInfo *TII;
-
-    // StackSlotForVirtReg - Maps virtual regs to the frame index where these
-    // values are spilled.
-    IndexedMap<int, VirtReg2IndexFunctor> StackSlotForVirtReg;
-
-    // Virt2PhysRegMap - This map contains entries for each virtual register
-    // that is currently available in a physical register.
-    IndexedMap<unsigned, VirtReg2IndexFunctor> Virt2PhysRegMap;
-
-    unsigned &getVirt2PhysRegMapSlot(unsigned VirtReg) {
-      return Virt2PhysRegMap[VirtReg];
-    }
-
-    // PhysRegsUsed - This array is effectively a map, containing entries for
-    // each physical register that currently has a value (ie, it is in
-    // Virt2PhysRegMap).  The value mapped to is the virtual register
-    // corresponding to the physical register (the inverse of the
-    // Virt2PhysRegMap), or 0.  The value is set to 0 if this register is pinned
-    // because it is used by a future instruction, and to -2 if it is not
-    // allocatable.  If the entry for a physical register is -1, then the
-    // physical register is "not in the map".
-    //
-    std::vector<int> PhysRegsUsed;
-
-    // PhysRegsUseOrder - This contains a list of the physical registers that
-    // currently have a virtual register value in them.  This list provides an
-    // ordering of registers, imposing a reallocation order.  This list is only
-    // used if all registers are allocated and we have to spill one, in which
-    // case we spill the least recently used register.  Entries at the front of
-    // the list are the least recently used registers, entries at the back are
-    // the most recently used.
-    //
-    std::vector<unsigned> PhysRegsUseOrder;
-
-    // Virt2LastUseMap - This maps each virtual register to its last use
-    // (MachineInstr*, operand index pair).
-    IndexedMap<std::pair<MachineInstr*, unsigned>, VirtReg2IndexFunctor>
-    Virt2LastUseMap;
-
-    std::pair<MachineInstr*,unsigned>& getVirtRegLastUse(unsigned Reg) {
-      assert(TargetRegisterInfo::isVirtualRegister(Reg) && "Illegal VirtReg!");
-      return Virt2LastUseMap[Reg];
-    }
-
-    // VirtRegModified - This bitset contains information about which virtual
-    // registers need to be spilled back to memory when their registers are
-    // scavenged.  If a virtual register has simply been rematerialized, there
-    // is no reason to spill it to memory when we need the register back.
-    //
-    BitVector VirtRegModified;
-    
-    // UsedInMultipleBlocks - Tracks whether a particular register is used in
-    // more than one block.
-    BitVector UsedInMultipleBlocks;
-
-    void markVirtRegModified(unsigned Reg, bool Val = true) {
-      assert(TargetRegisterInfo::isVirtualRegister(Reg) && "Illegal VirtReg!");
-      Reg -= TargetRegisterInfo::FirstVirtualRegister;
-      if (Val)
-        VirtRegModified.set(Reg);
-      else
-        VirtRegModified.reset(Reg);
-    }
-
-    bool isVirtRegModified(unsigned Reg) const {
-      assert(TargetRegisterInfo::isVirtualRegister(Reg) && "Illegal VirtReg!");
-      assert(Reg - TargetRegisterInfo::FirstVirtualRegister <
-             VirtRegModified.size() && "Illegal virtual register!");
-      return VirtRegModified[Reg - TargetRegisterInfo::FirstVirtualRegister];
-    }
-
-    void AddToPhysRegsUseOrder(unsigned Reg) {
-      std::vector<unsigned>::iterator It =
-        std::find(PhysRegsUseOrder.begin(), PhysRegsUseOrder.end(), Reg);
-      if (It != PhysRegsUseOrder.end())
-        PhysRegsUseOrder.erase(It);
-      PhysRegsUseOrder.push_back(Reg);
-    }
-
-    void MarkPhysRegRecentlyUsed(unsigned Reg) {
-      if (PhysRegsUseOrder.empty() ||
-          PhysRegsUseOrder.back() == Reg) return;  // Already most recently used
-
-      for (unsigned i = PhysRegsUseOrder.size(); i != 0; --i) {
-        unsigned RegMatch = PhysRegsUseOrder[i-1];       // remove from middle
-        if (!areRegsEqual(Reg, RegMatch)) continue;
-        
-        PhysRegsUseOrder.erase(PhysRegsUseOrder.begin()+i-1);
-        // Add it to the end of the list
-        PhysRegsUseOrder.push_back(RegMatch);
-        if (RegMatch == Reg)
-          return;    // Found an exact match, exit early
-      }
-    }
-
-  public:
-    virtual const char *getPassName() const {
-      return "Local Register Allocator";
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      AU.addRequiredID(PHIEliminationID);
-      AU.addRequiredID(TwoAddressInstructionPassID);
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-
-  private:
-    /// runOnMachineFunction - Register allocate the whole function
-    bool runOnMachineFunction(MachineFunction &Fn);
-
-    /// AllocateBasicBlock - Register allocate the specified basic block.
-    void AllocateBasicBlock(MachineBasicBlock &MBB);
-
-
-    /// areRegsEqual - This method returns true if the specified registers are
-    /// related to each other.  To do this, it checks to see if they are equal
-    /// or if the first register is in the alias set of the second register.
-    ///
-    bool areRegsEqual(unsigned R1, unsigned R2) const {
-      if (R1 == R2) return true;
-      for (const unsigned *AliasSet = TRI->getAliasSet(R2);
-           *AliasSet; ++AliasSet) {
-        if (*AliasSet == R1) return true;
-      }
-      return false;
-    }
-
-    /// getStackSpaceFor - This returns the frame index of the specified virtual
-    /// register on the stack, allocating space if necessary.
-    int getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC);
-
-    /// removePhysReg - This method marks the specified physical register as no
-    /// longer being in use.
-    ///
-    void removePhysReg(unsigned PhysReg);
-
-    void storeVirtReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                      unsigned VirtReg, unsigned PhysReg, bool isKill);
-
-    /// spillVirtReg - This method spills the value specified by PhysReg into
-    /// the virtual register slot specified by VirtReg.  It then updates the RA
-    /// data structures to indicate the fact that PhysReg is now available.
-    ///
-    void spillVirtReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                      unsigned VirtReg, unsigned PhysReg);
-
-    /// spillPhysReg - This method spills the specified physical register into
-    /// the virtual register slot associated with it.  If OnlyVirtRegs is set to
-    /// true, then the request is ignored if the physical register does not
-    /// contain a virtual register.
-    ///
-    void spillPhysReg(MachineBasicBlock &MBB, MachineInstr *I,
-                      unsigned PhysReg, bool OnlyVirtRegs = false);
-
-    /// assignVirtToPhysReg - This method updates local state so that we know
-    /// that PhysReg is the proper container for VirtReg now.  The physical
-    /// register must not be used for anything else when this is called.
-    ///
-    void assignVirtToPhysReg(unsigned VirtReg, unsigned PhysReg);
-
-    /// isPhysRegAvailable - Return true if the specified physical register is
-    /// free and available for use.  This also includes checking to see if
-    /// aliased registers are all free...
-    ///
-    bool isPhysRegAvailable(unsigned PhysReg) const;
-
-    /// getFreeReg - Look to see if there is a free register available in the
-    /// specified register class.  If not, return 0.
-    ///
-    unsigned getFreeReg(const TargetRegisterClass *RC);
-
-    /// getReg - Find a physical register to hold the specified virtual
-    /// register.  If all compatible physical registers are used, this method
-    /// spills the last used virtual register to the stack, and uses that
-    /// register. If NoFree is true, that means the caller knows there isn't
-    /// a free register, do not call getFreeReg().
-    unsigned getReg(MachineBasicBlock &MBB, MachineInstr *MI,
-                    unsigned VirtReg, bool NoFree = false);
-
-    /// reloadVirtReg - This method transforms the specified virtual
-    /// register use to refer to a physical register.  This method may do this
-    /// in one of several ways: if the register is available in a physical
-    /// register already, it uses that physical register.  If the value is not
-    /// in a physical register, and if there are physical registers available,
-    /// it loads it into a register: PhysReg if that is an available physical
-    /// register, otherwise any physical register of the right class.
-    /// If register pressure is high, and it is possible, it tries to fold the
-    /// load of the virtual register into the instruction itself.  It avoids
-    /// doing this if register pressure is low to improve the chance that
-    /// subsequent instructions can use the reloaded value.  This method
-    /// returns the modified instruction.
-    ///
-    MachineInstr *reloadVirtReg(MachineBasicBlock &MBB, MachineInstr *MI,
-                                unsigned OpNum, SmallSet<unsigned, 4> &RRegs,
-                                unsigned PhysReg);
-
-    /// ComputeLocalLiveness - Computes liveness of registers within a basic
-    /// block, setting the killed/dead flags as appropriate.
-    void ComputeLocalLiveness(MachineBasicBlock& MBB);
-
-    void reloadPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I,
-                       unsigned PhysReg);
-  };
-  char RALocal::ID = 0;
-}
-
-/// getStackSpaceFor - This allocates space for the specified virtual register
-/// to be held on the stack.
-int RALocal::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) {
-  // Find the location Reg would belong...
-  int SS = StackSlotForVirtReg[VirtReg];
-  if (SS != -1)
-    return SS;          // Already has space allocated?
-
-  // Allocate a new stack object for this spill location...
-  int FrameIdx = MF->getFrameInfo()->CreateSpillStackObject(RC->getSize(),
-                                                            RC->getAlignment());
-
-  // Assign the slot.
-  StackSlotForVirtReg[VirtReg] = FrameIdx;
-  return FrameIdx;
-}
-
-
-/// removePhysReg - This method marks the specified physical register as no
-/// longer being in use.
-///
-void RALocal::removePhysReg(unsigned PhysReg) {
-  PhysRegsUsed[PhysReg] = -1;      // PhyReg no longer used
-
-  std::vector<unsigned>::iterator It =
-    std::find(PhysRegsUseOrder.begin(), PhysRegsUseOrder.end(), PhysReg);
-  if (It != PhysRegsUseOrder.end())
-    PhysRegsUseOrder.erase(It);
-}
-
-/// storeVirtReg - Store a virtual register to its assigned stack slot.
-void RALocal::storeVirtReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I,
-                           unsigned VirtReg, unsigned PhysReg,
-                           bool isKill) {
-  const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(VirtReg);
-  int FrameIndex = getStackSpaceFor(VirtReg, RC);
-  DEBUG(dbgs() << " to stack slot #" << FrameIndex);
-  TII->storeRegToStackSlot(MBB, I, PhysReg, isKill, FrameIndex, RC, TRI);
-  ++NumStores;   // Update statistics
-
-  // Mark the spill instruction as last use if we're not killing the register.
-  if (!isKill) {
-    MachineInstr *Spill = llvm::prior(I);
-    int OpNum = Spill->findRegisterUseOperandIdx(PhysReg);
-    if (OpNum < 0)
-      getVirtRegLastUse(VirtReg) = std::make_pair((MachineInstr*)0, 0);
-    else
-      getVirtRegLastUse(VirtReg) = std::make_pair(Spill, OpNum);
-  }
-}
-
-/// spillVirtReg - This method spills the value specified by PhysReg into the
-/// virtual register slot specified by VirtReg.  It then updates the RA data
-/// structures to indicate the fact that PhysReg is now available.
-///
-void RALocal::spillVirtReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I,
-                           unsigned VirtReg, unsigned PhysReg) {
-  assert(VirtReg && "Spilling a physical register is illegal!"
-         " Must not have appropriate kill for the register or use exists beyond"
-         " the intended one.");
-  DEBUG(dbgs() << "  Spilling register " << TRI->getName(PhysReg)
-               << " containing %reg" << VirtReg);
-  
-  if (!isVirtRegModified(VirtReg)) {
-    DEBUG(dbgs() << " which has not been modified, so no store necessary!");
-    std::pair<MachineInstr*, unsigned> &LastUse = getVirtRegLastUse(VirtReg);
-    if (LastUse.first)
-      LastUse.first->getOperand(LastUse.second).setIsKill();
-  } else {
-    // Otherwise, there is a virtual register corresponding to this physical
-    // register.  We only need to spill it into its stack slot if it has been
-    // modified.
-    // If the instruction reads the register that's spilled, (e.g. this can
-    // happen if it is a move to a physical register), then the spill
-    // instruction is not a kill.
-    bool isKill = !(I != MBB.end() && I->readsRegister(PhysReg));
-    storeVirtReg(MBB, I, VirtReg, PhysReg, isKill);
-  }
-
-  getVirt2PhysRegMapSlot(VirtReg) = 0;   // VirtReg no longer available
-
-  DEBUG(dbgs() << '\n');
-  removePhysReg(PhysReg);
-}
-
-
-/// spillPhysReg - This method spills the specified physical register into the
-/// virtual register slot associated with it.  If OnlyVirtRegs is set to true,
-/// then the request is ignored if the physical register does not contain a
-/// virtual register.
-///
-void RALocal::spillPhysReg(MachineBasicBlock &MBB, MachineInstr *I,
-                           unsigned PhysReg, bool OnlyVirtRegs) {
-  if (PhysRegsUsed[PhysReg] != -1) {            // Only spill it if it's used!
-    assert(PhysRegsUsed[PhysReg] != -2 && "Non allocable reg used!");
-    if (PhysRegsUsed[PhysReg] || !OnlyVirtRegs)
-      spillVirtReg(MBB, I, PhysRegsUsed[PhysReg], PhysReg);
-    return;
-  }
-  
-  // If the selected register aliases any other registers, we must make
-  // sure that one of the aliases isn't alive.
-  for (const unsigned *AliasSet = TRI->getAliasSet(PhysReg);
-       *AliasSet; ++AliasSet) {
-    if (PhysRegsUsed[*AliasSet] == -1 ||     // Spill aliased register.
-        PhysRegsUsed[*AliasSet] == -2)       // If allocatable.
-      continue;
-  
-    if (PhysRegsUsed[*AliasSet])
-      spillVirtReg(MBB, I, PhysRegsUsed[*AliasSet], *AliasSet);
-  }
-}
-
-
-/// assignVirtToPhysReg - This method updates local state so that we know
-/// that PhysReg is the proper container for VirtReg now.  The physical
-/// register must not be used for anything else when this is called.
-///
-void RALocal::assignVirtToPhysReg(unsigned VirtReg, unsigned PhysReg) {
-  assert(PhysRegsUsed[PhysReg] == -1 && "Phys reg already assigned!");
-  // Update information to note the fact that this register was just used, and
-  // it holds VirtReg.
-  PhysRegsUsed[PhysReg] = VirtReg;
-  getVirt2PhysRegMapSlot(VirtReg) = PhysReg;
-  AddToPhysRegsUseOrder(PhysReg);   // New use of PhysReg
-}
-
-
-/// isPhysRegAvailable - Return true if the specified physical register is free
-/// and available for use.  This also includes checking to see if aliased
-/// registers are all free...
-///
-bool RALocal::isPhysRegAvailable(unsigned PhysReg) const {
-  if (PhysRegsUsed[PhysReg] != -1) return false;
-
-  // If the selected register aliases any other allocated registers, it is
-  // not free!
-  for (const unsigned *AliasSet = TRI->getAliasSet(PhysReg);
-       *AliasSet; ++AliasSet)
-    if (PhysRegsUsed[*AliasSet] >= 0) // Aliased register in use?
-      return false;                    // Can't use this reg then.
-  return true;
-}
-
-
-/// getFreeReg - Look to see if there is a free register available in the
-/// specified register class.  If not, return 0.
-///
-unsigned RALocal::getFreeReg(const TargetRegisterClass *RC) {
-  // Get iterators defining the range of registers that are valid to allocate in
-  // this class, which also specifies the preferred allocation order.
-  TargetRegisterClass::iterator RI = RC->allocation_order_begin(*MF);
-  TargetRegisterClass::iterator RE = RC->allocation_order_end(*MF);
-
-  for (; RI != RE; ++RI)
-    if (isPhysRegAvailable(*RI)) {       // Is reg unused?
-      assert(*RI != 0 && "Cannot use register!");
-      return *RI; // Found an unused register!
-    }
-  return 0;
-}
-
-
-/// getReg - Find a physical register to hold the specified virtual
-/// register.  If all compatible physical registers are used, this method spills
-/// the last used virtual register to the stack, and uses that register.
-///
-unsigned RALocal::getReg(MachineBasicBlock &MBB, MachineInstr *I,
-                         unsigned VirtReg, bool NoFree) {
-  const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(VirtReg);
-
-  // First check to see if we have a free register of the requested type...
-  unsigned PhysReg = NoFree ? 0 : getFreeReg(RC);
-
-  if (PhysReg != 0) {
-    // Assign the register.
-    assignVirtToPhysReg(VirtReg, PhysReg);
-    return PhysReg;
-  }    
-    
-  // If we didn't find an unused register, scavenge one now!
-  assert(!PhysRegsUseOrder.empty() && "No allocated registers??");
-
-  // Loop over all of the preallocated registers from the least recently used
-  // to the most recently used.  When we find one that is capable of holding
-  // our register, use it.
-  for (unsigned i = 0; PhysReg == 0; ++i) {
-    assert(i != PhysRegsUseOrder.size() &&
-           "Couldn't find a register of the appropriate class!");
-
-    unsigned R = PhysRegsUseOrder[i];
-
-    // We can only use this register if it holds a virtual register (ie, it
-    // can be spilled).  Do not use it if it is an explicitly allocated
-    // physical register!
-    assert(PhysRegsUsed[R] != -1 &&
-           "PhysReg in PhysRegsUseOrder, but is not allocated?");
-    if (PhysRegsUsed[R] && PhysRegsUsed[R] != -2) {
-      // If the current register is compatible, use it.
-      if (RC->contains(R)) {
-        PhysReg = R;
-        break;
-      }
-      
-      // If one of the registers aliased to the current register is
-      // compatible, use it.
-      for (const unsigned *AliasIt = TRI->getAliasSet(R);
-           *AliasIt; ++AliasIt) {
-        if (!RC->contains(*AliasIt)) continue;
-        
-        // If this is pinned down for some reason, don't use it.  For
-        // example, if CL is pinned, and we run across CH, don't use
-        // CH as justification for using scavenging ECX (which will
-        // fail).
-        if (PhysRegsUsed[*AliasIt] == 0) continue;
-            
-        // Make sure the register is allocatable.  Don't allocate SIL on
-        // x86-32.
-        if (PhysRegsUsed[*AliasIt] == -2) continue;
-        
-        PhysReg = *AliasIt;    // Take an aliased register
-        break;
-      }
-    }
-  }
-
-  assert(PhysReg && "Physical register not assigned!?!?");
-
-  // At this point PhysRegsUseOrder[i] is the least recently used register of
-  // compatible register class.  Spill it to memory and reap its remains.
-  spillPhysReg(MBB, I, PhysReg);
-
-  // Now that we know which register we need to assign this to, do it now!
-  assignVirtToPhysReg(VirtReg, PhysReg);
-  return PhysReg;
-}
-
-
-/// reloadVirtReg - This method transforms the specified virtual
-/// register use to refer to a physical register.  This method may do this in
-/// one of several ways: if the register is available in a physical register
-/// already, it uses that physical register.  If the value is not in a physical
-/// register, and if there are physical registers available, it loads it into a
-/// register: PhysReg if that is an available physical register, otherwise any
-/// register.  If register pressure is high, and it is possible, it tries to
-/// fold the load of the virtual register into the instruction itself.  It
-/// avoids doing this if register pressure is low to improve the chance that
-/// subsequent instructions can use the reloaded value.  This method returns
-/// the modified instruction.
-///
-MachineInstr *RALocal::reloadVirtReg(MachineBasicBlock &MBB, MachineInstr *MI,
-                                     unsigned OpNum,
-                                     SmallSet<unsigned, 4> &ReloadedRegs,
-                                     unsigned PhysReg) {
-  unsigned VirtReg = MI->getOperand(OpNum).getReg();
-  unsigned SubIdx = MI->getOperand(OpNum).getSubReg();
-
-  // If the virtual register is already available, just update the instruction
-  // and return.
-  if (unsigned PR = getVirt2PhysRegMapSlot(VirtReg)) {
-    if (SubIdx) {
-      PR = TRI->getSubReg(PR, SubIdx);
-      MI->getOperand(OpNum).setSubReg(0);
-    }
-    MI->getOperand(OpNum).setReg(PR);  // Assign the input register
-    if (!MI->isDebugValue()) {
-      // Do not do these for DBG_VALUE as they can affect codegen.
-      MarkPhysRegRecentlyUsed(PR);       // Already have this value available!
-      getVirtRegLastUse(VirtReg) = std::make_pair(MI, OpNum);
-    }
-    return MI;
-  }
-
-  // Otherwise, we need to fold it into the current instruction, or reload it.
-  // If we have registers available to hold the value, use them.
-  const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(VirtReg);
-  // If we already have a PhysReg (this happens when the instruction is a
-  // reg-to-reg copy with a PhysReg destination) use that.
-  if (!PhysReg || !TargetRegisterInfo::isPhysicalRegister(PhysReg) ||
-      !isPhysRegAvailable(PhysReg))
-    PhysReg = getFreeReg(RC);
-  int FrameIndex = getStackSpaceFor(VirtReg, RC);
-
-  if (PhysReg) {   // Register is available, allocate it!
-    assignVirtToPhysReg(VirtReg, PhysReg);
-  } else {         // No registers available.
-    // Force some poor hapless value out of the register file to
-    // make room for the new register, and reload it.
-    PhysReg = getReg(MBB, MI, VirtReg, true);
-  }
-
-  markVirtRegModified(VirtReg, false);   // Note that this reg was just reloaded
-
-  DEBUG(dbgs() << "  Reloading %reg" << VirtReg << " into "
-               << TRI->getName(PhysReg) << "\n");
-
-  // Add move instruction(s)
-  TII->loadRegFromStackSlot(MBB, MI, PhysReg, FrameIndex, RC, TRI);
-  ++NumLoads;    // Update statistics
-
-  MF->getRegInfo().setPhysRegUsed(PhysReg);
-  // Assign the input register.
-  if (SubIdx) {
-    MI->getOperand(OpNum).setSubReg(0);
-    MI->getOperand(OpNum).setReg(TRI->getSubReg(PhysReg, SubIdx));
-  } else
-    MI->getOperand(OpNum).setReg(PhysReg);  // Assign the input register
-  getVirtRegLastUse(VirtReg) = std::make_pair(MI, OpNum);
-
-  if (!ReloadedRegs.insert(PhysReg)) {
-    std::string msg;
-    raw_string_ostream Msg(msg);
-    Msg << "Ran out of registers during register allocation!";
-    if (MI->isInlineAsm()) {
-      Msg << "\nPlease check your inline asm statement for invalid "
-           << "constraints:\n";
-      MI->print(Msg, TM);
-    }
-    report_fatal_error(Msg.str());
-  }
-  for (const unsigned *SubRegs = TRI->getSubRegisters(PhysReg);
-       *SubRegs; ++SubRegs) {
-    if (ReloadedRegs.insert(*SubRegs)) continue;
-    
-    std::string msg;
-    raw_string_ostream Msg(msg);
-    Msg << "Ran out of registers during register allocation!";
-    if (MI->isInlineAsm()) {
-      Msg << "\nPlease check your inline asm statement for invalid "
-           << "constraints:\n";
-      MI->print(Msg, TM);
-    }
-    report_fatal_error(Msg.str());
-  }
-
-  return MI;
-}
-
-/// isReadModWriteImplicitKill - True if this is an implicit kill for a
-/// read/mod/write register, i.e. update partial register.
-static bool isReadModWriteImplicitKill(MachineInstr *MI, unsigned Reg) {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (MO.isReg() && MO.getReg() == Reg && MO.isImplicit() &&
-        MO.isDef() && !MO.isDead())
-      return true;
-  }
-  return false;
-}
-
-/// isReadModWriteImplicitDef - True if this is an implicit def for a
-/// read/mod/write register, i.e. update partial register.
-static bool isReadModWriteImplicitDef(MachineInstr *MI, unsigned Reg) {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (MO.isReg() && MO.getReg() == Reg && MO.isImplicit() &&
-        !MO.isDef() && MO.isKill())
-      return true;
-  }
-  return false;
-}
-
-// precedes - Helper function to determine with MachineInstr A
-// precedes MachineInstr B within the same MBB.
-static bool precedes(MachineBasicBlock::iterator A,
-                     MachineBasicBlock::iterator B) {
-  if (A == B)
-    return false;
-  
-  MachineBasicBlock::iterator I = A->getParent()->begin();
-  while (I != A->getParent()->end()) {
-    if (I == A)
-      return true;
-    else if (I == B)
-      return false;
-    
-    ++I;
-  }
-  
-  return false;
-}
-
-/// ComputeLocalLiveness - Computes liveness of registers within a basic
-/// block, setting the killed/dead flags as appropriate.
-void RALocal::ComputeLocalLiveness(MachineBasicBlock& MBB) {
-  // Keep track of the most recently seen previous use or def of each reg, 
-  // so that we can update them with dead/kill markers.
-  DenseMap<unsigned, std::pair<MachineInstr*, unsigned> > LastUseDef;
-  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-       I != E; ++I) {
-    if (I->isDebugValue())
-      continue;
-    
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = I->getOperand(i);
-      // Uses don't trigger any flags, but we need to save
-      // them for later.  Also, we have to process these
-      // _before_ processing the defs, since an instr
-      // uses regs before it defs them.
-      if (!MO.isReg() || !MO.getReg() || !MO.isUse())
-        continue;
-
-      // Ignore helpful kill flags from earlier passes.
-      MO.setIsKill(false);
-
-      LastUseDef[MO.getReg()] = std::make_pair(I, i);
-      
-      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) continue;
-      
-      const unsigned *Aliases = TRI->getAliasSet(MO.getReg());
-      if (Aliases == 0)
-        continue;
-      
-      while (*Aliases) {
-        DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
-          alias = LastUseDef.find(*Aliases);
-        
-        if (alias != LastUseDef.end() && alias->second.first != I)
-          LastUseDef[*Aliases] = std::make_pair(I, i);
-        
-        ++Aliases;
-      }
-    }
-    
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = I->getOperand(i);
-      // Defs others than 2-addr redefs _do_ trigger flag changes:
-      //   - A def followed by a def is dead
-      //   - A use followed by a def is a kill
-      if (!MO.isReg() || !MO.getReg() || !MO.isDef()) continue;
-
-      unsigned SubIdx = MO.getSubReg();
-      DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
-        last = LastUseDef.find(MO.getReg());
-      if (last != LastUseDef.end()) {
-        // Check if this is a two address instruction.  If so, then
-        // the def does not kill the use.
-        if (last->second.first == I && I->isRegTiedToUseOperand(i))
-          continue;
-        
-        MachineOperand &lastUD =
-                    last->second.first->getOperand(last->second.second);
-        if (SubIdx && lastUD.getSubReg() != SubIdx)
-          // Partial re-def, the last def is not dead.
-          // %reg1024:5<def> =
-          // %reg1024:6<def> =
-          // or 
-          // %reg1024:5<def> = op %reg1024, 5
-          continue;
-
-        if (lastUD.isDef())
-          lastUD.setIsDead(true);
-        else
-          lastUD.setIsKill(true);
-      }
-      
-      LastUseDef[MO.getReg()] = std::make_pair(I, i);
-    }
-  }
-  
-  // Live-out (of the function) registers contain return values of the function,
-  // so we need to make sure they are alive at return time.
-  MachineBasicBlock::iterator Ret = MBB.getFirstTerminator();
-  bool BBEndsInReturn = (Ret != MBB.end() && Ret->getDesc().isReturn());
-
-  if (BBEndsInReturn)
-    for (MachineRegisterInfo::liveout_iterator
-         I = MF->getRegInfo().liveout_begin(),
-         E = MF->getRegInfo().liveout_end(); I != E; ++I)
-      if (!Ret->readsRegister(*I)) {
-        Ret->addOperand(MachineOperand::CreateReg(*I, false, true));
-        LastUseDef[*I] = std::make_pair(Ret, Ret->getNumOperands()-1);
-      }
-  
-  // Finally, loop over the final use/def of each reg 
-  // in the block and determine if it is dead.
-  for (DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
-       I = LastUseDef.begin(), E = LastUseDef.end(); I != E; ++I) {
-    MachineInstr *MI = I->second.first;
-    unsigned idx = I->second.second;
-    MachineOperand &MO = MI->getOperand(idx);
-    
-    bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(MO.getReg());
-    
-    // A crude approximation of "live-out" calculation
-    bool usedOutsideBlock = isPhysReg ? false :   
-          UsedInMultipleBlocks.test(MO.getReg() -  
-                                    TargetRegisterInfo::FirstVirtualRegister);
-
-    // If the machine BB ends in a return instruction, then the value isn't used
-    // outside of the BB.
-    if (!isPhysReg && (!usedOutsideBlock || BBEndsInReturn)) {
-      // DBG_VALUE complicates this:  if the only refs of a register outside
-      // this block are DBG_VALUE, we can't keep the reg live just for that,
-      // as it will cause the reg to be spilled at the end of this block when
-      // it wouldn't have been otherwise.  Nullify the DBG_VALUEs when that
-      // happens.
-      bool UsedByDebugValueOnly = false;
-      for (MachineRegisterInfo::reg_iterator UI = MRI->reg_begin(MO.getReg()),
-             UE = MRI->reg_end(); UI != UE; ++UI) {
-        // Two cases:
-        // - used in another block
-        // - used in the same block before it is defined (loop)
-        if (UI->getParent() == &MBB &&
-            !(MO.isDef() && UI.getOperand().isUse() && precedes(&*UI, MI)))
-          continue;
-        
-        if (UI->isDebugValue()) {
-          UsedByDebugValueOnly = true;
-          continue;
-        }
-
-        // A non-DBG_VALUE use means we can leave DBG_VALUE uses alone.
-        UsedInMultipleBlocks.set(MO.getReg() - 
-                                 TargetRegisterInfo::FirstVirtualRegister);
-        usedOutsideBlock = true;
-        UsedByDebugValueOnly = false;
-        break;
-      }
-
-      if (UsedByDebugValueOnly)
-        for (MachineRegisterInfo::reg_iterator UI = MRI->reg_begin(MO.getReg()),
-             UE = MRI->reg_end(); UI != UE; ++UI)
-          if (UI->isDebugValue() &&
-              (UI->getParent() != &MBB ||
-               (MO.isDef() && precedes(&*UI, MI))))
-            UI.getOperand().setReg(0U);
-    }
-  
-    // Physical registers and those that are not live-out of the block are
-    // killed/dead at their last use/def within this block.
-    if (isPhysReg || !usedOutsideBlock || BBEndsInReturn) {
-      if (MO.isUse()) {
-        // Don't mark uses that are tied to defs as kills.
-        if (!MI->isRegTiedToDefOperand(idx))
-          MO.setIsKill(true);
-      } else {
-        MO.setIsDead(true);
-      }
-    }
-  }
-}
-
-void RALocal::AllocateBasicBlock(MachineBasicBlock &MBB) {
-  // loop over each instruction
-  MachineBasicBlock::iterator MII = MBB.begin();
-  
-  DEBUG({
-      const BasicBlock *LBB = MBB.getBasicBlock();
-      if (LBB)
-        dbgs() << "\nStarting RegAlloc of BB: " << LBB->getName();
-    });
-
-  // Add live-in registers as active.
-  for (MachineBasicBlock::livein_iterator I = MBB.livein_begin(),
-         E = MBB.livein_end(); I != E; ++I) {
-    unsigned Reg = *I;
-    MF->getRegInfo().setPhysRegUsed(Reg);
-    PhysRegsUsed[Reg] = 0;            // It is free and reserved now
-    AddToPhysRegsUseOrder(Reg); 
-    for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
-         *SubRegs; ++SubRegs) {
-      if (PhysRegsUsed[*SubRegs] == -2) continue;
-      
-      AddToPhysRegsUseOrder(*SubRegs); 
-      PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
-      MF->getRegInfo().setPhysRegUsed(*SubRegs);
-    }
-  }
-  
-  ComputeLocalLiveness(MBB);
-  
-  // Otherwise, sequentially allocate each instruction in the MBB.
-  while (MII != MBB.end()) {
-    MachineInstr *MI = MII++;
-    const TargetInstrDesc &TID = MI->getDesc();
-    DEBUG({
-        dbgs() << "\nStarting RegAlloc of: " << *MI;
-        dbgs() << "  Regs have values: ";
-        for (unsigned i = 0; i != TRI->getNumRegs(); ++i)
-          if (PhysRegsUsed[i] != -1 && PhysRegsUsed[i] != -2) {
-            if (PhysRegsUsed[i] && isVirtRegModified(PhysRegsUsed[i]))
-              dbgs() << "*";
-            dbgs() << "[" << TRI->getName(i)
-                   << ",%reg" << PhysRegsUsed[i] << "] ";
-          }
-        dbgs() << '\n';
-      });
-
-    // Determine whether this is a copy instruction.  The cases where the
-    // source or destination are phys regs are handled specially.
-    unsigned SrcCopyReg, DstCopyReg, SrcCopySubReg, DstCopySubReg;
-    unsigned SrcCopyPhysReg = 0U;
-    bool isCopy = TII->isMoveInstr(*MI, SrcCopyReg, DstCopyReg, 
-                                   SrcCopySubReg, DstCopySubReg) &&
-      SrcCopySubReg == DstCopySubReg;
-    if (isCopy && TargetRegisterInfo::isVirtualRegister(SrcCopyReg))
-      SrcCopyPhysReg = getVirt2PhysRegMapSlot(SrcCopyReg);
-
-    // Loop over the implicit uses, making sure that they are at the head of the
-    // use order list, so they don't get reallocated.
-    if (TID.ImplicitUses) {
-      for (const unsigned *ImplicitUses = TID.ImplicitUses;
-           *ImplicitUses; ++ImplicitUses)
-        MarkPhysRegRecentlyUsed(*ImplicitUses);
-    }
-
-    SmallVector<unsigned, 8> Kills;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isKill()) continue;
-      
-      if (!MO.isImplicit())
-        Kills.push_back(MO.getReg());
-      else if (!isReadModWriteImplicitKill(MI, MO.getReg()))
-        // These are extra physical register kills when a sub-register
-        // is defined (def of a sub-register is a read/mod/write of the
-        // larger registers). Ignore.
-        Kills.push_back(MO.getReg());
-    }
-
-    // If any physical regs are earlyclobber, spill any value they might
-    // have in them, then mark them unallocatable.
-    // If any virtual regs are earlyclobber, allocate them now (before
-    // freeing inputs that are killed).
-    if (MI->isInlineAsm()) {
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        MachineOperand &MO = MI->getOperand(i);
-        if (!MO.isReg() || !MO.isDef() || !MO.isEarlyClobber() ||
-            !MO.getReg())
-          continue;
-          
-        if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
-          unsigned DestVirtReg = MO.getReg();
-          unsigned DestPhysReg;
-
-          // If DestVirtReg already has a value, use it.
-          if (!(DestPhysReg = getVirt2PhysRegMapSlot(DestVirtReg)))
-            DestPhysReg = getReg(MBB, MI, DestVirtReg);
-          MF->getRegInfo().setPhysRegUsed(DestPhysReg);
-          markVirtRegModified(DestVirtReg);
-          getVirtRegLastUse(DestVirtReg) =
-                 std::make_pair((MachineInstr*)0, 0);
-          DEBUG(dbgs() << "  Assigning " << TRI->getName(DestPhysReg)
-                       << " to %reg" << DestVirtReg << "\n");
-          if (unsigned DestSubIdx = MO.getSubReg()) {
-            MO.setSubReg(0);
-            DestPhysReg = TRI->getSubReg(DestPhysReg, DestSubIdx);
-          }
-          MO.setReg(DestPhysReg);  // Assign the earlyclobber register
-        } else {
-          unsigned Reg = MO.getReg();
-          if (PhysRegsUsed[Reg] == -2) continue;  // Something like ESP.
-          // These are extra physical register defs when a sub-register
-          // is defined (def of a sub-register is a read/mod/write of the
-          // larger registers). Ignore.
-          if (isReadModWriteImplicitDef(MI, MO.getReg())) continue;
-
-          MF->getRegInfo().setPhysRegUsed(Reg);
-          spillPhysReg(MBB, MI, Reg, true); // Spill any existing value in reg
-          PhysRegsUsed[Reg] = 0;            // It is free and reserved now
-          AddToPhysRegsUseOrder(Reg); 
-
-          for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
-               *SubRegs; ++SubRegs) {
-            if (PhysRegsUsed[*SubRegs] == -2) continue;
-            MF->getRegInfo().setPhysRegUsed(*SubRegs);
-            PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
-            AddToPhysRegsUseOrder(*SubRegs); 
-          }
-        }
-      }
-    }
-
-    // If a DBG_VALUE says something is located in a spilled register,
-    // change the DBG_VALUE to be undef, which prevents the register
-    // from being reloaded here.  Doing that would change the generated
-    // code, unless another use immediately follows this instruction.
-    if (MI->isDebugValue() &&
-        MI->getNumOperands()==3 && MI->getOperand(0).isReg()) {
-      unsigned VirtReg = MI->getOperand(0).getReg();
-      if (VirtReg && TargetRegisterInfo::isVirtualRegister(VirtReg) &&
-          !getVirt2PhysRegMapSlot(VirtReg))
-        MI->getOperand(0).setReg(0U);
-    }
-
-    // Get the used operands into registers.  This has the potential to spill
-    // incoming values if we are out of registers.  Note that we completely
-    // ignore physical register uses here.  We assume that if an explicit
-    // physical register is referenced by the instruction, that it is guaranteed
-    // to be live-in, or the input is badly hosed.
-    //
-    SmallSet<unsigned, 4> ReloadedRegs;
-    for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      // here we are looking for only used operands (never def&use)
-      if (MO.isReg() && !MO.isDef() && MO.getReg() && !MO.isImplicit() &&
-          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        MI = reloadVirtReg(MBB, MI, i, ReloadedRegs,
-                           isCopy ? DstCopyReg : 0);
-    }
-
-    // If this instruction is the last user of this register, kill the
-    // value, freeing the register being used, so it doesn't need to be
-    // spilled to memory.
-    //
-    for (unsigned i = 0, e = Kills.size(); i != e; ++i) {
-      unsigned VirtReg = Kills[i];
-      unsigned PhysReg = VirtReg;
-      if (TargetRegisterInfo::isVirtualRegister(VirtReg)) {
-        // If the virtual register was never materialized into a register, it
-        // might not be in the map, but it won't hurt to zero it out anyway.
-        unsigned &PhysRegSlot = getVirt2PhysRegMapSlot(VirtReg);
-        PhysReg = PhysRegSlot;
-        PhysRegSlot = 0;
-      } else if (PhysRegsUsed[PhysReg] == -2) {
-        // Unallocatable register dead, ignore.
-        continue;
-      } else {
-        assert((!PhysRegsUsed[PhysReg] || PhysRegsUsed[PhysReg] == -1) &&
-               "Silently clearing a virtual register?");
-      }
-
-      if (!PhysReg) continue;
-      
-      DEBUG(dbgs() << "  Last use of " << TRI->getName(PhysReg)
-                   << "[%reg" << VirtReg <<"], removing it from live set\n");
-      removePhysReg(PhysReg);
-      for (const unsigned *SubRegs = TRI->getSubRegisters(PhysReg);
-           *SubRegs; ++SubRegs) {
-        if (PhysRegsUsed[*SubRegs] != -2) {
-          DEBUG(dbgs()  << "  Last use of "
-                        << TRI->getName(*SubRegs) << "[%reg" << VirtReg
-                        <<"], removing it from live set\n");
-          removePhysReg(*SubRegs);
-        }
-      }
-    }
-
-    // Loop over all of the operands of the instruction, spilling registers that
-    // are defined, and marking explicit destinations in the PhysRegsUsed map.
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isDef() || MO.isImplicit() || !MO.getReg() ||
-          MO.isEarlyClobber() ||
-          !TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
-        continue;
-      
-      unsigned Reg = MO.getReg();
-      if (PhysRegsUsed[Reg] == -2) continue;  // Something like ESP.
-      // These are extra physical register defs when a sub-register
-      // is defined (def of a sub-register is a read/mod/write of the
-      // larger registers). Ignore.
-      if (isReadModWriteImplicitDef(MI, MO.getReg())) continue;
-
-      MF->getRegInfo().setPhysRegUsed(Reg);
-      spillPhysReg(MBB, MI, Reg, true); // Spill any existing value in reg
-      PhysRegsUsed[Reg] = 0;            // It is free and reserved now
-      AddToPhysRegsUseOrder(Reg); 
-
-      for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
-           *SubRegs; ++SubRegs) {
-        if (PhysRegsUsed[*SubRegs] == -2) continue;
-        
-        MF->getRegInfo().setPhysRegUsed(*SubRegs);
-        PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
-        AddToPhysRegsUseOrder(*SubRegs); 
-      }
-    }
-
-    // Loop over the implicit defs, spilling them as well.
-    if (TID.ImplicitDefs) {
-      for (const unsigned *ImplicitDefs = TID.ImplicitDefs;
-           *ImplicitDefs; ++ImplicitDefs) {
-        unsigned Reg = *ImplicitDefs;
-        if (PhysRegsUsed[Reg] != -2) {
-          spillPhysReg(MBB, MI, Reg, true);
-          AddToPhysRegsUseOrder(Reg); 
-          PhysRegsUsed[Reg] = 0;            // It is free and reserved now
-        }
-        MF->getRegInfo().setPhysRegUsed(Reg);
-        for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
-             *SubRegs; ++SubRegs) {
-          if (PhysRegsUsed[*SubRegs] == -2) continue;
-          
-          AddToPhysRegsUseOrder(*SubRegs); 
-          PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
-          MF->getRegInfo().setPhysRegUsed(*SubRegs);
-        }
-      }
-    }
-
-    SmallVector<unsigned, 8> DeadDefs;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (MO.isReg() && MO.isDead())
-        DeadDefs.push_back(MO.getReg());
-    }
-
-    // Okay, we have allocated all of the source operands and spilled any values
-    // that would be destroyed by defs of this instruction.  Loop over the
-    // explicit defs and assign them to a register, spilling incoming values if
-    // we need to scavenge a register.
-    //
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isDef() || !MO.getReg() ||
-          MO.isEarlyClobber() ||
-          !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        continue;
-      
-      unsigned DestVirtReg = MO.getReg();
-      unsigned DestPhysReg;
-
-      // If DestVirtReg already has a value, use it.
-      if (!(DestPhysReg = getVirt2PhysRegMapSlot(DestVirtReg))) {
-        // If this is a copy try to reuse the input as the output;
-        // that will make the copy go away.
-        // If this is a copy, the source reg is a phys reg, and
-        // that reg is available, use that phys reg for DestPhysReg.
-        // If this is a copy, the source reg is a virtual reg, and
-        // the phys reg that was assigned to that virtual reg is now
-        // available, use that phys reg for DestPhysReg.  (If it's now
-        // available that means this was the last use of the source.)
-        if (isCopy &&
-            TargetRegisterInfo::isPhysicalRegister(SrcCopyReg) &&
-            isPhysRegAvailable(SrcCopyReg)) {
-          DestPhysReg = SrcCopyReg;
-          assignVirtToPhysReg(DestVirtReg, DestPhysReg);
-        } else if (isCopy &&
-            TargetRegisterInfo::isVirtualRegister(SrcCopyReg) &&
-            SrcCopyPhysReg && isPhysRegAvailable(SrcCopyPhysReg) &&
-            MF->getRegInfo().getRegClass(DestVirtReg)->
-                             contains(SrcCopyPhysReg)) {
-          DestPhysReg = SrcCopyPhysReg;
-          assignVirtToPhysReg(DestVirtReg, DestPhysReg);
-        } else
-          DestPhysReg = getReg(MBB, MI, DestVirtReg);
-      }
-      MF->getRegInfo().setPhysRegUsed(DestPhysReg);
-      markVirtRegModified(DestVirtReg);
-      getVirtRegLastUse(DestVirtReg) = std::make_pair((MachineInstr*)0, 0);
-      DEBUG(dbgs() << "  Assigning " << TRI->getName(DestPhysReg)
-                   << " to %reg" << DestVirtReg << "\n");
-
-      if (unsigned DestSubIdx = MO.getSubReg()) {
-        MO.setSubReg(0);
-        DestPhysReg = TRI->getSubReg(DestPhysReg, DestSubIdx);
-      }
-      MO.setReg(DestPhysReg);  // Assign the output register
-    }
-
-    // If this instruction defines any registers that are immediately dead,
-    // kill them now.
-    //
-    for (unsigned i = 0, e = DeadDefs.size(); i != e; ++i) {
-      unsigned VirtReg = DeadDefs[i];
-      unsigned PhysReg = VirtReg;
-      if (TargetRegisterInfo::isVirtualRegister(VirtReg)) {
-        unsigned &PhysRegSlot = getVirt2PhysRegMapSlot(VirtReg);
-        PhysReg = PhysRegSlot;
-        assert(PhysReg != 0);
-        PhysRegSlot = 0;
-      } else if (PhysRegsUsed[PhysReg] == -2) {
-        // Unallocatable register dead, ignore.
-        continue;
-      } else if (!PhysReg)
-        continue;
-      
-      DEBUG(dbgs()  << "  Register " << TRI->getName(PhysReg)
-                    << " [%reg" << VirtReg
-                    << "] is never used, removing it from live set\n");
-      removePhysReg(PhysReg);
-      for (const unsigned *AliasSet = TRI->getAliasSet(PhysReg);
-           *AliasSet; ++AliasSet) {
-        if (PhysRegsUsed[*AliasSet] != -2) {
-          DEBUG(dbgs()  << "  Register " << TRI->getName(*AliasSet)
-                        << " [%reg" << *AliasSet
-                        << "] is never used, removing it from live set\n");
-          removePhysReg(*AliasSet);
-        }
-      }
-    }
-    
-    // If this instruction is a call, make sure there are no dirty registers. The
-    // call might throw an exception, and the landing pad expects to find all
-    // registers in stack slots.
-    if (TID.isCall())
-      for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i) {
-        if (PhysRegsUsed[i] <= 0) continue;
-        unsigned VirtReg = PhysRegsUsed[i];
-        if (!isVirtRegModified(VirtReg)) continue;
-        DEBUG(dbgs() << "  Storing dirty %reg" << VirtReg);
-        storeVirtReg(MBB, MI, VirtReg, i, false);
-        markVirtRegModified(VirtReg, false);
-        DEBUG(dbgs() << " because the call might throw\n");
-      }
-
-    // Finally, if this is a noop copy instruction, zap it.  (Except that if
-    // the copy is dead, it must be kept to avoid messing up liveness info for
-    // the register scavenger.  See pr4100.)
-    if (TII->isMoveInstr(*MI, SrcCopyReg, DstCopyReg,
-                         SrcCopySubReg, DstCopySubReg) &&
-        SrcCopyReg == DstCopyReg && SrcCopySubReg == DstCopySubReg &&
-        DeadDefs.empty()) {
-      ++NumCopies;
-      MBB.erase(MI);
-    }
-  }
-
-  MachineBasicBlock::iterator MI = MBB.getFirstTerminator();
-
-  // Spill all physical registers holding virtual registers now.
-  for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i)
-    if (PhysRegsUsed[i] != -1 && PhysRegsUsed[i] != -2) {
-      if (unsigned VirtReg = PhysRegsUsed[i])
-        spillVirtReg(MBB, MI, VirtReg, i);
-      else
-        removePhysReg(i);
-    }
-
-#if 0
-  // This checking code is very expensive.
-  bool AllOk = true;
-  for (unsigned i = TargetRegisterInfo::FirstVirtualRegister,
-           e = MF->getRegInfo().getLastVirtReg(); i <= e; ++i)
-    if (unsigned PR = Virt2PhysRegMap[i]) {
-      cerr << "Register still mapped: " << i << " -> " << PR << "\n";
-      AllOk = false;
-    }
-  assert(AllOk && "Virtual registers still in phys regs?");
-#endif
-
-  // Clear any physical register which appear live at the end of the basic
-  // block, but which do not hold any virtual registers.  e.g., the stack
-  // pointer.
-  PhysRegsUseOrder.clear();
-}
-
-/// runOnMachineFunction - Register allocate the whole function
-///
-bool RALocal::runOnMachineFunction(MachineFunction &Fn) {
-  DEBUG(dbgs() << "Machine Function\n");
-  MF = &Fn;
-  MRI = &Fn.getRegInfo();
-  TM = &Fn.getTarget();
-  TRI = TM->getRegisterInfo();
-  TII = TM->getInstrInfo();
-
-  PhysRegsUsed.assign(TRI->getNumRegs(), -1);
-  
-  // At various places we want to efficiently check to see whether a register
-  // is allocatable.  To handle this, we mark all unallocatable registers as
-  // being pinned down, permanently.
-  {
-    BitVector Allocable = TRI->getAllocatableSet(Fn);
-    for (unsigned i = 0, e = Allocable.size(); i != e; ++i)
-      if (!Allocable[i])
-        PhysRegsUsed[i] = -2;  // Mark the reg unallocable.
-  }
-
-  // initialize the virtual->physical register map to have a 'null'
-  // mapping for all virtual registers
-  unsigned LastVirtReg = MF->getRegInfo().getLastVirtReg();
-  StackSlotForVirtReg.grow(LastVirtReg);
-  Virt2PhysRegMap.grow(LastVirtReg);
-  Virt2LastUseMap.grow(LastVirtReg);
-  VirtRegModified.resize(LastVirtReg+1 -
-                         TargetRegisterInfo::FirstVirtualRegister);
-  UsedInMultipleBlocks.resize(LastVirtReg+1 -
-                              TargetRegisterInfo::FirstVirtualRegister);
- 
-  // Loop over all of the basic blocks, eliminating virtual register references
-  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
-       MBB != MBBe; ++MBB)
-    AllocateBasicBlock(*MBB);
-
-  StackSlotForVirtReg.clear();
-  PhysRegsUsed.clear();
-  VirtRegModified.clear();
-  UsedInMultipleBlocks.clear();
-  Virt2PhysRegMap.clear();
-  Virt2LastUseMap.clear();
-  return true;
-}
-
-FunctionPass *llvm::createLocalRegisterAllocator() {
-  return new RALocal();
-}
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 4fafd28..7e61a12 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -396,28 +396,23 @@ PBQPRegAlloc::CoalesceMap PBQPRegAlloc::findCoalesces() {
       if (srcRegIsPhysical && dstRegIsPhysical)
         continue;
 
-      // If it's a copy that includes a virtual register but the source and
-      // destination classes differ then we can't coalesce, so continue with
-      // the next instruction.
-      const TargetRegisterClass *srcRegClass = srcRegIsPhysical ?
-          tri->getPhysicalRegisterRegClass(srcReg) : mri->getRegClass(srcReg);
-
-      const TargetRegisterClass *dstRegClass = dstRegIsPhysical ?
-          tri->getPhysicalRegisterRegClass(dstReg) : mri->getRegClass(dstReg);
-
-      if (srcRegClass != dstRegClass)
+      // If it's a copy that includes two virtual register but the source and
+      // destination classes differ then we can't coalesce.
+      if (!srcRegIsPhysical && !dstRegIsPhysical &&
+          mri->getRegClass(srcReg) != mri->getRegClass(dstReg))
         continue;
 
-      // We also need any physical regs to be allocable, coalescing with
-      // a non-allocable register is invalid.
-      if (srcRegIsPhysical) {
+      // If one is physical and one is virtual, check that the physical is
+      // allocatable in the class of the virtual.
+      if (srcRegIsPhysical && !dstRegIsPhysical) {
+        const TargetRegisterClass *dstRegClass = mri->getRegClass(dstReg);
         if (std::find(dstRegClass->allocation_order_begin(*mf),
                       dstRegClass->allocation_order_end(*mf), srcReg) ==
             dstRegClass->allocation_order_end(*mf))
           continue;
       }
-
-      if (dstRegIsPhysical) {
+      if (!srcRegIsPhysical && dstRegIsPhysical) {
+        const TargetRegisterClass *srcRegClass = mri->getRegClass(srcReg);
         if (std::find(srcRegClass->allocation_order_begin(*mf),
                       srcRegClass->allocation_order_end(*mf), dstReg) ==
             srcRegClass->allocation_order_end(*mf))
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 1131e3d..ab0bc2d 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -16,6 +16,8 @@
 #include "llvm/CodeGen/RegisterCoalescer.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Pass.h"
 
@@ -33,6 +35,160 @@ char RegisterCoalescer::ID = 0;
 //
 RegisterCoalescer::~RegisterCoalescer() {}
 
+unsigned CoalescerPair::compose(unsigned a, unsigned b) const {
+  if (!a) return b;
+  if (!b) return a;
+  return tri_.composeSubRegIndices(a, b);
+}
+
+bool CoalescerPair::isMoveInstr(const MachineInstr *MI,
+                                unsigned &Src, unsigned &Dst,
+                                unsigned &SrcSub, unsigned &DstSub) const {
+  if (MI->isCopy()) {
+    Dst = MI->getOperand(0).getReg();
+    DstSub = MI->getOperand(0).getSubReg();
+    Src = MI->getOperand(1).getReg();
+    SrcSub = MI->getOperand(1).getSubReg();
+  } else if (MI->isSubregToReg()) {
+    Dst = MI->getOperand(0).getReg();
+    DstSub = compose(MI->getOperand(0).getSubReg(), MI->getOperand(3).getImm());
+    Src = MI->getOperand(2).getReg();
+    SrcSub = MI->getOperand(2).getSubReg();
+  } else if (!tii_.isMoveInstr(*MI, Src, Dst, SrcSub, DstSub)) {
+    return false;
+  }
+  return true;
+}
+
+bool CoalescerPair::setRegisters(const MachineInstr *MI) {
+  srcReg_ = dstReg_ = subIdx_ = 0;
+  newRC_ = 0;
+  flipped_ = crossClass_ = false;
+
+  unsigned Src, Dst, SrcSub, DstSub;
+  if (!isMoveInstr(MI, Src, Dst, SrcSub, DstSub))
+    return false;
+  partial_ = SrcSub || DstSub;
+
+  // If one register is a physreg, it must be Dst.
+  if (TargetRegisterInfo::isPhysicalRegister(Src)) {
+    if (TargetRegisterInfo::isPhysicalRegister(Dst))
+      return false;
+    std::swap(Src, Dst);
+    std::swap(SrcSub, DstSub);
+    flipped_ = true;
+  }
+
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+  if (TargetRegisterInfo::isPhysicalRegister(Dst)) {
+    // Eliminate DstSub on a physreg.
+    if (DstSub) {
+      Dst = tri_.getSubReg(Dst, DstSub);
+      if (!Dst) return false;
+      DstSub = 0;
+    }
+
+    // Eliminate SrcSub by picking a corresponding Dst superregister.
+    if (SrcSub) {
+      Dst = tri_.getMatchingSuperReg(Dst, SrcSub, MRI.getRegClass(Src));
+      if (!Dst) return false;
+      SrcSub = 0;
+    } else if (!MRI.getRegClass(Src)->contains(Dst)) {
+      return false;
+    }
+  } else {
+    // Both registers are virtual.
+
+    // Both registers have subreg indices.
+    if (SrcSub && DstSub) {
+      // For now we only handle the case of identical indices in commensurate
+      // registers: Dreg:ssub_1 + Dreg:ssub_1 -> Dreg
+      // FIXME: Handle Qreg:ssub_3 + Dreg:ssub_1 as QReg:dsub_1 + Dreg.
+      if (SrcSub != DstSub)
+        return false;
+      const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
+      const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+      if (!getCommonSubClass(DstRC, SrcRC))
+        return false;
+      SrcSub = DstSub = 0;
+    }
+
+    // There can be no SrcSub.
+    if (SrcSub) {
+      std::swap(Src, Dst);
+      DstSub = SrcSub;
+      SrcSub = 0;
+      assert(!flipped_ && "Unexpected flip");
+      flipped_ = true;
+    }
+
+    // Find the new register class.
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+    if (DstSub)
+      newRC_ = tri_.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+    else
+      newRC_ = getCommonSubClass(DstRC, SrcRC);
+    if (!newRC_)
+      return false;
+    crossClass_ = newRC_ != DstRC || newRC_ != SrcRC;
+  }
+  // Check our invariants
+  assert(TargetRegisterInfo::isVirtualRegister(Src) && "Src must be virtual");
+  assert(!(TargetRegisterInfo::isPhysicalRegister(Dst) && DstSub) &&
+         "Cannot have a physical SubIdx");
+  srcReg_ = Src;
+  dstReg_ = Dst;
+  subIdx_ = DstSub;
+  return true;
+}
+
+bool CoalescerPair::flip() {
+  if (subIdx_ || TargetRegisterInfo::isPhysicalRegister(dstReg_))
+    return false;
+  std::swap(srcReg_, dstReg_);
+  flipped_ = !flipped_;
+  return true;
+}
+
+bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
+  if (!MI)
+    return false;
+  unsigned Src, Dst, SrcSub, DstSub;
+  if (!isMoveInstr(MI, Src, Dst, SrcSub, DstSub))
+    return false;
+
+  // Find the virtual register that is srcReg_.
+  if (Dst == srcReg_) {
+    std::swap(Src, Dst);
+    std::swap(SrcSub, DstSub);
+  } else if (Src != srcReg_) {
+    return false;
+  }
+
+  // Now check that Dst matches dstReg_.
+  if (TargetRegisterInfo::isPhysicalRegister(dstReg_)) {
+    if (!TargetRegisterInfo::isPhysicalRegister(Dst))
+      return false;
+    assert(!subIdx_ && "Inconsistent CoalescerPair state.");
+    // DstSub could be set for a physreg from INSERT_SUBREG.
+    if (DstSub)
+      Dst = tri_.getSubReg(Dst, DstSub);
+    // Full copy of Src.
+    if (!SrcSub)
+      return dstReg_ == Dst;
+    // This is a partial register copy. Check that the parts match.
+    return tri_.getSubReg(dstReg_, SrcSub) == Dst;
+  } else {
+    // dstReg_ is virtual.
+    if (dstReg_ != Dst)
+      return false;
+    // Registers match, do the subregisters line up?
+    return compose(subIdx_, SrcSub) == DstSub;
+  }
+}
+
 // Because of the way .a files work, we must force the SimpleRC
 // implementation to be pulled in if the RegisterCoalescer classes are
 // pulled in.  Otherwise we run the risk of RegisterCoalescer being
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 690e59f..43b3fb6 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -141,6 +141,10 @@ void RegScavenger::forward() {
 
   // Find out which registers are early clobbered, killed, defined, and marked
   // def-dead in this instruction.
+  // FIXME: The scavenger is not predication aware. If the instruction is
+  // predicated, conservatively assume "kill" markers do not actually kill the
+  // register. Similarly ignores "dead" markers.
+  bool isPred = TII->isPredicated(MI);
   BitVector EarlyClobberRegs(NumPhysRegs);
   BitVector KillRegs(NumPhysRegs);
   BitVector DefRegs(NumPhysRegs);
@@ -155,11 +159,11 @@ void RegScavenger::forward() {
 
     if (MO.isUse()) {
       // Two-address operands implicitly kill.
-      if (MO.isKill() || MI->isRegTiedToDefOperand(i))
+      if (!isPred && (MO.isKill() || MI->isRegTiedToDefOperand(i)))
         addRegWithSubRegs(KillRegs, Reg);
     } else {
       assert(MO.isDef());
-      if (MO.isDead())
+      if (!isPred && MO.isDead())
         addRegWithSubRegs(DeadRegs, Reg);
       else
         addRegWithSubRegs(DefRegs, Reg);
@@ -238,8 +242,18 @@ unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
   return 0;
 }
 
+/// getRegsAvailable - Return all available registers in the register class
+/// in Mask.
+void RegScavenger::getRegsAvailable(const TargetRegisterClass *RC,
+                                    BitVector &Mask) {
+  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
+       I != E; ++I)
+    if (!isAliasUsed(*I))
+      Mask.set(*I);
+}
+
 /// findSurvivorReg - Return the candidate register that is unused for the
-/// longest after MBBI. UseMI is set to the instruction where the search
+/// longest after StargMII. UseMI is set to the instruction where the search
 /// stopped.
 ///
 /// No more than InstrLimit instructions are inspected.
@@ -258,6 +272,10 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
 
   bool inVirtLiveRange = false;
   for (++MI; InstrLimit > 0 && MI != ME; ++MI, --InstrLimit) {
+    if (MI->isDebugValue()) {
+      ++InstrLimit; // Don't count debug instructions
+      continue;
+    }
     bool isVirtKillInsn = false;
     bool isVirtDefInsn = false;
     // Remove any candidates touched by instruction.
@@ -321,13 +339,16 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
       Candidates.reset(MO.getReg());
   }
 
+  // Try to find a register that's unused if there is one, as then we won't
+  // have to spill.
+  if ((Candidates & RegsAvailable).any())
+     Candidates &= RegsAvailable;
+
   // Find the register whose use is furthest away.
   MachineBasicBlock::iterator UseMI;
   unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
 
-  // If we found an unused register there is no reason to spill it. We have
-  // probably found a callee-saved register that has been saved in the
-  // prologue, but happens to be unused at this point.
+  // If we found an unused register there is no reason to spill it.
   if (!isAliasUsed(SReg))
     return SReg;
 
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index da20c12..7d39dc4 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -380,26 +380,26 @@ void ScheduleDAG::VerifySchedule(bool isBottomUp) {
 }
 #endif
 
-/// InitDAGTopologicalSorting - create the initial topological 
+/// InitDAGTopologicalSorting - create the initial topological
 /// ordering from the DAG to be scheduled.
 ///
-/// The idea of the algorithm is taken from 
+/// The idea of the algorithm is taken from
 /// "Online algorithms for managing the topological order of
 /// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly
-/// This is the MNR algorithm, which was first introduced by 
-/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in  
+/// This is the MNR algorithm, which was first introduced by
+/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in
 /// "Maintaining a topological order under edge insertions".
 ///
-/// Short description of the algorithm: 
+/// Short description of the algorithm:
 ///
 /// Topological ordering, ord, of a DAG maps each node to a topological
 /// index so that for all edges X->Y it is the case that ord(X) < ord(Y).
 ///
-/// This means that if there is a path from the node X to the node Z, 
+/// This means that if there is a path from the node X to the node Z,
 /// then ord(X) < ord(Z).
 ///
 /// This property can be used to check for reachability of nodes:
-/// if Z is reachable from X, then an insertion of the edge Z->X would 
+/// if Z is reachable from X, then an insertion of the edge Z->X would
 /// create a cycle.
 ///
 /// The algorithm first computes a topological ordering for the DAG by
@@ -431,7 +431,7 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
       // Collect leaf nodes.
       WorkList.push_back(SU);
     }
-  }  
+  }
 
   int Id = DAGSize;
   while (!WorkList.empty()) {
@@ -456,7 +456,7 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
     SUnit *SU = &SUnits[i];
     for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
          I != E; ++I) {
-      assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] && 
+      assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] &&
       "Wrong topological sorting");
     }
   }
@@ -494,7 +494,7 @@ void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
 void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
                                      bool& HasLoop) {
   std::vector<const SUnit*> WorkList;
-  WorkList.reserve(SUnits.size()); 
+  WorkList.reserve(SUnits.size());
 
   WorkList.push_back(SU);
   do {
@@ -504,20 +504,20 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
     for (int I = SU->Succs.size()-1; I >= 0; --I) {
       int s = SU->Succs[I].getSUnit()->NodeNum;
       if (Node2Index[s] == UpperBound) {
-        HasLoop = true; 
+        HasLoop = true;
         return;
       }
       // Visit successors if not already and in affected region.
       if (!Visited.test(s) && Node2Index[s] < UpperBound) {
         WorkList.push_back(SU->Succs[I].getSUnit());
-      } 
-    } 
+      }
+    }
   } while (!WorkList.empty());
 }
 
-/// Shift - Renumber the nodes so that the topological ordering is 
+/// Shift - Renumber the nodes so that the topological ordering is
 /// preserved.
-void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound, 
+void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,
                                        int UpperBound) {
   std::vector<int> L;
   int shift = 0;
@@ -568,7 +568,7 @@ bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
   // Is Ord(TargetSU) < Ord(SU) ?
   if (LowerBound < UpperBound) {
     Visited.reset();
-    // There may be a path from TargetSU to SU. Check for it. 
+    // There may be a path from TargetSU to SU. Check for it.
     DFS(TargetSU, UpperBound, HasLoop);
   }
   return HasLoop;
@@ -580,8 +580,7 @@ void ScheduleDAGTopologicalSort::Allocate(int n, int index) {
   Index2Node[index] = n;
 }
 
-ScheduleDAGTopologicalSort::ScheduleDAGTopologicalSort(
-                                                     std::vector<SUnit> &sunits)
- : SUnits(sunits) {}
+ScheduleDAGTopologicalSort::
+ScheduleDAGTopologicalSort(std::vector<SUnit> &sunits) : SUnits(sunits) {}
 
 ScheduleHazardRecognizer::~ScheduleHazardRecognizer() {}
diff --git a/lib/CodeGen/ScheduleDAGEmit.cpp b/lib/CodeGen/ScheduleDAGEmit.cpp
index ee08e1d..0a2fb37 100644
--- a/lib/CodeGen/ScheduleDAGEmit.cpp
+++ b/lib/CodeGen/ScheduleDAGEmit.cpp
@@ -50,11 +50,8 @@ void ScheduleDAG::EmitPhysRegCopy(SUnit *SU,
           break;
         }
       }
-      bool Success = TII->copyRegToReg(*BB, InsertPos, Reg, VRI->second,
-                                       SU->CopyDstRC, SU->CopySrcRC,
-                                       DebugLoc());
-      (void)Success;
-      assert(Success && "copyRegToReg failed!");
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
+        .addReg(VRI->second);
     } else {
       // Copy from physical register.
       assert(I->getReg() && "Unknown physical register!");
@@ -62,11 +59,8 @@ void ScheduleDAG::EmitPhysRegCopy(SUnit *SU,
       bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
       isNew = isNew; // Silence compiler warning.
       assert(isNew && "Node emitted out of order - early");
-      bool Success = TII->copyRegToReg(*BB, InsertPos, VRBase, I->getReg(),
-                                       SU->CopyDstRC, SU->CopySrcRC,
-                                       DebugLoc());
-      (void)Success;
-      assert(Success && "copyRegToReg failed!");
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), VRBase)
+        .addReg(I->getReg());
     }
     break;
   }
diff --git a/lib/CodeGen/ScheduleDAGInstrs.h b/lib/CodeGen/ScheduleDAGInstrs.h
index ad82db2..d90659b 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.h
+++ b/lib/CodeGen/ScheduleDAGInstrs.h
@@ -69,8 +69,10 @@ namespace llvm {
                      const SmallSet<unsigned, 8> &LoopLiveIns) {
       unsigned Count = 0;
       for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
-           I != E; ++I, ++Count) {
+           I != E; ++I) {
         const MachineInstr *MI = I;
+        if (MI->isDebugValue())
+          continue;
         for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
           const MachineOperand &MO = MI->getOperand(i);
           if (!MO.isReg() || !MO.isUse())
@@ -79,6 +81,7 @@ namespace llvm {
           if (LoopLiveIns.count(MOReg))
             Deps.insert(std::make_pair(MOReg, std::make_pair(&MO, Count)));
         }
+        ++Count; // Not every iteration due to dbg_value above.
       }
 
       const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
diff --git a/lib/CodeGen/SelectionDAG/CMakeLists.txt b/lib/CodeGen/SelectionDAG/CMakeLists.txt
index 0cfd5e1..799988a 100644
--- a/lib/CodeGen/SelectionDAG/CMakeLists.txt
+++ b/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_llvm_library(LLVMSelectionDAG
-  CallingConvLower.cpp
   DAGCombiner.cpp
   FastISel.cpp
   FunctionLoweringInfo.cpp
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6bddd78..e671752 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -211,6 +211,7 @@ namespace {
     SDValue visitBUILD_VECTOR(SDNode *N);
     SDValue visitCONCAT_VECTORS(SDNode *N);
     SDValue visitVECTOR_SHUFFLE(SDNode *N);
+    SDValue visitMEMBARRIER(SDNode *N);
 
     SDValue XformToShuffleWithZero(SDNode *N);
     SDValue ReassociateOps(unsigned Opc, DebugLoc DL, SDValue LHS, SDValue RHS);
@@ -668,7 +669,7 @@ SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
       ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD)
       : LD->getExtensionType();
     Replace = true;
-    return DAG.getExtLoad(ExtType, dl, PVT,
+    return DAG.getExtLoad(ExtType, PVT, dl,
                           LD->getChain(), LD->getBasePtr(),
                           LD->getSrcValue(), LD->getSrcValueOffset(),
                           MemVT, LD->isVolatile(),
@@ -890,7 +891,7 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
     ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
       ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD)
       : LD->getExtensionType();
-    SDValue NewLD = DAG.getExtLoad(ExtType, dl, PVT,
+    SDValue NewLD = DAG.getExtLoad(ExtType, PVT, dl,
                                    LD->getChain(), LD->getBasePtr(),
                                    LD->getSrcValue(), LD->getSrcValueOffset(),
                                    MemVT, LD->isVolatile(),
@@ -1079,6 +1080,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::BUILD_VECTOR:       return visitBUILD_VECTOR(N);
   case ISD::CONCAT_VECTORS:     return visitCONCAT_VECTORS(N);
   case ISD::VECTOR_SHUFFLE:     return visitVECTOR_SHUFFLE(N);
+  case ISD::MEMBARRIER:         return visitMEMBARRIER(N);
   }
   return SDValue();
 }
@@ -1313,7 +1315,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
     if (!LegalOperations && TLI.isOffsetFoldingLegal(GA) && N1C &&
         GA->getOpcode() == ISD::GlobalAddress)
-      return DAG.getGlobalAddress(GA->getGlobal(), VT,
+      return DAG.getGlobalAddress(GA->getGlobal(), N1C->getDebugLoc(), VT,
                                   GA->getOffset() +
                                     (uint64_t)N1C->getSExtValue());
   // fold ((c1-A)+c2) -> (c1+c2)-A
@@ -1550,7 +1552,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
       // fold (sub Sym, c) -> Sym-c
       if (N1C && GA->getOpcode() == ISD::GlobalAddress)
-        return DAG.getGlobalAddress(GA->getGlobal(), VT,
+        return DAG.getGlobalAddress(GA->getGlobal(), N1C->getDebugLoc(), VT,
                                     GA->getOffset() -
                                       (uint64_t)N1C->getSExtValue());
       // fold (sub Sym+c1, Sym+c2) -> c1-c2
@@ -2028,7 +2030,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // fold (OP (zext x), (zext y)) -> (zext (OP x, y))
   // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
   // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
-  // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y))
+  // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
   //
   // do not sink logical op inside of a vector extend, since it may combine
   // into a vsetcc.
@@ -2038,7 +2040,10 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
        // Avoid infinite looping with PromoteIntBinOp.
        (N0.getOpcode() == ISD::ANY_EXTEND &&
         (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) ||
-       (N0.getOpcode() == ISD::TRUNCATE && TLI.isTypeLegal(Op0VT))) &&
+       (N0.getOpcode() == ISD::TRUNCATE &&
+        (!TLI.isZExtFree(VT, Op0VT) ||
+         !TLI.isTruncateFree(Op0VT, VT)) &&
+        TLI.isTypeLegal(Op0VT))) &&
       !VT.isVector() &&
       Op0VT == N1.getOperand(0).getValueType() &&
       (!LegalOperations || TLI.isOperationLegal(N->getOpcode(), Op0VT))) {
@@ -2193,7 +2198,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
                            BitWidth - MemVT.getScalarType().getSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N0.getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, VT, N0.getDebugLoc(),
                                        LN0->getChain(), LN0->getBasePtr(),
                                        LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(), MemVT,
@@ -2216,7 +2221,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
                            BitWidth - MemVT.getScalarType().getSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N0.getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, VT, N0.getDebugLoc(),
                                        LN0->getChain(),
                                        LN0->getBasePtr(), LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(), MemVT,
@@ -2250,7 +2255,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
           EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
           
           SDValue NewLoad = 
-            DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), LoadResultTy,
+            DAG.getExtLoad(ISD::ZEXTLOAD, LoadResultTy, LN0->getDebugLoc(),
                            LN0->getChain(), LN0->getBasePtr(),
                            LN0->getSrcValue(), LN0->getSrcValueOffset(),
                            ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
@@ -2286,7 +2291,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
           
           EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
           SDValue Load =
-            DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), LoadResultTy,
+            DAG.getExtLoad(ISD::ZEXTLOAD, LoadResultTy, LN0->getDebugLoc(),
                            LN0->getChain(), NewPtr,
                            LN0->getSrcValue(), LN0->getSrcValueOffset(),
                            ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
@@ -2317,7 +2322,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   }
 
   // fold (or x, undef) -> -1
-  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF) {
+  if (!LegalOperations &&
+      (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)) {
     EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
     return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT);
   }
@@ -2425,6 +2431,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   if (SDNode *Rot = MatchRotate(N0, N1, N->getDebugLoc()))
     return SDValue(Rot, 0);
 
+  // Simplify the operands using demanded-bits information.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
@@ -3158,6 +3169,11 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
       return NewSRL;
   }
 
+  // Attempt to convert a srl of a load into a narrower zero-extending load.
+  SDValue NarrowLoad = ReduceLoadWidth(N);
+  if (NarrowLoad.getNode())
+    return NarrowLoad;
+
   // Here is a common situation. We want to optimize:
   //
   //   %a = ...
@@ -3487,7 +3503,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, VT, N->getDebugLoc(),
                                        LN0->getChain(),
                                        LN0->getBasePtr(), LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(),
@@ -3531,7 +3547,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
         TLI.isLoadExtLegal(ISD::SEXTLOAD, MemVT)) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, VT, N->getDebugLoc(),
                                        LN0->getChain(),
                                        LN0->getBasePtr(), LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(), MemVT,
@@ -3557,24 +3573,24 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
         // we know that the element size of the sext'd result matches the
         // element size of the compare operands.
       if (VT.getSizeInBits() == N0VT.getSizeInBits())
-	return DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
-			     N0.getOperand(1),
-			     cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+                             N0.getOperand(1),
+                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
       // truncate/sign extend
       else {
-	EVT MatchingElementType =
-	  EVT::getIntegerVT(*DAG.getContext(),
-			    N0VT.getScalarType().getSizeInBits());
-	EVT MatchingVectorType =
-	  EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
-			   N0VT.getVectorNumElements());
-	SDValue VsetCC =
-	  DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
-			N0.getOperand(1),
-			cast<CondCodeSDNode>(N0.getOperand(2))->get());
-	return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
+        EVT MatchingElementType =
+          EVT::getIntegerVT(*DAG.getContext(),
+                            N0VT.getScalarType().getSizeInBits());
+        EVT MatchingVectorType =
+          EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
+                           N0VT.getVectorNumElements());
+        SDValue VsetCC =
+          DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+                        N0.getOperand(1),
+                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
       }
     }
 
@@ -3635,10 +3651,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
 
   // fold (zext (truncate x)) -> (and x, mask)
   if (N0.getOpcode() == ISD::TRUNCATE &&
-      (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) &&
-      (!TLI.isTruncateFree(N0.getOperand(0).getValueType(),
-                           N0.getValueType()) ||
-       !TLI.isZExtFree(N0.getValueType(), VT))) {
+      (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT))) {
     SDValue Op = N0.getOperand(0);
     if (Op.getValueType().bitsLT(VT)) {
       Op = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, Op);
@@ -3679,7 +3692,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, VT, N->getDebugLoc(),
                                        LN0->getChain(),
                                        LN0->getBasePtr(), LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(),
@@ -3723,7 +3736,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT)) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, VT, N->getDebugLoc(),
                                        LN0->getChain(),
                                        LN0->getBasePtr(), LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(), MemVT,
@@ -3881,7 +3894,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, VT, N->getDebugLoc(),
                                        LN0->getChain(),
                                        LN0->getBasePtr(), LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(),
@@ -3925,8 +3938,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       N0.hasOneUse()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     EVT MemVT = LN0->getMemoryVT();
-    SDValue ExtLoad = DAG.getExtLoad(LN0->getExtensionType(), N->getDebugLoc(),
-                                     VT, LN0->getChain(), LN0->getBasePtr(),
+    SDValue ExtLoad = DAG.getExtLoad(LN0->getExtensionType(), VT,
+                                     N->getDebugLoc(),
+                                     LN0->getChain(), LN0->getBasePtr(),
                                      LN0->getSrcValue(),
                                      LN0->getSrcValueOffset(), MemVT,
                                      LN0->isVolatile(), LN0->isNonTemporal(),
@@ -3950,24 +3964,24 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
         // we know that the element size of the sext'd result matches the
         // element size of the compare operands.
       if (VT.getSizeInBits() == N0VT.getSizeInBits())
-	return DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
-			     N0.getOperand(1),
-			     cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+                             N0.getOperand(1),
+                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
       // truncate/sign extend
       else {
-	EVT MatchingElementType =
-	  EVT::getIntegerVT(*DAG.getContext(),
-			    N0VT.getScalarType().getSizeInBits());
-	EVT MatchingVectorType =
-	  EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
-			   N0VT.getVectorNumElements());
-	SDValue VsetCC =
-	  DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
-			N0.getOperand(1),
-			cast<CondCodeSDNode>(N0.getOperand(2))->get());
-	return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
+        EVT MatchingElementType =
+          EVT::getIntegerVT(*DAG.getContext(),
+                            N0VT.getScalarType().getSizeInBits());
+        EVT MatchingVectorType =
+          EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
+                           N0VT.getVectorNumElements());
+        SDValue VsetCC =
+          DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+                        N0.getOperand(1),
+                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
       }
     }
 
@@ -4024,6 +4038,7 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
 /// extended, also fold the extension to form a extending load.
 SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
   unsigned Opc = N->getOpcode();
+
   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -4040,6 +4055,15 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
     if (LegalOperations && !TLI.isLoadExtLegal(ISD::SEXTLOAD, ExtVT))
       return SDValue();
+  } else if (Opc == ISD::SRL) {
+    // Annother special-case: SRL is basically zero-extending a narrower
+    // value.
+    ExtType = ISD::ZEXTLOAD;
+    N0 = SDValue(N, 0);
+    ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (!N01) return SDValue();
+    ExtVT = EVT::getIntegerVT(*DAG.getContext(),
+                              VT.getSizeInBits() - N01->getZExtValue());
   }
 
   unsigned EVTBits = ExtVT.getSizeInBits();
@@ -4085,7 +4109,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
       ? DAG.getLoad(VT, N0.getDebugLoc(), LN0->getChain(), NewPtr,
                     LN0->getSrcValue(), LN0->getSrcValueOffset() + PtrOff,
                     LN0->isVolatile(), LN0->isNonTemporal(), NewAlign)
-      : DAG.getExtLoad(ExtType, N0.getDebugLoc(), VT, LN0->getChain(), NewPtr,
+      : DAG.getExtLoad(ExtType, VT, N0.getDebugLoc(), LN0->getChain(), NewPtr,
                        LN0->getSrcValue(), LN0->getSrcValueOffset() + PtrOff,
                        ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
                        NewAlign);
@@ -4172,7 +4196,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, VT, N->getDebugLoc(),
                                      LN0->getChain(),
                                      LN0->getBasePtr(), LN0->getSrcValue(),
                                      LN0->getSrcValueOffset(), EVT,
@@ -4189,7 +4213,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, VT, N->getDebugLoc(),
                                      LN0->getChain(),
                                      LN0->getBasePtr(), LN0->getSrcValue(),
                                      LN0->getSrcValueOffset(), EVT,
@@ -4243,8 +4267,17 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
 
   // fold (truncate (load x)) -> (smaller load x)
   // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
-  if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT))
-    return ReduceLoadWidth(N);
+  if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
+    SDValue Reduced = ReduceLoadWidth(N);
+    if (Reduced.getNode())
+      return Reduced;
+  }
+
+  // Simplify the operands using demanded-bits information.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
@@ -4943,7 +4976,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, N->getDebugLoc(), VT,
+    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, VT, N->getDebugLoc(),
                                      LN0->getChain(),
                                      LN0->getBasePtr(), LN0->getSrcValue(),
                                      LN0->getSrcValueOffset(),
@@ -5527,8 +5560,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
   if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
     if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
       if (Align > LD->getAlignment())
-        return DAG.getExtLoad(LD->getExtensionType(), N->getDebugLoc(),
-                              LD->getValueType(0),
+        return DAG.getExtLoad(LD->getExtensionType(), LD->getValueType(0),
+                              N->getDebugLoc(),
                               Chain, Ptr, LD->getSrcValue(),
                               LD->getSrcValueOffset(), LD->getMemoryVT(),
                               LD->isVolatile(), LD->isNonTemporal(), Align);
@@ -5551,8 +5584,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
                                LD->isVolatile(), LD->isNonTemporal(),
                                LD->getAlignment());
       } else {
-        ReplLoad = DAG.getExtLoad(LD->getExtensionType(), LD->getDebugLoc(),
-                                  LD->getValueType(0),
+        ReplLoad = DAG.getExtLoad(LD->getExtensionType(), LD->getValueType(0),
+                                  LD->getDebugLoc(),
                                   BetterChain, Ptr, LD->getSrcValue(),
                                   LD->getSrcValueOffset(),
                                   LD->getMemoryVT(),
@@ -6077,7 +6110,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
    // Check if the result type doesn't match the inserted element type. A
    // SCALAR_TO_VECTOR may truncate the inserted element and the
    // EXTRACT_VECTOR_ELT may widen the extracted vector.
-   EVT EltVT = InVec.getValueType().getVectorElementType();
    SDValue InOp = InVec.getOperand(0);
    EVT NVT = N->getValueType(0);
    if (InOp.getValueType() != NVT) {
@@ -6277,8 +6309,6 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
-  return SDValue();
-  
   EVT VT = N->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
 
@@ -6334,6 +6364,59 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitMEMBARRIER(SDNode* N) {
+  if (!TLI.getShouldFoldAtomicFences())
+    return SDValue();
+
+  SDValue atomic = N->getOperand(0);
+  switch (atomic.getOpcode()) {
+    case ISD::ATOMIC_CMP_SWAP:
+    case ISD::ATOMIC_SWAP:
+    case ISD::ATOMIC_LOAD_ADD:
+    case ISD::ATOMIC_LOAD_SUB:
+    case ISD::ATOMIC_LOAD_AND:
+    case ISD::ATOMIC_LOAD_OR:
+    case ISD::ATOMIC_LOAD_XOR:
+    case ISD::ATOMIC_LOAD_NAND:
+    case ISD::ATOMIC_LOAD_MIN:
+    case ISD::ATOMIC_LOAD_MAX:
+    case ISD::ATOMIC_LOAD_UMIN:
+    case ISD::ATOMIC_LOAD_UMAX:
+      break;
+    default:
+      return SDValue();
+  }
+
+  SDValue fence = atomic.getOperand(0);
+  if (fence.getOpcode() != ISD::MEMBARRIER)
+    return SDValue();
+
+  switch (atomic.getOpcode()) {
+    case ISD::ATOMIC_CMP_SWAP:
+      return SDValue(DAG.UpdateNodeOperands(atomic.getNode(),
+                                    fence.getOperand(0),
+                                    atomic.getOperand(1), atomic.getOperand(2),
+                                    atomic.getOperand(3)), atomic.getResNo());
+    case ISD::ATOMIC_SWAP:
+    case ISD::ATOMIC_LOAD_ADD:
+    case ISD::ATOMIC_LOAD_SUB:
+    case ISD::ATOMIC_LOAD_AND:
+    case ISD::ATOMIC_LOAD_OR:
+    case ISD::ATOMIC_LOAD_XOR:
+    case ISD::ATOMIC_LOAD_NAND:
+    case ISD::ATOMIC_LOAD_MIN:
+    case ISD::ATOMIC_LOAD_MAX:
+    case ISD::ATOMIC_LOAD_UMIN:
+    case ISD::ATOMIC_LOAD_UMAX:
+      return SDValue(DAG.UpdateNodeOperands(atomic.getNode(),
+                                    fence.getOperand(0),
+                                    atomic.getOperand(1), atomic.getOperand(2)),
+                     atomic.getResNo());
+    default:
+      return SDValue();
+  }
+}
+
 /// XformToShuffleWithZero - Returns a vector_shuffle if it able to transform
 /// an AND to a vector_shuffle with the destination vector and a zero vector.
 /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
@@ -6565,8 +6648,8 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
                                LLD->getAlignment());
           } else {
             Load = DAG.getExtLoad(LLD->getExtensionType(),
-                                  TheSelect->getDebugLoc(),
                                   TheSelect->getValueType(0),
+                                  TheSelect->getDebugLoc(),
                                   LLD->getChain(), Addr, 0, 0,
                                   LLD->getMemoryVT(),
                                   LLD->isVolatile(),
@@ -6807,38 +6890,34 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
     }
   }
 
-  // Check to see if this is an integer abs. select_cc setl[te] X, 0, -X, X ->
+  // Check to see if this is an integer abs.
+  // select_cc setg[te] X,  0,  X, -X ->
+  // select_cc setgt    X, -1,  X, -X ->
+  // select_cc setl[te] X,  0, -X,  X ->
+  // select_cc setlt    X,  1, -X,  X ->
   // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
-  if (N1C && N1C->isNullValue() && (CC == ISD::SETLT || CC == ISD::SETLE) &&
-      N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1) &&
-      N2.getOperand(0) == N1 && N0.getValueType().isInteger()) {
+  if (N1C) {
+    ConstantSDNode *SubC = NULL;
+    if (((N1C->isNullValue() && (CC == ISD::SETGT || CC == ISD::SETGE)) ||
+         (N1C->isAllOnesValue() && CC == ISD::SETGT)) &&
+        N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1))
+      SubC = dyn_cast<ConstantSDNode>(N3.getOperand(0));
+    else if (((N1C->isNullValue() && (CC == ISD::SETLT || CC == ISD::SETLE)) ||
+              (N1C->isOne() && CC == ISD::SETLT)) &&
+             N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1))
+      SubC = dyn_cast<ConstantSDNode>(N2.getOperand(0));
+
     EVT XType = N0.getValueType();
-    SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(), XType, N0,
-                                DAG.getConstant(XType.getSizeInBits()-1,
-                                                getShiftAmountTy()));
-    SDValue Add = DAG.getNode(ISD::ADD, N0.getDebugLoc(), XType,
-                              N0, Shift);
-    AddToWorkList(Shift.getNode());
-    AddToWorkList(Add.getNode());
-    return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
-  }
-  // Check to see if this is an integer abs. select_cc setgt X, -1, X, -X ->
-  // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
-  if (N1C && N1C->isAllOnesValue() && CC == ISD::SETGT &&
-      N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1)) {
-    if (ConstantSDNode *SubC = dyn_cast<ConstantSDNode>(N3.getOperand(0))) {
-      EVT XType = N0.getValueType();
-      if (SubC->isNullValue() && XType.isInteger()) {
-        SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(), XType,
-                                    N0,
-                                    DAG.getConstant(XType.getSizeInBits()-1,
-                                                    getShiftAmountTy()));
-        SDValue Add = DAG.getNode(ISD::ADD, N0.getDebugLoc(),
-                                  XType, N0, Shift);
-        AddToWorkList(Shift.getNode());
-        AddToWorkList(Add.getNode());
-        return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
-      }
+    if (SubC && SubC->isNullValue() && XType.isInteger()) {
+      SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(), XType,
+                                  N0,
+                                  DAG.getConstant(XType.getSizeInBits()-1,
+                                                  getShiftAmountTy()));
+      SDValue Add = DAG.getNode(ISD::ADD, N0.getDebugLoc(),
+                                XType, N0, Shift);
+      AddToWorkList(Shift.getNode());
+      AddToWorkList(Add.getNode());
+      return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
     }
   }
 
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 95f4d07..3f7e4a5 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -44,18 +44,38 @@
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "FunctionLoweringInfo.h"
 using namespace llvm;
 
+/// startNewBlock - Set the current block to which generated machine
+/// instructions will be appended, and clear the local CSE map.
+///
+void FastISel::startNewBlock() {
+  LocalValueMap.clear();
+
+  // Start out as null, meaining no local-value instructions have
+  // been emitted.
+  LastLocalValue = 0;
+
+  // Advance the last local value past any EH_LABEL instructions.
+  MachineBasicBlock::iterator
+    I = FuncInfo.MBB->begin(), E = FuncInfo.MBB->end();
+  while (I != E && I->getOpcode() == TargetOpcode::EH_LABEL) {
+    LastLocalValue = I;
+    ++I;
+  }
+}
+
 bool FastISel::hasTrivialKill(const Value *V) const {
   // Don't consider constants or arguments to have trivial kills.
   const Instruction *I = dyn_cast<Instruction>(V);
@@ -99,25 +119,31 @@ unsigned FastISel::getRegForValue(const Value *V) {
   // cache values defined by Instructions across blocks, and other values
   // only locally. This is because Instructions already have the SSA
   // def-dominates-use requirement enforced.
-  DenseMap<const Value *, unsigned>::iterator I = ValueMap.find(V);
-  if (I != ValueMap.end())
-    return I->second;
+  DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(V);
+  if (I != FuncInfo.ValueMap.end()) {
+    unsigned Reg = I->second;
+    return Reg;
+  }
   unsigned Reg = LocalValueMap[V];
   if (Reg != 0)
     return Reg;
 
   // In bottom-up mode, just create the virtual register which will be used
   // to hold the value. It will be materialized later.
-  if (IsBottomUp) {
-    Reg = createResultReg(TLI.getRegClassFor(VT));
-    if (isa<Instruction>(V))
-      ValueMap[V] = Reg;
-    else
-      LocalValueMap[V] = Reg;
-    return Reg;
-  }
+  if (isa<Instruction>(V) &&
+      (!isa<AllocaInst>(V) ||
+       !FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(V))))
+    return FuncInfo.InitializeRegForValue(V);
+
+  MachineBasicBlock::iterator SaveInsertPt = enterLocalValueArea();
+
+  // Materialize the value in a register. Emit any instructions in the
+  // local value area.
+  Reg = materializeRegForValue(V, VT);
 
-  return materializeRegForValue(V, VT);
+  leaveLocalValueArea(SaveInsertPt);
+
+  return Reg;
 }
 
 /// materializeRegForValue - Helper for getRegForVale. This function is
@@ -161,11 +187,15 @@ unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
       }
     }
   } else if (const Operator *Op = dyn_cast<Operator>(V)) {
-    if (!SelectOperator(Op, Op->getOpcode())) return 0;
-    Reg = LocalValueMap[Op];
+    if (!SelectOperator(Op, Op->getOpcode()))
+      if (!isa<Instruction>(Op) ||
+          !TargetSelectInstruction(cast<Instruction>(Op)))
+        return 0;
+    Reg = lookUpRegForValue(Op);
   } else if (isa<UndefValue>(V)) {
     Reg = createResultReg(TLI.getRegClassFor(VT));
-    BuildMI(MBB, DL, TII.get(TargetOpcode::IMPLICIT_DEF), Reg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(TargetOpcode::IMPLICIT_DEF), Reg);
   }
   
   // If target-independent code couldn't handle the value, give target-specific
@@ -175,8 +205,10 @@ unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
   
   // Don't cache constant materializations in the general ValueMap.
   // To do so would require tracking what uses they dominate.
-  if (Reg != 0)
+  if (Reg != 0) {
     LocalValueMap[V] = Reg;
+    LastLocalValue = MRI.getVRegDef(Reg);
+  }
   return Reg;
 }
 
@@ -185,8 +217,9 @@ unsigned FastISel::lookUpRegForValue(const Value *V) {
   // cache values defined by Instructions across blocks, and other values
   // only locally. This is because Instructions already have the SSA
   // def-dominates-use requirement enforced.
-  if (ValueMap.count(V))
-    return ValueMap[V];
+  DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(V);
+  if (I != FuncInfo.ValueMap.end())
+    return I->second;
   return LocalValueMap[V];
 }
 
@@ -202,14 +235,17 @@ unsigned FastISel::UpdateValueMap(const Value *I, unsigned Reg) {
     return Reg;
   }
   
-  unsigned &AssignedReg = ValueMap[I];
+  unsigned &AssignedReg = FuncInfo.ValueMap[I];
   if (AssignedReg == 0)
+    // Use the new register.
     AssignedReg = Reg;
   else if (Reg != AssignedReg) {
-    const TargetRegisterClass *RegClass = MRI.getRegClass(Reg);
-    TII.copyRegToReg(*MBB, MBB->end(), AssignedReg,
-                     Reg, RegClass, RegClass, DL);
+    // Arrange for uses of AssignedReg to be replaced by uses of Reg.
+    FuncInfo.RegFixups[AssignedReg] = Reg;
+
+    AssignedReg = Reg;
   }
+
   return AssignedReg;
 }
 
@@ -237,6 +273,33 @@ std::pair<unsigned, bool> FastISel::getRegForGEPIndex(const Value *Idx) {
   return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
 }
 
+void FastISel::recomputeInsertPt() {
+  if (getLastLocalValue()) {
+    FuncInfo.InsertPt = getLastLocalValue();
+    ++FuncInfo.InsertPt;
+  } else
+    FuncInfo.InsertPt = FuncInfo.MBB->getFirstNonPHI();
+
+  // Now skip past any EH_LABELs, which must remain at the beginning.
+  while (FuncInfo.InsertPt != FuncInfo.MBB->end() &&
+         FuncInfo.InsertPt->getOpcode() == TargetOpcode::EH_LABEL)
+    ++FuncInfo.InsertPt;
+}
+
+MachineBasicBlock::iterator FastISel::enterLocalValueArea() {
+  MachineBasicBlock::iterator OldInsertPt = FuncInfo.InsertPt;
+  recomputeInsertPt();
+  return OldInsertPt;
+}
+
+void FastISel::leaveLocalValueArea(MachineBasicBlock::iterator OldInsertPt) {
+  if (FuncInfo.InsertPt != FuncInfo.MBB->begin())
+    LastLocalValue = llvm::prior(FuncInfo.InsertPt);
+
+  // Restore the previous insert position.
+  FuncInfo.InsertPt = OldInsertPt;
+}
+
 /// SelectBinaryOp - Select and emit code for a binary operator instruction,
 /// which has an opcode which directly corresponds to the given ISD opcode.
 ///
@@ -345,7 +408,7 @@ bool FastISel::SelectGetElementPtr(const User *I) {
 
       // If this is a constant subscript, handle it quickly.
       if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
-        if (CI->getZExtValue() == 0) continue;
+        if (CI->isZero()) continue;
         uint64_t Offs = 
           TD.getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
         N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, Offs, VT);
@@ -395,7 +458,7 @@ bool FastISel::SelectCall(const User *I) {
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(I);
     if (!DIVariable(DI->getVariable()).Verify() ||
-        !MF.getMMI().hasDebugInfo())
+        !FuncInfo.MF->getMMI().hasDebugInfo())
       return true;
 
     const Value *Address = DI->getAddress();
@@ -409,11 +472,12 @@ bool FastISel::SelectCall(const User *I) {
     // those are handled in SelectionDAGBuilder.
     if (AI) {
       DenseMap<const AllocaInst*, int>::iterator SI =
-        StaticAllocaMap.find(AI);
-      if (SI == StaticAllocaMap.end()) break; // VLAs.
+        FuncInfo.StaticAllocaMap.find(AI);
+      if (SI == FuncInfo.StaticAllocaMap.end()) break; // VLAs.
       int FI = SI->second;
       if (!DI->getDebugLoc().isUnknown())
-        MF.getMMI().setVariableDbgInfo(DI->getVariable(), FI, DI->getDebugLoc());
+        FuncInfo.MF->getMMI().setVariableDbgInfo(DI->getVariable(),
+                                                 FI, DI->getDebugLoc());
     } else
       // Building the map above is target independent.  Generating DBG_VALUE
       // inline is target dependent; do this now.
@@ -428,23 +492,28 @@ bool FastISel::SelectCall(const User *I) {
     if (!V) {
       // Currently the optimizer can produce this; insert an undef to
       // help debugging.  Probably the optimizer should not do this.
-      BuildMI(MBB, DL, II).addReg(0U).addImm(DI->getOffset()).
-                                     addMetadata(DI->getVariable());
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+        .addReg(0U).addImm(DI->getOffset())
+        .addMetadata(DI->getVariable());
     } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-      BuildMI(MBB, DL, II).addImm(CI->getZExtValue()).addImm(DI->getOffset()).
-                                     addMetadata(DI->getVariable());
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+        .addImm(CI->getZExtValue()).addImm(DI->getOffset())
+        .addMetadata(DI->getVariable());
     } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
-      BuildMI(MBB, DL, II).addFPImm(CF).addImm(DI->getOffset()).
-                                     addMetadata(DI->getVariable());
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+        .addFPImm(CF).addImm(DI->getOffset())
+        .addMetadata(DI->getVariable());
     } else if (unsigned Reg = lookUpRegForValue(V)) {
-      BuildMI(MBB, DL, II).addReg(Reg, RegState::Debug).addImm(DI->getOffset()).
-                                     addMetadata(DI->getVariable());
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+        .addReg(Reg, RegState::Debug).addImm(DI->getOffset())
+        .addMetadata(DI->getVariable());
     } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
       // Insert an undef so we can see what we dropped.
-      BuildMI(MBB, DL, II).addReg(0U).addImm(DI->getOffset()).
-                                     addMetadata(DI->getVariable());
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+        .addReg(0U).addImm(DI->getOffset())
+        .addMetadata(DI->getVariable());
     }     
     return true;
   }
@@ -453,14 +522,13 @@ bool FastISel::SelectCall(const User *I) {
     switch (TLI.getOperationAction(ISD::EXCEPTIONADDR, VT)) {
     default: break;
     case TargetLowering::Expand: {
-      assert(MBB->isLandingPad() && "Call to eh.exception not in landing pad!");
+      assert(FuncInfo.MBB->isLandingPad() &&
+             "Call to eh.exception not in landing pad!");
       unsigned Reg = TLI.getExceptionAddressRegister();
       const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
       unsigned ResultReg = createResultReg(RC);
-      bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                           Reg, RC, RC, DL);
-      assert(InsertedCopy && "Can't copy address registers!");
-      InsertedCopy = InsertedCopy;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              ResultReg).addReg(Reg);
       UpdateValueMap(I, ResultReg);
       return true;
     }
@@ -472,25 +540,23 @@ bool FastISel::SelectCall(const User *I) {
     switch (TLI.getOperationAction(ISD::EHSELECTION, VT)) {
     default: break;
     case TargetLowering::Expand: {
-      if (MBB->isLandingPad())
-        AddCatchInfo(*cast<CallInst>(I), &MF.getMMI(), MBB);
+      if (FuncInfo.MBB->isLandingPad())
+        AddCatchInfo(*cast<CallInst>(I), &FuncInfo.MF->getMMI(), FuncInfo.MBB);
       else {
 #ifndef NDEBUG
-        CatchInfoLost.insert(cast<CallInst>(I));
+        FuncInfo.CatchInfoLost.insert(cast<CallInst>(I));
 #endif
         // FIXME: Mark exception selector register as live in.  Hack for PR1508.
         unsigned Reg = TLI.getExceptionSelectorRegister();
-        if (Reg) MBB->addLiveIn(Reg);
+        if (Reg) FuncInfo.MBB->addLiveIn(Reg);
       }
 
       unsigned Reg = TLI.getExceptionSelectorRegister();
       EVT SrcVT = TLI.getPointerTy();
       const TargetRegisterClass *RC = TLI.getRegClassFor(SrcVT);
       unsigned ResultReg = createResultReg(RC);
-      bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg, Reg,
-                                           RC, RC, DL);
-      assert(InsertedCopy && "Can't copy address registers!");
-      InsertedCopy = InsertedCopy;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              ResultReg).addReg(Reg);
 
       bool ResultRegIsKill = hasTrivialKill(I);
 
@@ -605,12 +671,12 @@ bool FastISel::SelectBitCast(const User *I) {
   if (SrcVT.getSimpleVT() == DstVT.getSimpleVT()) {
     TargetRegisterClass* SrcClass = TLI.getRegClassFor(SrcVT);
     TargetRegisterClass* DstClass = TLI.getRegClassFor(DstVT);
-    ResultReg = createResultReg(DstClass);
-    
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         Op0, DstClass, SrcClass, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    // Don't attempt a cross-class copy. It will likely fail.
+    if (SrcClass == DstClass) {
+      ResultReg = createResultReg(DstClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              ResultReg).addReg(Op0);
+    }
   }
   
   // If the reg-reg copy failed, select a BIT_CONVERT opcode.
@@ -655,14 +721,15 @@ FastISel::SelectInstruction(const Instruction *I) {
 /// unless it is the immediate (fall-through) successor, and update
 /// the CFG.
 void
-FastISel::FastEmitBranch(MachineBasicBlock *MSucc) {
-  if (MBB->isLayoutSuccessor(MSucc)) {
+FastISel::FastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DL) {
+  if (FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
     // The unconditional fall-through case, which needs no instructions.
   } else {
     // The unconditional branch case.
-    TII.InsertBranch(*MBB, MSucc, NULL, SmallVector<MachineOperand, 0>());
+    TII.InsertBranch(*FuncInfo.MBB, MSucc, NULL,
+                     SmallVector<MachineOperand, 0>(), DL);
   }
-  MBB->addSuccessor(MSucc);
+  FuncInfo.MBB->addSuccessor(MSucc);
 }
 
 /// SelectFNeg - Emit an FNeg operation.
@@ -712,8 +779,39 @@ FastISel::SelectFNeg(const User *I) {
 }
 
 bool
+FastISel::SelectLoad(const User *I) {
+  LoadInst *LI = const_cast<LoadInst *>(cast<LoadInst>(I));
+
+  // For a load from an alloca, make a limited effort to find the value
+  // already available in a register, avoiding redundant loads.
+  if (!LI->isVolatile() && isa<AllocaInst>(LI->getPointerOperand())) {
+    BasicBlock::iterator ScanFrom = LI;
+    if (const Value *V = FindAvailableLoadedValue(LI->getPointerOperand(),
+                                                  LI->getParent(), ScanFrom)) {
+      if (!V->use_empty() &&
+          (!isa<Instruction>(V) ||
+           cast<Instruction>(V)->getParent() == LI->getParent() ||
+           (isa<AllocaInst>(V) &&
+            FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(V)))) &&
+          (!isa<Argument>(V) ||
+           LI->getParent() == &LI->getParent()->getParent()->getEntryBlock())) {
+      unsigned ResultReg = getRegForValue(V);
+      if (ResultReg != 0) {
+        UpdateValueMap(I, ResultReg);
+        return true;
+      }
+      }
+    }
+  }
+
+  return false;
+}
+
+bool
 FastISel::SelectOperator(const User *I, unsigned Opcode) {
   switch (Opcode) {
+  case Instruction::Load:
+    return SelectLoad(I);
   case Instruction::Add:
     return SelectBinaryOp(I, ISD::ADD);
   case Instruction::FAdd:
@@ -762,8 +860,8 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
 
     if (BI->isUnconditional()) {
       const BasicBlock *LLVMSucc = BI->getSuccessor(0);
-      MachineBasicBlock *MSucc = MBBMap[LLVMSucc];
-      FastEmitBranch(MSucc);
+      MachineBasicBlock *MSucc = FuncInfo.MBBMap[LLVMSucc];
+      FastEmitBranch(MSucc, BI->getDebugLoc());
       return true;
     }
 
@@ -778,7 +876,7 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
 
   case Instruction::Alloca:
     // FunctionLowering has the static-sized case covered.
-    if (StaticAllocaMap.count(cast<AllocaInst>(I)))
+    if (FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(I)))
       return true;
 
     // Dynamic-sized alloca is not handled yet.
@@ -824,32 +922,16 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
   }
 }
 
-FastISel::FastISel(MachineFunction &mf,
-                   DenseMap<const Value *, unsigned> &vm,
-                   DenseMap<const BasicBlock *, MachineBasicBlock *> &bm,
-                   DenseMap<const AllocaInst *, int> &am,
-                   std::vector<std::pair<MachineInstr*, unsigned> > &pn
-#ifndef NDEBUG
-                   , SmallSet<const Instruction *, 8> &cil
-#endif
-                   )
-  : MBB(0),
-    ValueMap(vm),
-    MBBMap(bm),
-    StaticAllocaMap(am),
-    PHINodesToUpdate(pn),
-#ifndef NDEBUG
-    CatchInfoLost(cil),
-#endif
-    MF(mf),
-    MRI(MF.getRegInfo()),
-    MFI(*MF.getFrameInfo()),
-    MCP(*MF.getConstantPool()),
-    TM(MF.getTarget()),
+FastISel::FastISel(FunctionLoweringInfo &funcInfo)
+  : FuncInfo(funcInfo),
+    MRI(FuncInfo.MF->getRegInfo()),
+    MFI(*FuncInfo.MF->getFrameInfo()),
+    MCP(*FuncInfo.MF->getConstantPool()),
+    TM(FuncInfo.MF->getTarget()),
     TD(*TM.getTargetData()),
     TII(*TM.getInstrInfo()),
     TLI(*TM.getTargetLowering()),
-    IsBottomUp(false) {
+    TRI(*TM.getRegisterInfo()) {
 }
 
 FastISel::~FastISel() {}
@@ -978,7 +1060,7 @@ unsigned FastISel::FastEmitInst_(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
 
-  BuildMI(MBB, DL, II, ResultReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg);
   return ResultReg;
 }
 
@@ -989,13 +1071,13 @@ unsigned FastISel::FastEmitInst_r(unsigned MachineInstOpcode,
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg).addReg(Op0, Op0IsKill * RegState::Kill);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill);
   else {
-    BuildMI(MBB, DL, II).addReg(Op0, Op0IsKill * RegState::Kill);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
   }
 
   return ResultReg;
@@ -1009,17 +1091,15 @@ unsigned FastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addReg(Op1, Op1IsKill * RegState::Kill);
   else {
-    BuildMI(MBB, DL, II)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addReg(Op1, Op1IsKill * RegState::Kill);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
@@ -1032,17 +1112,15 @@ unsigned FastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addImm(Imm);
   else {
-    BuildMI(MBB, DL, II)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addImm(Imm);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
@@ -1055,17 +1133,15 @@ unsigned FastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addFPImm(FPImm);
   else {
-    BuildMI(MBB, DL, II)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addFPImm(FPImm);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
@@ -1079,19 +1155,17 @@ unsigned FastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addReg(Op1, Op1IsKill * RegState::Kill)
       .addImm(Imm);
   else {
-    BuildMI(MBB, DL, II)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addReg(Op1, Op1IsKill * RegState::Kill)
       .addImm(Imm);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
@@ -1103,13 +1177,11 @@ unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode,
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
   
   if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg).addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg).addImm(Imm);
   else {
-    BuildMI(MBB, DL, II).addImm(Imm);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II).addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
@@ -1117,24 +1189,12 @@ unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode,
 unsigned FastISel::FastEmitInst_extractsubreg(MVT RetVT,
                                               unsigned Op0, bool Op0IsKill,
                                               uint32_t Idx) {
-  const TargetRegisterClass* RC = MRI.getRegClass(Op0);
-  
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
-  const TargetInstrDesc &II = TII.get(TargetOpcode::EXTRACT_SUBREG);
-  
-  if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addImm(Idx);
-  else {
-    BuildMI(MBB, DL, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addImm(Idx);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
-  }
+  assert(TargetRegisterInfo::isVirtualRegister(Op0) &&
+         "Cannot yet extract from physregs");
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+          DL, TII.get(TargetOpcode::COPY), ResultReg)
+    .addReg(Op0, getKillRegState(Op0IsKill), Idx);
   return ResultReg;
 }
 
@@ -1154,14 +1214,14 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
   const TerminatorInst *TI = LLVMBB->getTerminator();
 
   SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
-  unsigned OrigNumPHINodesToUpdate = PHINodesToUpdate.size();
+  unsigned OrigNumPHINodesToUpdate = FuncInfo.PHINodesToUpdate.size();
 
   // Check successor nodes' PHI nodes that expect a constant to be available
   // from this block.
   for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
     const BasicBlock *SuccBB = TI->getSuccessor(succ);
     if (!isa<PHINode>(SuccBB->begin())) continue;
-    MachineBasicBlock *SuccMBB = MBBMap[SuccBB];
+    MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];
 
     // If this terminator has multiple identical successors (common for
     // switches), only handle each succ once.
@@ -1182,7 +1242,7 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       // by bailing out early, we may leave behind some dead instructions,
       // since SelectionDAG's HandlePHINodesInSuccessorBlocks will insert its
       // own moves. Second, this check is necessary becuase FastISel doesn't
-      // use CreateRegForValue to create registers, so it always creates
+      // use CreateRegs to create registers, so it always creates
       // exactly one register for each non-void instruction.
       EVT VT = TLI.getValueType(PN->getType(), /*AllowUnknown=*/true);
       if (VT == MVT::Other || !TLI.isTypeLegal(VT)) {
@@ -1190,7 +1250,7 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
         if (VT == MVT::i1)
           VT = TLI.getTypeToTransformTo(LLVMBB->getContext(), VT);
         else {
-          PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
+          FuncInfo.PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
           return false;
         }
       }
@@ -1205,10 +1265,10 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
 
       unsigned Reg = getRegForValue(PHIOp);
       if (Reg == 0) {
-        PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
+        FuncInfo.PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
         return false;
       }
-      PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg));
+      FuncInfo.PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg));
       DL = DebugLoc();
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 65c36c1..928e1ec 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "function-lowering-info"
-#include "FunctionLoweringInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
@@ -30,7 +30,6 @@
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/Debug.h"
@@ -47,9 +46,11 @@ static bool isUsedOutsideOfDefiningBlock(const Instruction *I) {
   if (isa<PHINode>(I)) return true;
   const BasicBlock *BB = I->getParent();
   for (Value::const_use_iterator UI = I->use_begin(), E = I->use_end();
-        UI != E; ++UI)
-    if (cast<Instruction>(*UI)->getParent() != BB || isa<PHINode>(*UI))
+        UI != E; ++UI) {
+    const User *U = *UI;
+    if (cast<Instruction>(U)->getParent() != BB || isa<PHINode>(U))
       return true;
+  }
   return false;
 }
 
@@ -64,9 +65,11 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool EnableFastISel) {
 
   const BasicBlock *Entry = A->getParent()->begin();
   for (Value::const_use_iterator UI = A->use_begin(), E = A->use_end();
-       UI != E; ++UI)
-    if (cast<Instruction>(*UI)->getParent() != Entry || isa<SwitchInst>(*UI))
+       UI != E; ++UI) {
+    const User *U = *UI;
+    if (cast<Instruction>(U)->getParent() != Entry || isa<SwitchInst>(U))
       return false;  // Use not in entry block.
+  }
   return true;
 }
 
@@ -74,12 +77,18 @@ FunctionLoweringInfo::FunctionLoweringInfo(const TargetLowering &tli)
   : TLI(tli) {
 }
 
-void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
-                               bool EnableFastISel) {
+void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
   Fn = &fn;
   MF = &mf;
   RegInfo = &MF->getRegInfo();
 
+  // Check whether the function can return without sret-demotion.
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(Fn->getReturnType(),
+                Fn->getAttributes().getRetAttributes(), Outs, TLI);
+  CanLowerReturn = TLI.CanLowerReturn(Fn->getCallingConv(), Fn->isVarArg(),
+                                      Outs, Fn->getContext());
+
   // Create a vreg for each argument register that is not dead and is used
   // outside of the entry block for the function.
   for (Function::const_arg_iterator AI = Fn->arg_begin(), E = Fn->arg_end();
@@ -172,31 +181,33 @@ void FunctionLoweringInfo::clear() {
 #endif
   LiveOutRegInfo.clear();
   ArgDbgValues.clear();
+  RegFixups.clear();
 }
 
-unsigned FunctionLoweringInfo::MakeReg(EVT VT) {
+/// CreateReg - Allocate a single virtual register for the given type.
+unsigned FunctionLoweringInfo::CreateReg(EVT VT) {
   return RegInfo->createVirtualRegister(TLI.getRegClassFor(VT));
 }
 
-/// CreateRegForValue - Allocate the appropriate number of virtual registers of
+/// CreateRegs - Allocate the appropriate number of virtual registers of
 /// the correctly promoted or expanded types.  Assign these registers
 /// consecutive vreg numbers and return the first assigned number.
 ///
 /// In the case that the given value has struct or array type, this function
 /// will assign registers for each member or element.
 ///
-unsigned FunctionLoweringInfo::CreateRegForValue(const Value *V) {
+unsigned FunctionLoweringInfo::CreateRegs(const Type *Ty) {
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(TLI, V->getType(), ValueVTs);
+  ComputeValueVTs(TLI, Ty, ValueVTs);
 
   unsigned FirstReg = 0;
   for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
     EVT ValueVT = ValueVTs[Value];
-    EVT RegisterVT = TLI.getRegisterType(V->getContext(), ValueVT);
+    EVT RegisterVT = TLI.getRegisterType(Ty->getContext(), ValueVT);
 
-    unsigned NumRegs = TLI.getNumRegisters(V->getContext(), ValueVT);
+    unsigned NumRegs = TLI.getNumRegisters(Ty->getContext(), ValueVT);
     for (unsigned i = 0; i != NumRegs; ++i) {
-      unsigned R = MakeReg(RegisterVT);
+      unsigned R = CreateReg(RegisterVT);
       if (!FirstReg) FirstReg = R;
     }
   }
@@ -208,7 +219,7 @@ unsigned FunctionLoweringInfo::CreateRegForValue(const Value *V) {
 void llvm::AddCatchInfo(const CallInst &I, MachineModuleInfo *MMI,
                         MachineBasicBlock *MBB) {
   // Inform the MachineModuleInfo of the personality for this landing pad.
-  const ConstantExpr *CE = cast<ConstantExpr>(I.getOperand(2));
+  const ConstantExpr *CE = cast<ConstantExpr>(I.getArgOperand(1));
   assert(CE->getOpcode() == Instruction::BitCast &&
          isa<Function>(CE->getOperand(0)) &&
          "Personality should be a function");
@@ -217,18 +228,18 @@ void llvm::AddCatchInfo(const CallInst &I, MachineModuleInfo *MMI,
   // Gather all the type infos for this landing pad and pass them along to
   // MachineModuleInfo.
   std::vector<const GlobalVariable *> TyInfo;
-  unsigned N = I.getNumOperands();
+  unsigned N = I.getNumArgOperands();
 
-  for (unsigned i = N - 1; i > 2; --i) {
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(i))) {
+  for (unsigned i = N - 1; i > 1; --i) {
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(i))) {
       unsigned FilterLength = CI->getZExtValue();
       unsigned FirstCatch = i + FilterLength + !FilterLength;
-      assert (FirstCatch <= N && "Invalid filter length");
+      assert(FirstCatch <= N && "Invalid filter length");
 
       if (FirstCatch < N) {
         TyInfo.reserve(N - FirstCatch);
         for (unsigned j = FirstCatch; j < N; ++j)
-          TyInfo.push_back(ExtractTypeInfo(I.getOperand(j)));
+          TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
         MMI->addCatchTypeInfo(MBB, TyInfo);
         TyInfo.clear();
       }
@@ -240,7 +251,7 @@ void llvm::AddCatchInfo(const CallInst &I, MachineModuleInfo *MMI,
         // Filter.
         TyInfo.reserve(FilterLength - 1);
         for (unsigned j = i + 1; j < FirstCatch; ++j)
-          TyInfo.push_back(ExtractTypeInfo(I.getOperand(j)));
+          TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
         MMI->addFilterTypeInfo(MBB, TyInfo);
         TyInfo.clear();
       }
@@ -249,10 +260,10 @@ void llvm::AddCatchInfo(const CallInst &I, MachineModuleInfo *MMI,
     }
   }
 
-  if (N > 3) {
-    TyInfo.reserve(N - 3);
-    for (unsigned j = 3; j < N; ++j)
-      TyInfo.push_back(ExtractTypeInfo(I.getOperand(j)));
+  if (N > 2) {
+    TyInfo.reserve(N - 2);
+    for (unsigned j = 2; j < N; ++j)
+      TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
     MMI->addCatchTypeInfo(MBB, TyInfo);
   }
 }
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.h b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.h
deleted file mode 100644
index 4067a5b..0000000
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.h
+++ /dev/null
@@ -1,144 +0,0 @@
-//===-- FunctionLoweringInfo.h - Lower functions from LLVM IR to CodeGen --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This implements routines for translating functions from LLVM IR into
-// Machine IR.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef FUNCTIONLOWERINGINFO_H
-#define FUNCTIONLOWERINGINFO_H
-
-#include "llvm/InlineAsm.h"
-#include "llvm/Instructions.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#ifndef NDEBUG
-#include "llvm/ADT/SmallSet.h"
-#endif
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/Support/CallSite.h"
-#include <vector>
-
-namespace llvm {
-
-class AllocaInst;
-class BasicBlock;
-class CallInst;
-class Function;
-class GlobalVariable;
-class Instruction;
-class MachineInstr;
-class MachineBasicBlock;
-class MachineFunction;
-class MachineModuleInfo;
-class MachineRegisterInfo;
-class TargetLowering;
-class Value;
-
-//===--------------------------------------------------------------------===//
-/// FunctionLoweringInfo - This contains information that is global to a
-/// function that is used when lowering a region of the function.
-///
-class FunctionLoweringInfo {
-public:
-  const TargetLowering &TLI;
-  const Function *Fn;
-  MachineFunction *MF;
-  MachineRegisterInfo *RegInfo;
-
-  /// CanLowerReturn - true iff the function's return value can be lowered to
-  /// registers.
-  bool CanLowerReturn;
-
-  /// DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg
-  /// allocated to hold a pointer to the hidden sret parameter.
-  unsigned DemoteRegister;
-
-  /// MBBMap - A mapping from LLVM basic blocks to their machine code entry.
-  DenseMap<const BasicBlock*, MachineBasicBlock *> MBBMap;
-
-  /// ValueMap - Since we emit code for the function a basic block at a time,
-  /// we must remember which virtual registers hold the values for
-  /// cross-basic-block values.
-  DenseMap<const Value*, unsigned> ValueMap;
-
-  /// StaticAllocaMap - Keep track of frame indices for fixed sized allocas in
-  /// the entry block.  This allows the allocas to be efficiently referenced
-  /// anywhere in the function.
-  DenseMap<const AllocaInst*, int> StaticAllocaMap;
-
-  /// ArgDbgValues - A list of DBG_VALUE instructions created during isel for
-  /// function arguments that are inserted after scheduling is completed.
-  SmallVector<MachineInstr*, 8> ArgDbgValues;
-
-#ifndef NDEBUG
-  SmallSet<const Instruction *, 8> CatchInfoLost;
-  SmallSet<const Instruction *, 8> CatchInfoFound;
-#endif
-
-  struct LiveOutInfo {
-    unsigned NumSignBits;
-    APInt KnownOne, KnownZero;
-    LiveOutInfo() : NumSignBits(0), KnownOne(1, 0), KnownZero(1, 0) {}
-  };
-  
-  /// LiveOutRegInfo - Information about live out vregs, indexed by their
-  /// register number offset by 'FirstVirtualRegister'.
-  std::vector<LiveOutInfo> LiveOutRegInfo;
-
-  /// PHINodesToUpdate - A list of phi instructions whose operand list will
-  /// be updated after processing the current basic block.
-  /// TODO: This isn't per-function state, it's per-basic-block state. But
-  /// there's no other convenient place for it to live right now.
-  std::vector<std::pair<MachineInstr*, unsigned> > PHINodesToUpdate;
-
-  explicit FunctionLoweringInfo(const TargetLowering &TLI);
-
-  /// set - Initialize this FunctionLoweringInfo with the given Function
-  /// and its associated MachineFunction.
-  ///
-  void set(const Function &Fn, MachineFunction &MF, bool EnableFastISel);
-
-  /// clear - Clear out all the function-specific state. This returns this
-  /// FunctionLoweringInfo to an empty state, ready to be used for a
-  /// different function.
-  void clear();
-
-  unsigned MakeReg(EVT VT);
-  
-  /// isExportedInst - Return true if the specified value is an instruction
-  /// exported from its block.
-  bool isExportedInst(const Value *V) {
-    return ValueMap.count(V);
-  }
-
-  unsigned CreateRegForValue(const Value *V);
-  
-  unsigned InitializeRegForValue(const Value *V) {
-    unsigned &R = ValueMap[V];
-    assert(R == 0 && "Already initialized this value register!");
-    return R = CreateRegForValue(V);
-  }
-};
-
-/// AddCatchInfo - Extract the personality and type infos from an eh.selector
-/// call, and add them to the specified machine basic block.
-void AddCatchInfo(const CallInst &I,
-                  MachineModuleInfo *MMI, MachineBasicBlock *MBB);
-
-/// CopyCatchInfo - Copy catch information from DestBB to SrcBB.
-void CopyCatchInfo(const BasicBlock *SrcBB, const BasicBlock *DestBB,
-                   MachineModuleInfo *MMI, FunctionLoweringInfo &FLI);
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 16eb8a7..61c2a90 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -123,7 +123,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
 
   EVT VT = Node->getValueType(ResNo);
   const TargetRegisterClass *SrcRC = 0, *DstRC = 0;
-  SrcRC = TRI->getPhysicalRegisterRegClass(SrcReg, VT);
+  SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
   
   // Figure out the register class to create for the destreg.
   if (VRBase) {
@@ -142,11 +142,8 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
   } else {
     // Create the reg, emit the copy.
     VRBase = MRI->createVirtualRegister(DstRC);
-    bool Emitted = TII->copyRegToReg(*MBB, InsertPos, VRBase, SrcReg,
-                                     DstRC, SrcRC, Node->getDebugLoc());
-
-    assert(Emitted && "Unable to issue a copy instruction!\n");
-    (void) Emitted;
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            VRBase).addReg(SrcReg);
   }
 
   SDValue Op(Node, ResNo);
@@ -246,7 +243,7 @@ unsigned InstrEmitter::getVR(SDValue Op,
       const TargetRegisterClass *RC = TLI->getRegClassFor(Op.getValueType());
       VReg = MRI->createVirtualRegister(RC);
     }
-    BuildMI(MBB, Op.getDebugLoc(),
+    BuildMI(*MBB, InsertPos, Op.getDebugLoc(),
             TII->get(TargetOpcode::IMPLICIT_DEF), VReg);
     return VReg;
   }
@@ -288,10 +285,8 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
            "Don't have operand info for this instruction!");
     if (DstRC && SrcRC != DstRC && !SrcRC->hasSuperClass(DstRC)) {
       unsigned NewVReg = MRI->createVirtualRegister(DstRC);
-      bool Emitted = TII->copyRegToReg(*MBB, InsertPos, NewVReg, VReg,
-                                       DstRC, SrcRC, Op.getNode()->getDebugLoc());
-      assert(Emitted && "Unable to issue a copy instruction!\n");
-      (void) Emitted;
+      BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
       VReg = NewVReg;
     }
   }
@@ -428,12 +423,9 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
   }
   
   if (Opc == TargetOpcode::EXTRACT_SUBREG) {
+    // EXTRACT_SUBREG is lowered as %dst = COPY %src:sub
     unsigned SubIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
 
-    // Create the extract_subreg machine instruction.
-    MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(),
-                               TII->get(TargetOpcode::EXTRACT_SUBREG));
-
     // Figure out the register class to create for the destreg.
     unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
     const TargetRegisterClass *TRC = MRI->getRegClass(VReg);
@@ -450,11 +442,16 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       VRBase = MRI->createVirtualRegister(SRC);
     }
 
-    // Add def, source, and subreg index
-    MI->addOperand(MachineOperand::CreateReg(VRBase, true));
+    // Create the extract_subreg machine instruction.
+    MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(),
+                               TII->get(TargetOpcode::COPY), VRBase);
+
+    // Add source, and subreg index
     AddOperand(MI, Node->getOperand(0), 0, 0, VRBaseMap, /*IsDebug=*/false,
                IsClone, IsCloned);
-    MI->addOperand(MachineOperand::CreateImm(SubIdx));
+    assert(TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg()) &&
+           "Cannot yet extract from physregs");
+    MI->getOperand(1).setSubReg(SubIdx);
     MBB->insert(InsertPos, MI);
   } else if (Opc == TargetOpcode::INSERT_SUBREG ||
              Opc == TargetOpcode::SUBREG_TO_REG) {
@@ -511,18 +508,13 @@ void
 InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
                                      DenseMap<SDValue, unsigned> &VRBaseMap) {
   unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
-  const TargetRegisterClass *SrcRC = MRI->getRegClass(VReg);
 
+  // Create the new VReg in the destination class and emit a copy.
   unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
   const TargetRegisterClass *DstRC = TRI->getRegClass(DstRCIdx);
-
-  // Create the new VReg in the destination class and emit a copy.
   unsigned NewVReg = MRI->createVirtualRegister(DstRC);
-  bool Emitted = TII->copyRegToReg(*MBB, InsertPos, NewVReg, VReg,
-                                   DstRC, SrcRC, Node->getDebugLoc());
-  assert(Emitted &&
-         "Unable to issue a copy instruction for a COPY_TO_REGCLASS node!\n");
-  (void) Emitted;
+  BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+    NewVReg).addReg(VReg);
 
   SDValue Op(Node, 0);
   bool isNew = VRBaseMap.insert(std::make_pair(Op, NewVReg)).second;
@@ -604,9 +596,10 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
   } else if (SD->getKind() == SDDbgValue::CONST) {
     const Value *V = SD->getConst();
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-      // FIXME: SDDbgValues aren't updated with legalization, so it's possible
-      // to have i128 values in them at this point. As a crude workaround, just
-      // drop the debug info if this happens.
+      // FIXME: SDDbgValue constants aren't updated with legalization, so it's 
+      // possible to have i128 constants in them at this point. Dwarf writer
+      // does not handle i128 constants at the moment so, as a crude workaround,
+      // just drop the debug info if this happens.
       if (!CI->getValue().isSignedIntN(64))
         MIB.addReg(0U);
       else
@@ -676,6 +669,33 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   // Create the new machine instruction.
   MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(), II);
+
+  // The MachineInstr constructor adds implicit-def operands. Scan through
+  // these to determine which are dead.
+  if (MI->getNumOperands() != 0 &&
+      Node->getValueType(Node->getNumValues()-1) == MVT::Flag) {
+    // First, collect all used registers.
+    SmallVector<unsigned, 8> UsedRegs;
+    for (SDNode *F = Node->getFlaggedUser(); F; F = F->getFlaggedUser())
+      if (F->getOpcode() == ISD::CopyFromReg)
+        UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg());
+      else {
+        // Collect declared implicit uses.
+        const TargetInstrDesc &TID = TII->get(F->getMachineOpcode());
+        UsedRegs.append(TID.getImplicitUses(),
+                        TID.getImplicitUses() + TID.getNumImplicitUses());
+        // In addition to declared implicit uses, we must also check for
+        // direct RegisterSDNode operands.
+        for (unsigned i = 0, e = F->getNumOperands(); i != e; ++i)
+          if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(F->getOperand(i))) {
+            unsigned Reg = R->getReg();
+            if (Reg != 0 && TargetRegisterInfo::isPhysicalRegister(Reg))
+              UsedRegs.push_back(Reg);
+          }
+      }
+    // Then mark unused registers as dead.
+    MI->setPhysRegsDeadExcept(UsedRegs, *TRI);
+  }
   
   // Add result register values for things that are defined by this
   // instruction.
@@ -696,16 +716,24 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   MI->setMemRefs(cast<MachineSDNode>(Node)->memoperands_begin(),
                  cast<MachineSDNode>(Node)->memoperands_end());
 
+  // Insert the instruction into position in the block. This needs to
+  // happen before any custom inserter hook is called so that the
+  // hook knows where in the block to insert the replacement code.
+  MBB->insert(InsertPos, MI);
+
   if (II.usesCustomInsertionHook()) {
     // Insert this instruction into the basic block using a target
     // specific inserter which may returns a new basic block.
-    MBB = TLI->EmitInstrWithCustomInserter(MI, MBB);
-    InsertPos = MBB->end();
+    bool AtEnd = InsertPos == MBB->end();
+    MachineBasicBlock *NewMBB = TLI->EmitInstrWithCustomInserter(MI, MBB);
+    if (NewMBB != MBB) {
+      if (AtEnd)
+        InsertPos = NewMBB->end();
+      MBB = NewMBB;
+    }
     return;
   }
   
-  MBB->insert(InsertPos, MI);
-
   // Additional results must be an physical register def.
   if (HasPhysRegOuts) {
     for (unsigned i = II.getNumDefs(); i < NumResults; ++i) {
@@ -761,24 +789,9 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
     unsigned DestReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
     if (SrcReg == DestReg) // Coalesced away the copy? Ignore.
       break;
-      
-    const TargetRegisterClass *SrcTRC = 0, *DstTRC = 0;
-    // Get the register classes of the src/dst.
-    if (TargetRegisterInfo::isVirtualRegister(SrcReg))
-      SrcTRC = MRI->getRegClass(SrcReg);
-    else
-      SrcTRC = TRI->getPhysicalRegisterRegClass(SrcReg,SrcVal.getValueType());
 
-    if (TargetRegisterInfo::isVirtualRegister(DestReg))
-      DstTRC = MRI->getRegClass(DestReg);
-    else
-      DstTRC = TRI->getPhysicalRegisterRegClass(DestReg,
-                                            Node->getOperand(1).getValueType());
-
-    bool Emitted = TII->copyRegToReg(*MBB, InsertPos, DestReg, SrcReg,
-                                     DstTRC, SrcTRC, Node->getDebugLoc());
-    assert(Emitted && "Unable to issue a copy instruction!\n");
-    (void) Emitted;
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            DestReg).addReg(SrcReg);
     break;
   }
   case ISD::CopyFromReg: {
@@ -807,6 +820,12 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
     const char *AsmStr = cast<ExternalSymbolSDNode>(AsmStrV)->getSymbol();
     MI->addOperand(MachineOperand::CreateES(AsmStr));
       
+    // Add the isAlignStack bit.
+    int64_t isAlignStack =
+      cast<ConstantSDNode>(Node->getOperand(InlineAsm::Op_IsAlignStack))->
+                          getZExtValue();
+    MI->addOperand(MachineOperand::CreateImm(isAlignStack));
+
     // Add all of the operand registers to the instruction.
     for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
       unsigned Flags =
@@ -821,14 +840,22 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
         case InlineAsm::Kind_RegDef:
         for (; NumVals; --NumVals, ++i) {
           unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-          MI->addOperand(MachineOperand::CreateReg(Reg, true));
+          // FIXME: Add dead flags for physical and virtual registers defined.
+          // For now, mark physical register defs as implicit to help fast
+          // regalloc. This makes inline asm look a lot like calls.
+          MI->addOperand(MachineOperand::CreateReg(Reg, true,
+                       /*isImp=*/ TargetRegisterInfo::isPhysicalRegister(Reg)));
         }
         break;
       case InlineAsm::Kind_RegDefEarlyClobber:
         for (; NumVals; --NumVals, ++i) {
           unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-          MI->addOperand(MachineOperand::CreateReg(Reg, true, false, false, 
-                                                   false, false, true));
+          MI->addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/ true,
+                         /*isImp=*/ TargetRegisterInfo::isPhysicalRegister(Reg),
+                                                   /*isKill=*/ false,
+                                                   /*isDead=*/ false,
+                                                   /*isUndef=*/false,
+                                                   /*isEarlyClobber=*/ true));
         }
         break;
       case InlineAsm::Kind_RegUse:  // Use of register.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 62a37a5..7a47da4 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -31,6 +31,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/DenseMap.h"
@@ -133,7 +134,7 @@ private:
   /// whose vector element type is narrower than the original shuffle type.
   /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
   SDValue ShuffleWithNarrowerEltType(EVT NVT, EVT VT, DebugLoc dl,
-                                     SDValue N1, SDValue N2, 
+                                     SDValue N1, SDValue N2,
                                      SmallVectorImpl<int> &Mask) const;
 
   bool LegalizeAllNodesNotLeadingTo(SDNode *N, SDNode *Dest,
@@ -143,6 +144,8 @@ private:
                              DebugLoc dl);
 
   SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
+  std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
+                                                 SDNode *Node, bool isSigned);
   SDValue ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32,
                           RTLIB::Libcall Call_F64, RTLIB::Libcall Call_F80,
                           RTLIB::Libcall Call_PPCF128);
@@ -172,6 +175,8 @@ private:
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandVectorBuildThroughStack(SDNode* Node);
 
+  std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
+
   void ExpandNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void PromoteNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 };
@@ -181,8 +186,8 @@ private:
 /// performs the same shuffe in terms of order or result bytes, but on a type
 /// whose vector element type is narrower than the original shuffle type.
 /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
-SDValue 
-SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  DebugLoc dl, 
+SDValue
+SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  DebugLoc dl,
                                                  SDValue N1, SDValue N2,
                                              SmallVectorImpl<int> &Mask) const {
   unsigned NumMaskElts = VT.getVectorNumElements();
@@ -193,12 +198,12 @@ SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  DebugLoc dl,
 
   if (NumEltsGrowth == 1)
     return DAG.getVectorShuffle(NVT, dl, N1, N2, &Mask[0]);
-  
+
   SmallVector<int, 8> NewMask;
   for (unsigned i = 0; i != NumMaskElts; ++i) {
     int Idx = Mask[i];
     for (unsigned j = 0; j != NumEltsGrowth; ++j) {
-      if (Idx < 0) 
+      if (Idx < 0)
         NewMask.push_back(-1);
       else
         NewMask.push_back(Idx * NumEltsGrowth + j);
@@ -320,7 +325,8 @@ bool SelectionDAGLegalize::LegalizeAllNodesNotLeadingTo(SDNode *N, SDNode *Dest,
   bool OperandsLeadToDest = false;
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
     OperandsLeadToDest |=     // If an operand leads to Dest, so do we.
-      LegalizeAllNodesNotLeadingTo(N->getOperand(i).getNode(), Dest, NodesLeadingTo);
+      LegalizeAllNodesNotLeadingTo(N->getOperand(i).getNode(), Dest,
+                                   NodesLeadingTo);
 
   if (OperandsLeadToDest) {
     NodesLeadingTo.insert(N);
@@ -357,7 +363,7 @@ static SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP,
   EVT SVT = VT;
   while (SVT != MVT::f32) {
     SVT = (MVT::SimpleValueType)(SVT.getSimpleVT().SimpleTy - 1);
-    if (CFP->isValueValidForType(SVT, CFP->getValueAPF()) &&
+    if (ConstantFPSDNode::isValueValidForType(SVT, CFP->getValueAPF()) &&
         // Only do this if the target has a native EXTLOAD instruction from
         // smaller type.
         TLI.isLoadExtLegal(ISD::EXTLOAD, SVT) &&
@@ -372,8 +378,8 @@ static SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP,
   SDValue CPIdx = DAG.getConstantPool(LLVMC, TLI.getPointerTy());
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   if (Extend)
-    return DAG.getExtLoad(ISD::EXTLOAD, dl,
-                          OrigVT, DAG.getEntryNode(),
+    return DAG.getExtLoad(ISD::EXTLOAD, OrigVT, dl,
+                          DAG.getEntryNode(),
                           CPIdx, PseudoSourceValue::getConstantPool(),
                           0, VT, false, false, Alignment);
   return DAG.getLoad(OrigVT, dl, DAG.getEntryNode(), CPIdx,
@@ -450,7 +456,7 @@ SDValue ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                     8 * (StoredBytes - Offset));
 
       // Load from the stack slot.
-      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
+      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, RegVT, dl, Store, StackPtr,
                                     NULL, 0, MemVT, false, false, 0);
 
       Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
@@ -552,7 +558,7 @@ SDValue ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
       // The last copy may be partial.  Do an extending load.
       EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
                                     8 * (LoadedBytes - Offset));
-      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr,
+      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, RegVT, dl, Chain, Ptr,
                                     LD->getSrcValue(), SVOffset + Offset,
                                     MemVT, LD->isVolatile(),
                                     LD->isNonTemporal(),
@@ -568,7 +574,7 @@ SDValue ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                                Stores.size());
 
       // Finally, perform the original load only redirected to the stack slot.
-      Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase,
+      Load = DAG.getExtLoad(LD->getExtensionType(), VT, dl, TF, StackBase,
                             NULL, 0, LoadedVT, false, false, 0);
 
       // Callers expect a MERGE_VALUES node.
@@ -597,23 +603,23 @@ SDValue ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
   // Load the value in two parts
   SDValue Lo, Hi;
   if (TLI.isLittleEndian()) {
-    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getSrcValue(),
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, VT, dl, Chain, Ptr, LD->getSrcValue(),
                         SVOffset, NewLoadedVT, LD->isVolatile(),
                         LD->isNonTemporal(), Alignment);
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getConstant(IncrementSize, TLI.getPointerTy()));
-    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getSrcValue(),
+    Hi = DAG.getExtLoad(HiExtType, VT, dl, Chain, Ptr, LD->getSrcValue(),
                         SVOffset + IncrementSize, NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize));
+                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
   } else {
-    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getSrcValue(),
+    Hi = DAG.getExtLoad(HiExtType, VT, dl, Chain, Ptr, LD->getSrcValue(),
                         SVOffset, NewLoadedVT, LD->isVolatile(),
                         LD->isNonTemporal(), Alignment);
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getConstant(IncrementSize, TLI.getPointerTy()));
-    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getSrcValue(),
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, VT, dl, Chain, Ptr, LD->getSrcValue(),
                         SVOffset + IncrementSize, NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize));
+                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
   }
 
   // aggregate the two parts
@@ -773,7 +779,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
            "Unexpected illegal type!");
 
   for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
-    assert((isTypeLegal(Node->getOperand(i).getValueType()) || 
+    assert((isTypeLegal(Node->getOperand(i).getValueType()) ||
             Node->getOperand(i).getOpcode() == ISD::TargetConstant) &&
            "Unexpected illegal type!");
 
@@ -853,6 +859,8 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
   case ISD::MERGE_VALUES:
   case ISD::EH_RETURN:
   case ISD::FRAME_TO_ARGS_OFFSET:
+  case ISD::EH_SJLJ_SETJMP:
+  case ISD::EH_SJLJ_LONGJMP:
     // These operations lie about being legal: when they claim to be legal,
     // they should actually be expanded.
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
@@ -925,8 +933,8 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
       break;
     }
 
-    Result = DAG.UpdateNodeOperands(Result.getValue(0), Ops.data(),
-                                    Ops.size());
+    Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(), Ops.data(),
+                                            Ops.size()), 0);
     switch (Action) {
     case TargetLowering::Legal:
       for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
@@ -1000,11 +1008,11 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
                                    NodesLeadingTo);
     }
 
-    // Now that we legalized all of the inputs (which may have inserted
-    // libcalls) create the new CALLSEQ_START node.
+    // Now that we have legalized all of the inputs (which may have inserted
+    // libcalls), create the new CALLSEQ_START node.
     Tmp1 = LegalizeOp(Node->getOperand(0));  // Legalize the chain.
 
-    // Merge in the last call, to ensure that this call start after the last
+    // Merge in the last call to ensure that this call starts after the last
     // call ended.
     if (LastCALLSEQ_END.getOpcode() != ISD::EntryToken) {
       Tmp1 = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
@@ -1016,7 +1024,8 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     if (Tmp1 != Node->getOperand(0)) {
       SmallVector<SDValue, 8> Ops(Node->op_begin(), Node->op_end());
       Ops[0] = Tmp1;
-      Result = DAG.UpdateNodeOperands(Result, &Ops[0], Ops.size());
+      Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(), &Ops[0],
+                                              Ops.size()), Result.getResNo());
     }
 
     // Remember that the CALLSEQ_START is legalized.
@@ -1058,7 +1067,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
       if (Tmp1 != Node->getOperand(0)) {
         SmallVector<SDValue, 8> Ops(Node->op_begin(), Node->op_end());
         Ops[0] = Tmp1;
-        Result = DAG.UpdateNodeOperands(Result, &Ops[0], Ops.size());
+        Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                &Ops[0], Ops.size()),
+                         Result.getResNo());
       }
     } else {
       Tmp2 = LegalizeOp(Node->getOperand(Node->getNumOperands()-1));
@@ -1067,7 +1078,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
         SmallVector<SDValue, 8> Ops(Node->op_begin(), Node->op_end());
         Ops[0] = Tmp1;
         Ops.back() = Tmp2;
-        Result = DAG.UpdateNodeOperands(Result, &Ops[0], Ops.size());
+        Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                &Ops[0], Ops.size()),
+                         Result.getResNo());
       }
     }
     assert(IsLegalizingCall && "Call sequence imbalance between start/end?");
@@ -1087,7 +1100,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     ISD::LoadExtType ExtType = LD->getExtensionType();
     if (ExtType == ISD::NON_EXTLOAD) {
       EVT VT = Node->getValueType(0);
-      Result = DAG.UpdateNodeOperands(Result, Tmp1, Tmp2, LD->getOffset());
+      Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                              Tmp1, Tmp2, LD->getOffset()),
+                       Result.getResNo());
       Tmp3 = Result.getValue(0);
       Tmp4 = Result.getValue(1);
 
@@ -1100,7 +1115,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           const Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
           unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty);
           if (LD->getAlignment() < ABIAlignment){
-            Result = ExpandUnalignedLoad(cast<LoadSDNode>(Result.getNode()), 
+            Result = ExpandUnalignedLoad(cast<LoadSDNode>(Result.getNode()),
                                          DAG, TLI);
             Tmp3 = Result.getOperand(0);
             Tmp4 = Result.getOperand(1);
@@ -1166,7 +1181,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
         ISD::LoadExtType NewExtType =
           ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
 
-        Result = DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
+        Result = DAG.getExtLoad(NewExtType, Node->getValueType(0), dl,
                                 Tmp1, Tmp2, LD->getSrcValue(), SVOffset,
                                 NVT, isVolatile, isNonTemporal, Alignment);
 
@@ -1202,8 +1217,8 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
         if (TLI.isLittleEndian()) {
           // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
           // Load the bottom RoundWidth bits.
-          Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl,
-                              Node->getValueType(0), Tmp1, Tmp2,
+          Lo = DAG.getExtLoad(ISD::ZEXTLOAD, Node->getValueType(0), dl,
+                              Tmp1, Tmp2,
                               LD->getSrcValue(), SVOffset, RoundVT, isVolatile,
                               isNonTemporal, Alignment);
 
@@ -1211,13 +1226,13 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           IncrementSize = RoundWidth / 8;
           Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
                              DAG.getIntPtrConstant(IncrementSize));
-          Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2,
+          Hi = DAG.getExtLoad(ExtType, Node->getValueType(0), dl, Tmp1, Tmp2,
                               LD->getSrcValue(), SVOffset + IncrementSize,
                               ExtraVT, isVolatile, isNonTemporal,
                               MinAlign(Alignment, IncrementSize));
 
-          // Build a factor node to remember that this load is independent of the
-          // other one.
+          // Build a factor node to remember that this load is independent of
+          // the other one.
           Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                            Hi.getValue(1));
 
@@ -1231,7 +1246,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           // Big endian - avoid unaligned loads.
           // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
           // Load the top RoundWidth bits.
-          Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2,
+          Hi = DAG.getExtLoad(ExtType, Node->getValueType(0), dl, Tmp1, Tmp2,
                               LD->getSrcValue(), SVOffset, RoundVT, isVolatile,
                               isNonTemporal, Alignment);
 
@@ -1239,14 +1254,14 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           IncrementSize = RoundWidth / 8;
           Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
                              DAG.getIntPtrConstant(IncrementSize));
-          Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl,
-                              Node->getValueType(0), Tmp1, Tmp2,
+          Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
+                              Node->getValueType(0), dl, Tmp1, Tmp2,
                               LD->getSrcValue(), SVOffset + IncrementSize,
                               ExtraVT, isVolatile, isNonTemporal,
                               MinAlign(Alignment, IncrementSize));
 
-          // Build a factor node to remember that this load is independent of the
-          // other one.
+          // Build a factor node to remember that this load is independent of
+          // the other one.
           Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                            Hi.getValue(1));
 
@@ -1267,7 +1282,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           isCustom = true;
           // FALLTHROUGH
         case TargetLowering::Legal:
-          Result = DAG.UpdateNodeOperands(Result, Tmp1, Tmp2, LD->getOffset());
+          Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                  Tmp1, Tmp2, LD->getOffset()),
+                           Result.getResNo());
           Tmp1 = Result.getValue(0);
           Tmp2 = Result.getValue(1);
 
@@ -1281,10 +1298,12 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
             // If this is an unaligned load and the target doesn't support it,
             // expand it.
             if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
-              const Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-              unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty);
+              const Type *Ty =
+                LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+              unsigned ABIAlignment =
+                TLI.getTargetData()->getABITypeAlignment(Ty);
               if (LD->getAlignment() < ABIAlignment){
-                Result = ExpandUnalignedLoad(cast<LoadSDNode>(Result.getNode()), 
+                Result = ExpandUnalignedLoad(cast<LoadSDNode>(Result.getNode()),
                                              DAG, TLI);
                 Tmp1 = Result.getOperand(0);
                 Tmp2 = Result.getOperand(1);
@@ -1310,10 +1329,11 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
             Tmp2 = LegalizeOp(Load.getValue(1));
             break;
           }
-          assert(ExtType != ISD::EXTLOAD &&"EXTLOAD should always be supported!");
+          assert(ExtType != ISD::EXTLOAD &&
+                 "EXTLOAD should always be supported!");
           // Turn the unsupported load into an EXTLOAD followed by an explicit
           // zero/sign extend inreg.
-          Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0),
+          Result = DAG.getExtLoad(ISD::EXTLOAD, Node->getValueType(0), dl,
                                   Tmp1, Tmp2, LD->getSrcValue(),
                                   LD->getSrcValueOffset(), SrcVT,
                                   LD->isVolatile(), LD->isNonTemporal(),
@@ -1355,8 +1375,10 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
 
       {
         Tmp3 = LegalizeOp(ST->getValue());
-        Result = DAG.UpdateNodeOperands(Result, Tmp1, Tmp3, Tmp2,
-                                        ST->getOffset());
+        Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                Tmp1, Tmp3, Tmp2,
+                                                ST->getOffset()),
+                         Result.getResNo());
 
         EVT VT = Tmp3.getValueType();
         switch (TLI.getOperationAction(ISD::STORE, VT)) {
@@ -1366,7 +1388,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           // expand it.
           if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
             const Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty);
+            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
             if (ST->getAlignment() < ABIAlignment)
               Result = ExpandUnalignedStore(cast<StoreSDNode>(Result.getNode()),
                                             DAG, TLI);
@@ -1459,8 +1481,10 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
       } else {
         if (Tmp1 != ST->getChain() || Tmp3 != ST->getValue() ||
             Tmp2 != ST->getBasePtr())
-          Result = DAG.UpdateNodeOperands(Result, Tmp1, Tmp3, Tmp2,
-                                          ST->getOffset());
+          Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                  Tmp1, Tmp3, Tmp2,
+                                                  ST->getOffset()),
+                           Result.getResNo());
 
         switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
         default: assert(0 && "This action is not supported yet!");
@@ -1469,7 +1493,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           // expand it.
           if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
             const Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty);
+            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
             if (ST->getAlignment() < ABIAlignment)
               Result = ExpandUnalignedStore(cast<StoreSDNode>(Result.getNode()),
                                             DAG, TLI);
@@ -1531,7 +1555,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
     return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, NULL, 0,
                        false, false, 0);
   else
-    return DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
+    return DAG.getExtLoad(ISD::EXTLOAD, Op.getValueType(), dl, Ch, StackPtr,
                           NULL, 0, Vec.getValueType().getVectorElementType(),
                           false, false, 0);
 }
@@ -1568,7 +1592,7 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
                                          Node->getOperand(i), Idx, SV, Offset,
                                          EltVT, false, false, 0));
     } else
-      Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, 
+      Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl,
                                     Node->getOperand(i), Idx, SV, Offset,
                                     false, false, 0));
   }
@@ -1763,7 +1787,7 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
                        DestAlign);
 
   assert(SlotSize < DestSize && "Unknown extension!");
-  return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, SV, 0, SlotVT,
+  return DAG.getExtLoad(ISD::EXTLOAD, DestVT, dl, Store, FIPtr, SV, 0, SlotVT,
                         false, false, DestAlign);
 }
 
@@ -1926,6 +1950,44 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   return CallInfo.first;
 }
 
+// ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
+// ExpandLibCall except that the first operand is the in-chain.
+std::pair<SDValue, SDValue>
+SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
+                                         SDNode *Node,
+                                         bool isSigned) {
+  assert(!IsLegalizingCall && "Cannot overlap legalization of calls!");
+  SDValue InChain = Node->getOperand(0);
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Node->getOperand(i).getValueType();
+    const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Node->getOperand(i);
+    Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+
+  // Splice the libcall in wherever FindInputOutputChains tells us to.
+  const Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
+  std::pair<SDValue, SDValue> CallInfo =
+    TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+                    0, TLI.getLibcallCallingConv(LC), false,
+                    /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, Node->getDebugLoc());
+
+  // Legalize the call sequence, starting with the chain.  This will advance
+  // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that
+  // was added by LowerCallTo (guaranteeing proper serialization of calls).
+  LegalizeOp(CallInfo.second);
+  return CallInfo;
+}
+
 SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
                                               RTLIB::Libcall Call_F32,
                                               RTLIB::Libcall Call_F64,
@@ -2048,7 +2110,8 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
     SDValue HiOr = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, TwoP84);
     SDValue LoFlt = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, LoOr);
     SDValue HiFlt = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, HiOr);
-    SDValue HiSub = DAG.getNode(ISD::FSUB, dl, MVT::f64, HiFlt, TwoP84PlusTwoP52);
+    SDValue HiSub = DAG.getNode(ISD::FSUB, dl, MVT::f64, HiFlt,
+                                TwoP84PlusTwoP52);
     return DAG.getNode(ISD::FADD, dl, MVT::f64, LoFlt, HiSub);
   }
 
@@ -2058,11 +2121,11 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
   if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f32) {
     EVT SHVT = TLI.getShiftAmountTy();
 
-    SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, 
+    SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
          DAG.getConstant(UINT64_C(0xfffffffffffff800), MVT::i64));
     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And,
          DAG.getConstant(UINT64_C(0x800), MVT::i64));
-    SDValue And2 = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, 
+    SDValue And2 = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
          DAG.getConstant(UINT64_C(0x7ff), MVT::i64));
     SDValue Ne = DAG.getSetCC(dl, TLI.getSetCCResultType(MVT::i64),
                    And2, DAG.getConstant(UINT64_C(0), MVT::i64), ISD::SETNE);
@@ -2122,7 +2185,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
                              false, false, Alignment);
   else {
     FudgeInReg =
-      LegalizeOp(DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT,
+      LegalizeOp(DAG.getExtLoad(ISD::EXTLOAD, DestVT, dl,
                                 DAG.getEntryNode(), CPIdx,
                                 PseudoSourceValue::getConstantPool(), 0,
                                 MVT::f32, false, false, Alignment));
@@ -2332,6 +2395,92 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
   }
 }
 
+std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
+  unsigned Opc = Node->getOpcode();
+  MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
+  RTLIB::Libcall LC;
+
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled atomic intrinsic Expand!");
+    break;
+  case ISD::ATOMIC_SWAP:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_LOCK_TEST_AND_SET_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_8; break;
+    }
+    break;
+  case ISD::ATOMIC_CMP_SWAP:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_ADD:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_ADD_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_ADD_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_ADD_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_ADD_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_SUB:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_SUB_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_SUB_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_SUB_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_SUB_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_AND:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_AND_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_AND_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_AND_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_AND_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_OR:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_OR_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_OR_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_OR_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_OR_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_XOR:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_XOR_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_XOR_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_XOR_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_XOR_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_NAND:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_NAND_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_NAND_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_NAND_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_NAND_8; break;
+    }
+    break;
+  }
+
+  return ExpandChainLibCall(LC, Node, false);
+}
+
 void SelectionDAGLegalize::ExpandNode(SDNode *Node,
                                       SmallVectorImpl<SDValue> &Results) {
   DebugLoc dl = Node->getDebugLoc();
@@ -2357,10 +2506,48 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
   case ISD::EH_RETURN:
   case ISD::EH_LABEL:
   case ISD::PREFETCH:
-  case ISD::MEMBARRIER:
   case ISD::VAEND:
+  case ISD::EH_SJLJ_LONGJMP:
+    Results.push_back(Node->getOperand(0));
+    break;
+  case ISD::EH_SJLJ_SETJMP:
+    Results.push_back(DAG.getConstant(0, MVT::i32));
     Results.push_back(Node->getOperand(0));
     break;
+  case ISD::MEMBARRIER: {
+    // If the target didn't lower this, lower it to '__sync_synchronize()' call
+    TargetLowering::ArgListTy Args;
+    std::pair<SDValue, SDValue> CallResult =
+      TLI.LowerCallTo(Node->getOperand(0), Type::getVoidTy(*DAG.getContext()),
+                      false, false, false, false, 0, CallingConv::C, false,
+                      /*isReturnValueUsed=*/true,
+                      DAG.getExternalSymbol("__sync_synchronize",
+                                            TLI.getPointerTy()),
+                      Args, DAG, dl);
+    Results.push_back(CallResult.second);
+    break;
+  }
+  // By default, atomic intrinsics are marked Legal and lowered. Targets
+  // which don't support them directly, however, may want libcalls, in which
+  // case they mark them Expand, and we get here.
+  // FIXME: Unimplemented for now. Add libcalls.
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_CMP_SWAP: {
+    std::pair<SDValue, SDValue> Tmp = ExpandAtomic(Node);
+    Results.push_back(Tmp.first);
+    Results.push_back(Tmp.second);
+    break;
+  }
   case ISD::DYNAMIC_STACKALLOC:
     ExpandDYNAMIC_STACKALLOC(Node, Results);
     break;
@@ -2465,15 +2652,31 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
     EVT VT = Node->getValueType(0);
     Tmp1 = Node->getOperand(0);
     Tmp2 = Node->getOperand(1);
-    SDValue VAList = DAG.getLoad(TLI.getPointerTy(), dl, Tmp1, Tmp2, V, 0,
-                                 false, false, 0);
+    unsigned Align = Node->getConstantOperandVal(3);
+
+    SDValue VAListLoad = DAG.getLoad(TLI.getPointerTy(), dl, Tmp1, Tmp2, V, 0,
+                                     false, false, 0);
+    SDValue VAList = VAListLoad;
+
+    if (Align > TLI.getMinStackArgumentAlignment()) {
+      assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
+
+      VAList = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
+                           DAG.getConstant(Align - 1,
+                                           TLI.getPointerTy()));
+
+      VAList = DAG.getNode(ISD::AND, dl, TLI.getPointerTy(), VAList,
+                           DAG.getConstant(-Align,
+                                           TLI.getPointerTy()));
+    }
+
     // Increment the pointer, VAList, to the next vaarg
     Tmp3 = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
                        DAG.getConstant(TLI.getTargetData()->
                           getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext())),
                                        TLI.getPointerTy()));
     // Store the incremented VAList to the legalized pointer
-    Tmp3 = DAG.getStore(VAList.getValue(1), dl, Tmp3, Tmp2, V, 0,
+    Tmp3 = DAG.getStore(VAListLoad.getValue(1), dl, Tmp3, Tmp2, V, 0,
                         false, false, 0);
     // Load the actual argument out of the pointer VAList
     Results.push_back(DAG.getLoad(VT, dl, Tmp3, VAList, NULL, 0,
@@ -2496,7 +2699,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
   case ISD::EXTRACT_VECTOR_ELT:
     if (Node->getOperand(0).getValueType().getVectorNumElements() == 1)
       // This must be an access of the only element.  Return it.
-      Tmp1 = DAG.getNode(ISD::BIT_CONVERT, dl, Node->getValueType(0), 
+      Tmp1 = DAG.getNode(ISD::BIT_CONVERT, dl, Node->getValueType(0),
                          Node->getOperand(0));
     else
       Tmp1 = ExpandExtractFromVectorThroughStack(SDValue(Node, 0));
@@ -2948,13 +3151,13 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
     const TargetData &TD = *TLI.getTargetData();
     unsigned EntrySize =
       DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
-    
+
     Index = DAG.getNode(ISD::MUL, dl, PTy,
                         Index, DAG.getConstant(EntrySize, PTy));
     SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
 
     EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
-    SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, dl, PTy, Chain, Addr,
+    SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, PTy, dl, Chain, Addr,
                                 PseudoSourceValue::getJumpTable(), 0, MemVT,
                                 false, false, 0);
     Addr = LD;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index e3eb949..650ee5a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -453,8 +453,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
 
   SDValue NewL;
   if (L->getExtensionType() == ISD::NON_EXTLOAD) {
-    NewL = DAG.getLoad(L->getAddressingMode(), dl, L->getExtensionType(),
-                       NVT, L->getChain(), L->getBasePtr(), L->getOffset(),
+    NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(),
+                       NVT, dl, L->getChain(), L->getBasePtr(), L->getOffset(),
                        L->getSrcValue(), L->getSrcValueOffset(), NVT,
                        L->isVolatile(), L->isNonTemporal(), L->getAlignment());
     // Legalized the chain result - switch anything that used the old chain to
@@ -464,8 +464,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
   }
 
   // Do a non-extending load followed by FP_EXTEND.
-  NewL = DAG.getLoad(L->getAddressingMode(), dl, ISD::NON_EXTLOAD,
-                     L->getMemoryVT(), L->getChain(),
+  NewL = DAG.getLoad(L->getAddressingMode(), ISD::NON_EXTLOAD,
+                     L->getMemoryVT(), dl, L->getChain(),
                      L->getBasePtr(), L->getOffset(),
                      L->getSrcValue(), L->getSrcValueOffset(),
                      L->getMemoryVT(), L->isVolatile(),
@@ -504,7 +504,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
 
   SDValue NewVAARG;
-  NewVAARG = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2));
+  NewVAARG = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2),
+                          N->getConstantOperandVal(3));
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
@@ -698,9 +699,10 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
   }
 
   // Update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 DAG.getCondCode(CCCode), NewLHS, NewRHS,
-                                N->getOperand(4));
+                                N->getOperand(4)),
+                 0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_SINT(SDNode *N) {
@@ -739,9 +741,10 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
   }
 
   // Update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
                                 N->getOperand(2), N->getOperand(3),
-                                DAG.getCondCode(CCCode));
+                                DAG.getCondCode(CCCode)),
+                 0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
@@ -757,8 +760,9 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
   }
 
   // Otherwise, update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
-                                DAG.getCondCode(CCCode));
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)),
+                 0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
@@ -1106,7 +1110,7 @@ void DAGTypeLegalizer::ExpandFloatRes_LOAD(SDNode *N, SDValue &Lo,
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
   assert(LD->getMemoryVT().bitsLE(NVT) && "Float type not round?");
 
-  Hi = DAG.getExtLoad(LD->getExtensionType(), dl, NVT, Chain, Ptr,
+  Hi = DAG.getExtLoad(LD->getExtensionType(), NVT, dl, Chain, Ptr,
                       LD->getSrcValue(), LD->getSrcValueOffset(),
                       LD->getMemoryVT(), LD->isVolatile(),
                       LD->isNonTemporal(), LD->getAlignment());
@@ -1294,9 +1298,9 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
   }
 
   // Update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 DAG.getCondCode(CCCode), NewLHS, NewRHS,
-                                N->getOperand(4));
+                                N->getOperand(4)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_ROUND(SDNode *N) {
@@ -1375,9 +1379,9 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
   }
 
   // Update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
                                 N->getOperand(2), N->getOperand(3),
-                                DAG.getCondCode(CCCode));
+                                DAG.getCondCode(CCCode)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_SETCC(SDNode *N) {
@@ -1393,8 +1397,8 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_SETCC(SDNode *N) {
   }
 
   // Otherwise, update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
-                                DAG.getCondCode(CCCode));
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8b382bc..b94ea9a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -369,7 +369,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
   ISD::LoadExtType ExtType =
     ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType();
   DebugLoc dl = N->getDebugLoc();
-  SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
+  SDValue Res = DAG.getExtLoad(ExtType, NVT, dl, N->getChain(), N->getBasePtr(),
                                N->getSrcValue(), N->getSrcValueOffset(),
                                N->getMemoryVT(), N->isVolatile(),
                                N->isNonTemporal(), N->getAlignment());
@@ -572,7 +572,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
 
   SmallVector<SDValue, 8> Parts(NumRegs);
   for (unsigned i = 0; i < NumRegs; ++i) {
-    Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2));
+    Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2),
+                            N->getConstantOperandVal(3));
     Chain = Parts[i].getValue(1);
   }
 
@@ -725,8 +726,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) {
 
   // The chain (Op#0), CC (#1) and basic block destination (Op#4) are always
   // legal types.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
-                                N->getOperand(1), LHS, RHS, N->getOperand(4));
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                N->getOperand(1), LHS, RHS, N->getOperand(4)),
+                 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) {
@@ -737,8 +739,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) {
   SDValue Cond = PromoteTargetBoolean(N->getOperand(1), SVT);
 
   // The chain (Op#0) and basic block destination (Op#2) are always legal types.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0), Cond,
-                                N->getOperand(2));
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Cond,
+                                        N->getOperand(2)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) {
@@ -773,7 +775,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
   for (unsigned i = 0; i < NumElts; ++i)
     NewOps.push_back(GetPromotedInteger(N->getOperand(i)));
 
-  return DAG.UpdateNodeOperands(SDValue(N, 0), &NewOps[0], NumElts);
+  return SDValue(DAG.UpdateNodeOperands(N, &NewOps[0], NumElts), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_CONVERT_RNDSAT(SDNode *N) {
@@ -798,17 +800,18 @@ SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N,
     assert(N->getOperand(1).getValueType().getSizeInBits() >=
            N->getValueType(0).getVectorElementType().getSizeInBits() &&
            "Type of inserted value narrower than vector element type!");
-    return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+    return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                   GetPromotedInteger(N->getOperand(1)),
-                                  N->getOperand(2));
+                                  N->getOperand(2)),
+                   0);
   }
 
   assert(OpNo == 2 && "Different operand and result vector types?");
 
   // Promote the index.
   SDValue Idx = ZExtPromotedInteger(N->getOperand(2));
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
-                                N->getOperand(1), Idx);
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                N->getOperand(1), Idx), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MEMBARRIER(SDNode *N) {
@@ -819,15 +822,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MEMBARRIER(SDNode *N) {
     SDValue Flag = GetPromotedInteger(N->getOperand(i));
     NewOps[i] = DAG.getZeroExtendInReg(Flag, dl, MVT::i1);
   }
-  return DAG.UpdateNodeOperands(SDValue (N, 0), NewOps,
-                                array_lengthof(NewOps));
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps, array_lengthof(NewOps)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N) {
   // Integer SCALAR_TO_VECTOR operands are implicitly truncated, so just promote
   // the operand in place.
-  return DAG.UpdateNodeOperands(SDValue(N, 0),
-                                GetPromotedInteger(N->getOperand(0)));
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                GetPromotedInteger(N->getOperand(0))), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
@@ -837,8 +839,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
   EVT SVT = TLI.getSetCCResultType(N->getOperand(1).getValueType());
   SDValue Cond = PromoteTargetBoolean(N->getOperand(0), SVT);
 
-  return DAG.UpdateNodeOperands(SDValue(N, 0), Cond,
-                                N->getOperand(1), N->getOperand(2));
+  return SDValue(DAG.UpdateNodeOperands(N, Cond,
+                                N->getOperand(1), N->getOperand(2)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) {
@@ -849,8 +851,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) {
   PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(4))->get());
 
   // The CC (#4) and the possible return values (#2 and #3) have legal types.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), LHS, RHS, N->getOperand(2),
-                                N->getOperand(3), N->getOperand(4));
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2),
+                                N->getOperand(3), N->getOperand(4)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SETCC(SDNode *N, unsigned OpNo) {
@@ -861,12 +863,12 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SETCC(SDNode *N, unsigned OpNo) {
   PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(2))->get());
 
   // The CC (#2) is always legal.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), LHS, RHS, N->getOperand(2));
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) {
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
-                                ZExtPromotedInteger(N->getOperand(1)));
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                ZExtPromotedInteger(N->getOperand(1))), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) {
@@ -878,8 +880,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) {
-  return DAG.UpdateNodeOperands(SDValue(N, 0),
-                                SExtPromotedInteger(N->getOperand(0)));
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                SExtPromotedInteger(N->getOperand(0))), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
@@ -905,8 +907,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_UINT_TO_FP(SDNode *N) {
-  return DAG.UpdateNodeOperands(SDValue(N, 0),
-                                ZExtPromotedInteger(N->getOperand(0)));
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                ZExtPromotedInteger(N->getOperand(0))), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) {
@@ -990,6 +992,11 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break;
+
+  case ISD::SADDO:
+  case ISD::SSUBO: ExpandIntRes_SADDSUBO(N, Lo, Hi); break;
+  case ISD::UADDO:
+  case ISD::USUBO: ExpandIntRes_UADDSUBO(N, Lo, Hi); break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -1526,7 +1533,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
   if (N->getMemoryVT().bitsLE(NVT)) {
     EVT MemVT = N->getMemoryVT();
 
-    Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getSrcValue(), SVOffset,
+    Lo = DAG.getExtLoad(ExtType, NVT, dl, Ch, Ptr, N->getSrcValue(), SVOffset,
                         MemVT, isVolatile, isNonTemporal, Alignment);
 
     // Remember the chain.
@@ -1559,7 +1566,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     unsigned IncrementSize = NVT.getSizeInBits()/8;
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getIntPtrConstant(IncrementSize));
-    Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getSrcValue(),
+    Hi = DAG.getExtLoad(ExtType, NVT, dl, Ch, Ptr, N->getSrcValue(),
                         SVOffset+IncrementSize, NEVT,
                         isVolatile, isNonTemporal,
                         MinAlign(Alignment, IncrementSize));
@@ -1577,7 +1584,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     unsigned ExcessBits = (EBytes - IncrementSize)*8;
 
     // Load both the high bits and maybe some of the low bits.
-    Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getSrcValue(), SVOffset,
+    Hi = DAG.getExtLoad(ExtType, NVT, dl, Ch, Ptr, N->getSrcValue(), SVOffset,
                         EVT::getIntegerVT(*DAG.getContext(),
                                           MemVT.getSizeInBits() - ExcessBits),
                         isVolatile, isNonTemporal, Alignment);
@@ -1586,7 +1593,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getIntPtrConstant(IncrementSize));
     // Load the rest of the low bits.
-    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr, N->getSrcValue(),
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, NVT, dl, Ch, Ptr, N->getSrcValue(),
                         SVOffset+IncrementSize,
                         EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
                         isVolatile, isNonTemporal,
@@ -1716,6 +1723,48 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
   SplitInteger(MakeLibCall(LC, VT, Ops, 2, true/*irrelevant*/, dl), Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  DebugLoc dl = Node->getDebugLoc();
+
+  // Expand the result by simply replacing it with the equivalent
+  // non-overflow-checking operation.
+  SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
+                            ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                            LHS, RHS);
+  SplitInteger(Sum, Lo, Hi);
+
+  // Compute the overflow.
+  //
+  //   LHSSign -> LHS >= 0
+  //   RHSSign -> RHS >= 0
+  //   SumSign -> Sum >= 0
+  //
+  //   Add:
+  //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+  //   Sub:
+  //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+  //
+  EVT OType = Node->getValueType(1);
+  SDValue Zero = DAG.getConstant(0, LHS.getValueType());
+
+  SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
+  SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
+  SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
+                                    Node->getOpcode() == ISD::SADDO ?
+                                    ISD::SETEQ : ISD::SETNE);
+
+  SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
+  SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
+
+  SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(Node, 1), Cmp);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
@@ -1912,6 +1961,29 @@ void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
   Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Expand the result by simply replacing it with the equivalent
+  // non-overflow-checking operation.
+  SDValue Sum = DAG.getNode(N->getOpcode() == ISD::UADDO ?
+                            ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                            LHS, RHS);
+  SplitInteger(Sum, Lo, Hi);
+
+  // Calculate the overflow: addition overflows iff a + b < a, and subtraction
+  // overflows iff a - b > a.
+  SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS,
+                             N->getOpcode () == ISD::UADDO ?
+                             ISD::SETULT : ISD::SETUGT);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
@@ -2154,9 +2226,9 @@ SDValue DAGTypeLegalizer::ExpandIntOp_BR_CC(SDNode *N) {
   }
 
   // Update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 DAG.getCondCode(CCCode), NewLHS, NewRHS,
-                                N->getOperand(4));
+                                N->getOperand(4)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) {
@@ -2172,9 +2244,9 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) {
   }
 
   // Update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
                                 N->getOperand(2), N->getOperand(3),
-                                DAG.getCondCode(CCCode));
+                                DAG.getCondCode(CCCode)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
@@ -2190,8 +2262,8 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
   }
 
   // Otherwise, update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
-                                DAG.getCondCode(CCCode));
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
@@ -2200,7 +2272,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
   // upper half of the shift amount is zero.  Just use the lower half.
   SDValue Lo, Hi;
   GetExpandedInteger(N->getOperand(1), Lo, Hi);
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0), Lo);
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Lo), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_RETURNADDR(SDNode *N) {
@@ -2209,7 +2281,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_RETURNADDR(SDNode *N) {
   // constant to valid type.
   SDValue Lo, Hi;
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
-  return DAG.UpdateNodeOperands(SDValue(N, 0), Lo);
+  return SDValue(DAG.UpdateNodeOperands(N, Lo), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
@@ -2384,7 +2456,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
 
     // Load the value out, extending it from f32 to the destination float type.
     // FIXME: Avoid the extend by constructing the right constant pool?
-    SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, DstVT, DAG.getEntryNode(),
+    SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, DstVT, dl, DAG.getEntryNode(),
                                    FudgePtr, NULL, 0, MVT::f32,
                                    false, false, Alignment);
     return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 17f131b..6e56c98 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -485,15 +485,14 @@ SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
       NewOps.push_back(Op);
     } else if (Op != OrigOp) {
       // This is the first operand to change - add all operands so far.
-      NewOps.insert(NewOps.end(), N->op_begin(), N->op_begin() + i);
+      NewOps.append(N->op_begin(), N->op_begin() + i);
       NewOps.push_back(Op);
     }
   }
 
   // Some operands changed - update the node.
   if (!NewOps.empty()) {
-    SDNode *M = DAG.UpdateNodeOperands(SDValue(N, 0), &NewOps[0],
-                                       NewOps.size()).getNode();
+    SDNode *M = DAG.UpdateNodeOperands(N, &NewOps[0], NewOps.size());
     if (M != N) {
       // The node morphed into a different node.  Normally for this to happen
       // the original node would have to be marked NewNode.  However this can
@@ -684,40 +683,45 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
   // can potentially cause recursive merging.
   SmallSetVector<SDNode*, 16> NodesToAnalyze;
   NodeUpdateListener NUL(*this, NodesToAnalyze);
-  DAG.ReplaceAllUsesOfValueWith(From, To, &NUL);
-
-  // The old node may still be present in a map like ExpandedIntegers or
-  // PromotedIntegers.  Inform maps about the replacement.
-  ReplacedValues[From] = To;
-
-  // Process the list of nodes that need to be reanalyzed.
-  while (!NodesToAnalyze.empty()) {
-    SDNode *N = NodesToAnalyze.back();
-    NodesToAnalyze.pop_back();
-    if (N->getNodeId() != DAGTypeLegalizer::NewNode)
-      // The node was analyzed while reanalyzing an earlier node - it is safe to
-      // skip.  Note that this is not a morphing node - otherwise it would still
-      // be marked NewNode.
-      continue;
+  do {
+    DAG.ReplaceAllUsesOfValueWith(From, To, &NUL);
+
+    // The old node may still be present in a map like ExpandedIntegers or
+    // PromotedIntegers.  Inform maps about the replacement.
+    ReplacedValues[From] = To;
+
+    // Process the list of nodes that need to be reanalyzed.
+    while (!NodesToAnalyze.empty()) {
+      SDNode *N = NodesToAnalyze.back();
+      NodesToAnalyze.pop_back();
+      if (N->getNodeId() != DAGTypeLegalizer::NewNode)
+        // The node was analyzed while reanalyzing an earlier node - it is safe
+        // to skip.  Note that this is not a morphing node - otherwise it would
+        // still be marked NewNode.
+        continue;
 
-    // Analyze the node's operands and recalculate the node ID.
-    SDNode *M = AnalyzeNewNode(N);
-    if (M != N) {
-      // The node morphed into a different node.  Make everyone use the new node
-      // instead.
-      assert(M->getNodeId() != NewNode && "Analysis resulted in NewNode!");
-      assert(N->getNumValues() == M->getNumValues() &&
-             "Node morphing changed the number of results!");
-      for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
-        SDValue OldVal(N, i);
-        SDValue NewVal(M, i);
-        if (M->getNodeId() == Processed)
-          RemapValue(NewVal);
-        DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal, &NUL);
+      // Analyze the node's operands and recalculate the node ID.
+      SDNode *M = AnalyzeNewNode(N);
+      if (M != N) {
+        // The node morphed into a different node.  Make everyone use the new
+        // node instead.
+        assert(M->getNodeId() != NewNode && "Analysis resulted in NewNode!");
+        assert(N->getNumValues() == M->getNumValues() &&
+               "Node morphing changed the number of results!");
+        for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+          SDValue OldVal(N, i);
+          SDValue NewVal(M, i);
+          if (M->getNodeId() == Processed)
+            RemapValue(NewVal);
+          DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal, &NUL);
+        }
+        // The original node continues to exist in the DAG, marked NewNode.
       }
-      // The original node continues to exist in the DAG, marked NewNode.
     }
-  }
+    // When recursively update nodes with new nodes, it is possible to have
+    // new uses of From due to CSE. If this happens, replace the new uses of
+    // From with To.
+  } while (!From.use_empty());
 }
 
 void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index c665963..bd86694 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -345,6 +345,9 @@ private:
   void ExpandIntRes_UREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_Shift             (SDNode *N, SDValue &Lo, SDValue &Hi);
 
+  void ExpandIntRes_SADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
+
   void ExpandShiftByConstant(SDNode *N, unsigned Amt,
                              SDValue &Lo, SDValue &Hi);
   bool ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -620,6 +623,7 @@ private:
 
   SDValue WidenVecRes_Binary(SDNode *N);
   SDValue WidenVecRes_Convert(SDNode *N);
+  SDValue WidenVecRes_POWI(SDNode *N);
   SDValue WidenVecRes_Shift(SDNode *N);
   SDValue WidenVecRes_Unary(SDNode *N);
   SDValue WidenVecRes_InregOp(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 88e1e62..9c2b1d9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -238,13 +238,15 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
 }
 
 void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
   SDValue Chain = N->getOperand(0);
   SDValue Ptr = N->getOperand(1);
   DebugLoc dl = N->getDebugLoc();
+  const unsigned Align = N->getConstantOperandVal(3);
 
-  Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2));
-  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2));
+  Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2), Align);
+  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2), 0);
 
   // Handle endianness of the load.
   if (TLI.isBigEndian())
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 0e2bd02..621c087 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -116,7 +116,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     Ops.push_back(LegalizeOp(Node->getOperand(i)));
 
   SDValue Result =
-      DAG.UpdateNodeOperands(Op.getValue(0), Ops.data(), Ops.size());
+    SDValue(DAG.UpdateNodeOperands(Op.getNode(), Ops.data(), Ops.size()), 0);
 
   bool HasVectorValue = false;
   for (SDNode::value_iterator J = Node->value_begin(), E = Node->value_end();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 7efeea1..93aeff5 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -165,9 +165,10 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
 SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
   assert(N->isUnindexed() && "Indexed vector load?");
 
-  SDValue Result = DAG.getLoad(ISD::UNINDEXED, N->getDebugLoc(),
+  SDValue Result = DAG.getLoad(ISD::UNINDEXED,
                                N->getExtensionType(),
                                N->getValueType(0).getVectorElementType(),
+                               N->getDebugLoc(),
                                N->getChain(), N->getBasePtr(),
                                DAG.getUNDEF(N->getBasePtr().getValueType()),
                                N->getSrcValue(), N->getSrcValueOffset(),
@@ -448,6 +449,11 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
     SplitVecRes_UnaryOp(N, Lo, Hi);
     break;
 
@@ -755,14 +761,14 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   EVT LoMemVT, HiMemVT;
   GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
 
-  Lo = DAG.getLoad(ISD::UNINDEXED, dl, ExtType, LoVT, Ch, Ptr, Offset,
+  Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
                    SV, SVOffset, LoMemVT, isVolatile, isNonTemporal, Alignment);
 
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                     DAG.getIntPtrConstant(IncrementSize));
   SVOffset += IncrementSize;
-  Hi = DAG.getLoad(ISD::UNINDEXED, dl, ExtType, HiVT, Ch, Ptr, Offset,
+  Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
                    SV, SVOffset, HiMemVT, isVolatile, isNonTemporal, Alignment);
 
   // Build a factor node to remember that this load is independent of the
@@ -1082,10 +1088,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
     uint64_t LoElts = Lo.getValueType().getVectorNumElements();
 
     if (IdxVal < LoElts)
-      return DAG.UpdateNodeOperands(SDValue(N, 0), Lo, Idx);
-    return DAG.UpdateNodeOperands(SDValue(N, 0), Hi,
+      return SDValue(DAG.UpdateNodeOperands(N, Lo, Idx), 0);
+    return SDValue(DAG.UpdateNodeOperands(N, Hi,
                                   DAG.getConstant(IdxVal - LoElts,
-                                                  Idx.getValueType()));
+                                                  Idx.getValueType())),
+                   0);
   }
 
   // Store the vector to the stack.
@@ -1099,7 +1106,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
 
   // Load back the required element.
   StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
-  return DAG.getExtLoad(ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
+  return DAG.getExtLoad(ISD::EXTLOAD, N->getValueType(0), dl, Store, StackPtr,
                         SV, 0, EltVT, false, false, 0);
 }
 
@@ -1199,7 +1206,6 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FDIV:
   case ISD::FMUL:
   case ISD::FPOW:
-  case ISD::FPOWI:
   case ISD::FREM:
   case ISD::FSUB:
   case ISD::MUL:
@@ -1215,6 +1221,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_Binary(N);
     break;
 
+  case ISD::FPOWI:
+    Res = WidenVecRes_POWI(N);
+    break;
+
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
@@ -1241,6 +1251,11 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FNEG:
   case ISD::FSIN:
   case ISD::FSQRT:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
     Res = WidenVecRes_Unary(N);
     break;
   }
@@ -1258,7 +1273,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   EVT WidenEltVT = WidenVT.getVectorElementType();
   EVT VT = WidenVT;
   unsigned NumElts =  VT.getVectorNumElements();
-  while (!TLI.isTypeLegal(VT) && NumElts != 1) {
+  while (!TLI.isTypeSynthesizable(VT) && NumElts != 1) {
      NumElts = NumElts / 2;
      VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
   }
@@ -1273,13 +1288,20 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
     return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
   } else {
     // Since the operation can trap, apply operation on the original vector.
+    EVT MaxVT = VT;
     SDValue InOp1 = GetWidenedVector(N->getOperand(0));
     SDValue InOp2 = GetWidenedVector(N->getOperand(1));
     unsigned CurNumElts = N->getValueType(0).getVectorNumElements();
 
     SmallVector<SDValue, 16> ConcatOps(CurNumElts);
     unsigned ConcatEnd = 0;  // Current ConcatOps index.
-    unsigned Idx = 0;        // Current Idx into input vectors.
+    int Idx = 0;        // Current Idx into input vectors.
+
+    // NumElts := greatest synthesizable vector size (at most WidenVT)
+    // while (orig. vector has unhandled elements) {
+    //   take munches of size NumElts from the beginning and add to ConcatOps
+    //   NumElts := next smaller supported vector size or 1
+    // }
     while (CurNumElts != 0) {
       while (CurNumElts >= NumElts) {
         SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
@@ -1290,26 +1312,21 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
         Idx += NumElts;
         CurNumElts -= NumElts;
       }
-      EVT PrevVecVT = VT;
       do {
         NumElts = NumElts / 2;
         VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
-      } while (!TLI.isTypeLegal(VT) && NumElts != 1);
+      } while (!TLI.isTypeSynthesizable(VT) && NumElts != 1);
 
       if (NumElts == 1) {
-        // Since we are using concat vector, build a vector from the scalar ops.
-        SDValue VecOp = DAG.getUNDEF(PrevVecVT);
         for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
           SDValue EOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, 
                                      InOp1, DAG.getIntPtrConstant(Idx));
           SDValue EOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, 
                                      InOp2, DAG.getIntPtrConstant(Idx));
-          VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, PrevVecVT, VecOp,
-                              DAG.getNode(Opcode, dl, WidenEltVT, EOp1, EOp2),
-                              DAG.getIntPtrConstant(i));
+          ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
+                                               EOp1, EOp2);
         }
         CurNumElts = 0;
-        ConcatOps[ConcatEnd++] = VecOp;
       }
     }
 
@@ -1320,23 +1337,65 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
         return ConcatOps[0];
     }
 
-    // Rebuild vector to one with the widen type
-    Idx = ConcatEnd - 1;
-    while (Idx != 0) {
+    // while (Some element of ConcatOps is not of type MaxVT) {
+    //   From the end of ConcatOps, collect elements of the same type and put
+    //   them into an op of the next larger supported type
+    // }
+    while (ConcatOps[ConcatEnd-1].getValueType() != MaxVT) {
+      Idx = ConcatEnd - 1;
       VT = ConcatOps[Idx--].getValueType();
-      while (Idx != 0 && ConcatOps[Idx].getValueType() == VT)
-        --Idx;
-      if (Idx != 0) {
-        VT = ConcatOps[Idx].getValueType();
-        ConcatOps[Idx+1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                                     &ConcatOps[Idx+1], ConcatEnd - Idx - 1);
+      while (Idx >= 0 && ConcatOps[Idx].getValueType() == VT)
+        Idx--;
+
+      int NextSize = VT.isVector() ? VT.getVectorNumElements() : 1;
+      EVT NextVT;
+      do {
+        NextSize *= 2;
+        NextVT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NextSize);
+      } while (!TLI.isTypeSynthesizable(NextVT));
+
+      if (!VT.isVector()) {
+        // Scalar type, create an INSERT_VECTOR_ELEMENT of type NextVT
+        SDValue VecOp = DAG.getUNDEF(NextVT);
+        unsigned NumToInsert = ConcatEnd - Idx - 1;
+        for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
+          VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp,
+                              ConcatOps[OpIdx], DAG.getIntPtrConstant(i));
+        }
+        ConcatOps[Idx+1] = VecOp;
         ConcatEnd = Idx + 2;
+      } 
+      else {
+        // Vector type, create a CONCAT_VECTORS of type NextVT
+        SDValue undefVec = DAG.getUNDEF(VT);
+        unsigned OpsToConcat = NextSize/VT.getVectorNumElements();
+        SmallVector<SDValue, 16> SubConcatOps(OpsToConcat);
+        unsigned RealVals = ConcatEnd - Idx - 1;
+        unsigned SubConcatEnd = 0;
+        unsigned SubConcatIdx = Idx + 1;
+        while (SubConcatEnd < RealVals)
+          SubConcatOps[SubConcatEnd++] = ConcatOps[++Idx];
+        while (SubConcatEnd < OpsToConcat)
+          SubConcatOps[SubConcatEnd++] = undefVec;
+        ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl,
+                                              NextVT, &SubConcatOps[0],
+                                              OpsToConcat);
+        ConcatEnd = SubConcatIdx + 1;
       }
     }
+
+    // Check to see if we have a single operation with the widen type.
+    if (ConcatEnd == 1) {
+      VT = ConcatOps[0].getValueType();
+      if (VT == WidenVT)
+        return ConcatOps[0];
+    }
     
-    unsigned NumOps = WidenVT.getVectorNumElements()/VT.getVectorNumElements();
+    // add undefs of size MaxVT until ConcatOps grows to length of WidenVT
+    unsigned NumOps = 
+        WidenVT.getVectorNumElements()/MaxVT.getVectorNumElements();
     if (NumOps != ConcatEnd ) {
-      SDValue UndefVal = DAG.getUNDEF(VT);
+      SDValue UndefVal = DAG.getUNDEF(MaxVT);
       for (unsigned j = ConcatEnd; j < NumOps; ++j)
         ConcatOps[j] = UndefVal;
     }
@@ -1366,7 +1425,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
       return DAG.getNode(Opcode, dl, WidenVT, InOp);
   }
 
-  if (TLI.isTypeLegal(InWidenVT)) {
+  if (TLI.isTypeSynthesizable(InWidenVT)) {
     // Because the result and the input are different vector types, widening
     // the result could create a legal type but widening the input might make
     // it an illegal type that might lead to repeatedly splitting the input
@@ -1410,6 +1469,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  SDValue ShOp = N->getOperand(1);
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp, ShOp);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp = GetWidenedVector(N->getOperand(0));
@@ -1501,7 +1567,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BIT_CONVERT(SDNode *N) {
       NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts);
     }
 
-    if (TLI.isTypeLegal(NewInVT)) {
+    if (TLI.isTypeSynthesizable(NewInVT)) {
       // Because the result and the input are different vector types, widening
       // the result could create a legal type but widening the input might make
       // it an illegal type that might lead to repeatedly splitting the input
@@ -1642,7 +1708,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
                                   SatOp, CvtCode);
   }
 
-  if (TLI.isTypeLegal(InWidenVT)) {
+  if (TLI.isTypeSynthesizable(InWidenVT)) {
     // Because the result and the input are different vector types, widening
     // the result could create a legal type but widening the input might make
     // it an illegal type that might lead to repeatedly splitting the input
@@ -1968,7 +2034,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_BIT_CONVERT(SDNode *N) {
   if (InWidenSize % Size == 0 && !VT.isVector()) {
     unsigned NewNumElts = InWidenSize / Size;
     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts);
-    if (TLI.isTypeLegal(NewVT)) {
+    if (TLI.isTypeSynthesizable(NewVT)) {
       SDValue BitOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, InOp);
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
                          DAG.getIntPtrConstant(0));
@@ -2066,7 +2132,7 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
     unsigned MemVTWidth = MemVT.getSizeInBits();
     if (MemVT.getSizeInBits() <= WidenEltWidth)
       break;
-    if (TLI.isTypeLegal(MemVT) && (WidenWidth % MemVTWidth) == 0 &&
+    if (TLI.isTypeSynthesizable(MemVT) && (WidenWidth % MemVTWidth) == 0 &&
         (MemVTWidth <= Width ||
          (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
       RetVT = MemVT;
@@ -2080,7 +2146,7 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
        VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) {
     EVT MemVT = (MVT::SimpleValueType) VT;
     unsigned MemVTWidth = MemVT.getSizeInBits();
-    if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() &&
+    if (TLI.isTypeSynthesizable(MemVT) && WidenEltVT == MemVT.getVectorElementType() &&
         (WidenWidth % MemVTWidth) == 0 &&
         (MemVTWidth <= Width ||
          (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
@@ -2286,14 +2352,14 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVector<SDValue, 16>& LdChain,
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
   SmallVector<SDValue, 16> Ops(WidenNumElts);
   unsigned Increment = LdEltVT.getSizeInBits() / 8;
-  Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, SV, SVOffset,
+  Ops[0] = DAG.getExtLoad(ExtType, EltVT, dl, Chain, BasePtr, SV, SVOffset,
                           LdEltVT, isVolatile, isNonTemporal, Align);
   LdChain.push_back(Ops[0].getValue(1));
   unsigned i = 0, Offset = Increment;
   for (i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
                                      BasePtr, DAG.getIntPtrConstant(Offset));
-    Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr, SV,
+    Ops[i] = DAG.getExtLoad(ExtType, EltVT, dl, Chain, NewBasePtr, SV,
                             SVOffset + Offset, LdEltVT, isVolatile,
                             isNonTemporal, Align);
     LdChain.push_back(Ops[i].getValue(1));
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index ad8630a..3b86c32 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -535,7 +535,7 @@ void ScheduleDAGFast::ListScheduleBottomUp() {
         SUnit *LRDef = LiveRegDefs[Reg];
         EVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
         const TargetRegisterClass *RC =
-          TRI->getPhysicalRegisterRegClass(Reg, VT);
+          TRI->getMinimalPhysRegClass(Reg, VT);
         const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
 
         // If cross copy register class is null, then it must be possible copy
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 820ba66..3ef521c 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -320,7 +320,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
   for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     CapturePred(&*I);
-    if (I->isAssignedRegDep() && SU->getHeight() == LiveRegCycles[I->getReg()]) {
+    if (I->isAssignedRegDep() && SU->getHeight() == LiveRegCycles[I->getReg()]){
       assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
       assert(LiveRegDefs[I->getReg()] == I->getSUnit() &&
              "Physical register dependency violated?");
@@ -795,7 +795,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() {
         SUnit *LRDef = LiveRegDefs[Reg];
         EVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
         const TargetRegisterClass *RC =
-          TRI->getPhysicalRegisterRegClass(Reg, VT);
+          TRI->getMinimalPhysRegClass(Reg, VT);
         const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
 
         // If cross copy register class is null, then it must be possible copy
@@ -1116,7 +1116,7 @@ namespace {
     SUnit *pop() {
       if (empty()) return NULL;
       std::vector<SUnit *>::iterator Best = Queue.begin();
-      for (std::vector<SUnit *>::iterator I = next(Queue.begin()),
+      for (std::vector<SUnit *>::iterator I = llvm::next(Queue.begin()),
            E = Queue.end(); I != E; ++I)
         if (Picker(*Best, *I))
           Best = I;
@@ -1275,6 +1275,17 @@ bool hybrid_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
       return left->getHeight() > right->getHeight();
   } else if (RStall)
       return false;
+
+  // If either node is scheduling for latency, sort them by height and latency
+  // first.
+  if (left->SchedulingPref == Sched::Latency ||
+      right->SchedulingPref == Sched::Latency) {
+    if (left->getHeight() != right->getHeight())
+      return left->getHeight() > right->getHeight();
+    if (left->Latency != right->Latency)
+      return left->Latency > right->Latency;
+  }
+
   return BURRSort(left, right, SPQ);
 }
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 3185c88..06cf053 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -59,7 +59,11 @@ SUnit *ScheduleDAGSDNodes::NewSUnit(SDNode *N) {
   SUnits.back().OrigNode = &SUnits.back();
   SUnit *SU = &SUnits.back();
   const TargetLowering &TLI = DAG->getTargetLoweringInfo();
-  SU->SchedulingPref = TLI.getSchedulingPreference(N);
+  if (N->isMachineOpcode() &&
+      N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF)
+    SU->SchedulingPref = Sched::None;
+  else
+    SU->SchedulingPref = TLI.getSchedulingPreference(N);
   return SU;
 }
 
@@ -97,7 +101,7 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
         II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg) {
       PhysReg = Reg;
       const TargetRegisterClass *RC =
-        TRI->getPhysicalRegisterRegClass(Reg, Def->getValueType(ResNo));
+        TRI->getMinimalPhysRegClass(Reg, Def->getValueType(ResNo));
       Cost = RC->getCopyCost();
     }
   }
@@ -106,17 +110,42 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
 static void AddFlags(SDNode *N, SDValue Flag, bool AddFlag,
                      SelectionDAG *DAG) {
   SmallVector<EVT, 4> VTs;
-  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
-    VTs.push_back(N->getValueType(i));
+  SDNode *FlagDestNode = Flag.getNode();
+
+  // Don't add a flag from a node to itself.
+  if (FlagDestNode == N) return;
+
+  // Don't add a flag to something which already has a flag.
+  if (N->getValueType(N->getNumValues() - 1) == MVT::Flag) return;
+
+  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
+    VTs.push_back(N->getValueType(I));
+
   if (AddFlag)
     VTs.push_back(MVT::Flag);
+
   SmallVector<SDValue, 4> Ops;
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-    Ops.push_back(N->getOperand(i));
-  if (Flag.getNode())
+  for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I)
+    Ops.push_back(N->getOperand(I));
+
+  if (FlagDestNode)
     Ops.push_back(Flag);
+
   SDVTList VTList = DAG->getVTList(&VTs[0], VTs.size());
+  MachineSDNode::mmo_iterator Begin = 0, End = 0;
+  MachineSDNode *MN = dyn_cast<MachineSDNode>(N);
+
+  // Store memory references.
+  if (MN) {
+    Begin = MN->memoperands_begin();
+    End = MN->memoperands_end();
+  }
+
   DAG->MorphNodeTo(N, N->getOpcode(), VTList, &Ops[0], Ops.size());
+
+  // Reset the memory references
+  if (MN)
+    MN->setMemRefs(Begin, End);
 }
 
 /// ClusterNeighboringLoads - Force nearby loads together by "flagging" them.
@@ -124,98 +153,98 @@ static void AddFlags(SDNode *N, SDValue Flag, bool AddFlag,
 /// offsets are not far apart (target specific), it add MVT::Flag inputs and
 /// outputs to ensure they are scheduled together and in order. This
 /// optimization may benefit some targets by improving cache locality.
-void ScheduleDAGSDNodes::ClusterNeighboringLoads() {
+void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
+  SDNode *Chain = 0;
+  unsigned NumOps = Node->getNumOperands();
+  if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
+    Chain = Node->getOperand(NumOps-1).getNode();
+  if (!Chain)
+    return;
+
+  // Look for other loads of the same chain. Find loads that are loading from
+  // the same base pointer and different offsets.
   SmallPtrSet<SDNode*, 16> Visited;
   SmallVector<int64_t, 4> Offsets;
   DenseMap<long long, SDNode*> O2SMap;  // Map from offset to SDNode.
-  for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(),
-       E = DAG->allnodes_end(); NI != E; ++NI) {
-    SDNode *Node = &*NI;
-    if (!Node || !Node->isMachineOpcode())
+  bool Cluster = false;
+  SDNode *Base = Node;
+  for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
+       I != E; ++I) {
+    SDNode *User = *I;
+    if (User == Node || !Visited.insert(User))
       continue;
-
-    unsigned Opc = Node->getMachineOpcode();
-    const TargetInstrDesc &TID = TII->get(Opc);
-    if (!TID.mayLoad())
+    int64_t Offset1, Offset2;
+    if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) ||
+        Offset1 == Offset2)
+      // FIXME: Should be ok if they addresses are identical. But earlier
+      // optimizations really should have eliminated one of the loads.
       continue;
+    if (O2SMap.insert(std::make_pair(Offset1, Base)).second)
+      Offsets.push_back(Offset1);
+    O2SMap.insert(std::make_pair(Offset2, User));
+    Offsets.push_back(Offset2);
+    if (Offset2 < Offset1)
+      Base = User;
+    Cluster = true;
+  }
 
-    SDNode *Chain = 0;
-    unsigned NumOps = Node->getNumOperands();
-    if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
-      Chain = Node->getOperand(NumOps-1).getNode();
-    if (!Chain)
-      continue;
+  if (!Cluster)
+    return;
 
-    // Look for other loads of the same chain. Find loads that are loading from
-    // the same base pointer and different offsets.
-    Visited.clear();
-    Offsets.clear();
-    O2SMap.clear();
-    bool Cluster = false;
-    SDNode *Base = Node;
-    int64_t BaseOffset;
-    for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
-         I != E; ++I) {
-      SDNode *User = *I;
-      if (User == Node || !Visited.insert(User))
-        continue;
-      int64_t Offset1, Offset2;
-      if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) ||
-          Offset1 == Offset2)
-        // FIXME: Should be ok if they addresses are identical. But earlier
-        // optimizations really should have eliminated one of the loads.
-        continue;
-      if (O2SMap.insert(std::make_pair(Offset1, Base)).second)
-        Offsets.push_back(Offset1);
-      O2SMap.insert(std::make_pair(Offset2, User));
-      Offsets.push_back(Offset2);
-      if (Offset2 < Offset1) {
-        Base = User;
-        BaseOffset = Offset2;
-      } else {
-        BaseOffset = Offset1;
-      }
-      Cluster = true;
-    }
+  // Sort them in increasing order.
+  std::sort(Offsets.begin(), Offsets.end());
+
+  // Check if the loads are close enough.
+  SmallVector<SDNode*, 4> Loads;
+  unsigned NumLoads = 0;
+  int64_t BaseOff = Offsets[0];
+  SDNode *BaseLoad = O2SMap[BaseOff];
+  Loads.push_back(BaseLoad);
+  for (unsigned i = 1, e = Offsets.size(); i != e; ++i) {
+    int64_t Offset = Offsets[i];
+    SDNode *Load = O2SMap[Offset];
+    if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,NumLoads))
+      break; // Stop right here. Ignore loads that are further away.
+    Loads.push_back(Load);
+    ++NumLoads;
+  }
 
-    if (!Cluster)
-      continue;
+  if (NumLoads == 0)
+    return;
 
-    // Sort them in increasing order.
-    std::sort(Offsets.begin(), Offsets.end());
-
-    // Check if the loads are close enough.
-    SmallVector<SDNode*, 4> Loads;
-    unsigned NumLoads = 0;
-    int64_t BaseOff = Offsets[0];
-    SDNode *BaseLoad = O2SMap[BaseOff];
-    Loads.push_back(BaseLoad);
-    for (unsigned i = 1, e = Offsets.size(); i != e; ++i) {
-      int64_t Offset = Offsets[i];
-      SDNode *Load = O2SMap[Offset];
-      if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,
-                                        NumLoads))
-        break; // Stop right here. Ignore loads that are further away.
-      Loads.push_back(Load);
-      ++NumLoads;
-    }
+  // Cluster loads by adding MVT::Flag outputs and inputs. This also
+  // ensure they are scheduled in order of increasing addresses.
+  SDNode *Lead = Loads[0];
+  AddFlags(Lead, SDValue(0, 0), true, DAG);
+
+  SDValue InFlag = SDValue(Lead, Lead->getNumValues() - 1);
+  for (unsigned I = 1, E = Loads.size(); I != E; ++I) {
+    bool OutFlag = I < E - 1;
+    SDNode *Load = Loads[I];
+
+    AddFlags(Load, InFlag, OutFlag, DAG);
+
+    if (OutFlag)
+      InFlag = SDValue(Load, Load->getNumValues() - 1);
+
+    ++LoadsClustered;
+  }
+}
 
-    if (NumLoads == 0)
+/// ClusterNodes - Cluster certain nodes which should be scheduled together.
+///
+void ScheduleDAGSDNodes::ClusterNodes() {
+  for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(),
+       E = DAG->allnodes_end(); NI != E; ++NI) {
+    SDNode *Node = &*NI;
+    if (!Node || !Node->isMachineOpcode())
       continue;
 
-    // Cluster loads by adding MVT::Flag outputs and inputs. This also
-    // ensure they are scheduled in order of increasing addresses.
-    SDNode *Lead = Loads[0];
-    AddFlags(Lead, SDValue(0,0), true, DAG);
-    SDValue InFlag = SDValue(Lead, Lead->getNumValues()-1);
-    for (unsigned i = 1, e = Loads.size(); i != e; ++i) {
-      bool OutFlag = i < e-1;
-      SDNode *Load = Loads[i];
-      AddFlags(Load, InFlag, OutFlag, DAG);
-      if (OutFlag)
-        InFlag = SDValue(Load, Load->getNumValues()-1);
-      ++LoadsClustered;
-    }
+    unsigned Opc = Node->getMachineOpcode();
+    const TargetInstrDesc &TID = TII->get(Opc);
+    if (TID.mayLoad())
+      // Cluster loads from "near" addresses into combined SUnits.
+      ClusterNeighboringLoads(Node);
   }
 }
 
@@ -364,8 +393,10 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
         if (Cost >= 0)
           PhysReg = 0;
 
-        const SDep& dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data,
-                               OpSU->Latency, PhysReg);
+        // If this is a ctrl dep, latency is 1.
+        unsigned OpLatency = isChain ? 1 : OpSU->Latency;
+        const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data,
+                               OpLatency, PhysReg);
         if (!isChain && !UnitLatencies) {
           ComputeOperandLatency(OpN, N, i, const_cast<SDep &>(dep));
           ST.adjustSchedDependency(OpSU, SU, const_cast<SDep &>(dep));
@@ -382,8 +413,8 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
 /// excludes nodes that aren't interesting to scheduling, and represents
 /// flagged together nodes with a single SUnit.
 void ScheduleDAGSDNodes::BuildSchedGraph(AliasAnalysis *AA) {
-  // Cluster loads from "near" addresses into combined SUnits.
-  ClusterNeighboringLoads();
+  // Cluster certain nodes which should be scheduled together.
+  ClusterNodes();
   // Populate the SUnits array.
   BuildSchedUnits();
   // Compute all the scheduling dependencies between nodes.
@@ -427,15 +458,18 @@ void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use,
     return;
 
   unsigned DefIdx = Use->getOperand(OpIdx).getResNo();
-  if (Def->isMachineOpcode() && Use->isMachineOpcode()) {
+  if (Def->isMachineOpcode()) {
     const TargetInstrDesc &II = TII->get(Def->getMachineOpcode());
     if (DefIdx >= II.getNumDefs())
       return;
     int DefCycle = InstrItins.getOperandCycle(II.getSchedClass(), DefIdx);
     if (DefCycle < 0)
       return;
-    const unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass();
-    int UseCycle = InstrItins.getOperandCycle(UseClass, OpIdx);
+    int UseCycle = 1;
+    if (Use->isMachineOpcode()) {
+      const unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass();
+      UseCycle = InstrItins.getOperandCycle(UseClass, OpIdx);
+    }
     if (UseCycle >= 0) {
       int Latency = DefCycle - UseCycle + 1;
       if (Latency >= 0)
@@ -473,7 +507,7 @@ namespace {
 }
 
 // ProcessSourceNode - Process nodes with source order numbers. These are added
-// to a vector which EmitSchedule use to determine how to insert dbg_value
+// to a vector which EmitSchedule uses to determine how to insert dbg_value
 // instructions in the right order.
 static void ProcessSourceNode(SDNode *N, SelectionDAG *DAG,
                            InstrEmitter &Emitter,
@@ -485,13 +519,13 @@ static void ProcessSourceNode(SDNode *N, SelectionDAG *DAG,
     return;
 
   MachineBasicBlock *BB = Emitter.getBlock();
-  if (BB->empty() || BB->back().isPHI()) {
+  if (Emitter.getInsertPos() == BB->begin() || BB->back().isPHI()) {
     // Did not insert any instruction.
     Orders.push_back(std::make_pair(Order, (MachineInstr*)0));
     return;
   }
 
-  Orders.push_back(std::make_pair(Order, &BB->back()));
+  Orders.push_back(std::make_pair(Order, prior(Emitter.getInsertPos())));
   if (!N->getHasDebugValue())
     return;
   // Opportunistically insert immediate dbg_value uses, i.e. those with source
@@ -530,7 +564,7 @@ MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
     for (; PDI != PDE; ++PDI) {
       MachineInstr *DbgMI= Emitter.EmitDbgValue(*PDI, VRBaseMap);
       if (DbgMI)
-        BB->insert(BB->end(), DbgMI);
+        BB->insert(InsertPos, DbgMI);
     }
   }
 
@@ -574,9 +608,7 @@ MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
   // Insert all the dbg_values which have not already been inserted in source
   // order sequence.
   if (HasDbg) {
-    MachineBasicBlock::iterator BBBegin = BB->empty() ? BB->end() : BB->begin();
-    while (BBBegin != BB->end() && BBBegin->isPHI())
-      ++BBBegin;
+    MachineBasicBlock::iterator BBBegin = BB->getFirstNonPHI();
 
     // Sort the source order instructions and use the order to insert debug
     // values.
@@ -586,14 +618,12 @@ MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
     SDDbgInfo::DbgIterator DE = DAG->DbgEnd();
     // Now emit the rest according to source order.
     unsigned LastOrder = 0;
-    MachineInstr *LastMI = 0;
     for (unsigned i = 0, e = Orders.size(); i != e && DI != DE; ++i) {
       unsigned Order = Orders[i].first;
       MachineInstr *MI = Orders[i].second;
       // Insert all SDDbgValue's whose order(s) are before "Order".
       if (!MI)
         continue;
-      MachineBasicBlock *MIBB = MI->getParent();
 #ifndef NDEBUG
       unsigned LastDIOrder = 0;
 #endif
@@ -612,13 +642,14 @@ MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
             // Insert to start of the BB (after PHIs).
             BB->insert(BBBegin, DbgMI);
           else {
+            // Insert at the instruction, which may be in a different
+            // block, if the block was split by a custom inserter.
             MachineBasicBlock::iterator Pos = MI;
-            MIBB->insert(llvm::next(Pos), DbgMI);
+            MI->getParent()->insert(llvm::next(Pos), DbgMI);
           }
         }
       }
       LastOrder = Order;
-      LastMI = MI;
     }
     // Add trailing DbgValue's before the terminator. FIXME: May want to add
     // some of them before one or more conditional branches?
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index e8714ba..842fc8c 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -108,7 +108,10 @@ namespace llvm {
   private:
     /// ClusterNeighboringLoads - Cluster loads from "near" addresses into
     /// combined SUnits.
-    void ClusterNeighboringLoads();
+    void ClusterNeighboringLoads(SDNode *Node);
+    /// ClusterNodes - Cluster certain nodes which should be scheduled together.
+    ///
+    void ClusterNodes();
 
     /// BuildSchedUnits, AddSchedEdges - Helper functions for BuildSchedGraph.
     void BuildSchedUnits();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 38bf68b..e83a034 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -790,9 +790,8 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
 }
 
 // EntryNode could meaningfully have debug info if we can find it...
-SelectionDAG::SelectionDAG(const TargetMachine &tm, FunctionLoweringInfo &fli)
+SelectionDAG::SelectionDAG(const TargetMachine &tm)
   : TM(tm), TLI(*tm.getTargetLowering()), TSI(*tm.getSelectionDAGInfo()),
-    FLI(fli),
     EntryNode(ISD::EntryToken, DebugLoc(), getVTList(MVT::Other)),
     Root(getEntryNode()), Ordering(0) {
   AllNodes.push_back(&EntryNode);
@@ -808,7 +807,6 @@ void SelectionDAG::init(MachineFunction &mf) {
 SelectionDAG::~SelectionDAG() {
   allnodes_clear();
   delete Ordering;
-  DbgInfo->clear();
   delete DbgInfo;
 }
 
@@ -835,11 +833,8 @@ void SelectionDAG::clear() {
   EntryNode.UseList = 0;
   AllNodes.push_back(&EntryNode);
   Root = getEntryNode();
-  delete Ordering;
-  Ordering = new SDNodeOrdering();
+  Ordering->clear();
   DbgInfo->clear();
-  delete DbgInfo;
-  DbgInfo = new SDDbgInfo();
 }
 
 SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
@@ -980,7 +975,7 @@ SDValue SelectionDAG::getConstantFP(double Val, EVT VT, bool isTarget) {
   }
 }
 
-SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV,
+SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, DebugLoc DL,
                                        EVT VT, int64_t Offset,
                                        bool isTargetGA,
                                        unsigned char TargetFlags) {
@@ -1015,7 +1010,7 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV,
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) GlobalAddressSDNode(Opc, GV, VT,
+  SDNode *N = new (NodeAllocator) GlobalAddressSDNode(Opc, DL, GV, VT,
                                                       Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
@@ -2291,7 +2286,6 @@ bool SelectionDAG::isVerifiedDebugInfoDesc(SDValue Op) const {
 SDValue SelectionDAG::getShuffleScalarElt(const ShuffleVectorSDNode *N,
                                           unsigned i) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
   if (N->getMaskElt(i) < 0)
     return getUNDEF(VT.getVectorElementType());
   unsigned Index = N->getMaskElt(i);
@@ -2475,9 +2469,18 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
             VT.getVectorNumElements() ==
             Operand.getValueType().getVectorNumElements()) &&
            "Vector element count mismatch!");
-    if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND)
+
+    if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
+        OpOpcode == ISD::ANY_EXTEND)
       // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
       return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+
+    // (ext (trunx x)) -> x
+    if (OpOpcode == ISD::TRUNCATE) {
+      SDValue OpOp = Operand.getNode()->getOperand(0);
+      if (OpOp.getValueType() == VT)
+        return OpOp;
+    }
     break;
   case ISD::TRUNCATE:
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
@@ -2622,7 +2625,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     if (N1.getOpcode() == ISD::BUILD_VECTOR &&
         N2.getOpcode() == ISD::BUILD_VECTOR) {
       SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(), N1.getNode()->op_end());
-      Elts.insert(Elts.end(), N2.getNode()->op_begin(), N2.getNode()->op_end());
+      Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
       return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
     }
     break;
@@ -3011,7 +3014,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3) {
   // Perform various simplifications.
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
-  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
   switch (Opcode) {
   case ISD::CONCAT_VECTORS:
     // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
@@ -3020,8 +3022,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
         N2.getOpcode() == ISD::BUILD_VECTOR &&
         N3.getOpcode() == ISD::BUILD_VECTOR) {
       SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(), N1.getNode()->op_end());
-      Elts.insert(Elts.end(), N2.getNode()->op_begin(), N2.getNode()->op_end());
-      Elts.insert(Elts.end(), N3.getNode()->op_begin(), N3.getNode()->op_end());
+      Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
+      Elts.append(N3.getNode()->op_begin(), N3.getNode()->op_end());
       return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
     }
     break;
@@ -3041,14 +3043,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
 
     if (N2 == N3) return N2;   // select C, X, X -> X
     break;
-  case ISD::BRCOND:
-    if (N2C) {
-      if (N2C->getZExtValue()) // Unconditional branch
-        return getNode(ISD::BR, DL, MVT::Other, N1, N3);
-      else
-        return N1;         // Never-taken branch
-    }
-    break;
   case ISD::VECTOR_SHUFFLE:
     llvm_unreachable("should use getVectorShuffle constructor!");
     break;
@@ -3267,6 +3261,15 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
     if (VT.bitsGT(LVT))
       VT = LVT;
   }
+  
+  // If we're optimizing for size, and there is a limit, bump the maximum number
+  // of operations inserted down to 4.  This is a wild guess that approximates
+  // the size of a call to memcpy or memset (3 arguments + call).
+  if (Limit != ~0U) {
+    const Function *F = DAG.getMachineFunction().getFunction();
+    if (F->hasFnAttr(Attribute::OptimizeForSize))
+      Limit = 4;
+  }
 
   unsigned NumMemOps = 0;
   while (Size != 0) {
@@ -3321,9 +3324,8 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   std::string Str;
   bool CopyFromStr = isMemSrcFromString(Src, Str);
   bool isZeroStr = CopyFromStr && Str.empty();
-  uint64_t Limit = -1ULL;
-  if (!AlwaysInline)
-    Limit = TLI.getMaxStoresPerMemcpy();
+  unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy();
+  
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                 (DstAlignCanChange ? 0 : Align),
                                 (isZeroStr ? 0 : SrcAlign),
@@ -3368,7 +3370,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
       // FIXME does the case above also need this?
       EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
       assert(NVT.bitsGE(VT));
-      Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
+      Value = DAG.getExtLoad(ISD::EXTLOAD, NVT, dl, Chain,
                              getMemBasePlusOffset(Src, SrcOff, DAG),
                              SrcSV, SrcSVOff + SrcOff, VT, isVol, false,
                              MinAlign(SrcAlign, SrcOff));
@@ -3401,9 +3403,6 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   // below a certain threshold.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   std::vector<EVT> MemOps;
-  uint64_t Limit = -1ULL;
-  if (!AlwaysInline)
-    Limit = TLI.getMaxStoresPerMemmove();
   bool DstAlignCanChange = false;
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
@@ -3412,6 +3411,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   unsigned SrcAlign = DAG.InferPtrAlignment(Src);
   if (Align > SrcAlign)
     SrcAlign = Align;
+  unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove();
 
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                 (DstAlignCanChange ? 0 : Align),
@@ -3895,8 +3895,8 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
 }
 
 SDValue
-SelectionDAG::getLoad(ISD::MemIndexedMode AM, DebugLoc dl,
-                      ISD::LoadExtType ExtType, EVT VT, SDValue Chain,
+SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                      EVT VT, DebugLoc dl, SDValue Chain,
                       SDValue Ptr, SDValue Offset,
                       const Value *SV, int SVOffset, EVT MemVT,
                       bool isVolatile, bool isNonTemporal,
@@ -3919,12 +3919,12 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, DebugLoc dl,
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(SV, Flags, SVOffset,
                             MemVT.getStoreSize(), Alignment);
-  return getLoad(AM, dl, ExtType, VT, Chain, Ptr, Offset, MemVT, MMO);
+  return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
 }
 
 SDValue
-SelectionDAG::getLoad(ISD::MemIndexedMode AM, DebugLoc dl,
-                      ISD::LoadExtType ExtType, EVT VT, SDValue Chain,
+SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, 
+                      EVT VT, DebugLoc dl, SDValue Chain,
                       SDValue Ptr, SDValue Offset, EVT MemVT,
                       MachineMemOperand *MMO) {
   if (VT == MemVT) {
@@ -3974,18 +3974,18 @@ SDValue SelectionDAG::getLoad(EVT VT, DebugLoc dl,
                               bool isVolatile, bool isNonTemporal,
                               unsigned Alignment) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
-  return getLoad(ISD::UNINDEXED, dl, ISD::NON_EXTLOAD, VT, Chain, Ptr, Undef,
+  return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
                  SV, SVOffset, VT, isVolatile, isNonTemporal, Alignment);
 }
 
-SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, DebugLoc dl, EVT VT,
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, EVT VT, DebugLoc dl,
                                  SDValue Chain, SDValue Ptr,
                                  const Value *SV,
                                  int SVOffset, EVT MemVT,
                                  bool isVolatile, bool isNonTemporal,
                                  unsigned Alignment) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
-  return getLoad(ISD::UNINDEXED, dl, ExtType, VT, Chain, Ptr, Undef,
+  return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
                  SV, SVOffset, MemVT, isVolatile, isNonTemporal, Alignment);
 }
 
@@ -3995,7 +3995,7 @@ SelectionDAG::getIndexedLoad(SDValue OrigLoad, DebugLoc dl, SDValue Base,
   LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
   assert(LD->getOffset().getOpcode() == ISD::UNDEF &&
          "Load is already a indexed load!");
-  return getLoad(AM, dl, LD->getExtensionType(), OrigLoad.getValueType(),
+  return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
                  LD->getChain(), Base, Offset, LD->getSrcValue(),
                  LD->getSrcValueOffset(), LD->getMemoryVT(),
                  LD->isVolatile(), LD->isNonTemporal(), LD->getAlignment());
@@ -4141,9 +4141,10 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, DebugLoc dl, SDValue Base,
 
 SDValue SelectionDAG::getVAArg(EVT VT, DebugLoc dl,
                                SDValue Chain, SDValue Ptr,
-                               SDValue SV) {
-  SDValue Ops[] = { Chain, Ptr, SV };
-  return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops, 3);
+                               SDValue SV,
+                               unsigned Align) {
+  SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, MVT::i32) };
+  return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops, 4);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
@@ -4425,17 +4426,16 @@ SDVTList SelectionDAG::getVTList(const EVT *VTs, unsigned NumVTs) {
 /// already exists.  If the resultant node does not exist in the DAG, the
 /// input node is returned.  As a degenerate case, if you specify the same
 /// input operands as the node already has, the input node is returned.
-SDValue SelectionDAG::UpdateNodeOperands(SDValue InN, SDValue Op) {
-  SDNode *N = InN.getNode();
+SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op) {
   assert(N->getNumOperands() == 1 && "Update with wrong number of operands");
 
   // Check to see if there is no change.
-  if (Op == N->getOperand(0)) return InN;
+  if (Op == N->getOperand(0)) return N;
 
   // See if the modified node already exists.
   void *InsertPos = 0;
   if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
-    return SDValue(Existing, InN.getResNo());
+    return Existing;
 
   // Nope it doesn't.  Remove the node from its current place in the maps.
   if (InsertPos)
@@ -4447,22 +4447,20 @@ SDValue SelectionDAG::UpdateNodeOperands(SDValue InN, SDValue Op) {
 
   // If this gets put into a CSE map, add it.
   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
-  return InN;
+  return N;
 }
 
-SDValue SelectionDAG::
-UpdateNodeOperands(SDValue InN, SDValue Op1, SDValue Op2) {
-  SDNode *N = InN.getNode();
+SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) {
   assert(N->getNumOperands() == 2 && "Update with wrong number of operands");
 
   // Check to see if there is no change.
   if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1))
-    return InN;   // No operands changed, just return the input node.
+    return N;   // No operands changed, just return the input node.
 
   // See if the modified node already exists.
   void *InsertPos = 0;
   if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
-    return SDValue(Existing, InN.getResNo());
+    return Existing;
 
   // Nope it doesn't.  Remove the node from its current place in the maps.
   if (InsertPos)
@@ -4477,32 +4475,31 @@ UpdateNodeOperands(SDValue InN, SDValue Op1, SDValue Op2) {
 
   // If this gets put into a CSE map, add it.
   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
-  return InN;
+  return N;
 }
 
-SDValue SelectionDAG::
-UpdateNodeOperands(SDValue N, SDValue Op1, SDValue Op2, SDValue Op3) {
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) {
   SDValue Ops[] = { Op1, Op2, Op3 };
   return UpdateNodeOperands(N, Ops, 3);
 }
 
-SDValue SelectionDAG::
-UpdateNodeOperands(SDValue N, SDValue Op1, SDValue Op2,
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
                    SDValue Op3, SDValue Op4) {
   SDValue Ops[] = { Op1, Op2, Op3, Op4 };
   return UpdateNodeOperands(N, Ops, 4);
 }
 
-SDValue SelectionDAG::
-UpdateNodeOperands(SDValue N, SDValue Op1, SDValue Op2,
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
                    SDValue Op3, SDValue Op4, SDValue Op5) {
   SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
   return UpdateNodeOperands(N, Ops, 5);
 }
 
-SDValue SelectionDAG::
-UpdateNodeOperands(SDValue InN, const SDValue *Ops, unsigned NumOps) {
-  SDNode *N = InN.getNode();
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, const SDValue *Ops, unsigned NumOps) {
   assert(N->getNumOperands() == NumOps &&
          "Update with wrong number of operands");
 
@@ -4516,12 +4513,12 @@ UpdateNodeOperands(SDValue InN, const SDValue *Ops, unsigned NumOps) {
   }
 
   // No operands changed, just return the input node.
-  if (!AnyChange) return InN;
+  if (!AnyChange) return N;
 
   // See if the modified node already exists.
   void *InsertPos = 0;
   if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, NumOps, InsertPos))
-    return SDValue(Existing, InN.getResNo());
+    return Existing;
 
   // Nope it doesn't.  Remove the node from its current place in the maps.
   if (InsertPos)
@@ -4535,7 +4532,7 @@ UpdateNodeOperands(SDValue InN, const SDValue *Ops, unsigned NumOps) {
 
   // If this gets put into a CSE map, add it.
   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
-  return InN;
+  return N;
 }
 
 /// DropOperands - Release the operands and set this node to have
@@ -5378,9 +5375,10 @@ HandleSDNode::~HandleSDNode() {
   DropOperands();
 }
 
-GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, const GlobalValue *GA,
+GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, DebugLoc DL,
+                                         const GlobalValue *GA,
                                          EVT VT, int64_t o, unsigned char TF)
-  : SDNode(Opc, DebugLoc(), getSDVTList(VT)), Offset(o), TargetFlags(TF) {
+  : SDNode(Opc, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
   TheGlobal = GA;
 }
 
@@ -5669,13 +5667,16 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FSQRT:  return "fsqrt";
   case ISD::FSIN:   return "fsin";
   case ISD::FCOS:   return "fcos";
-  case ISD::FPOWI:  return "fpowi";
-  case ISD::FPOW:   return "fpow";
   case ISD::FTRUNC: return "ftrunc";
   case ISD::FFLOOR: return "ffloor";
   case ISD::FCEIL:  return "fceil";
   case ISD::FRINT:  return "frint";
   case ISD::FNEARBYINT: return "fnearbyint";
+  case ISD::FEXP:   return "fexp";
+  case ISD::FEXP2:  return "fexp2";
+  case ISD::FLOG:   return "flog";
+  case ISD::FLOG2:  return "flog2";
+  case ISD::FLOG10: return "flog10";
 
   // Binary operators
   case ISD::ADD:    return "add";
@@ -5706,7 +5707,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FREM:   return "frem";
   case ISD::FCOPYSIGN: return "fcopysign";
   case ISD::FGETSIGN:  return "fgetsign";
+  case ISD::FPOW:   return "fpow";
 
+  case ISD::FPOWI:  return "fpowi";
   case ISD::SETCC:       return "setcc";
   case ISD::VSETCC:      return "vsetcc";
   case ISD::SELECT:      return "select";
@@ -6260,23 +6263,6 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
     const MachineFrameInfo &MFI = *getMachineFunction().getFrameInfo();
     unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
                                     FrameOffset);
-    if (MFI.isFixedObjectIndex(FrameIdx)) {
-      int64_t ObjectOffset = MFI.getObjectOffset(FrameIdx) + FrameOffset;
-
-      // The alignment of the frame index can be determined from its offset from
-      // the incoming frame position.  If the frame object is at offset 32 and
-      // the stack is guaranteed to be 16-byte aligned, then we know that the
-      // object is 16-byte aligned.
-      unsigned StackAlign = getTarget().getFrameInfo()->getStackAlignment();
-      unsigned Align = MinAlign(ObjectOffset, StackAlign);
-
-      // Finally, the frame object itself may have a known alignment.  Factor
-      // the alignment + offset into a new alignment.  For example, if we know
-      // the FI is 8 byte aligned, but the pointer is 4 off, we really have a
-      // 4-byte alignment of the resultant pointer.  Likewise align 4 + 4-byte
-      // offset = 4-byte alignment, align 4 + 1-byte offset = align 1, etc.
-      return std::max(Align, FIInfoAlign);
-    }
     return FIInfoAlign;
   }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index fbe601f..d323c16 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -14,7 +14,6 @@
 #define DEBUG_TYPE "isel"
 #include "SDNodeDbgValue.h"
 #include "SelectionDAGBuilder.h"
-#include "FunctionLoweringInfo.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -32,6 +31,7 @@
 #include "llvm/Module.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -70,113 +70,6 @@ LimitFPPrecision("limit-float-precision",
                  cl::location(LimitFloatPrecision),
                  cl::init(0));
 
-namespace {
-  /// RegsForValue - This struct represents the registers (physical or virtual)
-  /// that a particular set of values is assigned, and the type information
-  /// about the value. The most common situation is to represent one value at a
-  /// time, but struct or array values are handled element-wise as multiple
-  /// values.  The splitting of aggregates is performed recursively, so that we
-  /// never have aggregate-typed registers. The values at this point do not
-  /// necessarily have legal types, so each value may require one or more
-  /// registers of some legal type.
-  ///
-  struct RegsForValue {
-    /// TLI - The TargetLowering object.
-    ///
-    const TargetLowering *TLI;
-
-    /// ValueVTs - The value types of the values, which may not be legal, and
-    /// may need be promoted or synthesized from one or more registers.
-    ///
-    SmallVector<EVT, 4> ValueVTs;
-
-    /// RegVTs - The value types of the registers. This is the same size as
-    /// ValueVTs and it records, for each value, what the type of the assigned
-    /// register or registers are. (Individual values are never synthesized
-    /// from more than one type of register.)
-    ///
-    /// With virtual registers, the contents of RegVTs is redundant with TLI's
-    /// getRegisterType member function, however when with physical registers
-    /// it is necessary to have a separate record of the types.
-    ///
-    SmallVector<EVT, 4> RegVTs;
-
-    /// Regs - This list holds the registers assigned to the values.
-    /// Each legal or promoted value requires one register, and each
-    /// expanded value requires multiple registers.
-    ///
-    SmallVector<unsigned, 4> Regs;
-
-    RegsForValue() : TLI(0) {}
-
-    RegsForValue(const TargetLowering &tli,
-                 const SmallVector<unsigned, 4> &regs,
-                 EVT regvt, EVT valuevt)
-      : TLI(&tli),  ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {}
-    RegsForValue(const TargetLowering &tli,
-                 const SmallVector<unsigned, 4> &regs,
-                 const SmallVector<EVT, 4> &regvts,
-                 const SmallVector<EVT, 4> &valuevts)
-      : TLI(&tli), ValueVTs(valuevts), RegVTs(regvts), Regs(regs) {}
-    RegsForValue(LLVMContext &Context, const TargetLowering &tli,
-                 unsigned Reg, const Type *Ty) : TLI(&tli) {
-      ComputeValueVTs(tli, Ty, ValueVTs);
-
-      for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
-        EVT ValueVT = ValueVTs[Value];
-        unsigned NumRegs = TLI->getNumRegisters(Context, ValueVT);
-        EVT RegisterVT = TLI->getRegisterType(Context, ValueVT);
-        for (unsigned i = 0; i != NumRegs; ++i)
-          Regs.push_back(Reg + i);
-        RegVTs.push_back(RegisterVT);
-        Reg += NumRegs;
-      }
-    }
-
-    /// areValueTypesLegal - Return true if types of all the values are legal.
-    bool areValueTypesLegal() {
-      for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
-        EVT RegisterVT = RegVTs[Value];
-        if (!TLI->isTypeLegal(RegisterVT))
-          return false;
-      }
-      return true;
-    }
-
-
-    /// append - Add the specified values to this one.
-    void append(const RegsForValue &RHS) {
-      TLI = RHS.TLI;
-      ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());
-      RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());
-      Regs.append(RHS.Regs.begin(), RHS.Regs.end());
-    }
-
-
-    /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
-    /// this value and returns the result as a ValueVTs value.  This uses
-    /// Chain/Flag as the input and updates them for the output Chain/Flag.
-    /// If the Flag pointer is NULL, no flag is used.
-    SDValue getCopyFromRegs(SelectionDAG &DAG, DebugLoc dl,
-                            SDValue &Chain, SDValue *Flag) const;
-
-    /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
-    /// specified value into the registers specified by this object.  This uses
-    /// Chain/Flag as the input and updates them for the output Chain/Flag.
-    /// If the Flag pointer is NULL, no flag is used.
-    void getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
-                       SDValue &Chain, SDValue *Flag) const;
-
-    /// AddInlineAsmOperands - Add this value to the specified inlineasm node
-    /// operand list.  This adds the code marker, matching input operand index
-    /// (if applicable), and includes the number of values added into it.
-    void AddInlineAsmOperands(unsigned Kind,
-                              bool HasMatching, unsigned MatchingIdx,
-                              SelectionDAG &DAG,
-                              std::vector<SDValue> &Ops) const;
-  };
-}
-
 /// getCopyFromParts - Create a value that contains the specified legal parts
 /// combined into the value they represent.  If the parts combine to a type
 /// larger then ValueVT then AssertOp can be used to specify whether the extra
@@ -528,6 +421,268 @@ static void getCopyToParts(SelectionDAG &DAG, DebugLoc dl,
   }
 }
 
+namespace {
+  /// RegsForValue - This struct represents the registers (physical or virtual)
+  /// that a particular set of values is assigned, and the type information
+  /// about the value. The most common situation is to represent one value at a
+  /// time, but struct or array values are handled element-wise as multiple
+  /// values.  The splitting of aggregates is performed recursively, so that we
+  /// never have aggregate-typed registers. The values at this point do not
+  /// necessarily have legal types, so each value may require one or more
+  /// registers of some legal type.
+  ///
+  struct RegsForValue {
+    /// ValueVTs - The value types of the values, which may not be legal, and
+    /// may need be promoted or synthesized from one or more registers.
+    ///
+    SmallVector<EVT, 4> ValueVTs;
+
+    /// RegVTs - The value types of the registers. This is the same size as
+    /// ValueVTs and it records, for each value, what the type of the assigned
+    /// register or registers are. (Individual values are never synthesized
+    /// from more than one type of register.)
+    ///
+    /// With virtual registers, the contents of RegVTs is redundant with TLI's
+    /// getRegisterType member function, however when with physical registers
+    /// it is necessary to have a separate record of the types.
+    ///
+    SmallVector<EVT, 4> RegVTs;
+
+    /// Regs - This list holds the registers assigned to the values.
+    /// Each legal or promoted value requires one register, and each
+    /// expanded value requires multiple registers.
+    ///
+    SmallVector<unsigned, 4> Regs;
+
+    RegsForValue() {}
+
+    RegsForValue(const SmallVector<unsigned, 4> &regs,
+                 EVT regvt, EVT valuevt)
+      : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {}
+
+    RegsForValue(const SmallVector<unsigned, 4> &regs,
+                 const SmallVector<EVT, 4> &regvts,
+                 const SmallVector<EVT, 4> &valuevts)
+      : ValueVTs(valuevts), RegVTs(regvts), Regs(regs) {}
+
+    RegsForValue(LLVMContext &Context, const TargetLowering &tli,
+                 unsigned Reg, const Type *Ty) {
+      ComputeValueVTs(tli, Ty, ValueVTs);
+
+      for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+        EVT ValueVT = ValueVTs[Value];
+        unsigned NumRegs = tli.getNumRegisters(Context, ValueVT);
+        EVT RegisterVT = tli.getRegisterType(Context, ValueVT);
+        for (unsigned i = 0; i != NumRegs; ++i)
+          Regs.push_back(Reg + i);
+        RegVTs.push_back(RegisterVT);
+        Reg += NumRegs;
+      }
+    }
+
+    /// areValueTypesLegal - Return true if types of all the values are legal.
+    bool areValueTypesLegal(const TargetLowering &TLI) {
+      for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+        EVT RegisterVT = RegVTs[Value];
+        if (!TLI.isTypeLegal(RegisterVT))
+          return false;
+      }
+      return true;
+    }
+
+    /// append - Add the specified values to this one.
+    void append(const RegsForValue &RHS) {
+      ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());
+      RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());
+      Regs.append(RHS.Regs.begin(), RHS.Regs.end());
+    }
+
+    /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
+    /// this value and returns the result as a ValueVTs value.  This uses
+    /// Chain/Flag as the input and updates them for the output Chain/Flag.
+    /// If the Flag pointer is NULL, no flag is used.
+    SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,
+                            DebugLoc dl,
+                            SDValue &Chain, SDValue *Flag) const;
+
+    /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
+    /// specified value into the registers specified by this object.  This uses
+    /// Chain/Flag as the input and updates them for the output Chain/Flag.
+    /// If the Flag pointer is NULL, no flag is used.
+    void getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
+                       SDValue &Chain, SDValue *Flag) const;
+
+    /// AddInlineAsmOperands - Add this value to the specified inlineasm node
+    /// operand list.  This adds the code marker, matching input operand index
+    /// (if applicable), and includes the number of values added into it.
+    void AddInlineAsmOperands(unsigned Kind,
+                              bool HasMatching, unsigned MatchingIdx,
+                              SelectionDAG &DAG,
+                              std::vector<SDValue> &Ops) const;
+  };
+}
+
+/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
+/// this value and returns the result as a ValueVT value.  This uses
+/// Chain/Flag as the input and updates them for the output Chain/Flag.
+/// If the Flag pointer is NULL, no flag is used.
+SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
+                                      FunctionLoweringInfo &FuncInfo,
+                                      DebugLoc dl,
+                                      SDValue &Chain, SDValue *Flag) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Assemble the legal parts into the final values.
+  SmallVector<SDValue, 4> Values(ValueVTs.size());
+  SmallVector<SDValue, 8> Parts;
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    // Copy the legal parts from the registers.
+    EVT ValueVT = ValueVTs[Value];
+    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
+    EVT RegisterVT = RegVTs[Value];
+
+    Parts.resize(NumRegs);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      SDValue P;
+      if (Flag == 0) {
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
+      } else {
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag);
+        *Flag = P.getValue(2);
+      }
+
+      Chain = P.getValue(1);
+
+      // If the source register was virtual and if we know something about it,
+      // add an assert node.
+      if (TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) &&
+          RegisterVT.isInteger() && !RegisterVT.isVector()) {
+        unsigned SlotNo = Regs[Part+i]-TargetRegisterInfo::FirstVirtualRegister;
+        if (FuncInfo.LiveOutRegInfo.size() > SlotNo) {
+          const FunctionLoweringInfo::LiveOutInfo &LOI =
+            FuncInfo.LiveOutRegInfo[SlotNo];
+
+          unsigned RegSize = RegisterVT.getSizeInBits();
+          unsigned NumSignBits = LOI.NumSignBits;
+          unsigned NumZeroBits = LOI.KnownZero.countLeadingOnes();
+
+          // FIXME: We capture more information than the dag can represent.  For
+          // now, just use the tightest assertzext/assertsext possible.
+          bool isSExt = true;
+          EVT FromVT(MVT::Other);
+          if (NumSignBits == RegSize)
+            isSExt = true, FromVT = MVT::i1;   // ASSERT SEXT 1
+          else if (NumZeroBits >= RegSize-1)
+            isSExt = false, FromVT = MVT::i1;  // ASSERT ZEXT 1
+          else if (NumSignBits > RegSize-8)
+            isSExt = true, FromVT = MVT::i8;   // ASSERT SEXT 8
+          else if (NumZeroBits >= RegSize-8)
+            isSExt = false, FromVT = MVT::i8;  // ASSERT ZEXT 8
+          else if (NumSignBits > RegSize-16)
+            isSExt = true, FromVT = MVT::i16;  // ASSERT SEXT 16
+          else if (NumZeroBits >= RegSize-16)
+            isSExt = false, FromVT = MVT::i16; // ASSERT ZEXT 16
+          else if (NumSignBits > RegSize-32)
+            isSExt = true, FromVT = MVT::i32;  // ASSERT SEXT 32
+          else if (NumZeroBits >= RegSize-32)
+            isSExt = false, FromVT = MVT::i32; // ASSERT ZEXT 32
+
+          if (FromVT != MVT::Other)
+            P = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
+                            RegisterVT, P, DAG.getValueType(FromVT));
+        }
+      }
+
+      Parts[i] = P;
+    }
+
+    Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(),
+                                     NumRegs, RegisterVT, ValueVT);
+    Part += NumRegs;
+    Parts.clear();
+  }
+
+  return DAG.getNode(ISD::MERGE_VALUES, dl,
+                     DAG.getVTList(&ValueVTs[0], ValueVTs.size()),
+                     &Values[0], ValueVTs.size());
+}
+
+/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
+/// specified value into the registers specified by this object.  This uses
+/// Chain/Flag as the input and updates them for the output Chain/Flag.
+/// If the Flag pointer is NULL, no flag is used.
+void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
+                                 SDValue &Chain, SDValue *Flag) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Get the list of the values's legal parts.
+  unsigned NumRegs = Regs.size();
+  SmallVector<SDValue, 8> Parts(NumRegs);
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    EVT ValueVT = ValueVTs[Value];
+    unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
+    EVT RegisterVT = RegVTs[Value];
+
+    getCopyToParts(DAG, dl,
+                   Val.getValue(Val.getResNo() + Value),
+                   &Parts[Part], NumParts, RegisterVT);
+    Part += NumParts;
+  }
+
+  // Copy the parts into the registers.
+  SmallVector<SDValue, 8> Chains(NumRegs);
+  for (unsigned i = 0; i != NumRegs; ++i) {
+    SDValue Part;
+    if (Flag == 0) {
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
+    } else {
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag);
+      *Flag = Part.getValue(1);
+    }
+
+    Chains[i] = Part.getValue(0);
+  }
+
+  if (NumRegs == 1 || Flag)
+    // If NumRegs > 1 && Flag is used then the use of the last CopyToReg is
+    // flagged to it. That is the CopyToReg nodes and the user are considered
+    // a single scheduling unit. If we create a TokenFactor and return it as
+    // chain, then the TokenFactor is both a predecessor (operand) of the
+    // user as well as a successor (the TF operands are flagged to the user).
+    // c1, f1 = CopyToReg
+    // c2, f2 = CopyToReg
+    // c3     = TokenFactor c1, c2
+    // ...
+    //        = op c3, ..., f2
+    Chain = Chains[NumRegs-1];
+  else
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], NumRegs);
+}
+
+/// AddInlineAsmOperands - Add this value to the specified inlineasm node
+/// operand list.  This adds the code marker and includes the number of
+/// values added into it.
+void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
+                                        unsigned MatchingIdx,
+                                        SelectionDAG &DAG,
+                                        std::vector<SDValue> &Ops) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
+  if (HasMatching)
+    Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
+  SDValue Res = DAG.getTargetConstant(Flag, MVT::i32);
+  Ops.push_back(Res);
+
+  for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVTs[Value]);
+    EVT RegisterVT = RegVTs[Value];
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      assert(Reg < Regs.size() && "Mismatch in # registers expected");
+      Ops.push_back(DAG.getRegister(Regs[Reg++], RegisterVT));
+    }
+  }
+}
 
 void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa) {
   AA = &aa;
@@ -543,6 +698,7 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa) {
 /// consumed.
 void SelectionDAGBuilder::clear() {
   NodeMap.clear();
+  UnusedArgNodeMap.clear();
   PendingLoads.clear();
   PendingExports.clear();
   CurDebugLoc = DebugLoc();
@@ -649,27 +805,63 @@ void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
   }
 }
 
+// getValue - Return an SDValue for the given Value.
 SDValue SelectionDAGBuilder::getValue(const Value *V) {
+  // If we already have an SDValue for this value, use it. It's important
+  // to do this first, so that we don't create a CopyFromReg if we already
+  // have a regular SDValue.
+  SDValue &N = NodeMap[V];
+  if (N.getNode()) return N;
+
+  // If there's a virtual register allocated and initialized for this
+  // value, use it.
+  DenseMap<const Value *, unsigned>::iterator It = FuncInfo.ValueMap.find(V);
+  if (It != FuncInfo.ValueMap.end()) {
+    unsigned InReg = It->second;
+    RegsForValue RFV(*DAG.getContext(), TLI, InReg, V->getType());
+    SDValue Chain = DAG.getEntryNode();
+    return N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL);
+  }
+
+  // Otherwise create a new SDValue and remember it.
+  SDValue Val = getValueImpl(V);
+  NodeMap[V] = Val;
+  return Val;
+}
+
+/// getNonRegisterValue - Return an SDValue for the given Value, but
+/// don't look in FuncInfo.ValueMap for a virtual register.
+SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
+  // If we already have an SDValue for this value, use it.
   SDValue &N = NodeMap[V];
   if (N.getNode()) return N;
 
+  // Otherwise create a new SDValue and remember it.
+  SDValue Val = getValueImpl(V);
+  NodeMap[V] = Val;
+  return Val;
+}
+
+/// getValueImpl - Helper function for getValue and getMaterializedValue.
+/// Create an SDValue for the given value.
+SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   if (const Constant *C = dyn_cast<Constant>(V)) {
     EVT VT = TLI.getValueType(V->getType(), true);
 
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
-      return N = DAG.getConstant(*CI, VT);
+      return DAG.getConstant(*CI, VT);
 
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
-      return N = DAG.getGlobalAddress(GV, VT);
+      return DAG.getGlobalAddress(GV, getCurDebugLoc(), VT);
 
     if (isa<ConstantPointerNull>(C))
-      return N = DAG.getConstant(0, TLI.getPointerTy());
+      return DAG.getConstant(0, TLI.getPointerTy());
 
     if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
-      return N = DAG.getConstantFP(*CFP, VT);
+      return DAG.getConstantFP(*CFP, VT);
 
     if (isa<UndefValue>(C) && !V->getType()->isAggregateType())
-      return N = DAG.getUNDEF(VT);
+      return DAG.getUNDEF(VT);
 
     if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
       visit(CE->getOpcode(), *CE);
@@ -757,82 +949,25 @@ SDValue SelectionDAGBuilder::getValue(const Value *V) {
       return DAG.getFrameIndex(SI->second, TLI.getPointerTy());
   }
 
-  unsigned InReg = FuncInfo.ValueMap[V];
-  assert(InReg && "Value not in map!");
-
-  RegsForValue RFV(*DAG.getContext(), TLI, InReg, V->getType());
-  SDValue Chain = DAG.getEntryNode();
-  return RFV.getCopyFromRegs(DAG, getCurDebugLoc(), Chain, NULL);
-}
-
-/// Get the EVTs and ArgFlags collections that represent the legalized return 
-/// type of the given function.  This does not require a DAG or a return value,
-/// and is suitable for use before any DAGs for the function are constructed.
-static void getReturnInfo(const Type* ReturnType,
-                   Attributes attr, SmallVectorImpl<EVT> &OutVTs,
-                   SmallVectorImpl<ISD::ArgFlagsTy> &OutFlags,
-                   const TargetLowering &TLI,
-                   SmallVectorImpl<uint64_t> *Offsets = 0) {
-  SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(TLI, ReturnType, ValueVTs);
-  unsigned NumValues = ValueVTs.size();
-  if (NumValues == 0) return;
-  unsigned Offset = 0;
-
-  for (unsigned j = 0, f = NumValues; j != f; ++j) {
-    EVT VT = ValueVTs[j];
-    ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
-
-    if (attr & Attribute::SExt)
-      ExtendKind = ISD::SIGN_EXTEND;
-    else if (attr & Attribute::ZExt)
-      ExtendKind = ISD::ZERO_EXTEND;
-
-    // FIXME: C calling convention requires the return type to be promoted to
-    // at least 32-bit. But this is not necessary for non-C calling
-    // conventions. The frontend should mark functions whose return values
-    // require promoting with signext or zeroext attributes.
-    if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) {
-      EVT MinVT = TLI.getRegisterType(ReturnType->getContext(), MVT::i32);
-      if (VT.bitsLT(MinVT))
-        VT = MinVT;
-    }
-
-    unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT);
-    EVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT);
-    unsigned PartSize = TLI.getTargetData()->getTypeAllocSize(
-                        PartVT.getTypeForEVT(ReturnType->getContext()));
-
-    // 'inreg' on function refers to return value
-    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-    if (attr & Attribute::InReg)
-      Flags.setInReg();
-
-    // Propagate extension type if any
-    if (attr & Attribute::SExt)
-      Flags.setSExt();
-    else if (attr & Attribute::ZExt)
-      Flags.setZExt();
-
-    for (unsigned i = 0; i < NumParts; ++i) {
-      OutVTs.push_back(PartVT);
-      OutFlags.push_back(Flags);
-      if (Offsets)
-      {
-        Offsets->push_back(Offset);
-        Offset += PartSize;
-      }
-    }
+  // If this is an instruction which fast-isel has deferred, select it now.
+  if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+    unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
+    RegsForValue RFV(*DAG.getContext(), TLI, InReg, Inst->getType());
+    SDValue Chain = DAG.getEntryNode();
+    return RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL);
   }
+
+  llvm_unreachable("Can't get register for value!");
+  return SDValue();
 }
 
 void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
   SDValue Chain = getControlRoot();
   SmallVector<ISD::OutputArg, 8> Outs;
-  FunctionLoweringInfo &FLI = DAG.getFunctionLoweringInfo();
+  SmallVector<SDValue, 8> OutVals;
 
-  if (!FLI.CanLowerReturn) {
-    unsigned DemoteReg = FLI.DemoteRegister;
+  if (!FuncInfo.CanLowerReturn) {
+    unsigned DemoteReg = FuncInfo.DemoteRegister;
     const Function *F = I.getParent()->getParent();
 
     // Emit a store of the return value through the virtual register.
@@ -908,8 +1043,11 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
         else if (F->paramHasAttr(0, Attribute::ZExt))
           Flags.setZExt();
 
-        for (unsigned i = 0; i < NumParts; ++i)
-          Outs.push_back(ISD::OutputArg(Flags, Parts[i], /*isfixed=*/true));
+        for (unsigned i = 0; i < NumParts; ++i) {
+          Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
+                                        /*isfixed=*/true));
+          OutVals.push_back(Parts[i]);
+        }
       }
     }
   }
@@ -918,7 +1056,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
   CallingConv::ID CallConv =
     DAG.getMachineFunction().getFunction()->getCallingConv();
   Chain = TLI.LowerReturn(Chain, CallConv, isVarArg,
-                          Outs, getCurDebugLoc(), DAG);
+                          Outs, OutVals, getCurDebugLoc(), DAG);
 
   // Verify that the target's LowerReturn behaved as expected.
   assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
@@ -1119,7 +1257,7 @@ SelectionDAGBuilder::ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases){
 }
 
 void SelectionDAGBuilder::visitBr(const BranchInst &I) {
-  MachineBasicBlock *BrMBB = FuncInfo.MBBMap[I.getParent()];
+  MachineBasicBlock *BrMBB = FuncInfo.MBB;
 
   // Update machine-CFG edges.
   MachineBasicBlock *Succ0MBB = FuncInfo.MBBMap[I.getSuccessor(0)];
@@ -1269,18 +1407,10 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
                                MVT::Other, getControlRoot(), Cond,
                                DAG.getBasicBlock(CB.TrueBB));
 
-  // If the branch was constant folded, fix up the CFG.
-  if (BrCond.getOpcode() == ISD::BR) {
-    SwitchBB->removeSuccessor(CB.FalseBB);
-  } else {
-    // Otherwise, go ahead and insert the false branch.
-    if (BrCond == getControlRoot())
-      SwitchBB->removeSuccessor(CB.TrueBB);
-
-    if (CB.FalseBB != NextBlock)
-      BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
-                           DAG.getBasicBlock(CB.FalseBB));
-  }
+  // Insert the false branch.
+  if (CB.FalseBB != NextBlock)
+    BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
+                         DAG.getBasicBlock(CB.FalseBB));
 
   DAG.setRoot(BrCond);
 }
@@ -1319,7 +1449,7 @@ void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
   // therefore require extension or truncating.
   SwitchOp = DAG.getZExtOrTrunc(Sub, getCurDebugLoc(), TLI.getPointerTy());
 
-  unsigned JumpTableReg = FuncInfo.MakeReg(TLI.getPointerTy());
+  unsigned JumpTableReg = FuncInfo.CreateReg(TLI.getPointerTy());
   SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurDebugLoc(),
                                     JumpTableReg, SwitchOp);
   JT.Reg = JumpTableReg;
@@ -1370,7 +1500,7 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
   SDValue ShiftOp = DAG.getZExtOrTrunc(Sub, getCurDebugLoc(),
                                        TLI.getPointerTy());
 
-  B.Reg = FuncInfo.MakeReg(TLI.getPointerTy());
+  B.Reg = FuncInfo.CreateReg(TLI.getPointerTy());
   SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurDebugLoc(),
                                     B.Reg, ShiftOp);
 
@@ -1402,29 +1532,41 @@ void SelectionDAGBuilder::visitBitTestCase(MachineBasicBlock* NextMBB,
                                            unsigned Reg,
                                            BitTestCase &B,
                                            MachineBasicBlock *SwitchBB) {
-  // Make desired shift
   SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), getCurDebugLoc(), Reg,
                                        TLI.getPointerTy());
-  SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurDebugLoc(),
-                                  TLI.getPointerTy(),
-                                  DAG.getConstant(1, TLI.getPointerTy()),
-                                  ShiftOp);
-
-  // Emit bit tests and jumps
-  SDValue AndOp = DAG.getNode(ISD::AND, getCurDebugLoc(),
-                              TLI.getPointerTy(), SwitchVal,
-                              DAG.getConstant(B.Mask, TLI.getPointerTy()));
-  SDValue AndCmp = DAG.getSetCC(getCurDebugLoc(),
-                                TLI.getSetCCResultType(AndOp.getValueType()),
-                                AndOp, DAG.getConstant(0, TLI.getPointerTy()),
-                                ISD::SETNE);
+  SDValue Cmp;
+  if (CountPopulation_64(B.Mask) == 1) {
+    // Testing for a single bit; just compare the shift count with what it
+    // would need to be to shift a 1 bit in that position.
+    Cmp = DAG.getSetCC(getCurDebugLoc(),
+                       TLI.getSetCCResultType(ShiftOp.getValueType()),
+                       ShiftOp,
+                       DAG.getConstant(CountTrailingZeros_64(B.Mask),
+                                       TLI.getPointerTy()),
+                       ISD::SETEQ);
+  } else {
+    // Make desired shift
+    SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurDebugLoc(),
+                                    TLI.getPointerTy(),
+                                    DAG.getConstant(1, TLI.getPointerTy()),
+                                    ShiftOp);
+
+    // Emit bit tests and jumps
+    SDValue AndOp = DAG.getNode(ISD::AND, getCurDebugLoc(),
+                                TLI.getPointerTy(), SwitchVal,
+                                DAG.getConstant(B.Mask, TLI.getPointerTy()));
+    Cmp = DAG.getSetCC(getCurDebugLoc(),
+                       TLI.getSetCCResultType(AndOp.getValueType()),
+                       AndOp, DAG.getConstant(0, TLI.getPointerTy()),
+                       ISD::SETNE);
+  }
 
   SwitchBB->addSuccessor(B.TargetBB);
   SwitchBB->addSuccessor(NextMBB);
 
   SDValue BrAnd = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
                               MVT::Other, getControlRoot(),
-                              AndCmp, DAG.getBasicBlock(B.TargetBB));
+                              Cmp, DAG.getBasicBlock(B.TargetBB));
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
   // This is used to avoid emitting unnecessary branches to the next block.
@@ -1441,7 +1583,7 @@ void SelectionDAGBuilder::visitBitTestCase(MachineBasicBlock* NextMBB,
 }
 
 void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
-  MachineBasicBlock *InvokeMBB = FuncInfo.MBBMap[I.getParent()];
+  MachineBasicBlock *InvokeMBB = FuncInfo.MBB;
 
   // Retrieve successors.
   MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)];
@@ -1969,7 +2111,7 @@ size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
 }
 
 void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
-  MachineBasicBlock *SwitchMBB = FuncInfo.MBBMap[SI.getParent()];
+  MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
 
   // Figure out which block is immediately after the current one.
   MachineBasicBlock *NextBlock = 0;
@@ -2035,7 +2177,7 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
 }
 
 void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
-  MachineBasicBlock *IndirectBrMBB = FuncInfo.MBBMap[I.getParent()];
+  MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;
 
   // Update machine-CFG edges with unique successors.
   SmallVector<BasicBlock*, 32> succs;
@@ -2245,7 +2387,6 @@ void SelectionDAGBuilder::visitPtrToInt(const User &I) {
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
-  EVT SrcVT = N.getValueType();
   EVT DestVT = TLI.getValueType(I.getType());
   setValue(&I, DAG.getZExtOrTrunc(N, getCurDebugLoc(), DestVT));
 }
@@ -2254,7 +2395,6 @@ void SelectionDAGBuilder::visitIntToPtr(const User &I) {
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
-  EVT SrcVT = N.getValueType();
   EVT DestVT = TLI.getValueType(I.getType());
   setValue(&I, DAG.getZExtOrTrunc(N, getCurDebugLoc(), DestVT));
 }
@@ -2579,7 +2719,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
 
       // If this is a constant subscript, handle it quickly.
       if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
-        if (CI->getZExtValue() == 0) continue;
+        if (CI->isZero()) continue;
         uint64_t Offs =
             TD->getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
         SDValue OffsVal;
@@ -2643,12 +2783,13 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
 
   SDValue AllocSize = getValue(I.getArraySize());
 
-  AllocSize = DAG.getNode(ISD::MUL, getCurDebugLoc(), AllocSize.getValueType(),
-                          AllocSize,
-                          DAG.getConstant(TySize, AllocSize.getValueType()));
-
   EVT IntPtr = TLI.getPointerTy();
-  AllocSize = DAG.getZExtOrTrunc(AllocSize, getCurDebugLoc(), IntPtr);
+  if (AllocSize.getValueType() != IntPtr)
+    AllocSize = DAG.getZExtOrTrunc(AllocSize, getCurDebugLoc(), IntPtr);
+
+  AllocSize = DAG.getNode(ISD::MUL, getCurDebugLoc(), IntPtr,
+                          AllocSize,
+                          DAG.getConstant(TySize, IntPtr));
 
   // Handle alignment.  If the requested alignment is less than or equal to
   // the stack alignment, ignore it.  If the size is greater than or equal to
@@ -2804,8 +2945,8 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     Ops.push_back(DAG.getConstant(Intrinsic, TLI.getPointerTy()));
 
   // Add all operands of the call to the operand list.
-  for (unsigned i = 1, e = I.getNumOperands(); i != e; ++i) {
-    SDValue Op = getValue(I.getOperand(i));
+  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+    SDValue Op = getValue(I.getArgOperand(i));
     assert(TLI.isTypeLegal(Op.getValueType()) &&
            "Intrinsic uses a non-legal type?");
     Ops.push_back(Op);
@@ -2910,11 +3051,11 @@ SelectionDAGBuilder::implVisitBinaryAtomic(const CallInst& I,
   SDValue Root = getRoot();
   SDValue L =
     DAG.getAtomic(Op, getCurDebugLoc(),
-                  getValue(I.getOperand(2)).getValueType().getSimpleVT(),
+                  getValue(I.getArgOperand(1)).getValueType().getSimpleVT(),
                   Root,
-                  getValue(I.getOperand(1)),
-                  getValue(I.getOperand(2)),
-                  I.getOperand(1));
+                  getValue(I.getArgOperand(0)),
+                  getValue(I.getArgOperand(1)),
+                  I.getArgOperand(0));
   setValue(&I, L);
   DAG.setRoot(L.getValue(1));
   return 0;
@@ -2923,8 +3064,8 @@ SelectionDAGBuilder::implVisitBinaryAtomic(const CallInst& I,
 // implVisitAluOverflow - Lower arithmetic overflow instrinsics.
 const char *
 SelectionDAGBuilder::implVisitAluOverflow(const CallInst &I, ISD::NodeType Op) {
-  SDValue Op1 = getValue(I.getOperand(1));
-  SDValue Op2 = getValue(I.getOperand(2));
+  SDValue Op1 = getValue(I.getArgOperand(0));
+  SDValue Op2 = getValue(I.getArgOperand(1));
 
   SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
   setValue(&I, DAG.getNode(Op, getCurDebugLoc(), VTs, Op1, Op2));
@@ -2938,9 +3079,9 @@ SelectionDAGBuilder::visitExp(const CallInst &I) {
   SDValue result;
   DebugLoc dl = getCurDebugLoc();
 
-  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
-    SDValue Op = getValue(I.getOperand(1));
+    SDValue Op = getValue(I.getArgOperand(0));
 
     // Put the exponent in the right bit position for later addition to the
     // final result:
@@ -3050,8 +3191,8 @@ SelectionDAGBuilder::visitExp(const CallInst &I) {
   } else {
     // No special expansion.
     result = DAG.getNode(ISD::FEXP, dl,
-                         getValue(I.getOperand(1)).getValueType(),
-                         getValue(I.getOperand(1)));
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
   }
 
   setValue(&I, result);
@@ -3064,9 +3205,9 @@ SelectionDAGBuilder::visitLog(const CallInst &I) {
   SDValue result;
   DebugLoc dl = getCurDebugLoc();
 
-  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
-    SDValue Op = getValue(I.getOperand(1));
+    SDValue Op = getValue(I.getArgOperand(0));
     SDValue Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
 
     // Scale the exponent by log(2) [0.69314718f].
@@ -3160,8 +3301,8 @@ SelectionDAGBuilder::visitLog(const CallInst &I) {
   } else {
     // No special expansion.
     result = DAG.getNode(ISD::FLOG, dl,
-                         getValue(I.getOperand(1)).getValueType(),
-                         getValue(I.getOperand(1)));
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
   }
 
   setValue(&I, result);
@@ -3174,9 +3315,9 @@ SelectionDAGBuilder::visitLog2(const CallInst &I) {
   SDValue result;
   DebugLoc dl = getCurDebugLoc();
 
-  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
-    SDValue Op = getValue(I.getOperand(1));
+    SDValue Op = getValue(I.getArgOperand(0));
     SDValue Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
 
     // Get the exponent.
@@ -3269,8 +3410,8 @@ SelectionDAGBuilder::visitLog2(const CallInst &I) {
   } else {
     // No special expansion.
     result = DAG.getNode(ISD::FLOG2, dl,
-                         getValue(I.getOperand(1)).getValueType(),
-                         getValue(I.getOperand(1)));
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
   }
 
   setValue(&I, result);
@@ -3283,9 +3424,9 @@ SelectionDAGBuilder::visitLog10(const CallInst &I) {
   SDValue result;
   DebugLoc dl = getCurDebugLoc();
 
-  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
-    SDValue Op = getValue(I.getOperand(1));
+    SDValue Op = getValue(I.getArgOperand(0));
     SDValue Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
 
     // Scale the exponent by log10(2) [0.30102999f].
@@ -3371,8 +3512,8 @@ SelectionDAGBuilder::visitLog10(const CallInst &I) {
   } else {
     // No special expansion.
     result = DAG.getNode(ISD::FLOG10, dl,
-                         getValue(I.getOperand(1)).getValueType(),
-                         getValue(I.getOperand(1)));
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
   }
 
   setValue(&I, result);
@@ -3385,9 +3526,9 @@ SelectionDAGBuilder::visitExp2(const CallInst &I) {
   SDValue result;
   DebugLoc dl = getCurDebugLoc();
 
-  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
-    SDValue Op = getValue(I.getOperand(1));
+    SDValue Op = getValue(I.getArgOperand(0));
 
     SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Op);
 
@@ -3485,8 +3626,8 @@ SelectionDAGBuilder::visitExp2(const CallInst &I) {
   } else {
     // No special expansion.
     result = DAG.getNode(ISD::FEXP2, dl,
-                         getValue(I.getOperand(1)).getValueType(),
-                         getValue(I.getOperand(1)));
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
   }
 
   setValue(&I, result);
@@ -3497,12 +3638,12 @@ SelectionDAGBuilder::visitExp2(const CallInst &I) {
 void
 SelectionDAGBuilder::visitPow(const CallInst &I) {
   SDValue result;
-  const Value *Val = I.getOperand(1);
+  const Value *Val = I.getArgOperand(0);
   DebugLoc dl = getCurDebugLoc();
   bool IsExp10 = false;
 
   if (getValue(Val).getValueType() == MVT::f32 &&
-      getValue(I.getOperand(2)).getValueType() == MVT::f32 &&
+      getValue(I.getArgOperand(1)).getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     if (Constant *C = const_cast<Constant*>(dyn_cast<Constant>(Val))) {
       if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
@@ -3513,7 +3654,7 @@ SelectionDAGBuilder::visitPow(const CallInst &I) {
   }
 
   if (IsExp10 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
-    SDValue Op = getValue(I.getOperand(2));
+    SDValue Op = getValue(I.getArgOperand(1));
 
     // Put the exponent in the right bit position for later addition to the
     // final result:
@@ -3618,9 +3759,9 @@ SelectionDAGBuilder::visitPow(const CallInst &I) {
   } else {
     // No special expansion.
     result = DAG.getNode(ISD::FPOW, dl,
-                         getValue(I.getOperand(1)).getValueType(),
-                         getValue(I.getOperand(1)),
-                         getValue(I.getOperand(2)));
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)),
+                         getValue(I.getArgOperand(1)));
   }
 
   setValue(&I, result);
@@ -3696,7 +3837,7 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const DbgValueInst &DI,
   if (DV.isInlinedFnArgument(MF.getFunction()))
     return false;
 
-  MachineBasicBlock *MBB = FuncInfo.MBBMap[DI.getParent()];
+  MachineBasicBlock *MBB = FuncInfo.MBB;
   if (MBB != &MF.front())
     return false;
 
@@ -3750,11 +3891,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::vacopy:   visitVACopy(I); return 0;
   case Intrinsic::returnaddress:
     setValue(&I, DAG.getNode(ISD::RETURNADDR, dl, TLI.getPointerTy(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::frameaddress:
     setValue(&I, DAG.getNode(ISD::FRAMEADDR, dl, TLI.getPointerTy(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::setjmp:
     return "_setjmp"+!TLI.usesUnderscoreSetJmp();
@@ -3763,63 +3904,63 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::memcpy: {
     // Assert for address < 256 since we support only user defined address
     // spaces.
-    assert(cast<PointerType>(I.getOperand(1)->getType())->getAddressSpace()
+    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
            < 256 &&
-           cast<PointerType>(I.getOperand(2)->getType())->getAddressSpace()
+           cast<PointerType>(I.getArgOperand(1)->getType())->getAddressSpace()
            < 256 &&
            "Unknown address space");
-    SDValue Op1 = getValue(I.getOperand(1));
-    SDValue Op2 = getValue(I.getOperand(2));
-    SDValue Op3 = getValue(I.getOperand(3));
-    unsigned Align = cast<ConstantInt>(I.getOperand(4))->getZExtValue();
-    bool isVol = cast<ConstantInt>(I.getOperand(5))->getZExtValue();
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
     DAG.setRoot(DAG.getMemcpy(getRoot(), dl, Op1, Op2, Op3, Align, isVol, false,
-                              I.getOperand(1), 0, I.getOperand(2), 0));
+                              I.getArgOperand(0), 0, I.getArgOperand(1), 0));
     return 0;
   }
   case Intrinsic::memset: {
     // Assert for address < 256 since we support only user defined address
     // spaces.
-    assert(cast<PointerType>(I.getOperand(1)->getType())->getAddressSpace()
+    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
            < 256 &&
            "Unknown address space");
-    SDValue Op1 = getValue(I.getOperand(1));
-    SDValue Op2 = getValue(I.getOperand(2));
-    SDValue Op3 = getValue(I.getOperand(3));
-    unsigned Align = cast<ConstantInt>(I.getOperand(4))->getZExtValue();
-    bool isVol = cast<ConstantInt>(I.getOperand(5))->getZExtValue();
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
     DAG.setRoot(DAG.getMemset(getRoot(), dl, Op1, Op2, Op3, Align, isVol,
-                              I.getOperand(1), 0));
+                              I.getArgOperand(0), 0));
     return 0;
   }
   case Intrinsic::memmove: {
     // Assert for address < 256 since we support only user defined address
     // spaces.
-    assert(cast<PointerType>(I.getOperand(1)->getType())->getAddressSpace()
+    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
            < 256 &&
-           cast<PointerType>(I.getOperand(2)->getType())->getAddressSpace()
+           cast<PointerType>(I.getArgOperand(1)->getType())->getAddressSpace()
            < 256 &&
            "Unknown address space");
-    SDValue Op1 = getValue(I.getOperand(1));
-    SDValue Op2 = getValue(I.getOperand(2));
-    SDValue Op3 = getValue(I.getOperand(3));
-    unsigned Align = cast<ConstantInt>(I.getOperand(4))->getZExtValue();
-    bool isVol = cast<ConstantInt>(I.getOperand(5))->getZExtValue();
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
 
     // If the source and destination are known to not be aliases, we can
     // lower memmove as memcpy.
     uint64_t Size = -1ULL;
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op3))
       Size = C->getZExtValue();
-    if (AA->alias(I.getOperand(1), Size, I.getOperand(2), Size) ==
+    if (AA->alias(I.getArgOperand(0), Size, I.getArgOperand(1), Size) ==
         AliasAnalysis::NoAlias) {
       DAG.setRoot(DAG.getMemcpy(getRoot(), dl, Op1, Op2, Op3, Align, isVol, 
-                                false, I.getOperand(1), 0, I.getOperand(2), 0));
+                                false, I.getArgOperand(0), 0, I.getArgOperand(1), 0));
       return 0;
     }
 
     DAG.setRoot(DAG.getMemmove(getRoot(), dl, Op1, Op2, Op3, Align, isVol,
-                               I.getOperand(1), 0, I.getOperand(2), 0));
+                               I.getArgOperand(0), 0, I.getArgOperand(1), 0));
     return 0;
   }
   case Intrinsic::dbg_declare: {
@@ -3908,7 +4049,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     } else {
       bool createUndef = false;
       // FIXME : Why not use getValue() directly ?
-      SDValue &N = NodeMap[V];
+      SDValue N = NodeMap[V];
+      if (!N.getNode() && isa<Argument>(V))
+        // Check unused arguments map.
+        N = UnusedArgNodeMap[V];
       if (N.getNode()) {
         if (!EmitFuncArgumentDbgValue(DI, V, Variable, Offset, N)) {
           SDV = DAG.getDbgValue(Variable, N.getNode(),
@@ -3956,7 +4100,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::eh_exception: {
     // Insert the EXCEPTIONADDR instruction.
-    assert(FuncInfo.MBBMap[I.getParent()]->isLandingPad() &&
+    assert(FuncInfo.MBB->isLandingPad() &&
            "Call to eh.exception not in landing pad!");
     SDVTList VTs = DAG.getVTList(TLI.getPointerTy(), MVT::Other);
     SDValue Ops[1];
@@ -3968,7 +4112,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
 
   case Intrinsic::eh_selector: {
-    MachineBasicBlock *CallMBB = FuncInfo.MBBMap[I.getParent()];
+    MachineBasicBlock *CallMBB = FuncInfo.MBB;
     MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
     if (CallMBB->isLandingPad())
       AddCatchInfo(I, &MMI, CallMBB);
@@ -3978,13 +4122,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 #endif
       // FIXME: Mark exception selector register as live in.  Hack for PR1508.
       unsigned Reg = TLI.getExceptionSelectorRegister();
-      if (Reg) FuncInfo.MBBMap[I.getParent()]->addLiveIn(Reg);
+      if (Reg) FuncInfo.MBB->addLiveIn(Reg);
     }
 
     // Insert the EHSELECTION instruction.
     SDVTList VTs = DAG.getVTList(TLI.getPointerTy(), MVT::Other);
     SDValue Ops[2];
-    Ops[0] = getValue(I.getOperand(1));
+    Ops[0] = getValue(I.getArgOperand(0));
     Ops[1] = getRoot();
     SDValue Op = DAG.getNode(ISD::EHSELECTION, dl, VTs, Ops, 2);
     DAG.setRoot(Op.getValue(1));
@@ -3994,7 +4138,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 
   case Intrinsic::eh_typeid_for: {
     // Find the type id for the given typeinfo.
-    GlobalVariable *GV = ExtractTypeInfo(I.getOperand(1));
+    GlobalVariable *GV = ExtractTypeInfo(I.getArgOperand(0));
     unsigned TypeID = DAG.getMachineFunction().getMMI().getTypeIDFor(GV);
     Res = DAG.getConstant(TypeID, MVT::i32);
     setValue(&I, Res);
@@ -4007,15 +4151,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     DAG.setRoot(DAG.getNode(ISD::EH_RETURN, dl,
                             MVT::Other,
                             getControlRoot(),
-                            getValue(I.getOperand(1)),
-                            getValue(I.getOperand(2))));
+                            getValue(I.getArgOperand(0)),
+                            getValue(I.getArgOperand(1))));
     return 0;
   case Intrinsic::eh_unwind_init:
     DAG.getMachineFunction().getMMI().setCallsUnwindInit(true);
     return 0;
   case Intrinsic::eh_dwarf_cfa: {
-    EVT VT = getValue(I.getOperand(1)).getValueType();
-    SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getOperand(1)), dl,
+    SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getArgOperand(0)), dl,
                                         TLI.getPointerTy());
     SDValue Offset = DAG.getNode(ISD::ADD, dl,
                                  TLI.getPointerTy(),
@@ -4031,7 +4174,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::eh_sjlj_callsite: {
     MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
-    ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(1));
+    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(0));
     assert(CI && "Non-constant call site value in eh.sjlj.callsite!");
     assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!");
 
@@ -4040,13 +4183,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::eh_sjlj_setjmp: {
     setValue(&I, DAG.getNode(ISD::EH_SJLJ_SETJMP, dl, MVT::i32, getRoot(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0))));
     return 0;
   }
   case Intrinsic::eh_sjlj_longjmp: {
     DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_LONGJMP, dl, MVT::Other,
                             getRoot(),
-                            getValue(I.getOperand(1))));
+                            getValue(I.getArgOperand(0))));
     return 0;
   }
 
@@ -4072,34 +4215,34 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     case Intrinsic::convertuu:  Code = ISD::CVT_UU; break;
     }
     EVT DestVT = TLI.getValueType(I.getType());
-    const Value *Op1 = I.getOperand(1);
+    const Value *Op1 = I.getArgOperand(0);
     Res = DAG.getConvertRndSat(DestVT, getCurDebugLoc(), getValue(Op1),
                                DAG.getValueType(DestVT),
                                DAG.getValueType(getValue(Op1).getValueType()),
-                               getValue(I.getOperand(2)),
-                               getValue(I.getOperand(3)),
+                               getValue(I.getArgOperand(1)),
+                               getValue(I.getArgOperand(2)),
                                Code);
     setValue(&I, Res);
     return 0;
   }
   case Intrinsic::sqrt:
     setValue(&I, DAG.getNode(ISD::FSQRT, dl,
-                             getValue(I.getOperand(1)).getValueType(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::powi:
-    setValue(&I, ExpandPowI(dl, getValue(I.getOperand(1)),
-                            getValue(I.getOperand(2)), DAG));
+    setValue(&I, ExpandPowI(dl, getValue(I.getArgOperand(0)),
+                            getValue(I.getArgOperand(1)), DAG));
     return 0;
   case Intrinsic::sin:
     setValue(&I, DAG.getNode(ISD::FSIN, dl,
-                             getValue(I.getOperand(1)).getValueType(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::cos:
     setValue(&I, DAG.getNode(ISD::FCOS, dl,
-                             getValue(I.getOperand(1)).getValueType(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::log:
     visitLog(I);
@@ -4121,14 +4264,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return 0;
   case Intrinsic::convert_to_fp16:
     setValue(&I, DAG.getNode(ISD::FP32_TO_FP16, dl,
-                             MVT::i16, getValue(I.getOperand(1))));
+                             MVT::i16, getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::convert_from_fp16:
     setValue(&I, DAG.getNode(ISD::FP16_TO_FP32, dl,
-                             MVT::f32, getValue(I.getOperand(1))));
+                             MVT::f32, getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::pcmarker: {
-    SDValue Tmp = getValue(I.getOperand(1));
+    SDValue Tmp = getValue(I.getArgOperand(0));
     DAG.setRoot(DAG.getNode(ISD::PCMARKER, dl, MVT::Other, getRoot(), Tmp));
     return 0;
   }
@@ -4143,23 +4286,23 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::bswap:
     setValue(&I, DAG.getNode(ISD::BSWAP, dl,
-                             getValue(I.getOperand(1)).getValueType(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::cttz: {
-    SDValue Arg = getValue(I.getOperand(1));
+    SDValue Arg = getValue(I.getArgOperand(0));
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(ISD::CTTZ, dl, Ty, Arg));
     return 0;
   }
   case Intrinsic::ctlz: {
-    SDValue Arg = getValue(I.getOperand(1));
+    SDValue Arg = getValue(I.getArgOperand(0));
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(ISD::CTLZ, dl, Ty, Arg));
     return 0;
   }
   case Intrinsic::ctpop: {
-    SDValue Arg = getValue(I.getOperand(1));
+    SDValue Arg = getValue(I.getArgOperand(0));
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(ISD::CTPOP, dl, Ty, Arg));
     return 0;
@@ -4173,7 +4316,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return 0;
   }
   case Intrinsic::stackrestore: {
-    Res = getValue(I.getOperand(1));
+    Res = getValue(I.getArgOperand(0));
     DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, dl, MVT::Other, getRoot(), Res));
     return 0;
   }
@@ -4183,8 +4326,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     MachineFrameInfo *MFI = MF.getFrameInfo();
     EVT PtrTy = TLI.getPointerTy();
 
-    SDValue Src = getValue(I.getOperand(1));   // The guard's value.
-    AllocaInst *Slot = cast<AllocaInst>(I.getOperand(2));
+    SDValue Src = getValue(I.getArgOperand(0));   // The guard's value.
+    AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
 
     int FI = FuncInfo.StaticAllocaMap[Slot];
     MFI->setStackProtectorIndex(FI);
@@ -4201,14 +4344,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::objectsize: {
     // If we don't know by now, we're never going to know.
-    ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(2));
+    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));
 
     assert(CI && "Non-constant type in __builtin_object_size?");
 
-    SDValue Arg = getValue(I.getOperand(0));
+    SDValue Arg = getValue(I.getCalledValue());
     EVT Ty = Arg.getValueType();
 
-    if (CI->getZExtValue() == 0)
+    if (CI->isZero())
       Res = DAG.getConstant(-1ULL, Ty);
     else
       Res = DAG.getConstant(0, Ty);
@@ -4221,14 +4364,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return 0;
 
   case Intrinsic::init_trampoline: {
-    const Function *F = cast<Function>(I.getOperand(2)->stripPointerCasts());
+    const Function *F = cast<Function>(I.getArgOperand(1)->stripPointerCasts());
 
     SDValue Ops[6];
     Ops[0] = getRoot();
-    Ops[1] = getValue(I.getOperand(1));
-    Ops[2] = getValue(I.getOperand(2));
-    Ops[3] = getValue(I.getOperand(3));
-    Ops[4] = DAG.getSrcValue(I.getOperand(1));
+    Ops[1] = getValue(I.getArgOperand(0));
+    Ops[2] = getValue(I.getArgOperand(1));
+    Ops[3] = getValue(I.getArgOperand(2));
+    Ops[4] = DAG.getSrcValue(I.getArgOperand(0));
     Ops[5] = DAG.getSrcValue(F);
 
     Res = DAG.getNode(ISD::TRAMPOLINE, dl,
@@ -4241,8 +4384,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::gcroot:
     if (GFI) {
-      const Value *Alloca = I.getOperand(1);
-      const Constant *TypeMap = cast<Constant>(I.getOperand(2));
+      const Value *Alloca = I.getArgOperand(0);
+      const Constant *TypeMap = cast<Constant>(I.getArgOperand(1));
 
       FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
       GFI->addStackRoot(FI->getIndex(), TypeMap);
@@ -4274,9 +4417,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::prefetch: {
     SDValue Ops[4];
     Ops[0] = getRoot();
-    Ops[1] = getValue(I.getOperand(1));
-    Ops[2] = getValue(I.getOperand(2));
-    Ops[3] = getValue(I.getOperand(3));
+    Ops[1] = getValue(I.getArgOperand(0));
+    Ops[2] = getValue(I.getArgOperand(1));
+    Ops[3] = getValue(I.getArgOperand(2));
     DAG.setRoot(DAG.getNode(ISD::PREFETCH, dl, MVT::Other, &Ops[0], 4));
     return 0;
   }
@@ -4285,7 +4428,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Ops[6];
     Ops[0] = getRoot();
     for (int x = 1; x < 6; ++x)
-      Ops[x] = getValue(I.getOperand(x));
+      Ops[x] = getValue(I.getArgOperand(x - 1));
 
     DAG.setRoot(DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, &Ops[0], 6));
     return 0;
@@ -4294,12 +4437,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Root = getRoot();
     SDValue L =
       DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, getCurDebugLoc(),
-                    getValue(I.getOperand(2)).getValueType().getSimpleVT(),
+                    getValue(I.getArgOperand(1)).getValueType().getSimpleVT(),
                     Root,
-                    getValue(I.getOperand(1)),
-                    getValue(I.getOperand(2)),
-                    getValue(I.getOperand(3)),
-                    I.getOperand(1));
+                    getValue(I.getArgOperand(0)),
+                    getValue(I.getArgOperand(1)),
+                    getValue(I.getArgOperand(2)),
+                    I.getArgOperand(0));
     setValue(&I, L);
     DAG.setRoot(L.getValue(1));
     return 0;
@@ -4353,14 +4496,13 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   Args.reserve(CS.arg_size());
 
   // Check whether the function can return without sret-demotion.
-  SmallVector<EVT, 4> OutVTs;
-  SmallVector<ISD::ArgFlagsTy, 4> OutsFlags;
+  SmallVector<ISD::OutputArg, 4> Outs;
   SmallVector<uint64_t, 4> Offsets;
-  getReturnInfo(RetTy, CS.getAttributes().getRetAttributes(),
-                OutVTs, OutsFlags, TLI, &Offsets);
+  GetReturnInfo(RetTy, CS.getAttributes().getRetAttributes(),
+                Outs, TLI, &Offsets);
 
   bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
-                        FTy->isVarArg(), OutVTs, OutsFlags, DAG);
+                        FTy->isVarArg(), Outs, FTy->getContext());
 
   SDValue DemoteStackSlot;
 
@@ -4453,7 +4595,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     ComputeValueVTs(TLI, PtrRetTy, PVTs);
     assert(PVTs.size() == 1 && "Pointers should fit in one register");
     EVT PtrVT = PVTs[0];
-    unsigned NumValues = OutVTs.size();
+    unsigned NumValues = Outs.size();
     SmallVector<SDValue, 4> Values(NumValues);
     SmallVector<SDValue, 4> Chains(NumValues);
 
@@ -4461,7 +4603,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
       SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(), PtrVT,
                                 DemoteStackSlot,
                                 DAG.getConstant(Offsets[i], PtrVT));
-      SDValue L = DAG.getLoad(OutVTs[i], getCurDebugLoc(), Result.second,
+      SDValue L = DAG.getLoad(Outs[i].VT, getCurDebugLoc(), Result.second,
                               Add, NULL, Offsets[i], false, false, 1);
       Values[i] = L;
       Chains[i] = L.getValue(1);
@@ -4580,16 +4722,16 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
 /// lowered like a normal call.
 bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   // Verify that the prototype makes sense.  int memcmp(void*,void*,size_t)
-  if (I.getNumOperands() != 4)
+  if (I.getNumArgOperands() != 3)
     return false;
 
-  const Value *LHS = I.getOperand(1), *RHS = I.getOperand(2);
+  const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
   if (!LHS->getType()->isPointerTy() || !RHS->getType()->isPointerTy() ||
-      !I.getOperand(3)->getType()->isIntegerTy() ||
+      !I.getArgOperand(2)->getType()->isIntegerTy() ||
       !I.getType()->isIntegerTy())
     return false;
 
-  const ConstantInt *Size = dyn_cast<ConstantInt>(I.getOperand(3));
+  const ConstantInt *Size = dyn_cast<ConstantInt>(I.getArgOperand(2));
 
   // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
   // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
@@ -4656,11 +4798,16 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
 
 
 void SelectionDAGBuilder::visitCall(const CallInst &I) {
+  // Handle inline assembly differently.
+  if (isa<InlineAsm>(I.getCalledValue())) {
+    visitInlineAsm(&I);
+    return;
+  }
+  
   const char *RenameFn = 0;
   if (Function *F = I.getCalledFunction()) {
     if (F->isDeclaration()) {
-      const TargetIntrinsicInfo *II = TM.getIntrinsicInfo();
-      if (II) {
+      if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo()) {
         if (unsigned IID = II->getIntrinsicID(F)) {
           RenameFn = visitIntrinsicCall(I, IID);
           if (!RenameFn)
@@ -4679,51 +4826,51 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
     if (!F->hasLocalLinkage() && F->hasName()) {
       StringRef Name = F->getName();
       if (Name == "copysign" || Name == "copysignf" || Name == "copysignl") {
-        if (I.getNumOperands() == 3 &&   // Basic sanity checks.
-            I.getOperand(1)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getOperand(1)->getType() &&
-            I.getType() == I.getOperand(2)->getType()) {
-          SDValue LHS = getValue(I.getOperand(1));
-          SDValue RHS = getValue(I.getOperand(2));
+        if (I.getNumArgOperands() == 2 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
+            I.getType() == I.getArgOperand(1)->getType()) {
+          SDValue LHS = getValue(I.getArgOperand(0));
+          SDValue RHS = getValue(I.getArgOperand(1));
           setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurDebugLoc(),
                                    LHS.getValueType(), LHS, RHS));
           return;
         }
       } else if (Name == "fabs" || Name == "fabsf" || Name == "fabsl") {
-        if (I.getNumOperands() == 2 &&   // Basic sanity checks.
-            I.getOperand(1)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getOperand(1)->getType()) {
-          SDValue Tmp = getValue(I.getOperand(1));
+        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType()) {
+          SDValue Tmp = getValue(I.getArgOperand(0));
           setValue(&I, DAG.getNode(ISD::FABS, getCurDebugLoc(),
                                    Tmp.getValueType(), Tmp));
           return;
         }
       } else if (Name == "sin" || Name == "sinf" || Name == "sinl") {
-        if (I.getNumOperands() == 2 &&   // Basic sanity checks.
-            I.getOperand(1)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getOperand(1)->getType() &&
+        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
             I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getOperand(1));
+          SDValue Tmp = getValue(I.getArgOperand(0));
           setValue(&I, DAG.getNode(ISD::FSIN, getCurDebugLoc(),
                                    Tmp.getValueType(), Tmp));
           return;
         }
       } else if (Name == "cos" || Name == "cosf" || Name == "cosl") {
-        if (I.getNumOperands() == 2 &&   // Basic sanity checks.
-            I.getOperand(1)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getOperand(1)->getType() &&
+        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
             I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getOperand(1));
+          SDValue Tmp = getValue(I.getArgOperand(0));
           setValue(&I, DAG.getNode(ISD::FCOS, getCurDebugLoc(),
                                    Tmp.getValueType(), Tmp));
           return;
         }
       } else if (Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl") {
-        if (I.getNumOperands() == 2 &&   // Basic sanity checks.
-            I.getOperand(1)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getOperand(1)->getType() &&
+        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
             I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getOperand(1));
+          SDValue Tmp = getValue(I.getArgOperand(0));
           setValue(&I, DAG.getNode(ISD::FSQRT, getCurDebugLoc(),
                                    Tmp.getValueType(), Tmp));
           return;
@@ -4733,14 +4880,11 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
           return;
       }
     }
-  } else if (isa<InlineAsm>(I.getOperand(0))) {
-    visitInlineAsm(&I);
-    return;
   }
-
+  
   SDValue Callee;
   if (!RenameFn)
-    Callee = getValue(I.getOperand(0));
+    Callee = getValue(I.getCalledValue());
   else
     Callee = DAG.getExternalSymbol(RenameFn, TLI.getPointerTy());
 
@@ -4749,210 +4893,8 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
   LowerCallTo(&I, Callee, I.isTailCall());
 }
 
-/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
-/// this value and returns the result as a ValueVT value.  This uses
-/// Chain/Flag as the input and updates them for the output Chain/Flag.
-/// If the Flag pointer is NULL, no flag is used.
-SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, DebugLoc dl,
-                                      SDValue &Chain, SDValue *Flag) const {
-  // Assemble the legal parts into the final values.
-  SmallVector<SDValue, 4> Values(ValueVTs.size());
-  SmallVector<SDValue, 8> Parts;
-  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
-    // Copy the legal parts from the registers.
-    EVT ValueVT = ValueVTs[Value];
-    unsigned NumRegs = TLI->getNumRegisters(*DAG.getContext(), ValueVT);
-    EVT RegisterVT = RegVTs[Value];
-
-    Parts.resize(NumRegs);
-    for (unsigned i = 0; i != NumRegs; ++i) {
-      SDValue P;
-      if (Flag == 0) {
-        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
-      } else {
-        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag);
-        *Flag = P.getValue(2);
-      }
-
-      Chain = P.getValue(1);
-
-      // If the source register was virtual and if we know something about it,
-      // add an assert node.
-      if (TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) &&
-          RegisterVT.isInteger() && !RegisterVT.isVector()) {
-        unsigned SlotNo = Regs[Part+i]-TargetRegisterInfo::FirstVirtualRegister;
-        FunctionLoweringInfo &FLI = DAG.getFunctionLoweringInfo();
-        if (FLI.LiveOutRegInfo.size() > SlotNo) {
-          FunctionLoweringInfo::LiveOutInfo &LOI = FLI.LiveOutRegInfo[SlotNo];
-
-          unsigned RegSize = RegisterVT.getSizeInBits();
-          unsigned NumSignBits = LOI.NumSignBits;
-          unsigned NumZeroBits = LOI.KnownZero.countLeadingOnes();
-
-          // FIXME: We capture more information than the dag can represent.  For
-          // now, just use the tightest assertzext/assertsext possible.
-          bool isSExt = true;
-          EVT FromVT(MVT::Other);
-          if (NumSignBits == RegSize)
-            isSExt = true, FromVT = MVT::i1;   // ASSERT SEXT 1
-          else if (NumZeroBits >= RegSize-1)
-            isSExt = false, FromVT = MVT::i1;  // ASSERT ZEXT 1
-          else if (NumSignBits > RegSize-8)
-            isSExt = true, FromVT = MVT::i8;   // ASSERT SEXT 8
-          else if (NumZeroBits >= RegSize-8)
-            isSExt = false, FromVT = MVT::i8;  // ASSERT ZEXT 8
-          else if (NumSignBits > RegSize-16)
-            isSExt = true, FromVT = MVT::i16;  // ASSERT SEXT 16
-          else if (NumZeroBits >= RegSize-16)
-            isSExt = false, FromVT = MVT::i16; // ASSERT ZEXT 16
-          else if (NumSignBits > RegSize-32)
-            isSExt = true, FromVT = MVT::i32;  // ASSERT SEXT 32
-          else if (NumZeroBits >= RegSize-32)
-            isSExt = false, FromVT = MVT::i32; // ASSERT ZEXT 32
-
-          if (FromVT != MVT::Other)
-            P = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
-                            RegisterVT, P, DAG.getValueType(FromVT));
-        }
-      }
-
-      Parts[i] = P;
-    }
-
-    Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(),
-                                     NumRegs, RegisterVT, ValueVT);
-    Part += NumRegs;
-    Parts.clear();
-  }
-
-  return DAG.getNode(ISD::MERGE_VALUES, dl,
-                     DAG.getVTList(&ValueVTs[0], ValueVTs.size()),
-                     &Values[0], ValueVTs.size());
-}
-
-/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
-/// specified value into the registers specified by this object.  This uses
-/// Chain/Flag as the input and updates them for the output Chain/Flag.
-/// If the Flag pointer is NULL, no flag is used.
-void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
-                                 SDValue &Chain, SDValue *Flag) const {
-  // Get the list of the values's legal parts.
-  unsigned NumRegs = Regs.size();
-  SmallVector<SDValue, 8> Parts(NumRegs);
-  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
-    EVT ValueVT = ValueVTs[Value];
-    unsigned NumParts = TLI->getNumRegisters(*DAG.getContext(), ValueVT);
-    EVT RegisterVT = RegVTs[Value];
-
-    getCopyToParts(DAG, dl,
-                   Val.getValue(Val.getResNo() + Value),
-                   &Parts[Part], NumParts, RegisterVT);
-    Part += NumParts;
-  }
-
-  // Copy the parts into the registers.
-  SmallVector<SDValue, 8> Chains(NumRegs);
-  for (unsigned i = 0; i != NumRegs; ++i) {
-    SDValue Part;
-    if (Flag == 0) {
-      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
-    } else {
-      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag);
-      *Flag = Part.getValue(1);
-    }
-
-    Chains[i] = Part.getValue(0);
-  }
-
-  if (NumRegs == 1 || Flag)
-    // If NumRegs > 1 && Flag is used then the use of the last CopyToReg is
-    // flagged to it. That is the CopyToReg nodes and the user are considered
-    // a single scheduling unit. If we create a TokenFactor and return it as
-    // chain, then the TokenFactor is both a predecessor (operand) of the
-    // user as well as a successor (the TF operands are flagged to the user).
-    // c1, f1 = CopyToReg
-    // c2, f2 = CopyToReg
-    // c3     = TokenFactor c1, c2
-    // ...
-    //        = op c3, ..., f2
-    Chain = Chains[NumRegs-1];
-  else
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], NumRegs);
-}
-
-/// AddInlineAsmOperands - Add this value to the specified inlineasm node
-/// operand list.  This adds the code marker and includes the number of
-/// values added into it.
-void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
-                                        unsigned MatchingIdx,
-                                        SelectionDAG &DAG,
-                                        std::vector<SDValue> &Ops) const {
-  unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
-  if (HasMatching)
-    Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
-  SDValue Res = DAG.getTargetConstant(Flag, MVT::i32);
-  Ops.push_back(Res);
-
-  for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
-    unsigned NumRegs = TLI->getNumRegisters(*DAG.getContext(), ValueVTs[Value]);
-    EVT RegisterVT = RegVTs[Value];
-    for (unsigned i = 0; i != NumRegs; ++i) {
-      assert(Reg < Regs.size() && "Mismatch in # registers expected");
-      Ops.push_back(DAG.getRegister(Regs[Reg++], RegisterVT));
-    }
-  }
-}
-
-/// isAllocatableRegister - If the specified register is safe to allocate,
-/// i.e. it isn't a stack pointer or some other special register, return the
-/// register class for the register.  Otherwise, return null.
-static const TargetRegisterClass *
-isAllocatableRegister(unsigned Reg, MachineFunction &MF,
-                      const TargetLowering &TLI,
-                      const TargetRegisterInfo *TRI) {
-  EVT FoundVT = MVT::Other;
-  const TargetRegisterClass *FoundRC = 0;
-  for (TargetRegisterInfo::regclass_iterator RCI = TRI->regclass_begin(),
-       E = TRI->regclass_end(); RCI != E; ++RCI) {
-    EVT ThisVT = MVT::Other;
-
-    const TargetRegisterClass *RC = *RCI;
-    // If none of the value types for this register class are valid, we
-    // can't use it.  For example, 64-bit reg classes on 32-bit targets.
-    for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
-         I != E; ++I) {
-      if (TLI.isTypeLegal(*I)) {
-        // If we have already found this register in a different register class,
-        // choose the one with the largest VT specified.  For example, on
-        // PowerPC, we favor f64 register classes over f32.
-        if (FoundVT == MVT::Other || FoundVT.bitsLT(*I)) {
-          ThisVT = *I;
-          break;
-        }
-      }
-    }
-
-    if (ThisVT == MVT::Other) continue;
-
-    // NOTE: This isn't ideal.  In particular, this might allocate the
-    // frame pointer in functions that need it (due to them not being taken
-    // out of allocation, because a variable sized allocation hasn't been seen
-    // yet).  This is a slight code pessimization, but should still work.
-    for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF),
-         E = RC->allocation_order_end(MF); I != E; ++I)
-      if (*I == Reg) {
-        // We found a matching register class.  Keep looking at others in case
-        // we find one with larger registers that this physreg is also in.
-        FoundRC = RC;
-        FoundVT = ThisVT;
-        break;
-      }
-  }
-  return FoundRC;
-}
-
-
 namespace llvm {
+
 /// AsmOperandInfo - This contains information for each constraint that we are
 /// lowering.
 class LLVM_LIBRARY_VISIBILITY SDISelAsmOperandInfo :
@@ -5041,8 +4983,56 @@ private:
         Regs.insert(*Aliases);
   }
 };
+
 } // end llvm namespace.
 
+/// isAllocatableRegister - If the specified register is safe to allocate,
+/// i.e. it isn't a stack pointer or some other special register, return the
+/// register class for the register.  Otherwise, return null.
+static const TargetRegisterClass *
+isAllocatableRegister(unsigned Reg, MachineFunction &MF,
+                      const TargetLowering &TLI,
+                      const TargetRegisterInfo *TRI) {
+  EVT FoundVT = MVT::Other;
+  const TargetRegisterClass *FoundRC = 0;
+  for (TargetRegisterInfo::regclass_iterator RCI = TRI->regclass_begin(),
+       E = TRI->regclass_end(); RCI != E; ++RCI) {
+    EVT ThisVT = MVT::Other;
+
+    const TargetRegisterClass *RC = *RCI;
+    // If none of the value types for this register class are valid, we
+    // can't use it.  For example, 64-bit reg classes on 32-bit targets.
+    for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
+         I != E; ++I) {
+      if (TLI.isTypeLegal(*I)) {
+        // If we have already found this register in a different register class,
+        // choose the one with the largest VT specified.  For example, on
+        // PowerPC, we favor f64 register classes over f32.
+        if (FoundVT == MVT::Other || FoundVT.bitsLT(*I)) {
+          ThisVT = *I;
+          break;
+        }
+      }
+    }
+
+    if (ThisVT == MVT::Other) continue;
+
+    // NOTE: This isn't ideal.  In particular, this might allocate the
+    // frame pointer in functions that need it (due to them not being taken
+    // out of allocation, because a variable sized allocation hasn't been seen
+    // yet).  This is a slight code pessimization, but should still work.
+    for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF),
+         E = RC->allocation_order_end(MF); I != E; ++I)
+      if (*I == Reg) {
+        // We found a matching register class.  Keep looking at others in case
+        // we find one with larger registers that this physreg is also in.
+        FoundRC = RC;
+        FoundVT = ThisVT;
+        break;
+      }
+  }
+  return FoundRC;
+}
 
 /// GetRegistersForValue - Assign registers (virtual or physical) for the
 /// specified operand.  We prefer to assign virtual registers, to allow the
@@ -5154,7 +5144,7 @@ GetRegistersForValue(SDISelAsmOperandInfo &OpInfo,
       }
     }
 
-    OpInfo.AssignedRegs = RegsForValue(TLI, Regs, RegVT, ValueVT);
+    OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
     const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
     OpInfo.MarkAllocatedRegs(isOutReg, isInReg, OutputRegs, InputRegs, *TRI);
     return;
@@ -5172,7 +5162,7 @@ GetRegistersForValue(SDISelAsmOperandInfo &OpInfo,
     for (; NumRegs; --NumRegs)
       Regs.push_back(RegInfo.createVirtualRegister(RC));
 
-    OpInfo.AssignedRegs = RegsForValue(TLI, Regs, RegVT, ValueVT);
+    OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
     return;
   }
 
@@ -5215,7 +5205,7 @@ GetRegistersForValue(SDISelAsmOperandInfo &OpInfo,
       for (unsigned i = RegStart; i != RegEnd; ++i)
         Regs.push_back(RegClassRegs[i]);
 
-      OpInfo.AssignedRegs = RegsForValue(TLI, Regs, *RC->vt_begin(),
+      OpInfo.AssignedRegs = RegsForValue(Regs, *RC->vt_begin(),
                                          OpInfo.ConstraintVT);
       OpInfo.MarkAllocatedRegs(isOutReg, isInReg, OutputRegs, InputRegs, *TRI);
       return;
@@ -5332,7 +5322,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     }
 
     // Compute the constraint code and ConstraintType to use.
-    TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, hasMemory, &DAG);
+    TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
 
     // If this is a memory input, and if the operand is not indirect, do what we
     // need to to provide an address for the memory input.
@@ -5406,6 +5396,10 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   const MDNode *SrcLoc = CS.getInstruction()->getMetadata("srcloc");
   AsmNodeOperands.push_back(DAG.getMDNode(SrcLoc));
 
+  // Remember the AlignStack bit as operand 3.
+  AsmNodeOperands.push_back(DAG.getTargetConstant(IA->isAlignStack() ? 1 : 0,
+                                            MVT::i1));
+
   // Loop over all of the inputs, copying the operand values into the
   // appropriate registers and processing the output regs.
   RegsForValue RetValRegs;
@@ -5497,7 +5491,6 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
           }
           
           RegsForValue MatchedRegs;
-          MatchedRegs.TLI = &TLI;
           MatchedRegs.ValueVTs.push_back(InOperandVal.getValueType());
           EVT RegVT = AsmNodeOperands[CurOp+1].getValueType();
           MatchedRegs.RegVTs.push_back(RegVT);
@@ -5535,7 +5528,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
         std::vector<SDValue> Ops;
         TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode[0],
-                                         hasMemory, Ops, DAG);
+                                         Ops, DAG);
         if (Ops.empty())
           report_fatal_error("Invalid operand for inline asm constraint '" +
                              Twine(OpInfo.ConstraintCode) + "'!");
@@ -5570,7 +5563,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
       // Copy the input into the appropriate registers.
       if (OpInfo.AssignedRegs.Regs.empty() ||
-          !OpInfo.AssignedRegs.areValueTypesLegal())
+          !OpInfo.AssignedRegs.areValueTypesLegal(TLI))
         report_fatal_error("Couldn't allocate input reg for constraint '" +
                            Twine(OpInfo.ConstraintCode) + "'!");
 
@@ -5595,7 +5588,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   }
 
   // Finish up input operands.  Set the input chain and add the flag last.
-  AsmNodeOperands[0] = Chain;
+  AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
   if (Flag.getNode()) AsmNodeOperands.push_back(Flag);
 
   Chain = DAG.getNode(ISD::INLINEASM, getCurDebugLoc(),
@@ -5606,7 +5599,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   // If this asm returns a register value, copy the result from that register
   // and set it as the value of the call.
   if (!RetValRegs.Regs.empty()) {
-    SDValue Val = RetValRegs.getCopyFromRegs(DAG, getCurDebugLoc(),
+    SDValue Val = RetValRegs.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(),
                                              Chain, &Flag);
 
     // FIXME: Why don't we do this for inline asms with MRVs?
@@ -5646,7 +5639,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   for (unsigned i = 0, e = IndirectStoresToEmit.size(); i != e; ++i) {
     RegsForValue &OutRegs = IndirectStoresToEmit[i].first;
     const Value *Ptr = IndirectStoresToEmit[i].second;
-    SDValue OutVal = OutRegs.getCopyFromRegs(DAG, getCurDebugLoc(),
+    SDValue OutVal = OutRegs.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(),
                                              Chain, &Flag);
     StoresToEmit.push_back(std::make_pair(OutVal, Ptr));
   }
@@ -5672,14 +5665,16 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
   DAG.setRoot(DAG.getNode(ISD::VASTART, getCurDebugLoc(),
                           MVT::Other, getRoot(),
-                          getValue(I.getOperand(1)),
-                          DAG.getSrcValue(I.getOperand(1))));
+                          getValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(0))));
 }
 
 void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
+  const TargetData &TD = *TLI.getTargetData();
   SDValue V = DAG.getVAArg(TLI.getValueType(I.getType()), getCurDebugLoc(),
                            getRoot(), getValue(I.getOperand(0)),
-                           DAG.getSrcValue(I.getOperand(0)));
+                           DAG.getSrcValue(I.getOperand(0)),
+                           TD.getABITypeAlignment(I.getType()));
   setValue(&I, V);
   DAG.setRoot(V.getValue(1));
 }
@@ -5687,17 +5682,17 @@ void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
 void SelectionDAGBuilder::visitVAEnd(const CallInst &I) {
   DAG.setRoot(DAG.getNode(ISD::VAEND, getCurDebugLoc(),
                           MVT::Other, getRoot(),
-                          getValue(I.getOperand(1)),
-                          DAG.getSrcValue(I.getOperand(1))));
+                          getValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(0))));
 }
 
 void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
   DAG.setRoot(DAG.getNode(ISD::VACOPY, getCurDebugLoc(),
                           MVT::Other, getRoot(),
-                          getValue(I.getOperand(1)),
-                          getValue(I.getOperand(2)),
-                          DAG.getSrcValue(I.getOperand(1)),
-                          DAG.getSrcValue(I.getOperand(2))));
+                          getValue(I.getArgOperand(0)),
+                          getValue(I.getArgOperand(1)),
+                          DAG.getSrcValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(1))));
 }
 
 /// TargetLowering::LowerCallTo - This is the default LowerCallTo
@@ -5715,6 +5710,7 @@ TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
                             DebugLoc dl) const {
   // Handle all of the outgoing arguments.
   SmallVector<ISD::OutputArg, 32> Outs;
+  SmallVector<SDValue, 32> OutVals;
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*this, Args[i].Ty, ValueVTs);
@@ -5768,13 +5764,15 @@ TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
 
       for (unsigned j = 0; j != NumParts; ++j) {
         // if it isn't first piece, alignment must be 1
-        ISD::OutputArg MyFlags(Flags, Parts[j], i < NumFixedArgs);
+        ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(),
+                               i < NumFixedArgs);
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
         else if (j != 0)
           MyFlags.Flags.setOrigAlign(1);
 
         Outs.push_back(MyFlags);
+        OutVals.push_back(Parts[j]);
       }
     }
   }
@@ -5803,7 +5801,7 @@ TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
 
   SmallVector<SDValue, 4> InVals;
   Chain = LowerCall(Chain, Callee, CallConv, isVarArg, isTailCall,
-                    Outs, Ins, dl, DAG, InVals);
+                    Outs, OutVals, Ins, dl, DAG, InVals);
 
   // Verify that the target's LowerCall behaved as expected.
   assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
@@ -5876,7 +5874,7 @@ SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 
 void
 SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
-  SDValue Op = getValue(V);
+  SDValue Op = getNonRegisterValue(V);
   assert((Op.getOpcode() != ISD::CopyFromReg ||
           cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
          "Copy from a reg to the same reg!");
@@ -5894,21 +5892,16 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
   // If this is the entry block, emit arguments.
   const Function &F = *LLVMBB->getParent();
   SelectionDAG &DAG = SDB->DAG;
-  SDValue OldRoot = DAG.getRoot();
   DebugLoc dl = SDB->getCurDebugLoc();
   const TargetData *TD = TLI.getTargetData();
   SmallVector<ISD::InputArg, 16> Ins;
 
   // Check whether the function can return without sret-demotion.
-  SmallVector<EVT, 4> OutVTs;
-  SmallVector<ISD::ArgFlagsTy, 4> OutsFlags;
-  getReturnInfo(F.getReturnType(), F.getAttributes().getRetAttributes(),
-                OutVTs, OutsFlags, TLI);
-  FunctionLoweringInfo &FLI = DAG.getFunctionLoweringInfo();
-
-  FLI.CanLowerReturn = TLI.CanLowerReturn(F.getCallingConv(), F.isVarArg(),
-                                          OutVTs, OutsFlags, DAG);
-  if (!FLI.CanLowerReturn) {
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(F.getReturnType(), F.getAttributes().getRetAttributes(),
+                Outs, TLI);
+
+  if (!FuncInfo->CanLowerReturn) {
     // Put in an sret pointer parameter before all the other parameters.
     SmallVector<EVT, 1> ValueVTs;
     ComputeValueVTs(TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
@@ -6002,7 +5995,7 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
   // Set up the argument values.
   unsigned i = 0;
   Idx = 1;
-  if (!FLI.CanLowerReturn) {
+  if (!FuncInfo->CanLowerReturn) {
     // Create a virtual register for the sret pointer, and put in a copy
     // from the sret argument into it.
     SmallVector<EVT, 1> ValueVTs;
@@ -6016,7 +6009,7 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
     MachineFunction& MF = SDB->DAG.getMachineFunction();
     MachineRegisterInfo& RegInfo = MF.getRegInfo();
     unsigned SRetReg = RegInfo.createVirtualRegister(TLI.getRegClassFor(RegVT));
-    FLI.DemoteRegister = SRetReg;
+    FuncInfo->DemoteRegister = SRetReg;
     NewRoot = SDB->DAG.getCopyToReg(NewRoot, SDB->getCurDebugLoc(),
                                     SRetReg, ArgValue);
     DAG.setRoot(NewRoot);
@@ -6032,6 +6025,12 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(TLI, I->getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
+
+    // If this argument is unused then remember its value. It is used to generate
+    // debugging information.
+    if (I->use_empty() && NumValues)
+      SDB->setUnusedArgValue(I, InVals[i]);
+
     for (unsigned Value = 0; Value != NumValues; ++Value) {
       EVT VT = ValueVTs[Value];
       EVT PartVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
@@ -6112,17 +6111,20 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       if (const Constant *C = dyn_cast<Constant>(PHIOp)) {
         unsigned &RegOut = ConstantsOut[C];
         if (RegOut == 0) {
-          RegOut = FuncInfo.CreateRegForValue(C);
+          RegOut = FuncInfo.CreateRegs(C->getType());
           CopyValueToVirtualRegister(C, RegOut);
         }
         Reg = RegOut;
       } else {
-        Reg = FuncInfo.ValueMap[PHIOp];
-        if (Reg == 0) {
+        DenseMap<const Value *, unsigned>::iterator I =
+          FuncInfo.ValueMap.find(PHIOp);
+        if (I != FuncInfo.ValueMap.end())
+          Reg = I->second;
+        else {
           assert(isa<AllocaInst>(PHIOp) &&
                  FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(PHIOp)) &&
                  "Didn't codegen value into a register!??");
-          Reg = FuncInfo.CreateRegForValue(PHIOp);
+          Reg = FuncInfo.CreateRegs(PHIOp->getType());
           CopyValueToVirtualRegister(PHIOp, Reg);
         }
       }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 3fcd4b9..46733d6 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -88,6 +88,10 @@ class SelectionDAGBuilder {
   DebugLoc CurDebugLoc;
 
   DenseMap<const Value*, SDValue> NodeMap;
+  
+  /// UnusedArgNodeMap - Maps argument value for unused arguments. This is used
+  /// to preserve debug information for incoming arguments.
+  DenseMap<const Value*, SDValue> UnusedArgNodeMap;
 
 public:
   /// PendingLoads - Loads are not emitted to the program immediately.  We bunch
@@ -342,6 +346,8 @@ public:
   void visit(unsigned Opcode, const User &I);
 
   SDValue getValue(const Value *V);
+  SDValue getNonRegisterValue(const Value *V);
+  SDValue getValueImpl(const Value *V);
 
   void setValue(const Value *V, SDValue NewN) {
     SDValue &N = NodeMap[V];
@@ -349,6 +355,12 @@ public:
     N = NewN;
   }
   
+  void setUnusedArgValue(const Value *V, SDValue NewN) {
+    SDValue &N = UnusedArgNodeMap[V];
+    assert(N.getNode() == 0 && "Already set a value for this node!");
+    N = NewN;
+  }
+  
   void GetRegistersForValue(SDISelAsmOperandInfo &OpInfo,
                             std::set<unsigned> &OutputRegs, 
                             std::set<unsigned> &InputRegs);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 65b8d4f..08ba548 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -14,7 +14,7 @@
 #define DEBUG_TYPE "isel"
 #include "ScheduleDAGSDNodes.h"
 #include "SelectionDAGBuilder.h"
-#include "FunctionLoweringInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/DebugInfo.h"
@@ -171,7 +171,7 @@ TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 SelectionDAGISel::SelectionDAGISel(const TargetMachine &tm, CodeGenOpt::Level OL) :
   MachineFunctionPass(&ID), TM(tm), TLI(*tm.getTargetLowering()),
   FuncInfo(new FunctionLoweringInfo(TLI)),
-  CurDAG(new SelectionDAG(tm, *FuncInfo)),
+  CurDAG(new SelectionDAG(tm)),
   SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
   GFI(),
   OptLevel(OL),
@@ -244,7 +244,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   CurDAG->init(*MF);
-  FuncInfo->set(Fn, *MF, EnableFastISel);
+  FuncInfo->set(Fn, *MF);
   SDB->init(GFI, *AA);
 
   SelectAllBasicBlocks(Fn);
@@ -300,7 +300,11 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       for (MachineBasicBlock::const_iterator
              II = MBB->begin(), IE = MBB->end(); II != IE; ++II) {
         const TargetInstrDesc &TID = TM.getInstrInfo()->get(II->getOpcode());
-        if (II->isInlineAsm() || (TID.isCall() && !TID.isReturn())) {
+
+        // Operand 1 of an inline asm instruction indicates whether the asm
+        // needs stack or not.
+        if ((II->isInlineAsm() && II->getOperand(1).getImm()) ||
+            (TID.isCall() && !TID.isReturn())) {
           MFI->setHasCalls(true);
           goto done;
         }
@@ -312,6 +316,26 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   // Determine if there is a call to setjmp in the machine function.
   MF->setCallsSetJmp(FunctionCallsSetJmp(&Fn));
 
+  // Replace forward-declared registers with the registers containing
+  // the desired value.
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  for (DenseMap<unsigned, unsigned>::iterator
+       I = FuncInfo->RegFixups.begin(), E = FuncInfo->RegFixups.end();
+       I != E; ++I) {
+    unsigned From = I->first;
+    unsigned To = I->second;
+    // If To is also scheduled to be replaced, find what its ultimate
+    // replacement is.
+    for (;;) {
+      DenseMap<unsigned, unsigned>::iterator J =
+        FuncInfo->RegFixups.find(To);
+      if (J == E) break;
+      To = J->second;
+    }
+    // Replace it.
+    MRI.replaceRegWith(From, To);
+  }
+
   // Release function-specific state. SDB and CurDAG are already cleared
   // at this point.
   FuncInfo->clear();
@@ -319,10 +343,8 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   return true;
 }
 
-MachineBasicBlock *
-SelectionDAGISel::SelectBasicBlock(MachineBasicBlock *BB,
-                                   const BasicBlock *LLVMBB,
-                                   BasicBlock::const_iterator Begin,
+void
+SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
                                    BasicBlock::const_iterator End,
                                    bool &HadTailCall) {
   // Lower all of the non-terminator instructions. If a call is emitted
@@ -337,7 +359,7 @@ SelectionDAGISel::SelectBasicBlock(MachineBasicBlock *BB,
   SDB->clear();
 
   // Final step, emit the lowered DAG as machine code.
-  return CodeGenAndEmitDAG(BB);
+  CodeGenAndEmitDAG();
 }
 
 namespace {
@@ -372,102 +394,6 @@ public:
 };
 }
 
-/// TrivialTruncElim - Eliminate some trivial nops that can result from
-/// ShrinkDemandedOps: (trunc (ext n)) -> n.
-static bool TrivialTruncElim(SDValue Op,
-                             TargetLowering::TargetLoweringOpt &TLO) {
-  SDValue N0 = Op.getOperand(0);
-  EVT VT = Op.getValueType();
-  if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
-       N0.getOpcode() == ISD::SIGN_EXTEND ||
-       N0.getOpcode() == ISD::ANY_EXTEND) &&
-      N0.getOperand(0).getValueType() == VT) {
-    return TLO.CombineTo(Op, N0.getOperand(0));
-  }
-  return false;
-}
-
-/// ShrinkDemandedOps - A late transformation pass that shrink expressions
-/// using TargetLowering::TargetLoweringOpt::ShrinkDemandedOp. It converts
-/// x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.
-void SelectionDAGISel::ShrinkDemandedOps() {
-  SmallVector<SDNode*, 128> Worklist;
-  SmallPtrSet<SDNode*, 128> InWorklist;
-
-  // Add all the dag nodes to the worklist.
-  Worklist.reserve(CurDAG->allnodes_size());
-  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-       E = CurDAG->allnodes_end(); I != E; ++I) {
-    Worklist.push_back(I);
-    InWorklist.insert(I);
-  }
-
-  TargetLowering::TargetLoweringOpt TLO(*CurDAG, true, true, true);
-  while (!Worklist.empty()) {
-    SDNode *N = Worklist.pop_back_val();
-    InWorklist.erase(N);
-
-    if (N->use_empty() && N != CurDAG->getRoot().getNode()) {
-      // Deleting this node may make its operands dead, add them to the worklist
-      // if they aren't already there.
-      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-        if (InWorklist.insert(N->getOperand(i).getNode()))
-          Worklist.push_back(N->getOperand(i).getNode());
-      
-      CurDAG->DeleteNode(N);
-      continue;
-    }
-
-    // Run ShrinkDemandedOp on scalar binary operations.
-    if (N->getNumValues() != 1 ||
-        !N->getValueType(0).isSimple() || !N->getValueType(0).isInteger())
-      continue;
-    
-    unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
-    APInt Demanded = APInt::getAllOnesValue(BitWidth);
-    APInt KnownZero, KnownOne;
-    if (!TLI.SimplifyDemandedBits(SDValue(N, 0), Demanded,
-                                  KnownZero, KnownOne, TLO) &&
-        (N->getOpcode() != ISD::TRUNCATE ||
-         !TrivialTruncElim(SDValue(N, 0), TLO)))
-      continue;
-    
-    // Revisit the node.
-    assert(!InWorklist.count(N) && "Already in worklist");
-    Worklist.push_back(N);
-    InWorklist.insert(N);
-
-    // Replace the old value with the new one.
-    DEBUG(errs() << "\nShrinkDemandedOps replacing "; 
-          TLO.Old.getNode()->dump(CurDAG);
-          errs() << "\nWith: ";
-          TLO.New.getNode()->dump(CurDAG);
-          errs() << '\n');
-
-    if (InWorklist.insert(TLO.New.getNode()))
-      Worklist.push_back(TLO.New.getNode());
-
-    SDOPsWorkListRemover DeadNodes(Worklist, InWorklist);
-    CurDAG->ReplaceAllUsesOfValueWith(TLO.Old, TLO.New, &DeadNodes);
-
-    if (!TLO.Old.getNode()->use_empty()) continue;
-        
-    for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands();
-         i != e; ++i) {
-      SDNode *OpNode = TLO.Old.getNode()->getOperand(i).getNode(); 
-      if (OpNode->hasOneUse()) {
-        // Add OpNode to the end of the list to revisit.
-        DeadNodes.RemoveFromWorklist(OpNode);
-        Worklist.push_back(OpNode);
-        InWorklist.insert(OpNode);
-      }
-    }
-
-    DeadNodes.RemoveFromWorklist(TLO.Old.getNode());
-    CurDAG->DeleteNode(TLO.Old.getNode());
-  }
-}
-
 void SelectionDAGISel::ComputeLiveOutVRegInfo() {
   SmallPtrSet<SDNode*, 128> VisitedNodes;
   SmallVector<SDNode*, 128> Worklist;
@@ -522,7 +448,7 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() {
   } while (!Worklist.empty());
 }
 
-MachineBasicBlock *SelectionDAGISel::CodeGenAndEmitDAG(MachineBasicBlock *BB) {
+void SelectionDAGISel::CodeGenAndEmitDAG() {
   std::string GroupName;
   if (TimePassesIsEnabled)
     GroupName = "Instruction Selection and Scheduling";
@@ -531,23 +457,19 @@ MachineBasicBlock *SelectionDAGISel::CodeGenAndEmitDAG(MachineBasicBlock *BB) {
       ViewDAGCombine2 || ViewDAGCombineLT || ViewISelDAGs || ViewSchedDAGs ||
       ViewSUnitDAGs)
     BlockName = MF->getFunction()->getNameStr() + ":" +
-                BB->getBasicBlock()->getNameStr();
+                FuncInfo->MBB->getBasicBlock()->getNameStr();
 
-  DEBUG(dbgs() << "Initial selection DAG:\n");
-  DEBUG(CurDAG->dump());
+  DEBUG(dbgs() << "Initial selection DAG:\n"; CurDAG->dump());
 
   if (ViewDAGCombine1) CurDAG->viewGraph("dag-combine1 input for " + BlockName);
 
   // Run the DAG combiner in pre-legalize mode.
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("DAG Combining 1", GroupName);
-    CurDAG->Combine(Unrestricted, *AA, OptLevel);
-  } else {
+  {
+    NamedRegionTimer T("DAG Combining 1", GroupName, TimePassesIsEnabled);
     CurDAG->Combine(Unrestricted, *AA, OptLevel);
   }
 
-  DEBUG(dbgs() << "Optimized lowered selection DAG:\n");
-  DEBUG(CurDAG->dump());
+  DEBUG(dbgs() << "Optimized lowered selection DAG:\n"; CurDAG->dump());
 
   // Second step, hack on the DAG until it only uses operations and types that
   // the target supports.
@@ -555,44 +477,36 @@ MachineBasicBlock *SelectionDAGISel::CodeGenAndEmitDAG(MachineBasicBlock *BB) {
                                                BlockName);
 
   bool Changed;
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("Type Legalization", GroupName);
-    Changed = CurDAG->LegalizeTypes();
-  } else {
+  {
+    NamedRegionTimer T("Type Legalization", GroupName, TimePassesIsEnabled);
     Changed = CurDAG->LegalizeTypes();
   }
 
-  DEBUG(dbgs() << "Type-legalized selection DAG:\n");
-  DEBUG(CurDAG->dump());
+  DEBUG(dbgs() << "Type-legalized selection DAG:\n"; CurDAG->dump());
 
   if (Changed) {
     if (ViewDAGCombineLT)
       CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
 
     // Run the DAG combiner in post-type-legalize mode.
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T("DAG Combining after legalize types", GroupName);
-      CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
-    } else {
+    {
+      NamedRegionTimer T("DAG Combining after legalize types", GroupName,
+                         TimePassesIsEnabled);
       CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
     }
 
-    DEBUG(dbgs() << "Optimized type-legalized selection DAG:\n");
-    DEBUG(CurDAG->dump());
+    DEBUG(dbgs() << "Optimized type-legalized selection DAG:\n";
+          CurDAG->dump());
   }
 
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("Vector Legalization", GroupName);
-    Changed = CurDAG->LegalizeVectors();
-  } else {
+  {
+    NamedRegionTimer T("Vector Legalization", GroupName, TimePassesIsEnabled);
     Changed = CurDAG->LegalizeVectors();
   }
 
   if (Changed) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T("Type Legalization 2", GroupName);
-      CurDAG->LegalizeTypes();
-    } else {
+    {
+      NamedRegionTimer T("Type Legalization 2", GroupName, TimePassesIsEnabled);
       CurDAG->LegalizeTypes();
     }
 
@@ -600,95 +514,79 @@ MachineBasicBlock *SelectionDAGISel::CodeGenAndEmitDAG(MachineBasicBlock *BB) {
       CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
 
     // Run the DAG combiner in post-type-legalize mode.
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T("DAG Combining after legalize vectors", GroupName);
-      CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
-    } else {
+    {
+      NamedRegionTimer T("DAG Combining after legalize vectors", GroupName,
+                         TimePassesIsEnabled);
       CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
     }
 
-    DEBUG(dbgs() << "Optimized vector-legalized selection DAG:\n");
-    DEBUG(CurDAG->dump());
+    DEBUG(dbgs() << "Optimized vector-legalized selection DAG:\n";
+          CurDAG->dump());
   }
 
   if (ViewLegalizeDAGs) CurDAG->viewGraph("legalize input for " + BlockName);
 
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("DAG Legalization", GroupName);
-    CurDAG->Legalize(OptLevel);
-  } else {
+  {
+    NamedRegionTimer T("DAG Legalization", GroupName, TimePassesIsEnabled);
     CurDAG->Legalize(OptLevel);
   }
 
-  DEBUG(dbgs() << "Legalized selection DAG:\n");
-  DEBUG(CurDAG->dump());
+  DEBUG(dbgs() << "Legalized selection DAG:\n"; CurDAG->dump());
 
   if (ViewDAGCombine2) CurDAG->viewGraph("dag-combine2 input for " + BlockName);
 
   // Run the DAG combiner in post-legalize mode.
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("DAG Combining 2", GroupName);
-    CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
-  } else {
+  {
+    NamedRegionTimer T("DAG Combining 2", GroupName, TimePassesIsEnabled);
     CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
   }
 
-  DEBUG(dbgs() << "Optimized legalized selection DAG:\n");
-  DEBUG(CurDAG->dump());
+  DEBUG(dbgs() << "Optimized legalized selection DAG:\n"; CurDAG->dump());
 
-  if (OptLevel != CodeGenOpt::None) {
-    ShrinkDemandedOps();
+  if (OptLevel != CodeGenOpt::None)
     ComputeLiveOutVRegInfo();
-  }
 
   if (ViewISelDAGs) CurDAG->viewGraph("isel input for " + BlockName);
 
   // Third, instruction select all of the operations to machine code, adding the
   // code to the MachineBasicBlock.
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("Instruction Selection", GroupName);
-    DoInstructionSelection();
-  } else {
+  {
+    NamedRegionTimer T("Instruction Selection", GroupName, TimePassesIsEnabled);
     DoInstructionSelection();
   }
 
-  DEBUG(dbgs() << "Selected selection DAG:\n");
-  DEBUG(CurDAG->dump());
+  DEBUG(dbgs() << "Selected selection DAG:\n"; CurDAG->dump());
 
   if (ViewSchedDAGs) CurDAG->viewGraph("scheduler input for " + BlockName);
 
   // Schedule machine code.
   ScheduleDAGSDNodes *Scheduler = CreateScheduler();
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("Instruction Scheduling", GroupName);
-    Scheduler->Run(CurDAG, BB, BB->end());
-  } else {
-    Scheduler->Run(CurDAG, BB, BB->end());
+  {
+    NamedRegionTimer T("Instruction Scheduling", GroupName,
+                       TimePassesIsEnabled);
+    Scheduler->Run(CurDAG, FuncInfo->MBB, FuncInfo->InsertPt);
   }
 
   if (ViewSUnitDAGs) Scheduler->viewGraph();
 
   // Emit machine code to BB.  This can change 'BB' to the last block being
   // inserted into.
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("Instruction Creation", GroupName);
-    BB = Scheduler->EmitSchedule();
-  } else {
-    BB = Scheduler->EmitSchedule();
+  {
+    NamedRegionTimer T("Instruction Creation", GroupName, TimePassesIsEnabled);
+
+    FuncInfo->MBB = Scheduler->EmitSchedule();
+    FuncInfo->InsertPt = Scheduler->InsertPos;
   }
 
   // Free the scheduler state.
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("Instruction Scheduling Cleanup", GroupName);
-    delete Scheduler;
-  } else {
+  {
+    NamedRegionTimer T("Instruction Scheduling Cleanup", GroupName,
+                       TimePassesIsEnabled);
     delete Scheduler;
   }
 
   // Free the SelectionDAG state, now that we're finished with it.
   CurDAG->clear();
-
-  return BB;
 }
 
 void SelectionDAGISel::DoInstructionSelection() {
@@ -750,21 +648,22 @@ void SelectionDAGISel::DoInstructionSelection() {
 
 /// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and
 /// do other setup for EH landing-pad blocks.
-void SelectionDAGISel::PrepareEHLandingPad(MachineBasicBlock *BB) {
+void SelectionDAGISel::PrepareEHLandingPad() {
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
-  MCSymbol *Label = MF->getMMI().addLandingPad(BB);
+  MCSymbol *Label = MF->getMMI().addLandingPad(FuncInfo->MBB);
 
   const TargetInstrDesc &II = TM.getInstrInfo()->get(TargetOpcode::EH_LABEL);
-  BuildMI(BB, SDB->getCurDebugLoc(), II).addSym(Label);
+  BuildMI(*FuncInfo->MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
+    .addSym(Label);
 
   // Mark exception register as live in.
   unsigned Reg = TLI.getExceptionAddressRegister();
-  if (Reg) BB->addLiveIn(Reg);
+  if (Reg) FuncInfo->MBB->addLiveIn(Reg);
 
   // Mark exception selector register as live in.
   Reg = TLI.getExceptionSelectorRegister();
-  if (Reg) BB->addLiveIn(Reg);
+  if (Reg) FuncInfo->MBB->addLiveIn(Reg);
 
   // FIXME: Hack around an exception handling flaw (PR1508): the personality
   // function and list of typeids logically belong to the invoke (or, if you
@@ -777,7 +676,7 @@ void SelectionDAGISel::PrepareEHLandingPad(MachineBasicBlock *BB) {
   // in exceptions not being caught because no typeids are associated with
   // the invoke.  This may not be the only way things can go wrong, but it
   // is the only way we try to work around for the moment.
-  const BasicBlock *LLVMBB = BB->getBasicBlock();
+  const BasicBlock *LLVMBB = FuncInfo->MBB->getBasicBlock();
   const BranchInst *Br = dyn_cast<BranchInst>(LLVMBB->getTerminator());
 
   if (Br && Br->isUnconditional()) { // Critical edge?
@@ -796,83 +695,100 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   // Initialize the Fast-ISel state, if needed.
   FastISel *FastIS = 0;
   if (EnableFastISel)
-    FastIS = TLI.createFastISel(*MF, FuncInfo->ValueMap, FuncInfo->MBBMap,
-                                FuncInfo->StaticAllocaMap,
-                                FuncInfo->PHINodesToUpdate
-#ifndef NDEBUG
-                                , FuncInfo->CatchInfoLost
-#endif
-                                );
+    FastIS = TLI.createFastISel(*FuncInfo);
 
   // Iterate over all basic blocks in the function.
   for (Function::const_iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
     const BasicBlock *LLVMBB = &*I;
-    MachineBasicBlock *BB = FuncInfo->MBBMap[LLVMBB];
+    FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB];
+    FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
 
     BasicBlock::const_iterator const Begin = LLVMBB->getFirstNonPHI();
     BasicBlock::const_iterator const End = LLVMBB->end();
-    BasicBlock::const_iterator BI = Begin;
+    BasicBlock::const_iterator BI = End;
 
+    FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
+
+    // Setup an EH landing-pad block.
+    if (FuncInfo->MBB->isLandingPad())
+      PrepareEHLandingPad();
+    
     // Lower any arguments needed in this block if this is the entry block.
     if (LLVMBB == &Fn.getEntryBlock())
       LowerArguments(LLVMBB);
 
-    // Setup an EH landing-pad block.
-    if (BB->isLandingPad())
-      PrepareEHLandingPad(BB);
-    
     // Before doing SelectionDAG ISel, see if FastISel has been requested.
     if (FastIS) {
+      FastIS->startNewBlock();
+
       // Emit code for any incoming arguments. This must happen before
       // beginning FastISel on the entry block.
       if (LLVMBB == &Fn.getEntryBlock()) {
         CurDAG->setRoot(SDB->getControlRoot());
         SDB->clear();
-        BB = CodeGenAndEmitDAG(BB);
+        CodeGenAndEmitDAG();
+
+        // If we inserted any instructions at the beginning, make a note of
+        // where they are, so we can be sure to emit subsequent instructions
+        // after them.
+        if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
+          FastIS->setLastLocalValue(llvm::prior(FuncInfo->InsertPt));
+        else
+          FastIS->setLastLocalValue(0);
       }
-      FastIS->startNewBlock(BB);
+
       // Do FastISel on as many instructions as possible.
-      for (; BI != End; ++BI) {
+      for (; BI != Begin; --BI) {
+        const Instruction *Inst = llvm::prior(BI);
+
+        // If we no longer require this instruction, skip it.
+        if (!Inst->mayWriteToMemory() &&
+            !isa<TerminatorInst>(Inst) &&
+            !isa<DbgInfoIntrinsic>(Inst) &&
+            !FuncInfo->isExportedInst(Inst))
+          continue;
+
+        // Bottom-up: reset the insert pos at the top, after any local-value
+        // instructions.
+        FastIS->recomputeInsertPt();
+
         // Try to select the instruction with FastISel.
-        if (FastIS->SelectInstruction(BI))
+        if (FastIS->SelectInstruction(Inst))
           continue;
 
         // Then handle certain instructions as single-LLVM-Instruction blocks.
-        if (isa<CallInst>(BI)) {
+        if (isa<CallInst>(Inst)) {
           ++NumFastIselFailures;
           if (EnableFastISelVerbose || EnableFastISelAbort) {
             dbgs() << "FastISel missed call: ";
-            BI->dump();
+            Inst->dump();
           }
 
-          if (!BI->getType()->isVoidTy() && !BI->use_empty()) {
-            unsigned &R = FuncInfo->ValueMap[BI];
+          if (!Inst->getType()->isVoidTy() && !Inst->use_empty()) {
+            unsigned &R = FuncInfo->ValueMap[Inst];
             if (!R)
-              R = FuncInfo->CreateRegForValue(BI);
+              R = FuncInfo->CreateRegs(Inst->getType());
           }
 
           bool HadTailCall = false;
-          BB = SelectBasicBlock(BB, LLVMBB, BI, llvm::next(BI), HadTailCall);
+          SelectBasicBlock(Inst, BI, HadTailCall);
 
           // If the call was emitted as a tail call, we're done with the block.
           if (HadTailCall) {
-            BI = End;
+            --BI;
             break;
           }
 
-          // If the instruction was codegen'd with multiple blocks,
-          // inform the FastISel object where to resume inserting.
-          FastIS->setCurrentBlock(BB);
           continue;
         }
 
         // Otherwise, give up on FastISel for the rest of the block.
         // For now, be a little lenient about non-branch terminators.
-        if (!isa<TerminatorInst>(BI) || isa<BranchInst>(BI)) {
+        if (!isa<TerminatorInst>(Inst) || isa<BranchInst>(Inst)) {
           ++NumFastIselFailures;
           if (EnableFastISelVerbose || EnableFastISelAbort) {
             dbgs() << "FastISel miss: ";
-            BI->dump();
+            Inst->dump();
           }
           if (EnableFastISelAbort)
             // The "fast" selector couldn't handle something and bailed.
@@ -881,17 +797,17 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
         }
         break;
       }
+
+      FastIS->recomputeInsertPt();
     }
 
     // Run SelectionDAG instruction selection on the remainder of the block
     // not handled by FastISel. If FastISel is not run, this is the entire
     // block.
-    if (BI != End) {
-      bool HadTailCall;
-      BB = SelectBasicBlock(BB, LLVMBB, BI, End, HadTailCall);
-    }
+    bool HadTailCall;
+    SelectBasicBlock(Begin, BI, HadTailCall);
 
-    FinishBasicBlock(BB);
+    FinishBasicBlock();
     FuncInfo->PHINodesToUpdate.clear();
   }
 
@@ -899,11 +815,11 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 }
 
 void
-SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
+SelectionDAGISel::FinishBasicBlock() {
 
   DEBUG(dbgs() << "Total amount of phi nodes to update: "
-               << FuncInfo->PHINodesToUpdate.size() << "\n");
-  DEBUG(for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i)
+               << FuncInfo->PHINodesToUpdate.size() << "\n";
+        for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i)
           dbgs() << "Node " << i << " : ("
                  << FuncInfo->PHINodesToUpdate[i].first
                  << ", " << FuncInfo->PHINodesToUpdate[i].second << ")\n");
@@ -917,11 +833,11 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
       MachineInstr *PHI = FuncInfo->PHINodesToUpdate[i].first;
       assert(PHI->isPHI() &&
              "This is not a machine PHI node that we are updating!");
-      if (!BB->isSuccessor(PHI->getParent()))
+      if (!FuncInfo->MBB->isSuccessor(PHI->getParent()))
         continue;
       PHI->addOperand(
         MachineOperand::CreateReg(FuncInfo->PHINodesToUpdate[i].second, false));
-      PHI->addOperand(MachineOperand::CreateMBB(BB));
+      PHI->addOperand(MachineOperand::CreateMBB(FuncInfo->MBB));
     }
     return;
   }
@@ -930,33 +846,35 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
     // Lower header first, if it wasn't already lowered
     if (!SDB->BitTestCases[i].Emitted) {
       // Set the current basic block to the mbb we wish to insert the code into
-      BB = SDB->BitTestCases[i].Parent;
+      FuncInfo->MBB = SDB->BitTestCases[i].Parent;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
       // Emit the code
-      SDB->visitBitTestHeader(SDB->BitTestCases[i], BB);
+      SDB->visitBitTestHeader(SDB->BitTestCases[i], FuncInfo->MBB);
       CurDAG->setRoot(SDB->getRoot());
       SDB->clear();
-      BB = CodeGenAndEmitDAG(BB);
+      CodeGenAndEmitDAG();
     }
 
     for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j) {
       // Set the current basic block to the mbb we wish to insert the code into
-      BB = SDB->BitTestCases[i].Cases[j].ThisBB;
+      FuncInfo->MBB = SDB->BitTestCases[i].Cases[j].ThisBB;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
       // Emit the code
       if (j+1 != ej)
         SDB->visitBitTestCase(SDB->BitTestCases[i].Cases[j+1].ThisBB,
                               SDB->BitTestCases[i].Reg,
                               SDB->BitTestCases[i].Cases[j],
-                              BB);
+                              FuncInfo->MBB);
       else
         SDB->visitBitTestCase(SDB->BitTestCases[i].Default,
                               SDB->BitTestCases[i].Reg,
                               SDB->BitTestCases[i].Cases[j],
-                              BB);
+                              FuncInfo->MBB);
 
 
       CurDAG->setRoot(SDB->getRoot());
       SDB->clear();
-      BB = CodeGenAndEmitDAG(BB);
+      CodeGenAndEmitDAG();
     }
 
     // Update PHI Nodes
@@ -1001,22 +919,24 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
     // Lower header first, if it wasn't already lowered
     if (!SDB->JTCases[i].first.Emitted) {
       // Set the current basic block to the mbb we wish to insert the code into
-      BB = SDB->JTCases[i].first.HeaderBB;
+      FuncInfo->MBB = SDB->JTCases[i].first.HeaderBB;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
       // Emit the code
       SDB->visitJumpTableHeader(SDB->JTCases[i].second, SDB->JTCases[i].first,
-                                BB);
+                                FuncInfo->MBB);
       CurDAG->setRoot(SDB->getRoot());
       SDB->clear();
-      BB = CodeGenAndEmitDAG(BB);
+      CodeGenAndEmitDAG();
     }
 
     // Set the current basic block to the mbb we wish to insert the code into
-    BB = SDB->JTCases[i].second.MBB;
+    FuncInfo->MBB = SDB->JTCases[i].second.MBB;
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
     // Emit the code
     SDB->visitJumpTable(SDB->JTCases[i].second);
     CurDAG->setRoot(SDB->getRoot());
     SDB->clear();
-    BB = CodeGenAndEmitDAG(BB);
+    CodeGenAndEmitDAG();
 
     // Update PHI Nodes
     for (unsigned pi = 0, pe = FuncInfo->PHINodesToUpdate.size();
@@ -1034,11 +954,11 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
           (MachineOperand::CreateMBB(SDB->JTCases[i].first.HeaderBB));
       }
       // JT BB. Just iterate over successors here
-      if (BB->isSuccessor(PHIBB)) {
+      if (FuncInfo->MBB->isSuccessor(PHIBB)) {
         PHI->addOperand
           (MachineOperand::CreateReg(FuncInfo->PHINodesToUpdate[pi].second,
                                      false));
-        PHI->addOperand(MachineOperand::CreateMBB(BB));
+        PHI->addOperand(MachineOperand::CreateMBB(FuncInfo->MBB));
       }
     }
   }
@@ -1050,10 +970,10 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
     MachineInstr *PHI = FuncInfo->PHINodesToUpdate[i].first;
     assert(PHI->isPHI() &&
            "This is not a machine PHI node that we are updating!");
-    if (BB->isSuccessor(PHI->getParent())) {
+    if (FuncInfo->MBB->isSuccessor(PHI->getParent())) {
       PHI->addOperand(
         MachineOperand::CreateReg(FuncInfo->PHINodesToUpdate[i].second, false));
-      PHI->addOperand(MachineOperand::CreateMBB(BB));
+      PHI->addOperand(MachineOperand::CreateMBB(FuncInfo->MBB));
     }
   }
 
@@ -1061,7 +981,8 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
   // additional DAGs necessary.
   for (unsigned i = 0, e = SDB->SwitchCases.size(); i != e; ++i) {
     // Set the current basic block to the mbb we wish to insert the code into
-    MachineBasicBlock *ThisBB = BB = SDB->SwitchCases[i].ThisBB;
+    MachineBasicBlock *ThisBB = FuncInfo->MBB = SDB->SwitchCases[i].ThisBB;
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
 
     // Determine the unique successors.
     SmallVector<MachineBasicBlock *, 2> Succs;
@@ -1071,21 +992,24 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
 
     // Emit the code. Note that this could result in ThisBB being split, so
     // we need to check for updates.
-    SDB->visitSwitchCase(SDB->SwitchCases[i], BB);
+    SDB->visitSwitchCase(SDB->SwitchCases[i], FuncInfo->MBB);
     CurDAG->setRoot(SDB->getRoot());
     SDB->clear();
-    ThisBB = CodeGenAndEmitDAG(BB);
+    CodeGenAndEmitDAG();
+    ThisBB = FuncInfo->MBB;
 
     // Handle any PHI nodes in successors of this chunk, as if we were coming
     // from the original BB before switch expansion.  Note that PHI nodes can
     // occur multiple times in PHINodesToUpdate.  We have to be very careful to
     // handle them the right number of times.
     for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
-      BB = Succs[i];
-      // BB may have been removed from the CFG if a branch was constant folded.
-      if (ThisBB->isSuccessor(BB)) {
-        for (MachineBasicBlock::iterator Phi = BB->begin();
-             Phi != BB->end() && Phi->isPHI();
+      FuncInfo->MBB = Succs[i];
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // FuncInfo->MBB may have been removed from the CFG if a branch was
+      // constant folded.
+      if (ThisBB->isSuccessor(FuncInfo->MBB)) {
+        for (MachineBasicBlock::iterator Phi = FuncInfo->MBB->begin();
+             Phi != FuncInfo->MBB->end() && Phi->isPHI();
              ++Phi) {
           // This value for this PHI node is recorded in PHINodesToUpdate.
           for (unsigned pn = 0; ; ++pn) {
@@ -1205,6 +1129,7 @@ SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops) {
   Ops.push_back(InOps[InlineAsm::Op_InputChain]); // 0
   Ops.push_back(InOps[InlineAsm::Op_AsmString]);  // 1
   Ops.push_back(InOps[InlineAsm::Op_MDNode]);     // 2, !srcloc
+  Ops.push_back(InOps[InlineAsm::Op_IsAlignStack]);  // 3
 
   unsigned i = InlineAsm::Op_FirstOperand, e = InOps.size();
   if (InOps[e-1].getValueType() == MVT::Flag)
@@ -1701,7 +1626,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
                                       SDValue(Res, ResNumResults-1));
 
   if ((EmitNodeInfo & OPFL_FlagOutput) != 0)
-  --ResNumResults;
+    --ResNumResults;
 
   // Move the chain reference if needed.
   if ((EmitNodeInfo & OPFL_Chain) && OldChainResultNo != -1 &&
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index 3786bd1..6cae804 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -278,7 +278,7 @@ std::string ScheduleDAGSDNodes::getGraphNodeLabel(const SUnit *SU) const {
       FlaggedNodes.push_back(N);
     while (!FlaggedNodes.empty()) {
       O << DOTGraphTraits<SelectionDAG*>
-	     ::getSimpleNodeLabel(FlaggedNodes.back(), DAG);
+        ::getSimpleNodeLabel(FlaggedNodes.back(), DAG);
       FlaggedNodes.pop_back();
       if (!FlaggedNodes.empty())
         O << "\n    ";
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 44a80d3..4f38669 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -261,6 +262,38 @@ static void InitLibcallNames(const char **Names) {
   Names[RTLIB::MEMMOVE] = "memmove";
   Names[RTLIB::MEMSET] = "memset";
   Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] = "__sync_val_compare_and_swap_4";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] = "__sync_val_compare_and_swap_8";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_1] = "__sync_lock_test_and_set_1";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_2] = "__sync_lock_test_and_set_2";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_4] = "__sync_lock_test_and_set_4";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_8] = "__sync_lock_test_and_set_8";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_1] = "__sync_fetch_and_add_1";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_2] = "__sync_fetch_and_add_2";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_4] = "__sync_fetch_and_add_4";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_8] = "__sync_fetch_and_add_8";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_1] = "__sync_fetch_and_sub_1";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_2] = "__sync_fetch_and_sub_2";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_4] = "__sync_fetch_and_sub_4";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_8] = "__sync_fetch_and_sub_8";
+  Names[RTLIB::SYNC_FETCH_AND_AND_1] = "__sync_fetch_and_and_1";
+  Names[RTLIB::SYNC_FETCH_AND_AND_2] = "__sync_fetch_and_and_2";
+  Names[RTLIB::SYNC_FETCH_AND_AND_4] = "__sync_fetch_and_and_4";
+  Names[RTLIB::SYNC_FETCH_AND_AND_8] = "__sync_fetch_and_and_8";
+  Names[RTLIB::SYNC_FETCH_AND_OR_1] = "__sync_fetch_and_or_1";
+  Names[RTLIB::SYNC_FETCH_AND_OR_2] = "__sync_fetch_and_or_2";
+  Names[RTLIB::SYNC_FETCH_AND_OR_4] = "__sync_fetch_and_or_4";
+  Names[RTLIB::SYNC_FETCH_AND_OR_8] = "__sync_fetch_and_or_8";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_1] = "__sync_fetch_and_xor_1";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_2] = "__sync_fetch_and_xor_2";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and-xor_4";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_8] = "__sync_fetch_and_xor_8";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_1] = "__sync_fetch_and_nand_1";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_2] = "__sync_fetch_and_nand_2";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_4] = "__sync_fetch_and_nand_4";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_8] = "__sync_fetch_and_nand_8";
 }
 
 /// InitLibcallCallingConvs - Set default libcall CallingConvs.
@@ -546,9 +579,9 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   SchedPreferenceInfo = Sched::Latency;
   JumpBufSize = 0;
   JumpBufAlignment = 0;
-  IfCvtBlockSizeLimit = 2;
-  IfCvtDupBlockSizeLimit = 0;
   PrefLoopAlignment = 0;
+  MinStackArgumentAlignment = 1;
+  ShouldFoldAtomicFences = false;
 
   InitLibcallNames(LibcallRoutineNames);
   InitCmpLibcallCCs(CmpLibcallCCs);
@@ -578,9 +611,9 @@ bool TargetLowering::canOpTrap(unsigned Op, EVT VT) const {
 
 
 static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
-                                       unsigned &NumIntermediates,
-                                       EVT &RegisterVT,
-                                       TargetLowering* TLI) {
+                                          unsigned &NumIntermediates,
+                                          EVT &RegisterVT,
+                                          TargetLowering *TLI) {
   // Figure out the right, legal destination reg to copy into.
   unsigned NumElts = VT.getVectorNumElements();
   MVT EltTy = VT.getVectorElementType();
@@ -610,16 +643,12 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
 
   EVT DestVT = TLI->getRegisterType(NewVT);
   RegisterVT = DestVT;
-  if (EVT(DestVT).bitsLT(NewVT)) {
-    // Value is expanded, e.g. i64 -> i16.
+  if (EVT(DestVT).bitsLT(NewVT))    // Value is expanded, e.g. i64 -> i16.
     return NumVectorRegs*(NewVT.getSizeInBits()/DestVT.getSizeInBits());
-  } else {
-    // Otherwise, promotion or legal types use the same number of registers as
-    // the vector decimated to the appropriate level.
-    return NumVectorRegs;
-  }
   
-  return 1;
+  // Otherwise, promotion or legal types use the same number of registers as
+  // the vector decimated to the appropriate level.
+  return NumVectorRegs;
 }
 
 /// computeRegisterProperties - Once all of the register classes are added,
@@ -705,39 +734,39 @@ void TargetLowering::computeRegisterProperties() {
   for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE;
        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
     MVT VT = (MVT::SimpleValueType)i;
-    if (!isTypeLegal(VT)) {
-      MVT IntermediateVT;
-      EVT RegisterVT;
-      unsigned NumIntermediates;
-      NumRegistersForVT[i] =
-        getVectorTypeBreakdownMVT(VT, IntermediateVT, NumIntermediates,
-                                  RegisterVT, this);
-      RegisterTypeForVT[i] = RegisterVT;
-      
-      // Determine if there is a legal wider type.
-      bool IsLegalWiderType = false;
-      EVT EltVT = VT.getVectorElementType();
-      unsigned NElts = VT.getVectorNumElements();
-      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
-        EVT SVT = (MVT::SimpleValueType)nVT;
-        if (isTypeSynthesizable(SVT) && SVT.getVectorElementType() == EltVT &&
-            SVT.getVectorNumElements() > NElts && NElts != 1) {
-          TransformToType[i] = SVT;
-          ValueTypeActions.setTypeAction(VT, Promote);
-          IsLegalWiderType = true;
-          break;
-        }
+    if (isTypeLegal(VT)) continue;
+    
+    MVT IntermediateVT;
+    EVT RegisterVT;
+    unsigned NumIntermediates;
+    NumRegistersForVT[i] =
+      getVectorTypeBreakdownMVT(VT, IntermediateVT, NumIntermediates,
+                                RegisterVT, this);
+    RegisterTypeForVT[i] = RegisterVT;
+    
+    // Determine if there is a legal wider type.
+    bool IsLegalWiderType = false;
+    EVT EltVT = VT.getVectorElementType();
+    unsigned NElts = VT.getVectorNumElements();
+    for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+      EVT SVT = (MVT::SimpleValueType)nVT;
+      if (isTypeSynthesizable(SVT) && SVT.getVectorElementType() == EltVT &&
+          SVT.getVectorNumElements() > NElts && NElts != 1) {
+        TransformToType[i] = SVT;
+        ValueTypeActions.setTypeAction(VT, Promote);
+        IsLegalWiderType = true;
+        break;
       }
-      if (!IsLegalWiderType) {
-        EVT NVT = VT.getPow2VectorType();
-        if (NVT == VT) {
-          // Type is already a power of 2.  The default action is to split.
-          TransformToType[i] = MVT::Other;
-          ValueTypeActions.setTypeAction(VT, Expand);
-        } else {
-          TransformToType[i] = NVT;
-          ValueTypeActions.setTypeAction(VT, Promote);
-        }
+    }
+    if (!IsLegalWiderType) {
+      EVT NVT = VT.getPow2VectorType();
+      if (NVT == VT) {
+        // Type is already a power of 2.  The default action is to split.
+        TransformToType[i] = MVT::Other;
+        ValueTypeActions.setTypeAction(VT, Expand);
+      } else {
+        TransformToType[i] = NVT;
+        ValueTypeActions.setTypeAction(VT, Promote);
       }
     }
   }
@@ -811,6 +840,65 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
   return 1;
 }
 
+/// Get the EVTs and ArgFlags collections that represent the legalized return 
+/// type of the given function.  This does not require a DAG or a return value,
+/// and is suitable for use before any DAGs for the function are constructed.
+/// TODO: Move this out of TargetLowering.cpp.
+void llvm::GetReturnInfo(const Type* ReturnType, Attributes attr,
+                         SmallVectorImpl<ISD::OutputArg> &Outs,
+                         const TargetLowering &TLI,
+                         SmallVectorImpl<uint64_t> *Offsets) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, ReturnType, ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0) return;
+  unsigned Offset = 0;
+
+  for (unsigned j = 0, f = NumValues; j != f; ++j) {
+    EVT VT = ValueVTs[j];
+    ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+    if (attr & Attribute::SExt)
+      ExtendKind = ISD::SIGN_EXTEND;
+    else if (attr & Attribute::ZExt)
+      ExtendKind = ISD::ZERO_EXTEND;
+
+    // FIXME: C calling convention requires the return type to be promoted to
+    // at least 32-bit. But this is not necessary for non-C calling
+    // conventions. The frontend should mark functions whose return values
+    // require promoting with signext or zeroext attributes.
+    if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) {
+      EVT MinVT = TLI.getRegisterType(ReturnType->getContext(), MVT::i32);
+      if (VT.bitsLT(MinVT))
+        VT = MinVT;
+    }
+
+    unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT);
+    EVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT);
+    unsigned PartSize = TLI.getTargetData()->getTypeAllocSize(
+                        PartVT.getTypeForEVT(ReturnType->getContext()));
+
+    // 'inreg' on function refers to return value
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+    if (attr & Attribute::InReg)
+      Flags.setInReg();
+
+    // Propagate extension type if any
+    if (attr & Attribute::SExt)
+      Flags.setSExt();
+    else if (attr & Attribute::ZExt)
+      Flags.setZExt();
+
+    for (unsigned i = 0; i < NumParts; ++i) {
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true));
+      if (Offsets) {
+        Offsets->push_back(Offset);
+        Offset += PartSize;
+      }
+    }
+  }
+}
+
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.  This is the actual
 /// alignment, not its logarithm.
@@ -1042,7 +1130,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask))
       return true;
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkOps && TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
       return true;
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
@@ -1076,7 +1164,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (TLO.ShrinkDemandedConstant(Op, NewMask))
       return true;
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkOps && TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
       return true;
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
@@ -1101,7 +1189,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if ((KnownZero2 & NewMask) == NewMask)
       return TLO.CombineTo(Op, Op.getOperand(1));
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkOps && TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
       return true;
 
     // If all of the unknown bits are known to be zero on one side or the other
@@ -1498,13 +1586,17 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     break;
   }
   case ISD::AssertZext: {
-    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
-    APInt InMask = APInt::getLowBitsSet(BitWidth,
-                                        VT.getSizeInBits());
-    if (SimplifyDemandedBits(Op.getOperand(0), InMask & NewMask,
+    // Demand all the bits of the input that are demanded in the output.
+    // The low bits are obvious; the high bits are demanded because we're
+    // asserting that they're zero here.
+    if (SimplifyDemandedBits(Op.getOperand(0), NewMask,
                              KnownZero, KnownOne, TLO, Depth+1))
       return true;
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+
+    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth,
+                                        VT.getSizeInBits());
     KnownZero |= ~InMask & NewMask;
     break;
   }
@@ -1544,7 +1636,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                              KnownOne2, TLO, Depth+1))
       return true;
     // See if the operation should be performed at a smaller bit width.
-    if (TLO.ShrinkOps && TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
       return true;
   }
   // FALL THROUGH
@@ -2346,7 +2438,6 @@ const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const{
 /// vector.  If it is invalid, don't add anything to Ops.
 void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                   char ConstraintLetter,
-                                                  bool hasMemory,
                                                   std::vector<SDValue> &Ops,
                                                   SelectionDAG &DAG) const {
   switch (ConstraintLetter) {
@@ -2384,7 +2475,8 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
       if (ConstraintLetter != 'n') {
         int64_t Offs = GA->getOffset();
         if (C) Offs += C->getZExtValue();
-        Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(),
+        Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), 
+                                                 C->getDebugLoc(),
                                                  Op.getValueType(), Offs));
         return;
       }
@@ -2507,18 +2599,18 @@ static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
 ///     'm' over 'r', for example.
 ///
 static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
-                             bool hasMemory,  const TargetLowering &TLI,
+                             const TargetLowering &TLI,
                              SDValue Op, SelectionDAG *DAG) {
   assert(OpInfo.Codes.size() > 1 && "Doesn't have multiple constraint options");
   unsigned BestIdx = 0;
   TargetLowering::ConstraintType BestType = TargetLowering::C_Unknown;
   int BestGenerality = -1;
-  
+
   // Loop over the options, keeping track of the most general one.
   for (unsigned i = 0, e = OpInfo.Codes.size(); i != e; ++i) {
     TargetLowering::ConstraintType CType =
       TLI.getConstraintType(OpInfo.Codes[i]);
-    
+
     // If this is an 'other' constraint, see if the operand is valid for it.
     // For example, on X86 we might have an 'rI' constraint.  If the operand
     // is an integer in the range [0..31] we want to use I (saving a load
@@ -2527,7 +2619,7 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
       assert(OpInfo.Codes[i].size() == 1 &&
              "Unhandled multi-letter 'other' constraint");
       std::vector<SDValue> ResultOps;
-      TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i][0], hasMemory,
+      TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i][0],
                                        ResultOps, *DAG);
       if (!ResultOps.empty()) {
         BestType = CType;
@@ -2536,6 +2628,11 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
       }
     }
     
+    // Things with matching constraints can only be registers, per gcc
+    // documentation.  This mainly affects "g" constraints.
+    if (CType == TargetLowering::C_Memory && OpInfo.hasMatchingInput())
+      continue;
+    
     // This constraint letter is more general than the previous one, use it.
     int Generality = getConstraintGenerality(CType);
     if (Generality > BestGenerality) {
@@ -2554,7 +2651,6 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
 /// OpInfo.ConstraintCode and OpInfo.ConstraintType.
 void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
                                             SDValue Op, 
-                                            bool hasMemory,
                                             SelectionDAG *DAG) const {
   assert(!OpInfo.Codes.empty() && "Must have at least one constraint");
   
@@ -2563,7 +2659,7 @@ void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
     OpInfo.ConstraintCode = OpInfo.Codes[0];
     OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
   } else {
-    ChooseConstraint(OpInfo, hasMemory, *this, Op, DAG);
+    ChooseConstraint(OpInfo, *this, Op, DAG);
   }
   
   // 'X' matches anything.
diff --git a/lib/CodeGen/ShadowStackGC.cpp b/lib/CodeGen/ShadowStackGC.cpp
index 5240bef..6ab0cb0 100644
--- a/lib/CodeGen/ShadowStackGC.cpp
+++ b/lib/CodeGen/ShadowStackGC.cpp
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/IRBuilder.h"
 
 using namespace llvm;
@@ -158,7 +159,8 @@ namespace {
 
           // Create a new invoke instruction.
           Args.clear();
-          Args.append(CI->op_begin() + 1, CI->op_end());
+          CallSite CS(CI);
+          Args.append(CS.arg_begin(), CS.arg_end());
 
           InvokeInst *II = InvokeInst::Create(CI->getCalledValue(),
                                               NewBB, CleanupBB,
@@ -194,7 +196,7 @@ Constant *ShadowStackGC::GetFrameMap(Function &F) {
   unsigned NumMeta = 0;
   SmallVector<Constant*,16> Metadata;
   for (unsigned I = 0; I != Roots.size(); ++I) {
-    Constant *C = cast<Constant>(Roots[I].first->getOperand(2));
+    Constant *C = cast<Constant>(Roots[I].first->getArgOperand(1));
     if (!C->isNullValue())
       NumMeta = I + 1;
     Metadata.push_back(ConstantExpr::getBitCast(C, VoidPtr));
@@ -322,16 +324,16 @@ void ShadowStackGC::CollectRoots(Function &F) {
 
   assert(Roots.empty() && "Not cleaned up?");
 
-  SmallVector<std::pair<CallInst*,AllocaInst*>,16> MetaRoots;
+  SmallVector<std::pair<CallInst*, AllocaInst*>, 16> MetaRoots;
 
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
     for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;)
       if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++))
         if (Function *F = CI->getCalledFunction())
           if (F->getIntrinsicID() == Intrinsic::gcroot) {
-            std::pair<CallInst*,AllocaInst*> Pair = std::make_pair(
-              CI, cast<AllocaInst>(CI->getOperand(1)->stripPointerCasts()));
-            if (IsNullValue(CI->getOperand(2)))
+            std::pair<CallInst*, AllocaInst*> Pair = std::make_pair(
+              CI, cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
+            if (IsNullValue(CI->getArgOperand(1)))
               Roots.push_back(Pair);
             else
               MetaRoots.push_back(Pair);
diff --git a/lib/CodeGen/SimpleHazardRecognizer.h b/lib/CodeGen/SimpleHazardRecognizer.h
deleted file mode 100644
index f69feaf..0000000
--- a/lib/CodeGen/SimpleHazardRecognizer.h
+++ /dev/null
@@ -1,89 +0,0 @@
-//=- llvm/CodeGen/SimpleHazardRecognizer.h - Scheduling Support -*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the SimpleHazardRecognizer class, which
-// implements hazard-avoidance heuristics for scheduling, based on the
-// scheduling itineraries specified for the target.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_SIMPLEHAZARDRECOGNIZER_H
-#define LLVM_CODEGEN_SIMPLEHAZARDRECOGNIZER_H
-
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-namespace llvm {
-  /// SimpleHazardRecognizer - A *very* simple hazard recognizer. It uses
-  /// a coarse classification and attempts to avoid that instructions of
-  /// a given class aren't grouped too densely together.
-  class SimpleHazardRecognizer : public ScheduleHazardRecognizer {
-    /// Class - A simple classification for SUnits.
-    enum Class {
-      Other, Load, Store
-    };
-
-    /// Window - The Class values of the most recently issued
-    /// instructions.
-    Class Window[8];
-
-    /// getClass - Classify the given SUnit.
-    Class getClass(const SUnit *SU) {
-      const MachineInstr *MI = SU->getInstr();
-      const TargetInstrDesc &TID = MI->getDesc();
-      if (TID.mayLoad())
-        return Load;
-      if (TID.mayStore())
-        return Store;
-      return Other;
-    }
-
-    /// Step - Rotate the existing entries in Window and insert the
-    /// given class value in position as the most recent.
-    void Step(Class C) {
-      std::copy(Window+1, array_endof(Window), Window);
-      Window[array_lengthof(Window)-1] = C;
-    }
-
-  public:
-    SimpleHazardRecognizer() : Window() {
-      Reset();
-    }
-
-    virtual HazardType getHazardType(SUnit *SU) {
-      Class C = getClass(SU);
-      if (C == Other)
-        return NoHazard;
-      unsigned Score = 0;
-      for (unsigned i = 0; i != array_lengthof(Window); ++i)
-        if (Window[i] == C)
-          Score += i + 1;
-      if (Score > array_lengthof(Window) * 2)
-        return Hazard;
-      return NoHazard;
-    }
-
-    virtual void Reset() {
-      for (unsigned i = 0; i != array_lengthof(Window); ++i)
-        Window[i] = Other;
-    }
-
-    virtual void EmitInstruction(SUnit *SU) {
-      Step(getClass(SU));
-    }
-
-    virtual void AdvanceCycle() {
-      Step(Other);
-    }
-  };
-}
-
-#endif
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.cpp b/lib/CodeGen/SimpleRegisterCoalescing.cpp
index ed3c243..e69d3e4 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.cpp
+++ b/lib/CodeGen/SimpleRegisterCoalescing.cpp
@@ -99,15 +99,23 @@ void SimpleRegisterCoalescing::getAnalysisUsage(AnalysisUsage &AU) const {
 ///
 /// This returns true if an interval was modified.
 ///
-bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
-                                                    LiveInterval &IntB,
+bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(const CoalescerPair &CP,
                                                     MachineInstr *CopyMI) {
+  // Bail if there is no dst interval - can happen when merging physical subreg
+  // operations.
+  if (!li_->hasInterval(CP.getDstReg()))
+    return false;
+
+  LiveInterval &IntA =
+    li_->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+    li_->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
   SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI).getDefIndex();
 
   // BValNo is a value number in B that is defined by a copy from A.  'B3' in
   // the example above.
   LiveInterval::iterator BLR = IntB.FindLiveRangeContaining(CopyIdx);
-  assert(BLR != IntB.end() && "Live range not found!");
+  if (BLR == IntB.end()) return false;
   VNInfo *BValNo = BLR->valno;
 
   // Get the location that B is defined at.  Two options: either this value has
@@ -119,7 +127,8 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
   // AValNo is the value number in A that defines the copy, A3 in the example.
   SlotIndex CopyUseIdx = CopyIdx.getUseIndex();
   LiveInterval::iterator ALR = IntA.FindLiveRangeContaining(CopyUseIdx);
-  assert(ALR != IntA.end() && "Live range not found!");
+  // The live range might not exist after fun with physreg coalescing.
+  if (ALR == IntA.end()) return false;
   VNInfo *AValNo = ALR->valno;
   // If it's re-defined by an early clobber somewhere in the live range, then
   // it's not safe to eliminate the copy. FIXME: This is a temporary workaround.
@@ -145,26 +154,21 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
 
   // If AValNo is defined as a copy from IntB, we can potentially process this.
   // Get the instruction that defines this value number.
-  unsigned SrcReg = li_->getVNInfoSourceReg(AValNo);
-  if (!SrcReg) return false;  // Not defined by a copy.
-
-  // If the value number is not defined by a copy instruction, ignore it.
-
-  // If the source register comes from an interval other than IntB, we can't
-  // handle this.
-  if (SrcReg != IntB.reg) return false;
+  if (!CP.isCoalescable(AValNo->getCopy()))
+    return false;
 
   // Get the LiveRange in IntB that this value number starts with.
   LiveInterval::iterator ValLR =
     IntB.FindLiveRangeContaining(AValNo->def.getPrevSlot());
-  assert(ValLR != IntB.end() && "Live range not found!");
+  if (ValLR == IntB.end())
+    return false;
 
   // Make sure that the end of the live range is inside the same block as
   // CopyMI.
   MachineInstr *ValLREndInst =
     li_->getInstructionFromIndex(ValLR->end.getPrevSlot());
-  if (!ValLREndInst ||
-      ValLREndInst->getParent() != CopyMI->getParent()) return false;
+  if (!ValLREndInst || ValLREndInst->getParent() != CopyMI->getParent())
+    return false;
 
   // Okay, we now know that ValLR ends in the same block that the CopyMI
   // live-range starts.  If there are no intervening live ranges between them in
@@ -207,6 +211,8 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
   // physreg has sub-registers, update their live intervals as well.
   if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) {
     for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
+      if (!li_->hasInterval(*SR))
+        continue;
       LiveInterval &SRLI = li_->getInterval(*SR);
       SRLI.addRange(LiveRange(FillerStart, FillerEnd,
                               SRLI.getNextValue(FillerStart, 0, true,
@@ -216,7 +222,6 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
 
   // Okay, merge "B1" into the same value number as "B0".
   if (BValNo != ValLR->valno) {
-    IntB.addKills(ValLR->valno, BValNo->kills);
     IntB.MergeValueNumberInto(BValNo, ValLR->valno);
   }
   DEBUG({
@@ -230,13 +235,12 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
   int UIdx = ValLREndInst->findRegisterUseOperandIdx(IntB.reg, true);
   if (UIdx != -1) {
     ValLREndInst->getOperand(UIdx).setIsKill(false);
-    ValLR->valno->removeKill(FillerStart);
   }
 
   // If the copy instruction was killing the destination register before the
   // merge, find the last use and trim the live range. That will also add the
   // isKill marker.
-  if (ALR->valno->isKill(CopyIdx))
+  if (ALR->end == CopyIdx)
     TrimLiveIntervalToLastUse(CopyUseIdx, CopyMI->getParent(), IntA, ALR);
 
   ++numExtends;
@@ -304,23 +308,31 @@ TransferImplicitOps(MachineInstr *MI, MachineInstr *NewMI) {
 ///
 /// This returns true if an interval was modified.
 ///
-bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
-                                                        LiveInterval &IntB,
+bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(const CoalescerPair &CP,
                                                         MachineInstr *CopyMI) {
-  SlotIndex CopyIdx =
-    li_->getInstructionIndex(CopyMI).getDefIndex();
-
   // FIXME: For now, only eliminate the copy by commuting its def when the
   // source register is a virtual register. We want to guard against cases
   // where the copy is a back edge copy and commuting the def lengthen the
   // live interval of the source register to the entire loop.
-  if (TargetRegisterInfo::isPhysicalRegister(IntA.reg))
+  if (CP.isPhys() && CP.isFlipped())
+    return false;
+
+  // Bail if there is no dst interval.
+  if (!li_->hasInterval(CP.getDstReg()))
     return false;
 
+  SlotIndex CopyIdx =
+    li_->getInstructionIndex(CopyMI).getDefIndex();
+
+  LiveInterval &IntA =
+    li_->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+    li_->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+
   // BValNo is a value number in B that is defined by a copy from A. 'B3' in
   // the example above.
   LiveInterval::iterator BLR = IntB.FindLiveRangeContaining(CopyIdx);
-  assert(BLR != IntB.end() && "Live range not found!");
+  if (BLR == IntB.end()) return false;
   VNInfo *BValNo = BLR->valno;
 
   // Get the location that B is defined at.  Two options: either this value has
@@ -342,6 +354,8 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
       AValNo->isUnused() || AValNo->hasPHIKill())
     return false;
   MachineInstr *DefMI = li_->getInstructionFromIndex(AValNo->def);
+  if (!DefMI)
+    return false;
   const TargetInstrDesc &TID = DefMI->getDesc();
   if (!TID.isCommutable())
     return false;
@@ -380,7 +394,8 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
   // clobbers from the superreg.
   if (BHasSubRegs)
     for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR)
-      if (HasOtherReachingDefs(IntA, li_->getInterval(*SR), AValNo, 0))
+      if (li_->hasInterval(*SR) &&
+          HasOtherReachingDefs(IntA, li_->getInterval(*SR), AValNo, 0))
         return false;
 
   // If some of the uses of IntA.reg is already coalesced away, return false.
@@ -413,7 +428,6 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
 
   bool BHasPHIKill = BValNo->hasPHIKill();
   SmallVector<VNInfo*, 4> BDeadValNos;
-  VNInfo::KillSet BKills;
   std::map<SlotIndex, SlotIndex> BExtend;
 
   // If ALR and BLR overlaps and end of BLR extends beyond end of ALR, e.g.
@@ -424,8 +438,6 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
   // C = A<kill>
   // ...
   //   = B
-  //
-  // then do not add kills of A to the newly created B interval.
   bool Extended = BLR->end > ALR->end && ALR->end != ALR->start;
   if (Extended)
     BExtend[ALR->end] = BLR->end;
@@ -448,34 +460,38 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
     LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
     if (ULR == IntA.end() || ULR->valno != AValNo)
       continue;
-    UseMO.setReg(NewReg);
+    if (TargetRegisterInfo::isPhysicalRegister(NewReg))
+      UseMO.substPhysReg(NewReg, *tri_);
+    else
+      UseMO.setReg(NewReg);
     if (UseMI == CopyMI)
       continue;
     if (UseMO.isKill()) {
       if (Extended)
         UseMO.setIsKill(false);
-      else
-        BKills.push_back(UseIdx.getDefIndex());
     }
     unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-    if (!tii_->isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
+    if (UseMI->isCopy()) {
+      if (UseMI->getOperand(0).getReg() != IntB.reg ||
+          UseMI->getOperand(0).getSubReg())
+        continue;
+    } else if (tii_->isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)){
+      if (DstReg != IntB.reg || DstSubIdx)
+        continue;
+    } else
       continue;
-    if (DstReg == IntB.reg && DstSubIdx == 0) {
-      // This copy will become a noop. If it's defining a new val#,
-      // remove that val# as well. However this live range is being
-      // extended to the end of the existing live range defined by the copy.
-      SlotIndex DefIdx = UseIdx.getDefIndex();
-      const LiveRange *DLR = IntB.getLiveRangeContaining(DefIdx);
-      BHasPHIKill |= DLR->valno->hasPHIKill();
-      assert(DLR->valno->def == DefIdx);
-      BDeadValNos.push_back(DLR->valno);
-      BExtend[DLR->start] = DLR->end;
-      JoinedCopies.insert(UseMI);
-      // If this is a kill but it's going to be removed, the last use
-      // of the same val# is the new kill.
-      if (UseMO.isKill())
-        BKills.pop_back();
-    }
+    // This copy will become a noop. If it's defining a new val#,
+    // remove that val# as well. However this live range is being
+    // extended to the end of the existing live range defined by the copy.
+    SlotIndex DefIdx = UseIdx.getDefIndex();
+    const LiveRange *DLR = IntB.getLiveRangeContaining(DefIdx);
+    if (!DLR)
+      continue;
+    BHasPHIKill |= DLR->valno->hasPHIKill();
+    assert(DLR->valno->def == DefIdx);
+    BDeadValNos.push_back(DLR->valno);
+    BExtend[DLR->start] = DLR->end;
+    JoinedCopies.insert(UseMI);
   }
 
   // We need to insert a new liverange: [ALR.start, LastUse). It may be we can
@@ -490,24 +506,21 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
     VNInfo *DeadVNI = BDeadValNos[i];
     if (BHasSubRegs) {
       for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
+        if (!li_->hasInterval(*SR))
+          continue;
         LiveInterval &SRLI = li_->getInterval(*SR);
-        const LiveRange *SRLR = SRLI.getLiveRangeContaining(DeadVNI->def);
-        SRLI.removeValNo(SRLR->valno);
+        if (const LiveRange *SRLR = SRLI.getLiveRangeContaining(DeadVNI->def))
+          SRLI.removeValNo(SRLR->valno);
       }
     }
     IntB.removeValNo(BDeadValNos[i]);
   }
 
   // Extend BValNo by merging in IntA live ranges of AValNo. Val# definition
-  // is updated. Kills are also updated.
+  // is updated.
   VNInfo *ValNo = BValNo;
   ValNo->def = AValNo->def;
   ValNo->setCopy(0);
-  for (unsigned j = 0, ee = ValNo->kills.size(); j != ee; ++j) {
-    if (ValNo->kills[j] != BLR->end)
-      BKills.push_back(ValNo->kills[j]);
-  }
-  ValNo->kills.clear();
   for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
        AI != AE; ++AI) {
     if (AI->valno != AValNo) continue;
@@ -517,18 +530,7 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
     if (EI != BExtend.end())
       End = EI->second;
     IntB.addRange(LiveRange(AI->start, End, ValNo));
-
-    // If the IntB live range is assigned to a physical register, and if that
-    // physreg has sub-registers, update their live intervals as well.
-    if (BHasSubRegs) {
-      for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
-        LiveInterval &SRLI = li_->getInterval(*SR);
-        SRLI.MergeInClobberRange(*li_, AI->start, End,
-                                 li_->getVNInfoAllocator());
-      }
-    }
   }
-  IntB.addKills(ValNo, BKills);
   ValNo->setHasPHIKill(BHasPHIKill);
 
   DEBUG({
@@ -621,7 +623,11 @@ SimpleRegisterCoalescing::TrimLiveIntervalToLastUse(SlotIndex CopyIdx,
     // of last use.
     LastUse->setIsKill();
     removeRange(li, LastUseIdx.getDefIndex(), LR->end, li_, tri_);
-    LR->valno->addKill(LastUseIdx.getDefIndex());
+    if (LastUseMI->isCopy()) {
+      MachineOperand &DefMO = LastUseMI->getOperand(0);
+      if (DefMO.getReg() == li.reg && !DefMO.getSubReg())
+        DefMO.setIsDead();
+    }
     unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
     if (tii_->isMoveInstr(*LastUseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
         DstReg == li.reg && DstSubIdx == 0) {
@@ -663,6 +669,7 @@ bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
       ValNo->isUnused() || ValNo->hasPHIKill())
     return false;
   MachineInstr *DefMI = li_->getInstructionFromIndex(ValNo->def);
+  assert(DefMI && "Defining instruction disappeared");
   const TargetInstrDesc &TID = DefMI->getDesc();
   if (!TID.isAsCheapAsAMove())
     return false;
@@ -701,33 +708,20 @@ bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
       return false;
   }
 
-  SlotIndex DefIdx = CopyIdx.getDefIndex();
-  const LiveRange *DLR= li_->getInterval(DstReg).getLiveRangeContaining(DefIdx);
-  DLR->valno->setCopy(0);
-  // Don't forget to update sub-register intervals.
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
-    for (const unsigned* SR = tri_->getSubRegisters(DstReg); *SR; ++SR) {
-      if (!li_->hasInterval(*SR))
-        continue;
-      const LiveRange *DLR =
-          li_->getInterval(*SR).getLiveRangeContaining(DefIdx);
-      if (DLR && DLR->valno->getCopy() == CopyMI)
-        DLR->valno->setCopy(0);
-    }
-  }
+  RemoveCopyFlag(DstReg, CopyMI);
 
   // If copy kills the source register, find the last use and propagate
   // kill.
   bool checkForDeadDef = false;
   MachineBasicBlock *MBB = CopyMI->getParent();
-  if (SrcLR->valno->isKill(DefIdx))
+  if (SrcLR->end == CopyIdx.getDefIndex())
     if (!TrimLiveIntervalToLastUse(CopyIdx, MBB, SrcInt, SrcLR)) {
       checkForDeadDef = true;
     }
 
   MachineBasicBlock::iterator MII =
     llvm::next(MachineBasicBlock::iterator(CopyMI));
-  tii_->reMaterialize(*MBB, MII, DstReg, DstSubIdx, DefMI, tri_);
+  tii_->reMaterialize(*MBB, MII, DstReg, DstSubIdx, DefMI, *tri_);
   MachineInstr *NewMI = prior(MII);
 
   if (checkForDeadDef) {
@@ -747,24 +741,8 @@ bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
     MachineOperand &MO = CopyMI->getOperand(i);
     if (MO.isReg() && MO.isImplicit())
       NewMI->addOperand(MO);
-    if (MO.isDef() && li_->hasInterval(MO.getReg())) {
-      unsigned Reg = MO.getReg();
-      const LiveRange *DLR =
-          li_->getInterval(Reg).getLiveRangeContaining(DefIdx);
-      if (DLR && DLR->valno->getCopy() == CopyMI)
-        DLR->valno->setCopy(0);
-      // Handle subregs as well
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        for (const unsigned* SR = tri_->getSubRegisters(Reg); *SR; ++SR) {
-          if (!li_->hasInterval(*SR))
-            continue;
-          const LiveRange *DLR =
-              li_->getInterval(*SR).getLiveRangeContaining(DefIdx);
-          if (DLR && DLR->valno->getCopy() == CopyMI)
-            DLR->valno->setCopy(0);
-        }
-      }
-    }
+    if (MO.isDef())
+      RemoveCopyFlag(MO.getReg(), CopyMI);
   }
 
   TransferImplicitOps(CopyMI, NewMI);
@@ -783,84 +761,72 @@ bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
 /// being updated is not zero, make sure to set it to the correct physical
 /// subregister.
 void
-SimpleRegisterCoalescing::UpdateRegDefsUses(unsigned SrcReg, unsigned DstReg,
-                                            unsigned SubIdx) {
-  bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
-  if (DstIsPhys && SubIdx) {
-    // Figure out the real physical register we are updating with.
-    DstReg = tri_->getSubReg(DstReg, SubIdx);
-    SubIdx = 0;
-  }
-
-  // Copy the register use-list before traversing it. We may be adding operands
-  // and invalidating pointers.
-  SmallVector<std::pair<MachineInstr*, unsigned>, 32> reglist;
-  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(SrcReg),
-         E = mri_->reg_end(); I != E; ++I)
-    reglist.push_back(std::make_pair(&*I, I.getOperandNo()));
-
-  for (unsigned N=0; N != reglist.size(); ++N) {
-    MachineInstr *UseMI = reglist[N].first;
-    MachineOperand &O = UseMI->getOperand(reglist[N].second);
-    unsigned OldSubIdx = O.getSubReg();
+SimpleRegisterCoalescing::UpdateRegDefsUses(const CoalescerPair &CP) {
+  bool DstIsPhys = CP.isPhys();
+  unsigned SrcReg = CP.getSrcReg();
+  unsigned DstReg = CP.getDstReg();
+  unsigned SubIdx = CP.getSubIdx();
+
+  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(SrcReg);
+       MachineInstr *UseMI = I.skipInstruction();) {
+    // A PhysReg copy that won't be coalesced can perhaps be rematerialized
+    // instead.
     if (DstIsPhys) {
-      unsigned UseDstReg = DstReg;
-      if (OldSubIdx)
-          UseDstReg = tri_->getSubReg(DstReg, OldSubIdx);
-
       unsigned CopySrcReg, CopyDstReg, CopySrcSubIdx, CopyDstSubIdx;
       if (tii_->isMoveInstr(*UseMI, CopySrcReg, CopyDstReg,
                             CopySrcSubIdx, CopyDstSubIdx) &&
-          CopySrcSubIdx == 0 &&
-          CopyDstSubIdx == 0 &&
-          CopySrcReg != CopyDstReg &&
-          CopySrcReg == SrcReg && CopyDstReg != UseDstReg) {
-        // If the use is a copy and it won't be coalesced away, and its source
-        // is defined by a trivial computation, try to rematerialize it instead.
-        if (!JoinedCopies.count(UseMI) &&
-            ReMaterializeTrivialDef(li_->getInterval(SrcReg), CopyDstReg,
-                                    CopyDstSubIdx, UseMI))
-          continue;
-      }
+          CopySrcSubIdx == 0 && CopyDstSubIdx == 0 &&
+          CopySrcReg != CopyDstReg && CopySrcReg == SrcReg &&
+          CopyDstReg != DstReg && !JoinedCopies.count(UseMI) &&
+          ReMaterializeTrivialDef(li_->getInterval(SrcReg), CopyDstReg, 0,
+                                  UseMI))
+        continue;
 
-      O.setReg(UseDstReg);
-      O.setSubReg(0);
-      if (OldSubIdx) {
-        // Def and kill of subregister of a virtual register actually defs and
-        // kills the whole register. Add imp-defs and imp-kills as needed.
-        if (O.isDef()) {
-          if(O.isDead())
-            UseMI->addRegisterDead(DstReg, tri_, true);
-          else
-            UseMI->addRegisterDefined(DstReg, tri_);
-        } else if (!O.isUndef() &&
-                   (O.isKill() ||
-                    UseMI->isRegTiedToDefOperand(&O-&UseMI->getOperand(0))))
-          UseMI->addRegisterKilled(DstReg, tri_, true);
-      }
+      if (UseMI->isCopy() &&
+          !UseMI->getOperand(1).getSubReg() &&
+          !UseMI->getOperand(0).getSubReg() &&
+          UseMI->getOperand(1).getReg() == SrcReg &&
+          UseMI->getOperand(0).getReg() != SrcReg &&
+          UseMI->getOperand(0).getReg() != DstReg &&
+          !JoinedCopies.count(UseMI) &&
+          ReMaterializeTrivialDef(li_->getInterval(SrcReg),
+                                  UseMI->getOperand(0).getReg(), 0, UseMI))
+        continue;
+    }
 
-      DEBUG({
-          dbgs() << "\t\tupdated: ";
-          if (!UseMI->isDebugValue())
-            dbgs() << li_->getInstructionIndex(UseMI) << "\t";
-          dbgs() << *UseMI;
-        });
-      continue;
+    SmallVector<unsigned,8> Ops;
+    bool Reads, Writes;
+    tie(Reads, Writes) = UseMI->readsWritesVirtualRegister(SrcReg, &Ops);
+    bool Kills = false, Deads = false;
+
+    // Replace SrcReg with DstReg in all UseMI operands.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      MachineOperand &MO = UseMI->getOperand(Ops[i]);
+      Kills |= MO.isKill();
+      Deads |= MO.isDead();
+
+      if (DstIsPhys)
+        MO.substPhysReg(DstReg, *tri_);
+      else
+        MO.substVirtReg(DstReg, SubIdx, *tri_);
     }
 
-    // Sub-register indexes goes from small to large. e.g.
-    // RAX: 1 -> AL, 2 -> AX, 3 -> EAX
-    // EAX: 1 -> AL, 2 -> AX
-    // So RAX's sub-register 2 is AX, RAX's sub-regsiter 3 is EAX, whose
-    // sub-register 2 is also AX.
-    //
-    // FIXME: Properly compose subreg indices for all targets.
-    //
-    if (SubIdx && OldSubIdx && SubIdx != OldSubIdx)
-      ;
-    else if (SubIdx)
-      O.setSubReg(SubIdx);
-    O.setReg(DstReg);
+    // This instruction is a copy that will be removed.
+    if (JoinedCopies.count(UseMI))
+      continue;
+
+    if (SubIdx) {
+      // If UseMI was a simple SrcReg def, make sure we didn't turn it into a
+      // read-modify-write of DstReg.
+      if (Deads)
+        UseMI->addRegisterDead(DstReg, tri_);
+      else if (!Reads && Writes)
+        UseMI->addRegisterDefined(DstReg, tri_);
+
+      // Kill flags apply to the whole physical register.
+      if (DstIsPhys && Kills)
+        UseMI->addRegisterKilled(DstReg, tri_);
+    }
 
     DEBUG({
         dbgs() << "\t\tupdated: ";
@@ -869,15 +835,15 @@ SimpleRegisterCoalescing::UpdateRegDefsUses(unsigned SrcReg, unsigned DstReg,
         dbgs() << *UseMI;
       });
 
+
     // After updating the operand, check if the machine instruction has
     // become a copy. If so, update its val# information.
-    if (JoinedCopies.count(UseMI))
+    const TargetInstrDesc &TID = UseMI->getDesc();
+    if (DstIsPhys || TID.getNumDefs() != 1 || TID.getNumOperands() <= 2)
       continue;
 
-    const TargetInstrDesc &TID = UseMI->getDesc();
     unsigned CopySrcReg, CopyDstReg, CopySrcSubIdx, CopyDstSubIdx;
-    if (TID.getNumDefs() == 1 && TID.getNumOperands() > 2 &&
-        tii_->isMoveInstr(*UseMI, CopySrcReg, CopyDstReg,
+    if (tii_->isMoveInstr(*UseMI, CopySrcReg, CopyDstReg,
                           CopySrcSubIdx, CopyDstSubIdx) &&
         CopySrcReg != CopyDstReg &&
         (TargetRegisterInfo::isVirtualRegister(CopyDstReg) ||
@@ -945,6 +911,27 @@ bool SimpleRegisterCoalescing::RemoveDeadDef(LiveInterval &li,
   return removeIntervalIfEmpty(li, li_, tri_);
 }
 
+void SimpleRegisterCoalescing::RemoveCopyFlag(unsigned DstReg,
+                                              const MachineInstr *CopyMI) {
+  SlotIndex DefIdx = li_->getInstructionIndex(CopyMI).getDefIndex();
+  if (li_->hasInterval(DstReg)) {
+    LiveInterval &LI = li_->getInterval(DstReg);
+    if (const LiveRange *LR = LI.getLiveRangeContaining(DefIdx))
+      if (LR->valno->getCopy() == CopyMI)
+        LR->valno->setCopy(0);
+  }
+  if (!TargetRegisterInfo::isPhysicalRegister(DstReg))
+    return;
+  for (const unsigned* AS = tri_->getAliasSet(DstReg); *AS; ++AS) {
+    if (!li_->hasInterval(*AS))
+      continue;
+    LiveInterval &LI = li_->getInterval(*AS);
+    if (const LiveRange *LR = LI.getLiveRangeContaining(DefIdx))
+      if (LR->valno->getCopy() == CopyMI)
+        LR->valno->setCopy(0);
+  }
+}
+
 /// PropagateDeadness - Propagate the dead marker to the instruction which
 /// defines the val#.
 static void PropagateDeadness(LiveInterval &li, MachineInstr *CopyMI,
@@ -978,8 +965,8 @@ SimpleRegisterCoalescing::ShortenDeadCopySrcLiveRange(LiveInterval &li,
     // Live-in to the function but dead. Remove it from entry live-in set.
     if (mf_->begin()->isLiveIn(li.reg))
       mf_->begin()->removeLiveIn(li.reg);
-    const LiveRange *LR = li.getLiveRangeContaining(CopyIdx);
-    removeRange(li, LR->start, LR->end, li_, tri_);
+    if (const LiveRange *LR = li.getLiveRangeContaining(CopyIdx))
+      removeRange(li, LR->start, LR->end, li_, tri_);
     return removeIntervalIfEmpty(li, li_, tri_);
   }
 
@@ -1017,147 +1004,12 @@ SimpleRegisterCoalescing::ShortenDeadCopySrcLiveRange(LiveInterval &li,
     // val#, then propagate the dead marker.
     PropagateDeadness(li, CopyMI, RemoveStart, li_, tri_);
     ++numDeadValNo;
-
-    if (LR->valno->isKill(RemoveEnd))
-      LR->valno->removeKill(RemoveEnd);
   }
 
   removeRange(li, RemoveStart, RemoveEnd, li_, tri_);
   return removeIntervalIfEmpty(li, li_, tri_);
 }
 
-/// CanCoalesceWithImpDef - Returns true if the specified copy instruction
-/// from an implicit def to another register can be coalesced away.
-bool SimpleRegisterCoalescing::CanCoalesceWithImpDef(MachineInstr *CopyMI,
-                                                     LiveInterval &li,
-                                                     LiveInterval &ImpLi) const{
-  if (!CopyMI->killsRegister(ImpLi.reg))
-    return false;
-  // Make sure this is the only use.
-  for (MachineRegisterInfo::use_iterator UI = mri_->use_begin(ImpLi.reg),
-         UE = mri_->use_end(); UI != UE;) {
-    MachineInstr *UseMI = &*UI;
-    ++UI;
-    if (CopyMI == UseMI || JoinedCopies.count(UseMI))
-      continue;
-    return false;
-  }
-  return true;
-}
-
-
-/// isWinToJoinVRWithSrcPhysReg - Return true if it's worth while to join a
-/// a virtual destination register with physical source register.
-bool
-SimpleRegisterCoalescing::isWinToJoinVRWithSrcPhysReg(MachineInstr *CopyMI,
-                                                     MachineBasicBlock *CopyMBB,
-                                                     LiveInterval &DstInt,
-                                                     LiveInterval &SrcInt) {
-  // If the virtual register live interval is long but it has low use desity,
-  // do not join them, instead mark the physical register as its allocation
-  // preference.
-  const TargetRegisterClass *RC = mri_->getRegClass(DstInt.reg);
-  unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
-  unsigned Length = li_->getApproximateInstructionCount(DstInt);
-  if (Length > Threshold &&
-      std::distance(mri_->use_nodbg_begin(DstInt.reg),
-                    mri_->use_nodbg_end()) * Threshold < Length)
-    return false;
-
-  // If the virtual register live interval extends into a loop, turn down
-  // aggressiveness.
-  SlotIndex CopyIdx =
-    li_->getInstructionIndex(CopyMI).getDefIndex();
-  const MachineLoop *L = loopInfo->getLoopFor(CopyMBB);
-  if (!L) {
-    // Let's see if the virtual register live interval extends into the loop.
-    LiveInterval::iterator DLR = DstInt.FindLiveRangeContaining(CopyIdx);
-    assert(DLR != DstInt.end() && "Live range not found!");
-    DLR = DstInt.FindLiveRangeContaining(DLR->end.getNextSlot());
-    if (DLR != DstInt.end()) {
-      CopyMBB = li_->getMBBFromIndex(DLR->start);
-      L = loopInfo->getLoopFor(CopyMBB);
-    }
-  }
-
-  if (!L || Length <= Threshold)
-    return true;
-
-  SlotIndex UseIdx = CopyIdx.getUseIndex();
-  LiveInterval::iterator SLR = SrcInt.FindLiveRangeContaining(UseIdx);
-  MachineBasicBlock *SMBB = li_->getMBBFromIndex(SLR->start);
-  if (loopInfo->getLoopFor(SMBB) != L) {
-    if (!loopInfo->isLoopHeader(CopyMBB))
-      return false;
-    // If vr's live interval extends pass the loop header, do not join.
-    for (MachineBasicBlock::succ_iterator SI = CopyMBB->succ_begin(),
-           SE = CopyMBB->succ_end(); SI != SE; ++SI) {
-      MachineBasicBlock *SuccMBB = *SI;
-      if (SuccMBB == CopyMBB)
-        continue;
-      if (DstInt.overlaps(li_->getMBBStartIdx(SuccMBB),
-                          li_->getMBBEndIdx(SuccMBB)))
-        return false;
-    }
-  }
-  return true;
-}
-
-/// isWinToJoinVRWithDstPhysReg - Return true if it's worth while to join a
-/// copy from a virtual source register to a physical destination register.
-bool
-SimpleRegisterCoalescing::isWinToJoinVRWithDstPhysReg(MachineInstr *CopyMI,
-                                                     MachineBasicBlock *CopyMBB,
-                                                     LiveInterval &DstInt,
-                                                     LiveInterval &SrcInt) {
-  // If the virtual register live interval is long but it has low use density,
-  // do not join them, instead mark the physical register as its allocation
-  // preference.
-  const TargetRegisterClass *RC = mri_->getRegClass(SrcInt.reg);
-  unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
-  unsigned Length = li_->getApproximateInstructionCount(SrcInt);
-  if (Length > Threshold &&
-      std::distance(mri_->use_nodbg_begin(SrcInt.reg),
-                    mri_->use_nodbg_end()) * Threshold < Length)
-    return false;
-
-  if (SrcInt.empty())
-    // Must be implicit_def.
-    return false;
-
-  // If the virtual register live interval is defined or cross a loop, turn
-  // down aggressiveness.
-  SlotIndex CopyIdx =
-    li_->getInstructionIndex(CopyMI).getDefIndex();
-  SlotIndex UseIdx = CopyIdx.getUseIndex();
-  LiveInterval::iterator SLR = SrcInt.FindLiveRangeContaining(UseIdx);
-  assert(SLR != SrcInt.end() && "Live range not found!");
-  SLR = SrcInt.FindLiveRangeContaining(SLR->start.getPrevSlot());
-  if (SLR == SrcInt.end())
-    return true;
-  MachineBasicBlock *SMBB = li_->getMBBFromIndex(SLR->start);
-  const MachineLoop *L = loopInfo->getLoopFor(SMBB);
-
-  if (!L || Length <= Threshold)
-    return true;
-
-  if (loopInfo->getLoopFor(CopyMBB) != L) {
-    if (SMBB != L->getLoopLatch())
-      return false;
-    // If vr's live interval is extended from before the loop latch, do not
-    // join.
-    for (MachineBasicBlock::pred_iterator PI = SMBB->pred_begin(),
-           PE = SMBB->pred_end(); PI != PE; ++PI) {
-      MachineBasicBlock *PredMBB = *PI;
-      if (PredMBB == SMBB)
-        continue;
-      if (SrcInt.overlaps(li_->getMBBStartIdx(PredMBB),
-                          li_->getMBBEndIdx(PredMBB)))
-        return false;
-    }
-  }
-  return true;
-}
 
 /// isWinToJoinCrossClass - Return true if it's profitable to coalesce
 /// two virtual registers from different register classes.
@@ -1203,157 +1055,6 @@ SimpleRegisterCoalescing::isWinToJoinCrossClass(unsigned SrcReg,
   return true;
 }
 
-/// HasIncompatibleSubRegDefUse - If we are trying to coalesce a virtual
-/// register with a physical register, check if any of the virtual register
-/// operand is a sub-register use or def. If so, make sure it won't result
-/// in an illegal extract_subreg or insert_subreg instruction. e.g.
-/// vr1024 = extract_subreg vr1025, 1
-/// ...
-/// vr1024 = mov8rr AH
-/// If vr1024 is coalesced with AH, the extract_subreg is now illegal since
-/// AH does not have a super-reg whose sub-register 1 is AH.
-bool
-SimpleRegisterCoalescing::HasIncompatibleSubRegDefUse(MachineInstr *CopyMI,
-                                                      unsigned VirtReg,
-                                                      unsigned PhysReg) {
-  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(VirtReg),
-         E = mri_->reg_end(); I != E; ++I) {
-    MachineOperand &O = I.getOperand();
-    if (O.isDebug())
-      continue;
-    MachineInstr *MI = &*I;
-    if (MI == CopyMI || JoinedCopies.count(MI))
-      continue;
-    unsigned SubIdx = O.getSubReg();
-    if (SubIdx && !tri_->getSubReg(PhysReg, SubIdx))
-      return true;
-    if (MI->isExtractSubreg()) {
-      SubIdx = MI->getOperand(2).getImm();
-      if (O.isUse() && !tri_->getSubReg(PhysReg, SubIdx))
-        return true;
-      if (O.isDef()) {
-        unsigned SrcReg = MI->getOperand(1).getReg();
-        const TargetRegisterClass *RC =
-          TargetRegisterInfo::isPhysicalRegister(SrcReg)
-          ? tri_->getPhysicalRegisterRegClass(SrcReg)
-          : mri_->getRegClass(SrcReg);
-        if (!tri_->getMatchingSuperReg(PhysReg, SubIdx, RC))
-          return true;
-      }
-    }
-    if (MI->isInsertSubreg() || MI->isSubregToReg()) {
-      SubIdx = MI->getOperand(3).getImm();
-      if (VirtReg == MI->getOperand(0).getReg()) {
-        if (!tri_->getSubReg(PhysReg, SubIdx))
-          return true;
-      } else {
-        unsigned DstReg = MI->getOperand(0).getReg();
-        const TargetRegisterClass *RC =
-          TargetRegisterInfo::isPhysicalRegister(DstReg)
-          ? tri_->getPhysicalRegisterRegClass(DstReg)
-          : mri_->getRegClass(DstReg);
-        if (!tri_->getMatchingSuperReg(PhysReg, SubIdx, RC))
-          return true;
-      }
-    }
-  }
-  return false;
-}
-
-
-/// CanJoinExtractSubRegToPhysReg - Return true if it's possible to coalesce
-/// an extract_subreg where dst is a physical register, e.g.
-/// cl = EXTRACT_SUBREG reg1024, 1
-bool
-SimpleRegisterCoalescing::CanJoinExtractSubRegToPhysReg(unsigned DstReg,
-                                               unsigned SrcReg, unsigned SubIdx,
-                                               unsigned &RealDstReg) {
-  const TargetRegisterClass *RC = mri_->getRegClass(SrcReg);
-  RealDstReg = tri_->getMatchingSuperReg(DstReg, SubIdx, RC);
-  if (!RealDstReg) {
-    DEBUG(dbgs() << "\tIncompatible source regclass: "
-                 << "none of the super-registers of " << tri_->getName(DstReg)
-                 << " are in " << RC->getName() << ".\n");
-    return false;
-  }
-
-  LiveInterval &RHS = li_->getInterval(SrcReg);
-  // For this type of EXTRACT_SUBREG, conservatively
-  // check if the live interval of the source register interfere with the
-  // actual super physical register we are trying to coalesce with.
-  if (li_->hasInterval(RealDstReg) &&
-      RHS.overlaps(li_->getInterval(RealDstReg))) {
-    DEBUG({
-        dbgs() << "\t\tInterfere with register ";
-        li_->getInterval(RealDstReg).print(dbgs(), tri_);
-      });
-    return false; // Not coalescable
-  }
-  for (const unsigned* SR = tri_->getSubRegisters(RealDstReg); *SR; ++SR)
-    // Do not check DstReg or its sub-register. JoinIntervals() will take care
-    // of that.
-    if (*SR != DstReg &&
-        !tri_->isSubRegister(DstReg, *SR) &&
-        li_->hasInterval(*SR) && RHS.overlaps(li_->getInterval(*SR))) {
-      DEBUG({
-          dbgs() << "\t\tInterfere with sub-register ";
-          li_->getInterval(*SR).print(dbgs(), tri_);
-        });
-      return false; // Not coalescable
-    }
-  return true;
-}
-
-/// CanJoinInsertSubRegToPhysReg - Return true if it's possible to coalesce
-/// an insert_subreg where src is a physical register, e.g.
-/// reg1024 = INSERT_SUBREG reg1024, c1, 0
-bool
-SimpleRegisterCoalescing::CanJoinInsertSubRegToPhysReg(unsigned DstReg,
-                                               unsigned SrcReg, unsigned SubIdx,
-                                               unsigned &RealSrcReg) {
-  const TargetRegisterClass *RC = mri_->getRegClass(DstReg);
-  RealSrcReg = tri_->getMatchingSuperReg(SrcReg, SubIdx, RC);
-  if (!RealSrcReg) {
-    DEBUG(dbgs() << "\tIncompatible destination regclass: "
-                 << "none of the super-registers of " << tri_->getName(SrcReg)
-                 << " are in " << RC->getName() << ".\n");
-    return false;
-  }
-
-  LiveInterval &LHS = li_->getInterval(DstReg);
-  if (li_->hasInterval(RealSrcReg) &&
-      LHS.overlaps(li_->getInterval(RealSrcReg))) {
-    DEBUG({
-        dbgs() << "\t\tInterfere with register ";
-        li_->getInterval(RealSrcReg).print(dbgs(), tri_);
-      });
-    return false; // Not coalescable
-  }
-  for (const unsigned* SR = tri_->getSubRegisters(RealSrcReg); *SR; ++SR)
-    // Do not check SrcReg or its sub-register. JoinIntervals() will take care
-    // of that.
-    if (*SR != SrcReg &&
-        !tri_->isSubRegister(SrcReg, *SR) &&
-        li_->hasInterval(*SR) && LHS.overlaps(li_->getInterval(*SR))) {
-      DEBUG({
-          dbgs() << "\t\tInterfere with sub-register ";
-          li_->getInterval(*SR).print(dbgs(), tri_);
-        });
-      return false; // Not coalescable
-    }
-  return true;
-}
-
-/// getRegAllocPreference - Return register allocation preference register.
-///
-static unsigned getRegAllocPreference(unsigned Reg, MachineFunction &MF,
-                                      MachineRegisterInfo *MRI,
-                                      const TargetRegisterInfo *TRI) {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
-    return 0;
-  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
-  return TRI->ResolveRegAllocHint(Hint.first, Hint.second, MF);
-}
 
 /// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
 /// which are the src/dst of the copy instruction CopyMI.  This returns true
@@ -1369,354 +1070,97 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
 
   DEBUG(dbgs() << li_->getInstructionIndex(CopyMI) << '\t' << *CopyMI);
 
-  unsigned SrcReg, DstReg, SrcSubIdx = 0, DstSubIdx = 0;
-  bool isExtSubReg = CopyMI->isExtractSubreg();
-  bool isInsSubReg = CopyMI->isInsertSubreg();
-  bool isSubRegToReg = CopyMI->isSubregToReg();
-  unsigned SubIdx = 0;
-  if (isExtSubReg) {
-    DstReg    = CopyMI->getOperand(0).getReg();
-    DstSubIdx = CopyMI->getOperand(0).getSubReg();
-    SrcReg    = CopyMI->getOperand(1).getReg();
-    SrcSubIdx = CopyMI->getOperand(2).getImm();
-  } else if (isInsSubReg || isSubRegToReg) {
-    DstReg    = CopyMI->getOperand(0).getReg();
-    DstSubIdx = CopyMI->getOperand(3).getImm();
-    SrcReg    = CopyMI->getOperand(2).getReg();
-    SrcSubIdx = CopyMI->getOperand(2).getSubReg();
-    if (SrcSubIdx && SrcSubIdx != DstSubIdx) {
-      // r1025 = INSERT_SUBREG r1025, r1024<2>, 2 Then r1024 has already been
-      // coalesced to a larger register so the subreg indices cancel out.
-      DEBUG(dbgs() << "\tSource of insert_subreg or subreg_to_reg is already "
-                      "coalesced to another register.\n");
-      return false;  // Not coalescable.
-    }
-  } else if (tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
-    if (SrcSubIdx && DstSubIdx && SrcSubIdx != DstSubIdx) {
-      // e.g. %reg16404:1<def> = MOV8rr %reg16412:2<kill>
-      Again = true;
-      return false;  // Not coalescable.
-    }
-  } else {
-    llvm_unreachable("Unrecognized copy instruction!");
+  CoalescerPair CP(*tii_, *tri_);
+  if (!CP.setRegisters(CopyMI)) {
+    DEBUG(dbgs() << "\tNot coalescable.\n");
+    return false;
   }
 
   // If they are already joined we continue.
-  if (SrcReg == DstReg) {
+  if (CP.getSrcReg() == CP.getDstReg()) {
     DEBUG(dbgs() << "\tCopy already coalesced.\n");
     return false;  // Not coalescable.
   }
 
-  bool SrcIsPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
-  bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
-
-  // If they are both physical registers, we cannot join them.
-  if (SrcIsPhys && DstIsPhys) {
-    DEBUG(dbgs() << "\tCan not coalesce physregs.\n");
-    return false;  // Not coalescable.
-  }
-
-  // We only join virtual registers with allocatable physical registers.
-  if (SrcIsPhys && !allocatableRegs_[SrcReg]) {
-    DEBUG(dbgs() << "\tSrc reg is unallocatable physreg.\n");
-    return false;  // Not coalescable.
-  }
-  if (DstIsPhys && !allocatableRegs_[DstReg]) {
-    DEBUG(dbgs() << "\tDst reg is unallocatable physreg.\n");
-    return false;  // Not coalescable.
-  }
-
-  // We cannot handle dual subreg indices and mismatched classes at the same
-  // time.
-  if (SrcSubIdx && DstSubIdx && differingRegisterClasses(SrcReg, DstReg)) {
-    DEBUG(dbgs() << "\tCannot handle subreg indices and mismatched classes.\n");
-    return false;
-  }
+  DEBUG(dbgs() << "\tConsidering merging %reg" << CP.getSrcReg());
 
-  // Check that a physical source register is compatible with dst regclass
-  if (SrcIsPhys) {
-    unsigned SrcSubReg = SrcSubIdx ?
-      tri_->getSubReg(SrcReg, SrcSubIdx) : SrcReg;
-    const TargetRegisterClass *DstRC = mri_->getRegClass(DstReg);
-    const TargetRegisterClass *DstSubRC = DstRC;
-    if (DstSubIdx)
-      DstSubRC = DstRC->getSubRegisterRegClass(DstSubIdx);
-    assert(DstSubRC && "Illegal subregister index");
-    if (!DstSubRC->contains(SrcSubReg)) {
-      DEBUG(dbgs() << "\tIncompatible destination regclass: "
-                   << "none of the super-registers of "
-                   << tri_->getName(SrcSubReg) << " are in "
-                   << DstSubRC->getName() << ".\n");
-      return false;             // Not coalescable.
-    }
-  }
-
-  // Check that a physical dst register is compatible with source regclass
-  if (DstIsPhys) {
-    unsigned DstSubReg = DstSubIdx ?
-      tri_->getSubReg(DstReg, DstSubIdx) : DstReg;
-    const TargetRegisterClass *SrcRC = mri_->getRegClass(SrcReg);
-    const TargetRegisterClass *SrcSubRC = SrcRC;
-    if (SrcSubIdx)
-      SrcSubRC = SrcRC->getSubRegisterRegClass(SrcSubIdx);
-    assert(SrcSubRC && "Illegal subregister index");
-    if (!SrcSubRC->contains(DstSubReg)) {
-      DEBUG(dbgs() << "\tIncompatible source regclass: "
-                   << "none of the super-registers of "
-                   << tri_->getName(DstSubReg) << " are in "
-                   << SrcSubRC->getName() << ".\n");
-      (void)DstSubReg;
-      return false;             // Not coalescable.
+  // Enforce policies.
+  if (CP.isPhys()) {
+    DEBUG(dbgs() <<" with physreg %" << tri_->getName(CP.getDstReg()) << "\n");
+    // Only coalesce to allocatable physreg.
+    if (!allocatableRegs_[CP.getDstReg()]) {
+      DEBUG(dbgs() << "\tRegister is an unallocatable physreg.\n");
+      return false;  // Not coalescable.
     }
-  }
-
-  // Should be non-null only when coalescing to a sub-register class.
-  bool CrossRC = false;
-  const TargetRegisterClass *SrcRC= SrcIsPhys ? 0 : mri_->getRegClass(SrcReg);
-  const TargetRegisterClass *DstRC= DstIsPhys ? 0 : mri_->getRegClass(DstReg);
-  const TargetRegisterClass *NewRC = NULL;
-  unsigned RealDstReg = 0;
-  unsigned RealSrcReg = 0;
-  if (isExtSubReg || isInsSubReg || isSubRegToReg) {
-    SubIdx = CopyMI->getOperand(isExtSubReg ? 2 : 3).getImm();
-    if (SrcIsPhys && isExtSubReg) {
-      // r1024 = EXTRACT_SUBREG EAX, 0 then r1024 is really going to be
-      // coalesced with AX.
-      unsigned DstSubIdx = CopyMI->getOperand(0).getSubReg();
-      if (DstSubIdx) {
-        // r1024<2> = EXTRACT_SUBREG EAX, 2. Then r1024 has already been
-        // coalesced to a larger register so the subreg indices cancel out.
-        if (DstSubIdx != SubIdx) {
-          DEBUG(dbgs() << "\t Sub-register indices mismatch.\n");
-          return false; // Not coalescable.
-        }
-      } else
-        SrcReg = tri_->getSubReg(SrcReg, SubIdx);
-      SubIdx = 0;
-    } else if (DstIsPhys && (isInsSubReg || isSubRegToReg)) {
-      // EAX = INSERT_SUBREG EAX, r1024, 0
-      unsigned SrcSubIdx = CopyMI->getOperand(2).getSubReg();
-      if (SrcSubIdx) {
-        // EAX = INSERT_SUBREG EAX, r1024<2>, 2 Then r1024 has already been
-        // coalesced to a larger register so the subreg indices cancel out.
-        if (SrcSubIdx != SubIdx) {
-          DEBUG(dbgs() << "\t Sub-register indices mismatch.\n");
-          return false; // Not coalescable.
-        }
-      } else
-        DstReg = tri_->getSubReg(DstReg, SubIdx);
-      SubIdx = 0;
-    } else if ((DstIsPhys && isExtSubReg) ||
-               (SrcIsPhys && (isInsSubReg || isSubRegToReg))) {
-      if (!isSubRegToReg && CopyMI->getOperand(1).getSubReg()) {
-        DEBUG(dbgs() << "\tSrc of extract_subreg already coalesced with reg"
-                     << " of a super-class.\n");
-        return false; // Not coalescable.
-      }
-
-      // FIXME: The following checks are somewhat conservative. Perhaps a better
-      // way to implement this is to treat this as coalescing a vr with the
-      // super physical register.
-      if (isExtSubReg) {
-        if (!CanJoinExtractSubRegToPhysReg(DstReg, SrcReg, SubIdx, RealDstReg))
-          return false; // Not coalescable
-      } else {
-        if (!CanJoinInsertSubRegToPhysReg(DstReg, SrcReg, SubIdx, RealSrcReg))
-          return false; // Not coalescable
-      }
-      SubIdx = 0;
-    } else {
-      unsigned OldSubIdx = isExtSubReg ? CopyMI->getOperand(0).getSubReg()
-        : CopyMI->getOperand(2).getSubReg();
-      if (OldSubIdx) {
-        if (OldSubIdx == SubIdx && !differingRegisterClasses(SrcReg, DstReg))
-          // r1024<2> = EXTRACT_SUBREG r1025, 2. Then r1024 has already been
-          // coalesced to a larger register so the subreg indices cancel out.
-          // Also check if the other larger register is of the same register
-          // class as the would be resulting register.
-          SubIdx = 0;
-        else {
-          DEBUG(dbgs() << "\t Sub-register indices mismatch.\n");
-          return false; // Not coalescable.
-        }
-      }
-      if (SubIdx) {
-        if (!DstIsPhys && !SrcIsPhys) {
-          if (isInsSubReg || isSubRegToReg) {
-            NewRC = tri_->getMatchingSuperRegClass(DstRC, SrcRC, SubIdx);
-          } else // extract_subreg {
-            NewRC = tri_->getMatchingSuperRegClass(SrcRC, DstRC, SubIdx);
-          }
-        if (!NewRC) {
-          DEBUG(dbgs() << "\t Conflicting sub-register indices.\n");
-          return false;  // Not coalescable
-        }
+  } else {
+    DEBUG({
+      dbgs() << " with reg%" << CP.getDstReg();
+      if (CP.getSubIdx())
+        dbgs() << ":" << tri_->getSubRegIndexName(CP.getSubIdx());
+      dbgs() << " to " << CP.getNewRC()->getName() << "\n";
+    });
 
-        if (!isWinToJoinCrossClass(SrcReg, DstReg, SrcRC, DstRC, NewRC)) {
-          DEBUG(dbgs() << "\tAvoid coalescing to constrained register class: "
-                       << SrcRC->getName() << "/"
-                       << DstRC->getName() << " -> "
-                       << NewRC->getName() << ".\n");
-          Again = true;  // May be possible to coalesce later.
-          return false;
-        }
-      }
-    }
-  } else if (differingRegisterClasses(SrcReg, DstReg)) {
-    if (DisableCrossClassJoin)
-      return false;
-    CrossRC = true;
-
-    // FIXME: What if the result of a EXTRACT_SUBREG is then coalesced
-    // with another? If it's the resulting destination register, then
-    // the subidx must be propagated to uses (but only those defined
-    // by the EXTRACT_SUBREG). If it's being coalesced into another
-    // register, it should be safe because register is assumed to have
-    // the register class of the super-register.
-
-    // Process moves where one of the registers have a sub-register index.
-    MachineOperand *DstMO = CopyMI->findRegisterDefOperand(DstReg);
-    MachineOperand *SrcMO = CopyMI->findRegisterUseOperand(SrcReg);
-    SubIdx = DstMO->getSubReg();
-    if (SubIdx) {
-      if (SrcMO->getSubReg())
-        // FIXME: can we handle this?
+    // Avoid constraining virtual register regclass too much.
+    if (CP.isCrossClass()) {
+      if (DisableCrossClassJoin) {
+        DEBUG(dbgs() << "\tCross-class joins disabled.\n");
         return false;
-      // This is not an insert_subreg but it looks like one.
-      // e.g. %reg1024:4 = MOV32rr %EAX
-      isInsSubReg = true;
-      if (SrcIsPhys) {
-        if (!CanJoinInsertSubRegToPhysReg(DstReg, SrcReg, SubIdx, RealSrcReg))
-          return false; // Not coalescable
-        SubIdx = 0;
-      }
-    } else {
-      SubIdx = SrcMO->getSubReg();
-      if (SubIdx) {
-        // This is not a extract_subreg but it looks like one.
-        // e.g. %cl = MOV16rr %reg1024:1
-        isExtSubReg = true;
-        if (DstIsPhys) {
-          if (!CanJoinExtractSubRegToPhysReg(DstReg, SrcReg, SubIdx,RealDstReg))
-            return false; // Not coalescable
-          SubIdx = 0;
-        }
-      }
-    }
-
-    // Now determine the register class of the joined register.
-    if (!SrcIsPhys && !DstIsPhys) {
-      if (isExtSubReg) {
-        NewRC =
-          SubIdx ? tri_->getMatchingSuperRegClass(SrcRC, DstRC, SubIdx) : SrcRC;
-      } else if (isInsSubReg) {
-        NewRC =
-          SubIdx ? tri_->getMatchingSuperRegClass(DstRC, SrcRC, SubIdx) : DstRC;
-      } else {
-        NewRC = getCommonSubClass(SrcRC, DstRC);
-      }
-
-      if (!NewRC) {
-        DEBUG(dbgs() << "\tDisjoint regclasses: "
-                     << SrcRC->getName() << ", "
-                     << DstRC->getName() << ".\n");
-        return false;           // Not coalescable.
       }
-
-      // If we are joining two virtual registers and the resulting register
-      // class is more restrictive (fewer register, smaller size). Check if it's
-      // worth doing the merge.
-      if (!isWinToJoinCrossClass(SrcReg, DstReg, SrcRC, DstRC, NewRC)) {
+      if (!isWinToJoinCrossClass(CP.getSrcReg(), CP.getDstReg(),
+                                 mri_->getRegClass(CP.getSrcReg()),
+                                 mri_->getRegClass(CP.getDstReg()),
+                                 CP.getNewRC())) {
         DEBUG(dbgs() << "\tAvoid coalescing to constrained register class: "
-                     << SrcRC->getName() << "/"
-                     << DstRC->getName() << " -> "
-                     << NewRC->getName() << ".\n");
-        // Allow the coalescer to try again in case either side gets coalesced to
-        // a physical register that's compatible with the other side. e.g.
-        // r1024 = MOV32to32_ r1025
-        // But later r1024 is assigned EAX then r1025 may be coalesced with EAX.
+                     << CP.getNewRC()->getName() << ".\n");
         Again = true;  // May be possible to coalesce later.
         return false;
       }
     }
-  }
-
-  // Will it create illegal extract_subreg / insert_subreg?
-  if (SrcIsPhys && HasIncompatibleSubRegDefUse(CopyMI, DstReg, SrcReg))
-    return false;
-  if (DstIsPhys && HasIncompatibleSubRegDefUse(CopyMI, SrcReg, DstReg))
-    return false;
-
-  LiveInterval &SrcInt = li_->getInterval(SrcReg);
-  LiveInterval &DstInt = li_->getInterval(DstReg);
-  assert(SrcInt.reg == SrcReg && DstInt.reg == DstReg &&
-         "Register mapping is horribly broken!");
 
-  DEBUG({
-      dbgs() << "\t\tInspecting ";
-      if (SrcRC) dbgs() << SrcRC->getName() << ": ";
-      SrcInt.print(dbgs(), tri_);
-      dbgs() << "\n\t\t       and ";
-      if (DstRC) dbgs() << DstRC->getName() << ": ";
-      DstInt.print(dbgs(), tri_);
-      dbgs() << "\n";
-    });
+    // When possible, let DstReg be the larger interval.
+    if (!CP.getSubIdx() && li_->getInterval(CP.getSrcReg()).ranges.size() >
+                           li_->getInterval(CP.getDstReg()).ranges.size())
+      CP.flip();
+  }
+
+  // We need to be careful about coalescing a source physical register with a
+  // virtual register. Once the coalescing is done, it cannot be broken and
+  // these are not spillable! If the destination interval uses are far away,
+  // think twice about coalescing them!
+  // FIXME: Why are we skipping this test for partial copies?
+  //        CodeGen/X86/phys_subreg_coalesce-3.ll needs it.
+  if (!CP.isPartial() && CP.isPhys()) {
+    LiveInterval &JoinVInt = li_->getInterval(CP.getSrcReg());
+
+    // Don't join with physregs that have a ridiculous number of live
+    // ranges. The data structure performance is really bad when that
+    // happens.
+    if (li_->hasInterval(CP.getDstReg()) &&
+        li_->getInterval(CP.getDstReg()).ranges.size() > 1000) {
+      mri_->setRegAllocationHint(CP.getSrcReg(), 0, CP.getDstReg());
+      ++numAborts;
+      DEBUG(dbgs()
+           << "\tPhysical register live interval too complicated, abort!\n");
+      return false;
+    }
 
-  // Save a copy of the virtual register live interval. We'll manually
-  // merge this into the "real" physical register live interval this is
-  // coalesced with.
-  OwningPtr<LiveInterval> SavedLI;
-  if (RealDstReg)
-    SavedLI.reset(li_->dupInterval(&SrcInt));
-  else if (RealSrcReg)
-    SavedLI.reset(li_->dupInterval(&DstInt));
-
-  if (!isExtSubReg && !isInsSubReg && !isSubRegToReg) {
-    // Check if it is necessary to propagate "isDead" property.
-    MachineOperand *mopd = CopyMI->findRegisterDefOperand(DstReg, false);
-    bool isDead = mopd->isDead();
-
-    // We need to be careful about coalescing a source physical register with a
-    // virtual register. Once the coalescing is done, it cannot be broken and
-    // these are not spillable! If the destination interval uses are far away,
-    // think twice about coalescing them!
-    if (!isDead && (SrcIsPhys || DstIsPhys)) {
-      // If the virtual register live interval is long but it has low use
-      // density, do not join them, instead mark the physical register as its
-      // allocation preference.
-      LiveInterval &JoinVInt = SrcIsPhys ? DstInt : SrcInt;
-      LiveInterval &JoinPInt = SrcIsPhys ? SrcInt : DstInt;
-      unsigned JoinVReg = SrcIsPhys ? DstReg : SrcReg;
-      unsigned JoinPReg = SrcIsPhys ? SrcReg : DstReg;
-
-      // Don't join with physregs that have a ridiculous number of live
-      // ranges. The data structure performance is really bad when that
-      // happens.
-      if (JoinPInt.ranges.size() > 1000) {
-        mri_->setRegAllocationHint(JoinVInt.reg, 0, JoinPReg);
-        ++numAborts;
-        DEBUG(dbgs()
-              << "\tPhysical register live interval too complicated, abort!\n");
-        return false;
-      }
+    const TargetRegisterClass *RC = mri_->getRegClass(CP.getSrcReg());
+    unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
+    unsigned Length = li_->getApproximateInstructionCount(JoinVInt);
+    if (Length > Threshold &&
+        std::distance(mri_->use_nodbg_begin(CP.getSrcReg()),
+                      mri_->use_nodbg_end()) * Threshold < Length) {
+      // Before giving up coalescing, if definition of source is defined by
+      // trivial computation, try rematerializing it.
+      if (!CP.isFlipped() &&
+          ReMaterializeTrivialDef(JoinVInt, CP.getDstReg(), 0, CopyMI))
+        return true;
 
-      const TargetRegisterClass *RC = mri_->getRegClass(JoinVReg);
-      unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
-      unsigned Length = li_->getApproximateInstructionCount(JoinVInt);
-      if (Length > Threshold &&
-          std::distance(mri_->use_nodbg_begin(JoinVReg),
-                        mri_->use_nodbg_end()) * Threshold < Length) {
-        // Before giving up coalescing, if definition of source is defined by
-        // trivial computation, try rematerializing it.
-        if (ReMaterializeTrivialDef(SrcInt, DstReg, DstSubIdx, CopyMI))
-          return true;
-
-        mri_->setRegAllocationHint(JoinVInt.reg, 0, JoinPReg);
-        ++numAborts;
-        DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n");
-        Again = true;  // May be possible to coalesce later.
-        return false;
-      }
+      mri_->setRegAllocationHint(CP.getSrcReg(), 0, CP.getDstReg());
+      ++numAborts;
+      DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n");
+      Again = true;  // May be possible to coalesce later.
+      return false;
     }
   }
 
@@ -1724,32 +1168,24 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
   // Otherwise, if one of the intervals being joined is a physreg, this method
   // always canonicalizes DstInt to be it.  The output "SrcInt" will not have
   // been modified, so we can use this information below to update aliases.
-  bool Swapped = false;
-  // If SrcInt is implicitly defined, it's safe to coalesce.
-  if (SrcInt.empty()) {
-    if (!CanCoalesceWithImpDef(CopyMI, DstInt, SrcInt)) {
-      // Only coalesce an empty interval (defined by implicit_def) with
-      // another interval which has a valno defined by the CopyMI and the CopyMI
-      // is a kill of the implicit def.
-      DEBUG(dbgs() << "\tNot profitable!\n");
-      return false;
-    }
-  } else if (!JoinIntervals(DstInt, SrcInt, Swapped)) {
+  if (!JoinIntervals(CP)) {
     // Coalescing failed.
 
     // If definition of source is defined by trivial computation, try
     // rematerializing it.
-    if (!isExtSubReg && !isInsSubReg && !isSubRegToReg &&
-        ReMaterializeTrivialDef(SrcInt, DstReg, DstSubIdx, CopyMI))
+    if (!CP.isFlipped() &&
+        ReMaterializeTrivialDef(li_->getInterval(CP.getSrcReg()),
+                                CP.getDstReg(), 0, CopyMI))
       return true;
 
     // If we can eliminate the copy without merging the live ranges, do so now.
-    if (!isExtSubReg && !isInsSubReg && !isSubRegToReg &&
-        (AdjustCopiesBackFrom(SrcInt, DstInt, CopyMI) ||
-         RemoveCopyByCommutingDef(SrcInt, DstInt, CopyMI))) {
-      JoinedCopies.insert(CopyMI);
-      DEBUG(dbgs() << "\tTrivial!\n");
-      return true;
+    if (!CP.isPartial()) {
+      if (AdjustCopiesBackFrom(CP, CopyMI) ||
+          RemoveCopyByCommutingDef(CP, CopyMI)) {
+        JoinedCopies.insert(CopyMI);
+        DEBUG(dbgs() << "\tTrivial!\n");
+        return true;
+      }
     }
 
     // Otherwise, we are unable to join the intervals.
@@ -1758,86 +1194,32 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
     return false;
   }
 
-  LiveInterval *ResSrcInt = &SrcInt;
-  LiveInterval *ResDstInt = &DstInt;
-  if (Swapped) {
-    std::swap(SrcReg, DstReg);
-    std::swap(ResSrcInt, ResDstInt);
-  }
-  assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-         "LiveInterval::join didn't work right!");
-
-  // If we're about to merge live ranges into a physical register live interval,
-  // we have to update any aliased register's live ranges to indicate that they
-  // have clobbered values for this range.
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
-    // If this is a extract_subreg where dst is a physical register, e.g.
-    // cl = EXTRACT_SUBREG reg1024, 1
-    // then create and update the actual physical register allocated to RHS.
-    if (RealDstReg || RealSrcReg) {
-      LiveInterval &RealInt =
-        li_->getOrCreateInterval(RealDstReg ? RealDstReg : RealSrcReg);
-      for (LiveInterval::const_vni_iterator I = SavedLI->vni_begin(),
-             E = SavedLI->vni_end(); I != E; ++I) {
-        const VNInfo *ValNo = *I;
-        VNInfo *NewValNo = RealInt.getNextValue(ValNo->def, ValNo->getCopy(),
-                                                false, // updated at *
-                                                li_->getVNInfoAllocator());
-        NewValNo->setFlags(ValNo->getFlags()); // * updated here.
-        RealInt.addKills(NewValNo, ValNo->kills);
-        RealInt.MergeValueInAsValue(*SavedLI, ValNo, NewValNo);
-      }
-      RealInt.weight += SavedLI->weight;
-      DstReg = RealDstReg ? RealDstReg : RealSrcReg;
-    }
-
-    // Update the liveintervals of sub-registers.
-    for (const unsigned *AS = tri_->getSubRegisters(DstReg); *AS; ++AS)
-      li_->getOrCreateInterval(*AS).MergeInClobberRanges(*li_, *ResSrcInt,
-                                                 li_->getVNInfoAllocator());
-  }
-
-  // If this is a EXTRACT_SUBREG, make sure the result of coalescing is the
-  // larger super-register.
-  if ((isExtSubReg || isInsSubReg || isSubRegToReg) &&
-      !SrcIsPhys && !DstIsPhys) {
-    if ((isExtSubReg && !Swapped) ||
-        ((isInsSubReg || isSubRegToReg) && Swapped)) {
-      ResSrcInt->Copy(*ResDstInt, mri_, li_->getVNInfoAllocator());
-      std::swap(SrcReg, DstReg);
-      std::swap(ResSrcInt, ResDstInt);
-    }
-  }
-
   // Coalescing to a virtual register that is of a sub-register class of the
   // other. Make sure the resulting register is set to the right register class.
-  if (CrossRC)
+  if (CP.isCrossClass()) {
     ++numCrossRCs;
-
-  // This may happen even if it's cross-rc coalescing. e.g.
-  // %reg1026<def> = SUBREG_TO_REG 0, %reg1037<kill>, 4
-  // reg1026 -> GR64, reg1037 -> GR32_ABCD. The resulting register will have to
-  // be allocate a register from GR64_ABCD.
-  if (NewRC)
-    mri_->setRegClass(DstReg, NewRC);
+    mri_->setRegClass(CP.getDstReg(), CP.getNewRC());
+  }
 
   // Remember to delete the copy instruction.
   JoinedCopies.insert(CopyMI);
 
-  UpdateRegDefsUses(SrcReg, DstReg, SubIdx);
+  UpdateRegDefsUses(CP);
 
   // If we have extended the live range of a physical register, make sure we
   // update live-in lists as well.
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
-    const LiveInterval &VRegInterval = li_->getInterval(SrcReg);
+  if (CP.isPhys()) {
     SmallVector<MachineBasicBlock*, 16> BlockSeq;
-    for (LiveInterval::const_iterator I = VRegInterval.begin(),
-           E = VRegInterval.end(); I != E; ++I ) {
+    // JoinIntervals invalidates the VNInfos in SrcInt, but we only need the
+    // ranges for this, and they are preserved.
+    LiveInterval &SrcInt = li_->getInterval(CP.getSrcReg());
+    for (LiveInterval::const_iterator I = SrcInt.begin(), E = SrcInt.end();
+         I != E; ++I ) {
       li_->findLiveInMBBs(I->start, I->end, BlockSeq);
       for (unsigned idx = 0, size = BlockSeq.size(); idx != size; ++idx) {
         MachineBasicBlock &block = *BlockSeq[idx];
-        if (!block.isLiveIn(DstReg))
-          block.addLiveIn(DstReg);
+        if (!block.isLiveIn(CP.getDstReg()))
+          block.addLiveIn(CP.getDstReg());
       }
       BlockSeq.clear();
     }
@@ -1845,32 +1227,17 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
 
   // SrcReg is guarateed to be the register whose live interval that is
   // being merged.
-  li_->removeInterval(SrcReg);
+  li_->removeInterval(CP.getSrcReg());
 
   // Update regalloc hint.
-  tri_->UpdateRegAllocHint(SrcReg, DstReg, *mf_);
-
-  // Manually deleted the live interval copy.
-  if (SavedLI) {
-    SavedLI->clear();
-    SavedLI.reset();
-  }
-
-  // If resulting interval has a preference that no longer fits because of subreg
-  // coalescing, just clear the preference.
-  unsigned Preference = getRegAllocPreference(ResDstInt->reg, *mf_, mri_, tri_);
-  if (Preference && (isExtSubReg || isInsSubReg || isSubRegToReg) &&
-      TargetRegisterInfo::isVirtualRegister(ResDstInt->reg)) {
-    const TargetRegisterClass *RC = mri_->getRegClass(ResDstInt->reg);
-    if (!RC->contains(Preference))
-      mri_->setRegAllocationHint(ResDstInt->reg, 0, 0);
-  }
+  tri_->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *mf_);
 
   DEBUG({
-      dbgs() << "\t\tJoined. Result = ";
-      ResDstInt->print(dbgs(), tri_);
-      dbgs() << "\n";
-    });
+    LiveInterval &DstInt = li_->getInterval(CP.getDstReg());
+    dbgs() << "\tJoined. Result = ";
+    DstInt.print(dbgs(), tri_);
+    dbgs() << "\n";
+  });
 
   ++numJoins;
   return true;
@@ -1927,263 +1294,53 @@ static unsigned ComputeUltimateVN(VNInfo *VNI,
   return ThisValNoAssignments[VN] = UltimateVN;
 }
 
-static bool InVector(VNInfo *Val, const SmallVector<VNInfo*, 8> &V) {
-  return std::find(V.begin(), V.end(), Val) != V.end();
-}
-
-static bool isValNoDefMove(const MachineInstr *MI, unsigned DR, unsigned SR,
-                           const TargetInstrInfo *TII,
-                           const TargetRegisterInfo *TRI) {
-  unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-  if (TII->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
-    ;
-  else if (MI->isExtractSubreg()) {
-    DstReg = MI->getOperand(0).getReg();
-    SrcReg = MI->getOperand(1).getReg();
-  } else if (MI->isSubregToReg() ||
-             MI->isInsertSubreg()) {
-    DstReg = MI->getOperand(0).getReg();
-    SrcReg = MI->getOperand(2).getReg();
-  } else
-    return false;
-  return (SrcReg == SR || TRI->isSuperRegister(SR, SrcReg)) &&
-         (DstReg == DR || TRI->isSuperRegister(DR, DstReg));
-}
-
-/// RangeIsDefinedByCopyFromReg - Return true if the specified live range of
-/// the specified live interval is defined by a copy from the specified
-/// register.
-bool SimpleRegisterCoalescing::RangeIsDefinedByCopyFromReg(LiveInterval &li,
-                                                           LiveRange *LR,
-                                                           unsigned Reg) {
-  unsigned SrcReg = li_->getVNInfoSourceReg(LR->valno);
-  if (SrcReg == Reg)
-    return true;
-  // FIXME: Do isPHIDef and isDefAccurate both need to be tested?
-  if ((LR->valno->isPHIDef() || !LR->valno->isDefAccurate()) &&
-      TargetRegisterInfo::isPhysicalRegister(li.reg) &&
-      *tri_->getSuperRegisters(li.reg)) {
-    // It's a sub-register live interval, we may not have precise information.
-    // Re-compute it.
-    MachineInstr *DefMI = li_->getInstructionFromIndex(LR->start);
-    if (DefMI && isValNoDefMove(DefMI, li.reg, Reg, tii_, tri_)) {
-      // Cache computed info.
-      LR->valno->def = LR->start;
-      LR->valno->setCopy(DefMI);
-      return true;
-    }
-  }
-  return false;
-}
-
-
-/// ValueLiveAt - Return true if the LiveRange pointed to by the given
-/// iterator, or any subsequent range with the same value number,
-/// is live at the given point.
-bool SimpleRegisterCoalescing::ValueLiveAt(LiveInterval::iterator LRItr,
-                                           LiveInterval::iterator LREnd,
-                                           SlotIndex defPoint) const {
-  for (const VNInfo *valno = LRItr->valno;
-       (LRItr != LREnd) && (LRItr->valno == valno); ++LRItr) {
-    if (LRItr->contains(defPoint))
-      return true;
-  }
-
-  return false;
-}
-
-
-/// SimpleJoin - Attempt to joint the specified interval into this one. The
-/// caller of this method must guarantee that the RHS only contains a single
-/// value number and that the RHS is not defined by a copy from this
-/// interval.  This returns false if the intervals are not joinable, or it
-/// joins them and returns true.
-bool SimpleRegisterCoalescing::SimpleJoin(LiveInterval &LHS, LiveInterval &RHS){
-  assert(RHS.containsOneValue());
-
-  // Some number (potentially more than one) value numbers in the current
-  // interval may be defined as copies from the RHS.  Scan the overlapping
-  // portions of the LHS and RHS, keeping track of this and looking for
-  // overlapping live ranges that are NOT defined as copies.  If these exist, we
-  // cannot coalesce.
-
-  LiveInterval::iterator LHSIt = LHS.begin(), LHSEnd = LHS.end();
-  LiveInterval::iterator RHSIt = RHS.begin(), RHSEnd = RHS.end();
-
-  if (LHSIt->start < RHSIt->start) {
-    LHSIt = std::upper_bound(LHSIt, LHSEnd, RHSIt->start);
-    if (LHSIt != LHS.begin()) --LHSIt;
-  } else if (RHSIt->start < LHSIt->start) {
-    RHSIt = std::upper_bound(RHSIt, RHSEnd, LHSIt->start);
-    if (RHSIt != RHS.begin()) --RHSIt;
-  }
-
-  SmallVector<VNInfo*, 8> EliminatedLHSVals;
-
-  while (1) {
-    // Determine if these live intervals overlap.
-    bool Overlaps = false;
-    if (LHSIt->start <= RHSIt->start)
-      Overlaps = LHSIt->end > RHSIt->start;
-    else
-      Overlaps = RHSIt->end > LHSIt->start;
-
-    // If the live intervals overlap, there are two interesting cases: if the
-    // LHS interval is defined by a copy from the RHS, it's ok and we record
-    // that the LHS value # is the same as the RHS.  If it's not, then we cannot
-    // coalesce these live ranges and we bail out.
-    if (Overlaps) {
-      // If we haven't already recorded that this value # is safe, check it.
-      if (!InVector(LHSIt->valno, EliminatedLHSVals)) {
-        // If it's re-defined by an early clobber somewhere in the live range,
-        // then conservatively abort coalescing.
-        if (LHSIt->valno->hasRedefByEC())
-          return false;
-        // Copy from the RHS?
-        if (!RangeIsDefinedByCopyFromReg(LHS, LHSIt, RHS.reg))
-          return false;    // Nope, bail out.
-
-        if (ValueLiveAt(LHSIt, LHS.end(), RHSIt->valno->def))
-          // Here is an interesting situation:
-          // BB1:
-          //   vr1025 = copy vr1024
-          //   ..
-          // BB2:
-          //   vr1024 = op
-          //          = vr1025
-          // Even though vr1025 is copied from vr1024, it's not safe to
-          // coalesce them since the live range of vr1025 intersects the
-          // def of vr1024. This happens because vr1025 is assigned the
-          // value of the previous iteration of vr1024.
+/// JoinIntervals - Attempt to join these two intervals.  On failure, this
+/// returns false.
+bool SimpleRegisterCoalescing::JoinIntervals(CoalescerPair &CP) {
+  LiveInterval &RHS = li_->getInterval(CP.getSrcReg());
+  DEBUG({ dbgs() << "\t\tRHS = "; RHS.print(dbgs(), tri_); dbgs() << "\n"; });
+
+  // If a live interval is a physical register, check for interference with any
+  // aliases. The interference check implemented here is a bit more conservative
+  // than the full interfeence check below. We allow overlapping live ranges
+  // only when one is a copy of the other.
+  if (CP.isPhys()) {
+    for (const unsigned *AS = tri_->getAliasSet(CP.getDstReg()); *AS; ++AS){
+      if (!li_->hasInterval(*AS))
+        continue;
+      const LiveInterval &LHS = li_->getInterval(*AS);
+      LiveInterval::const_iterator LI = LHS.begin();
+      for (LiveInterval::const_iterator RI = RHS.begin(), RE = RHS.end();
+           RI != RE; ++RI) {
+        LI = std::lower_bound(LI, LHS.end(), RI->start);
+        // Does LHS have an overlapping live range starting before RI?
+        if ((LI != LHS.begin() && LI[-1].end > RI->start) &&
+            (RI->start != RI->valno->def ||
+             !CP.isCoalescable(li_->getInstructionFromIndex(RI->start)))) {
+          DEBUG({
+            dbgs() << "\t\tInterference from alias: ";
+            LHS.print(dbgs(), tri_);
+            dbgs() << "\n\t\tOverlap at " << RI->start << " and no copy.\n";
+          });
           return false;
-        EliminatedLHSVals.push_back(LHSIt->valno);
-      }
-
-      // We know this entire LHS live range is okay, so skip it now.
-      if (++LHSIt == LHSEnd) break;
-      continue;
-    }
+        }
 
-    if (LHSIt->end < RHSIt->end) {
-      if (++LHSIt == LHSEnd) break;
-    } else {
-      // One interesting case to check here.  It's possible that we have
-      // something like "X3 = Y" which defines a new value number in the LHS,
-      // and is the last use of this liverange of the RHS.  In this case, we
-      // want to notice this copy (so that it gets coalesced away) even though
-      // the live ranges don't actually overlap.
-      if (LHSIt->start == RHSIt->end) {
-        if (InVector(LHSIt->valno, EliminatedLHSVals)) {
-          // We already know that this value number is going to be merged in
-          // if coalescing succeeds.  Just skip the liverange.
-          if (++LHSIt == LHSEnd) break;
-        } else {
-          // If it's re-defined by an early clobber somewhere in the live range,
-          // then conservatively abort coalescing.
-          if (LHSIt->valno->hasRedefByEC())
+        // Check that LHS ranges beginning in this range are copies.
+        for (; LI != LHS.end() && LI->start < RI->end; ++LI) {
+          if (LI->start != LI->valno->def ||
+              !CP.isCoalescable(li_->getInstructionFromIndex(LI->start))) {
+            DEBUG({
+              dbgs() << "\t\tInterference from alias: ";
+              LHS.print(dbgs(), tri_);
+              dbgs() << "\n\t\tDef at " << LI->start << " is not a copy.\n";
+            });
             return false;
-          // Otherwise, if this is a copy from the RHS, mark it as being merged
-          // in.
-          if (RangeIsDefinedByCopyFromReg(LHS, LHSIt, RHS.reg)) {
-            if (ValueLiveAt(LHSIt, LHS.end(), RHSIt->valno->def))
-              // Here is an interesting situation:
-              // BB1:
-              //   vr1025 = copy vr1024
-              //   ..
-              // BB2:
-              //   vr1024 = op
-              //          = vr1025
-              // Even though vr1025 is copied from vr1024, it's not safe to
-              // coalesced them since live range of vr1025 intersects the
-              // def of vr1024. This happens because vr1025 is assigned the
-              // value of the previous iteration of vr1024.
-              return false;
-            EliminatedLHSVals.push_back(LHSIt->valno);
-
-            // We know this entire LHS live range is okay, so skip it now.
-            if (++LHSIt == LHSEnd) break;
           }
         }
       }
-
-      if (++RHSIt == RHSEnd) break;
-    }
-  }
-
-  // If we got here, we know that the coalescing will be successful and that
-  // the value numbers in EliminatedLHSVals will all be merged together.  Since
-  // the most common case is that EliminatedLHSVals has a single number, we
-  // optimize for it: if there is more than one value, we merge them all into
-  // the lowest numbered one, then handle the interval as if we were merging
-  // with one value number.
-  VNInfo *LHSValNo = NULL;
-  if (EliminatedLHSVals.size() > 1) {
-    // Loop through all the equal value numbers merging them into the smallest
-    // one.
-    VNInfo *Smallest = EliminatedLHSVals[0];
-    for (unsigned i = 1, e = EliminatedLHSVals.size(); i != e; ++i) {
-      if (EliminatedLHSVals[i]->id < Smallest->id) {
-        // Merge the current notion of the smallest into the smaller one.
-        LHS.MergeValueNumberInto(Smallest, EliminatedLHSVals[i]);
-        Smallest = EliminatedLHSVals[i];
-      } else {
-        // Merge into the smallest.
-        LHS.MergeValueNumberInto(EliminatedLHSVals[i], Smallest);
-      }
     }
-    LHSValNo = Smallest;
-  } else if (EliminatedLHSVals.empty()) {
-    if (TargetRegisterInfo::isPhysicalRegister(LHS.reg) &&
-        *tri_->getSuperRegisters(LHS.reg))
-      // Imprecise sub-register information. Can't handle it.
-      return false;
-    llvm_unreachable("No copies from the RHS?");
-  } else {
-    LHSValNo = EliminatedLHSVals[0];
-  }
-
-  // Okay, now that there is a single LHS value number that we're merging the
-  // RHS into, update the value number info for the LHS to indicate that the
-  // value number is defined where the RHS value number was.
-  const VNInfo *VNI = RHS.getValNumInfo(0);
-  LHSValNo->def  = VNI->def;
-  LHSValNo->setCopy(VNI->getCopy());
-
-  // Okay, the final step is to loop over the RHS live intervals, adding them to
-  // the LHS.
-  if (VNI->hasPHIKill())
-    LHSValNo->setHasPHIKill(true);
-  LHS.addKills(LHSValNo, VNI->kills);
-  LHS.MergeRangesInAsValue(RHS, LHSValNo);
-
-  LHS.ComputeJoinedWeight(RHS);
-
-  // Update regalloc hint if both are virtual registers.
-  if (TargetRegisterInfo::isVirtualRegister(LHS.reg) &&
-      TargetRegisterInfo::isVirtualRegister(RHS.reg)) {
-    std::pair<unsigned, unsigned> RHSPref = mri_->getRegAllocationHint(RHS.reg);
-    std::pair<unsigned, unsigned> LHSPref = mri_->getRegAllocationHint(LHS.reg);
-    if (RHSPref != LHSPref)
-      mri_->setRegAllocationHint(LHS.reg, RHSPref.first, RHSPref.second);
   }
 
-  // Update the liveintervals of sub-registers.
-  if (TargetRegisterInfo::isPhysicalRegister(LHS.reg))
-    for (const unsigned *AS = tri_->getSubRegisters(LHS.reg); *AS; ++AS)
-      li_->getOrCreateInterval(*AS).MergeInClobberRanges(*li_, LHS,
-                                                    li_->getVNInfoAllocator());
-
-  return true;
-}
-
-/// JoinIntervals - Attempt to join these two intervals.  On failure, this
-/// returns false.  Otherwise, if one of the intervals being joined is a
-/// physreg, this method always canonicalizes LHS to be it.  The output
-/// "RHS" will not have been modified, so we can use this information
-/// below to update aliases.
-bool
-SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
-                                        bool &Swapped) {
   // Compute the final value assignment, assuming that the live ranges can be
   // coalesced.
   SmallVector<int, 16> LHSValNoAssignments;
@@ -2192,203 +1349,87 @@ SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
   DenseMap<VNInfo*, VNInfo*> RHSValsDefinedFromLHS;
   SmallVector<VNInfo*, 16> NewVNInfo;
 
-  // If a live interval is a physical register, conservatively check if any
-  // of its sub-registers is overlapping the live interval of the virtual
-  // register. If so, do not coalesce.
-  if (TargetRegisterInfo::isPhysicalRegister(LHS.reg) &&
-      *tri_->getSubRegisters(LHS.reg)) {
-    // If it's coalescing a virtual register to a physical register, estimate
-    // its live interval length. This is the *cost* of scanning an entire live
-    // interval. If the cost is low, we'll do an exhaustive check instead.
-
-    // If this is something like this:
-    // BB1:
-    // v1024 = op
-    // ...
-    // BB2:
-    // ...
-    // RAX   = v1024
-    //
-    // That is, the live interval of v1024 crosses a bb. Then we can't rely on
-    // less conservative check. It's possible a sub-register is defined before
-    // v1024 (or live in) and live out of BB1.
-    if (RHS.containsOneValue() &&
-        li_->intervalIsInOneMBB(RHS) &&
-        li_->getApproximateInstructionCount(RHS) <= 10) {
-      // Perform a more exhaustive check for some common cases.
-      if (li_->conflictsWithSubPhysRegRef(RHS, LHS.reg, true, JoinedCopies))
-        return false;
-    } else {
-      for (const unsigned* SR = tri_->getSubRegisters(LHS.reg); *SR; ++SR)
-        if (li_->hasInterval(*SR) && RHS.overlaps(li_->getInterval(*SR))) {
-          DEBUG({
-              dbgs() << "\tInterfere with sub-register ";
-              li_->getInterval(*SR).print(dbgs(), tri_);
-            });
-          return false;
-        }
-    }
-  } else if (TargetRegisterInfo::isPhysicalRegister(RHS.reg) &&
-             *tri_->getSubRegisters(RHS.reg)) {
-    if (LHS.containsOneValue() &&
-        li_->getApproximateInstructionCount(LHS) <= 10) {
-      // Perform a more exhaustive check for some common cases.
-      if (li_->conflictsWithSubPhysRegRef(LHS, RHS.reg, false, JoinedCopies))
-        return false;
-    } else {
-      for (const unsigned* SR = tri_->getSubRegisters(RHS.reg); *SR; ++SR)
-        if (li_->hasInterval(*SR) && LHS.overlaps(li_->getInterval(*SR))) {
-          DEBUG({
-              dbgs() << "\tInterfere with sub-register ";
-              li_->getInterval(*SR).print(dbgs(), tri_);
-            });
-          return false;
-        }
-    }
-  }
+  LiveInterval &LHS = li_->getOrCreateInterval(CP.getDstReg());
+  DEBUG({ dbgs() << "\t\tLHS = "; LHS.print(dbgs(), tri_); dbgs() << "\n"; });
 
-  // Compute ultimate value numbers for the LHS and RHS values.
-  if (RHS.containsOneValue()) {
-    // Copies from a liveinterval with a single value are simple to handle and
-    // very common, handle the special case here.  This is important, because
-    // often RHS is small and LHS is large (e.g. a physreg).
-
-    // Find out if the RHS is defined as a copy from some value in the LHS.
-    int RHSVal0DefinedFromLHS = -1;
-    int RHSValID = -1;
-    VNInfo *RHSValNoInfo = NULL;
-    VNInfo *RHSValNoInfo0 = RHS.getValNumInfo(0);
-    unsigned RHSSrcReg = li_->getVNInfoSourceReg(RHSValNoInfo0);
-    if (RHSSrcReg == 0 || RHSSrcReg != LHS.reg) {
-      // If RHS is not defined as a copy from the LHS, we can use simpler and
-      // faster checks to see if the live ranges are coalescable.  This joiner
-      // can't swap the LHS/RHS intervals though.
-      if (!TargetRegisterInfo::isPhysicalRegister(RHS.reg)) {
-        return SimpleJoin(LHS, RHS);
-      } else {
-        RHSValNoInfo = RHSValNoInfo0;
-      }
-    } else {
-      // It was defined as a copy from the LHS, find out what value # it is.
-      RHSValNoInfo =
-        LHS.getLiveRangeContaining(RHSValNoInfo0->def.getPrevSlot())->valno;
-      RHSValID = RHSValNoInfo->id;
-      RHSVal0DefinedFromLHS = RHSValID;
-    }
+  // Loop over the value numbers of the LHS, seeing if any are defined from
+  // the RHS.
+  for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
+       i != e; ++i) {
+    VNInfo *VNI = *i;
+    if (VNI->isUnused() || VNI->getCopy() == 0)  // Src not defined by a copy?
+      continue;
 
-    LHSValNoAssignments.resize(LHS.getNumValNums(), -1);
-    RHSValNoAssignments.resize(RHS.getNumValNums(), -1);
-    NewVNInfo.resize(LHS.getNumValNums(), NULL);
-
-    // Okay, *all* of the values in LHS that are defined as a copy from RHS
-    // should now get updated.
-    for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
-         i != e; ++i) {
-      VNInfo *VNI = *i;
-      unsigned VN = VNI->id;
-      if (unsigned LHSSrcReg = li_->getVNInfoSourceReg(VNI)) {
-        if (LHSSrcReg != RHS.reg) {
-          // If this is not a copy from the RHS, its value number will be
-          // unmodified by the coalescing.
-          NewVNInfo[VN] = VNI;
-          LHSValNoAssignments[VN] = VN;
-        } else if (RHSValID == -1) {
-          // Otherwise, it is a copy from the RHS, and we don't already have a
-          // value# for it.  Keep the current value number, but remember it.
-          LHSValNoAssignments[VN] = RHSValID = VN;
-          NewVNInfo[VN] = RHSValNoInfo;
-          LHSValsDefinedFromRHS[VNI] = RHSValNoInfo0;
-        } else {
-          // Otherwise, use the specified value #.
-          LHSValNoAssignments[VN] = RHSValID;
-          if (VN == (unsigned)RHSValID) {  // Else this val# is dead.
-            NewVNInfo[VN] = RHSValNoInfo;
-            LHSValsDefinedFromRHS[VNI] = RHSValNoInfo0;
-          }
-        }
-      } else {
-        NewVNInfo[VN] = VNI;
-        LHSValNoAssignments[VN] = VN;
-      }
-    }
+    // Never join with a register that has EarlyClobber redefs.
+    if (VNI->hasRedefByEC())
+      return false;
 
-    assert(RHSValID != -1 && "Didn't find value #?");
-    RHSValNoAssignments[0] = RHSValID;
-    if (RHSVal0DefinedFromLHS != -1) {
-      // This path doesn't go through ComputeUltimateVN so just set
-      // it to anything.
-      RHSValsDefinedFromLHS[RHSValNoInfo0] = (VNInfo*)1;
-    }
-  } else {
-    // Loop over the value numbers of the LHS, seeing if any are defined from
-    // the RHS.
-    for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
-         i != e; ++i) {
-      VNInfo *VNI = *i;
-      if (VNI->isUnused() || VNI->getCopy() == 0)  // Src not defined by a copy?
-        continue;
+    // DstReg is known to be a register in the LHS interval.  If the src is
+    // from the RHS interval, we can use its value #.
+    if (!CP.isCoalescable(VNI->getCopy()))
+      continue;
 
-      // DstReg is known to be a register in the LHS interval.  If the src is
-      // from the RHS interval, we can use its value #.
-      if (li_->getVNInfoSourceReg(VNI) != RHS.reg)
-        continue;
+    // Figure out the value # from the RHS.
+    LiveRange *lr = RHS.getLiveRangeContaining(VNI->def.getPrevSlot());
+    // The copy could be to an aliased physreg.
+    if (!lr) continue;
+    LHSValsDefinedFromRHS[VNI] = lr->valno;
+  }
 
-      // Figure out the value # from the RHS.
-      LiveRange *lr = RHS.getLiveRangeContaining(VNI->def.getPrevSlot());
-      assert(lr && "Cannot find live range");
-      LHSValsDefinedFromRHS[VNI] = lr->valno;
-    }
+  // Loop over the value numbers of the RHS, seeing if any are defined from
+  // the LHS.
+  for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
+       i != e; ++i) {
+    VNInfo *VNI = *i;
+    if (VNI->isUnused() || VNI->getCopy() == 0)  // Src not defined by a copy?
+      continue;
 
-    // Loop over the value numbers of the RHS, seeing if any are defined from
-    // the LHS.
-    for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
-         i != e; ++i) {
-      VNInfo *VNI = *i;
-      if (VNI->isUnused() || VNI->getCopy() == 0)  // Src not defined by a copy?
-        continue;
+    // Never join with a register that has EarlyClobber redefs.
+    if (VNI->hasRedefByEC())
+      return false;
 
-      // DstReg is known to be a register in the RHS interval.  If the src is
-      // from the LHS interval, we can use its value #.
-      if (li_->getVNInfoSourceReg(VNI) != LHS.reg)
-        continue;
+    // DstReg is known to be a register in the RHS interval.  If the src is
+    // from the LHS interval, we can use its value #.
+    if (!CP.isCoalescable(VNI->getCopy()))
+      continue;
 
-      // Figure out the value # from the LHS.
-      LiveRange *lr = LHS.getLiveRangeContaining(VNI->def.getPrevSlot());
-      assert(lr && "Cannot find live range");
-      RHSValsDefinedFromLHS[VNI] = lr->valno;
-    }
+    // Figure out the value # from the LHS.
+    LiveRange *lr = LHS.getLiveRangeContaining(VNI->def.getPrevSlot());
+    // The copy could be to an aliased physreg.
+    if (!lr) continue;
+    RHSValsDefinedFromLHS[VNI] = lr->valno;
+  }
 
-    LHSValNoAssignments.resize(LHS.getNumValNums(), -1);
-    RHSValNoAssignments.resize(RHS.getNumValNums(), -1);
-    NewVNInfo.reserve(LHS.getNumValNums() + RHS.getNumValNums());
+  LHSValNoAssignments.resize(LHS.getNumValNums(), -1);
+  RHSValNoAssignments.resize(RHS.getNumValNums(), -1);
+  NewVNInfo.reserve(LHS.getNumValNums() + RHS.getNumValNums());
 
-    for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
-         i != e; ++i) {
-      VNInfo *VNI = *i;
-      unsigned VN = VNI->id;
-      if (LHSValNoAssignments[VN] >= 0 || VNI->isUnused())
-        continue;
-      ComputeUltimateVN(VNI, NewVNInfo,
-                        LHSValsDefinedFromRHS, RHSValsDefinedFromLHS,
-                        LHSValNoAssignments, RHSValNoAssignments);
+  for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
+       i != e; ++i) {
+    VNInfo *VNI = *i;
+    unsigned VN = VNI->id;
+    if (LHSValNoAssignments[VN] >= 0 || VNI->isUnused())
+      continue;
+    ComputeUltimateVN(VNI, NewVNInfo,
+                      LHSValsDefinedFromRHS, RHSValsDefinedFromLHS,
+                      LHSValNoAssignments, RHSValNoAssignments);
+  }
+  for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
+       i != e; ++i) {
+    VNInfo *VNI = *i;
+    unsigned VN = VNI->id;
+    if (RHSValNoAssignments[VN] >= 0 || VNI->isUnused())
+      continue;
+    // If this value number isn't a copy from the LHS, it's a new number.
+    if (RHSValsDefinedFromLHS.find(VNI) == RHSValsDefinedFromLHS.end()) {
+      NewVNInfo.push_back(VNI);
+      RHSValNoAssignments[VN] = NewVNInfo.size()-1;
+      continue;
     }
-    for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
-         i != e; ++i) {
-      VNInfo *VNI = *i;
-      unsigned VN = VNI->id;
-      if (RHSValNoAssignments[VN] >= 0 || VNI->isUnused())
-        continue;
-      // If this value number isn't a copy from the LHS, it's a new number.
-      if (RHSValsDefinedFromLHS.find(VNI) == RHSValsDefinedFromLHS.end()) {
-        NewVNInfo.push_back(VNI);
-        RHSValNoAssignments[VN] = NewVNInfo.size()-1;
-        continue;
-      }
 
-      ComputeUltimateVN(VNI, NewVNInfo,
-                        RHSValsDefinedFromLHS, LHSValsDefinedFromRHS,
-                        RHSValNoAssignments, LHSValNoAssignments);
-    }
+    ComputeUltimateVN(VNI, NewVNInfo,
+                      RHSValsDefinedFromLHS, LHSValsDefinedFromRHS,
+                      RHSValNoAssignments, LHSValNoAssignments);
   }
 
   // Armed with the mappings of LHS/RHS values to ultimate values, walk the
@@ -2399,15 +1440,17 @@ SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
   LiveInterval::const_iterator JE = RHS.end();
 
   // Skip ahead until the first place of potential sharing.
-  if (I->start < J->start) {
-    I = std::upper_bound(I, IE, J->start);
-    if (I != LHS.begin()) --I;
-  } else if (J->start < I->start) {
-    J = std::upper_bound(J, JE, I->start);
-    if (J != RHS.begin()) --J;
+  if (I != IE && J != JE) {
+    if (I->start < J->start) {
+      I = std::upper_bound(I, IE, J->start);
+      if (I != LHS.begin()) --I;
+    } else if (J->start < I->start) {
+      J = std::upper_bound(J, JE, I->start);
+      if (J != RHS.begin()) --J;
+    }
   }
 
-  while (1) {
+  while (I != IE && J != JE) {
     // Determine if these two live ranges overlap.
     bool Overlaps;
     if (I->start < J->start) {
@@ -2429,13 +1472,10 @@ SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
         return false;
     }
 
-    if (I->end < J->end) {
+    if (I->end < J->end)
       ++I;
-      if (I == IE) break;
-    } else {
+    else
       ++J;
-      if (J == JE) break;
-    }
   }
 
   // Update kill info. Some live ranges are extended due to copy coalescing.
@@ -2443,10 +1483,8 @@ SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
          E = LHSValsDefinedFromRHS.end(); I != E; ++I) {
     VNInfo *VNI = I->first;
     unsigned LHSValID = LHSValNoAssignments[VNI->id];
-    NewVNInfo[LHSValID]->removeKill(VNI->def);
     if (VNI->hasPHIKill())
       NewVNInfo[LHSValID]->setHasPHIKill(true);
-    RHS.addKills(NewVNInfo[LHSValID], VNI->kills);
   }
 
   // Update kill info. Some live ranges are extended due to copy coalescing.
@@ -2454,25 +1492,19 @@ SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
          E = RHSValsDefinedFromLHS.end(); I != E; ++I) {
     VNInfo *VNI = I->first;
     unsigned RHSValID = RHSValNoAssignments[VNI->id];
-    NewVNInfo[RHSValID]->removeKill(VNI->def);
     if (VNI->hasPHIKill())
       NewVNInfo[RHSValID]->setHasPHIKill(true);
-    LHS.addKills(NewVNInfo[RHSValID], VNI->kills);
   }
 
+  if (LHSValNoAssignments.empty())
+    LHSValNoAssignments.push_back(-1);
+  if (RHSValNoAssignments.empty())
+    RHSValNoAssignments.push_back(-1);
+
   // If we get here, we know that we can coalesce the live ranges.  Ask the
   // intervals to coalesce themselves now.
-  if ((RHS.ranges.size() > LHS.ranges.size() &&
-      TargetRegisterInfo::isVirtualRegister(LHS.reg)) ||
-      TargetRegisterInfo::isPhysicalRegister(RHS.reg)) {
-    RHS.join(LHS, &RHSValNoAssignments[0], &LHSValNoAssignments[0], NewVNInfo,
-             mri_);
-    Swapped = true;
-  } else {
-    LHS.join(RHS, &LHSValNoAssignments[0], &RHSValNoAssignments[0], NewVNInfo,
-             mri_);
-    Swapped = false;
-  }
+  LHS.join(RHS, &LHSValNoAssignments[0], &RHSValNoAssignments[0], NewVNInfo,
+           mri_);
   return true;
 }
 
@@ -2513,15 +1545,10 @@ void SimpleRegisterCoalescing::CopyCoalesceInMBB(MachineBasicBlock *MBB,
     // If this isn't a copy nor a extract_subreg, we can't join intervals.
     unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
     bool isInsUndef = false;
-    if (Inst->isExtractSubreg()) {
+    if (Inst->isCopy()) {
       DstReg = Inst->getOperand(0).getReg();
       SrcReg = Inst->getOperand(1).getReg();
-    } else if (Inst->isInsertSubreg()) {
-      DstReg = Inst->getOperand(0).getReg();
-      SrcReg = Inst->getOperand(2).getReg();
-      if (Inst->getOperand(1).isUndef())
-        isInsUndef = true;
-    } else if (Inst->isInsertSubreg() || Inst->isSubregToReg()) {
+    } else if (Inst->isSubregToReg()) {
       DstReg = Inst->getOperand(0).getReg();
       SrcReg = Inst->getOperand(2).getReg();
     } else if (!tii_->isMoveInstr(*Inst, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
@@ -2650,6 +1677,8 @@ SimpleRegisterCoalescing::lastRegisterUse(SlotIndex Start,
            E = mri_->use_nodbg_end(); I != E; ++I) {
       MachineOperand &Use = I.getOperand();
       MachineInstr *UseMI = Use.getParent();
+      if (UseMI->isIdentityCopy())
+        continue;
       unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
       if (tii_->isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
           SrcReg == DstReg && SrcSubIdx == DstSubIdx)
@@ -2680,7 +1709,8 @@ SimpleRegisterCoalescing::lastRegisterUse(SlotIndex Start,
 
     // Ignore identity copies.
     unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-    if (!(tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
+    if (!MI->isIdentityCopy() &&
+        !(tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
           SrcReg == DstReg && SrcSubIdx == DstSubIdx))
       for (unsigned i = 0, NumOps = MI->getNumOperands(); i != NumOps; ++i) {
         MachineOperand &Use = MI->getOperand(i);
@@ -2750,10 +1780,9 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
         // Delete all coalesced copies.
         bool DoDelete = true;
         if (!tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
-          assert((MI->isExtractSubreg() || MI->isInsertSubreg() ||
-                  MI->isSubregToReg()) && "Unrecognized copy instruction");
-          DstReg = MI->getOperand(0).getReg();
-          if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+          assert(MI->isCopyLike() && "Unrecognized copy instruction");
+          SrcReg = MI->getOperand(MI->isSubregToReg() ? 2 : 1).getReg();
+          if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
             // Do not delete extract_subreg, insert_subreg of physical
             // registers unless the definition is dead. e.g.
             // %DO<def> = INSERT_SUBREG %D0<undef>, %S0<kill>, 1
@@ -2762,7 +1791,7 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
             DoDelete = false;
         }
         if (MI->allDefsAreDead()) {
-          LiveInterval &li = li_->getInterval(DstReg);
+          LiveInterval &li = li_->getInterval(SrcReg);
           if (!ShortenDeadCopySrcLiveRange(li, MI))
             ShortenDeadCopyLiveRange(li, MI);
           DoDelete = true;
@@ -2812,12 +1841,13 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
 
       // If the move will be an identity move delete it
       bool isMove= tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);
-      if (isMove && SrcReg == DstReg && SrcSubIdx == DstSubIdx) {
+      if (MI->isIdentityCopy() ||
+          (isMove && SrcReg == DstReg && SrcSubIdx == DstSubIdx)) {
         if (li_->hasInterval(SrcReg)) {
           LiveInterval &RegInt = li_->getInterval(SrcReg);
           // If def of this move instruction is dead, remove its live range
-          // from the dstination register's live interval.
-          if (MI->registerDefIsDead(DstReg)) {
+          // from the destination register's live interval.
+          if (MI->allDefsAreDead()) {
             if (!ShortenDeadCopySrcLiveRange(RegInt, MI))
               ShortenDeadCopyLiveRange(RegInt, MI);
           }
@@ -2832,17 +1862,13 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
 
       // Check for now unnecessary kill flags.
       if (li_->isNotInMIMap(MI)) continue;
-      SlotIndex UseIdx = li_->getInstructionIndex(MI).getUseIndex();
+      SlotIndex DefIdx = li_->getInstructionIndex(MI).getDefIndex();
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         MachineOperand &MO = MI->getOperand(i);
         if (!MO.isReg() || !MO.isKill()) continue;
         unsigned reg = MO.getReg();
         if (!reg || !li_->hasInterval(reg)) continue;
-        LiveInterval &LI = li_->getInterval(reg);
-        const LiveRange *LR = LI.getLiveRangeContaining(UseIdx);
-        if (!LR ||
-            (!LR->valno->isKill(UseIdx.getDefIndex()) &&
-             LR->valno->def != UseIdx.getDefIndex()))
+        if (!li_->getInterval(reg).killedAt(DefIdx))
           MO.setIsKill(false);
       }
     }
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.h b/lib/CodeGen/SimpleRegisterCoalescing.h
index 1be04f3..e154da6 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.h
+++ b/lib/CodeGen/SimpleRegisterCoalescing.h
@@ -105,21 +105,12 @@ namespace llvm {
     /// possible to coalesce this interval, but it may be possible if other
     /// things get coalesced, then it returns true by reference in 'Again'.
     bool JoinCopy(CopyRec &TheCopy, bool &Again);
-    
+
     /// JoinIntervals - Attempt to join these two intervals.  On failure, this
-    /// returns false.  Otherwise, if one of the intervals being joined is a
-    /// physreg, this method always canonicalizes DestInt to be it.  The output
-    /// "SrcInt" will not have been modified, so we can use this information
-    /// below to update aliases.
-    bool JoinIntervals(LiveInterval &LHS, LiveInterval &RHS, bool &Swapped);
-    
-    /// SimpleJoin - Attempt to join the specified interval into this one. The
-    /// caller of this method must guarantee that the RHS only contains a single
-    /// value number and that the RHS is not defined by a copy from this
-    /// interval.  This returns false if the intervals are not joinable, or it
-    /// joins them and returns true.
-    bool SimpleJoin(LiveInterval &LHS, LiveInterval &RHS);
-    
+    /// returns false.  The output "SrcInt" will not have been modified, so we can
+    /// use this information below to update aliases.
+    bool JoinIntervals(CoalescerPair &CP);
+
     /// Return true if the two specified registers belong to different register
     /// classes.  The registers may be either phys or virt regs.
     bool differingRegisterClasses(unsigned RegA, unsigned RegB) const;
@@ -128,8 +119,7 @@ namespace llvm {
     /// the source value number is defined by a copy from the destination reg
     /// see if we can merge these two destination reg valno# into a single
     /// value number, eliminating a copy.
-    bool AdjustCopiesBackFrom(LiveInterval &IntA, LiveInterval &IntB,
-                              MachineInstr *CopyMI);
+    bool AdjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
 
     /// HasOtherReachingDefs - Return true if there are definitions of IntB
     /// other than BValNo val# that can reach uses of AValno val# of IntA.
@@ -140,8 +130,7 @@ namespace llvm {
     /// If the source value number is defined by a commutable instruction and
     /// its other operand is coalesced to the copy dest register, see if we
     /// can transform the copy into a noop by commuting the definition.
-    bool RemoveCopyByCommutingDef(LiveInterval &IntA, LiveInterval &IntB,
-                                  MachineInstr *CopyMI);
+    bool RemoveCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
 
     /// TrimLiveIntervalToLastUse - If there is a last use in the same basic
     /// block as the copy instruction, trim the ive interval to the last use
@@ -155,28 +144,6 @@ namespace llvm {
     bool ReMaterializeTrivialDef(LiveInterval &SrcInt, unsigned DstReg,
                                  unsigned DstSubIdx, MachineInstr *CopyMI);
 
-    /// CanCoalesceWithImpDef - Returns true if the specified copy instruction
-    /// from an implicit def to another register can be coalesced away.
-    bool CanCoalesceWithImpDef(MachineInstr *CopyMI,
-                               LiveInterval &li, LiveInterval &ImpLi) const;
-
-    /// TurnCopiesFromValNoToImpDefs - The specified value# is defined by an
-    /// implicit_def and it is being removed. Turn all copies from this value#
-    /// into implicit_defs.
-    void TurnCopiesFromValNoToImpDefs(LiveInterval &li, VNInfo *VNI);
-
-    /// isWinToJoinVRWithSrcPhysReg - Return true if it's worth while to join a
-    /// a virtual destination register with physical source register.
-    bool isWinToJoinVRWithSrcPhysReg(MachineInstr *CopyMI,
-                                    MachineBasicBlock *CopyMBB,
-                                    LiveInterval &DstInt, LiveInterval &SrcInt);
-
-    /// isWinToJoinVRWithDstPhysReg - Return true if it's worth while to join a
-    /// copy from a virtual source register to a physical destination register.
-    bool isWinToJoinVRWithDstPhysReg(MachineInstr *CopyMI,
-                                    MachineBasicBlock *CopyMBB,
-                                    LiveInterval &DstInt, LiveInterval &SrcInt);
-
     /// isWinToJoinCrossClass - Return true if it's profitable to coalesce
     /// two virtual registers from different register classes.
     bool isWinToJoinCrossClass(unsigned SrcReg,
@@ -185,43 +152,12 @@ namespace llvm {
                                const TargetRegisterClass *DstRC,
                                const TargetRegisterClass *NewRC);
 
-    /// HasIncompatibleSubRegDefUse - If we are trying to coalesce a virtual
-    /// register with a physical register, check if any of the virtual register
-    /// operand is a sub-register use or def. If so, make sure it won't result
-    /// in an illegal extract_subreg or insert_subreg instruction.
-    bool HasIncompatibleSubRegDefUse(MachineInstr *CopyMI,
-                                     unsigned VirtReg, unsigned PhysReg);
-
-    /// CanJoinExtractSubRegToPhysReg - Return true if it's possible to coalesce
-    /// an extract_subreg where dst is a physical register, e.g.
-    /// cl = EXTRACT_SUBREG reg1024, 1
-    bool CanJoinExtractSubRegToPhysReg(unsigned DstReg, unsigned SrcReg,
-                                       unsigned SubIdx, unsigned &RealDstReg);
-
-    /// CanJoinInsertSubRegToPhysReg - Return true if it's possible to coalesce
-    /// an insert_subreg where src is a physical register, e.g.
-    /// reg1024 = INSERT_SUBREG reg1024, c1, 0
-    bool CanJoinInsertSubRegToPhysReg(unsigned DstReg, unsigned SrcReg,
-                                      unsigned SubIdx, unsigned &RealDstReg);
-
-    /// ValueLiveAt - Return true if the LiveRange pointed to by the given
-    /// iterator, or any subsequent range with the same value number,
-    /// is live at the given point.
-    bool ValueLiveAt(LiveInterval::iterator LRItr, LiveInterval::iterator LREnd, 
-                     SlotIndex defPoint) const;                                  
-
-    /// RangeIsDefinedByCopyFromReg - Return true if the specified live range of
-    /// the specified live interval is defined by a copy from the specified
-    /// register.
-    bool RangeIsDefinedByCopyFromReg(LiveInterval &li, LiveRange *LR,
-                                     unsigned Reg);
-
     /// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
     /// update the subregister number if it is not zero. If DstReg is a
     /// physical register and the existing subregister number of the def / use
     /// being updated is not zero, make sure to set it to the correct physical
     /// subregister.
-    void UpdateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
+    void UpdateRegDefsUses(const CoalescerPair &CP);
 
     /// ShortenDeadCopyLiveRange - Shorten a live range defined by a dead copy.
     /// Return true if live interval is removed.
@@ -238,6 +174,10 @@ namespace llvm {
     /// it as well.
     bool RemoveDeadDef(LiveInterval &li, MachineInstr *DefMI);
 
+    /// RemoveCopyFlag - If DstReg is no longer defined by CopyMI, clear the
+    /// VNInfo copy flag for DstReg and all aliases.
+    void RemoveCopyFlag(unsigned DstReg, const MachineInstr *CopyMI);
+
     /// lastRegisterUse - Returns the last use of the specific register between
     /// cycles Start and End or NULL if there are no uses.
     MachineOperand *lastRegisterUse(SlotIndex Start, SlotIndex End,
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 059e8d6..e90869d 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -46,6 +46,8 @@ namespace {
     Constant *UnregisterFn;
     Constant *BuiltinSetjmpFn;
     Constant *FrameAddrFn;
+    Constant *StackAddrFn;
+    Constant *StackRestoreFn;
     Constant *LSDAAddrFn;
     Value *PersonalityFn;
     Constant *SelectorFn;
@@ -69,7 +71,7 @@ namespace {
     void insertCallSiteStore(Instruction *I, int Number, Value *CallSite);
     void markInvokeCallSite(InvokeInst *II, int InvokeNo, Value *CallSite,
                             SwitchInst *CatchSwitch);
-    void splitLiveRangesLiveAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes);
+    void splitLiveRangesAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes);
     bool insertSjLjEHSupport(Function &F);
   };
 } // end anonymous namespace
@@ -107,6 +109,8 @@ bool SjLjEHPass::doInitialization(Module &M) {
                           PointerType::getUnqual(FunctionContextTy),
                           (Type *)0);
   FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
+  StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
+  StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
   BuiltinSetjmpFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setjmp);
   LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda);
   SelectorFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_selector);
@@ -175,8 +179,10 @@ static void MarkBlocksLiveIn(BasicBlock *BB, std::set<BasicBlock*> &LiveBBs) {
 /// we spill into a stack location, guaranteeing that there is nothing live
 /// across the unwind edge.  This process also splits all critical edges
 /// coming out of invoke's.
+/// FIXME: Move this function to a common utility file (Local.cpp?) so
+/// both SjLj and LowerInvoke can use it.
 void SjLjEHPass::
-splitLiveRangesLiveAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes) {
+splitLiveRangesAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes) {
   // First step, split all critical edges from invoke instructions.
   for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
     InvokeInst *II = Invokes[i];
@@ -198,16 +204,33 @@ splitLiveRangesLiveAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes) {
     ++AfterAllocaInsertPt;
   for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
        AI != E; ++AI) {
-    // This is always a no-op cast because we're casting AI to AI->getType() so
-    // src and destination types are identical. BitCast is the only possibility.
-    CastInst *NC = new BitCastInst(
-      AI, AI->getType(), AI->getName()+".tmp", AfterAllocaInsertPt);
-    AI->replaceAllUsesWith(NC);
-    // Normally its is forbidden to replace a CastInst's operand because it
-    // could cause the opcode to reflect an illegal conversion. However, we're
-    // replacing it here with the same value it was constructed with to simply
-    // make NC its user.
-    NC->setOperand(0, AI);
+    const Type *Ty = AI->getType();
+    // Aggregate types can't be cast, but are legal argument types, so we have
+    // to handle them differently. We use an extract/insert pair as a
+    // lightweight method to achieve the same goal.
+    if (isa<StructType>(Ty) || isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
+      Instruction *EI = ExtractValueInst::Create(AI, 0, "",AfterAllocaInsertPt);
+      Instruction *NI = InsertValueInst::Create(AI, EI, 0);
+      NI->insertAfter(EI);
+      AI->replaceAllUsesWith(NI);
+      // Set the operand of the instructions back to the AllocaInst.
+      EI->setOperand(0, AI);
+      NI->setOperand(0, AI);
+    } else {
+      // This is always a no-op cast because we're casting AI to AI->getType()
+      // so src and destination types are identical. BitCast is the only
+      // possibility.
+      CastInst *NC = new BitCastInst(
+        AI, AI->getType(), AI->getName()+".tmp", AfterAllocaInsertPt);
+      AI->replaceAllUsesWith(NC);
+      // Set the operand of the cast instruction back to the AllocaInst.
+      // Normally it's forbidden to replace a CastInst's operand because it
+      // could cause the opcode to reflect an illegal conversion. However,
+      // we're replacing it here with the same value it was constructed with.
+      // We do this because the above replaceAllUsesWith() clobbered the
+      // operand, but we want this one to remain.
+      NC->setOperand(0, AI);
+    }
   }
 
   // Finally, scan the code looking for instructions with bad live ranges.
@@ -266,6 +289,9 @@ splitLiveRangesLiveAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes) {
       }
 
       // If we decided we need a spill, do it.
+      // FIXME: Spilling this way is overkill, as it forces all uses of
+      // the value to be reloaded from the stack slot, even those that aren't
+      // in the unwind blocks. We should be more selective.
       if (NeedsSpill) {
         ++NumSpilled;
         DemoteRegToStack(*Inst, true);
@@ -294,22 +320,34 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
   // If we don't have any invokes or unwinds, there's nothing to do.
   if (Unwinds.empty() && Invokes.empty()) return false;
 
-  // Find the eh.selector.*  and eh.exception calls. We'll use the first
-  // eh.selector to determine the right personality function to use. For
-  // SJLJ, we always use the same personality for the whole function,
-  // not on a per-selector basis.
+  // Find the eh.selector.*, eh.exception and alloca calls.
+  //
+  // Remember any allocas() that aren't in the entry block, as the
+  // jmpbuf saved SP will need to be updated for them.
+  //
+  // We'll use the first eh.selector to determine the right personality
+  // function to use. For SJLJ, we always use the same personality for the
+  // whole function, not on a per-selector basis.
   // FIXME: That's a bit ugly. Better way?
   SmallVector<CallInst*,16> EH_Selectors;
   SmallVector<CallInst*,16> EH_Exceptions;
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+  SmallVector<Instruction*,16> JmpbufUpdatePoints;
+  // Note: Skip the entry block since there's nothing there that interests
+  // us. eh.selector and eh.exception shouldn't ever be there, and we
+  // want to disregard any allocas that are there.
+  for (Function::iterator BB = F.begin(), E = F.end(); ++BB != E;) {
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
       if (CallInst *CI = dyn_cast<CallInst>(I)) {
         if (CI->getCalledFunction() == SelectorFn) {
-          if (!PersonalityFn) PersonalityFn = CI->getOperand(2);
+          if (!PersonalityFn) PersonalityFn = CI->getArgOperand(1);
           EH_Selectors.push_back(CI);
         } else if (CI->getCalledFunction() == ExceptionFn) {
           EH_Exceptions.push_back(CI);
+        } else if (CI->getCalledFunction() == StackRestoreFn) {
+          JmpbufUpdatePoints.push_back(CI);
         }
+      } else if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+        JmpbufUpdatePoints.push_back(AI);
       }
     }
   }
@@ -329,7 +367,7 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
     // we spill into a stack location, guaranteeing that there is nothing live
     // across the unwind edge.  This process also splits all critical edges
     // coming out of invoke's.
-    splitLiveRangesLiveAcrossInvokes(Invokes);
+    splitLiveRangesAcrossInvokes(Invokes);
 
     BasicBlock *EntryBB = F.begin();
     // Create an alloca for the incoming jump buffer ptr and the new jump buffer
@@ -419,7 +457,7 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
     // Populate the Function Context
     //   1. LSDA address
     //   2. Personality function address
-    //   3. jmpbuf (save FP and call eh.sjlj.setjmp)
+    //   3. jmpbuf (save SP, FP and call eh.sjlj.setjmp)
 
     // LSDA address
     Idxs[0] = Zero;
@@ -440,31 +478,41 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
     new StoreInst(PersonalityFn, PersonalityFieldPtr, true,
                   EntryBB->getTerminator());
 
-    //   Save the frame pointer.
+    // Save the frame pointer.
     Idxs[1] = ConstantInt::get(Int32Ty, 5);
-    Value *FieldPtr
+    Value *JBufPtr
       = GetElementPtrInst::Create(FunctionContext, Idxs, Idxs+2,
                                   "jbuf_gep",
                                   EntryBB->getTerminator());
     Idxs[1] = ConstantInt::get(Int32Ty, 0);
-    Value *ElemPtr =
-      GetElementPtrInst::Create(FieldPtr, Idxs, Idxs+2, "jbuf_fp_gep",
+    Value *FramePtr =
+      GetElementPtrInst::Create(JBufPtr, Idxs, Idxs+2, "jbuf_fp_gep",
                                 EntryBB->getTerminator());
 
     Value *Val = CallInst::Create(FrameAddrFn,
                                   ConstantInt::get(Int32Ty, 0),
                                   "fp",
                                   EntryBB->getTerminator());
-    new StoreInst(Val, ElemPtr, true, EntryBB->getTerminator());
-    // Call the setjmp instrinsic. It fills in the rest of the jmpbuf
+    new StoreInst(Val, FramePtr, true, EntryBB->getTerminator());
+
+    // Save the stack pointer.
+    Idxs[1] = ConstantInt::get(Int32Ty, 2);
+    Value *StackPtr =
+      GetElementPtrInst::Create(JBufPtr, Idxs, Idxs+2, "jbuf_sp_gep",
+                                EntryBB->getTerminator());
+
+    Val = CallInst::Create(StackAddrFn, "sp", EntryBB->getTerminator());
+    new StoreInst(Val, StackPtr, true, EntryBB->getTerminator());
+
+    // Call the setjmp instrinsic. It fills in the rest of the jmpbuf.
     Value *SetjmpArg =
-      CastInst::Create(Instruction::BitCast, FieldPtr,
+      CastInst::Create(Instruction::BitCast, JBufPtr,
                        Type::getInt8PtrTy(F.getContext()), "",
                        EntryBB->getTerminator());
     Value *DispatchVal = CallInst::Create(BuiltinSetjmpFn, SetjmpArg,
                                           "dispatch",
                                           EntryBB->getTerminator());
-    // check the return value of the setjmp. non-zero goes to dispatcher
+    // check the return value of the setjmp. non-zero goes to dispatcher.
     Value *IsNormal = new ICmpInst(EntryBB->getTerminator(),
                                    ICmpInst::ICMP_EQ, DispatchVal, Zero,
                                    "notunwind");
@@ -509,6 +557,16 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
       Unwinds[i]->eraseFromParent();
     }
 
+    // Following any allocas not in the entry block, update the saved SP
+    // in the jmpbuf to the new value.
+    for (unsigned i = 0, e = JmpbufUpdatePoints.size(); i != e; ++i) {
+      Instruction *AI = JmpbufUpdatePoints[i];
+      Instruction *StackAddr = CallInst::Create(StackAddrFn, "sp");
+      StackAddr->insertAfter(AI);
+      Instruction *StoreStackAddr = new StoreInst(StackAddr, StackPtr, true);
+      StoreStackAddr->insertAfter(StackAddr);
+    }
+
     // Finally, for any returns from this function, if this function contains an
     // invoke, add a call to unregister the function context.
     for (unsigned i = 0, e = Returns.size(); i != e; ++i)
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index 6110ef5..7a227cf 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -213,9 +213,11 @@ void SlotIndexes::dump() const {
 
 // Print a SlotIndex to a raw_ostream.
 void SlotIndex::print(raw_ostream &os) const {
-  os << getIndex();
+  os << entry().getIndex();
   if (isPHI())
     os << "*";
+  else
+    os << "LudS"[getSlot()];
 }
 
 // Dump a SlotIndex to stderr.
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
index a7b2efe..56bcb28 100644
--- a/lib/CodeGen/Spiller.cpp
+++ b/lib/CodeGen/Spiller.cpp
@@ -14,18 +14,20 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <set>
 
 using namespace llvm;
 
 namespace {
-  enum SpillerName { trivial, standard, splitting };
+  enum SpillerName { trivial, standard, splitting, inline_ };
 }
 
 static cl::opt<SpillerName>
@@ -35,6 +37,7 @@ spillerOpt("spiller",
            cl::values(clEnumVal(trivial,   "trivial spiller"),
                       clEnumVal(standard,  "default spiller"),
                       clEnumVal(splitting, "splitting spiller"),
+                      clEnumValN(inline_,  "inline", "inline spiller"),
                       clEnumValEnd),
            cl::init(standard));
 
@@ -53,8 +56,8 @@ protected:
   const TargetInstrInfo *tii;
   const TargetRegisterInfo *tri;
   VirtRegMap *vrm;
-  
-  /// Construct a spiller base. 
+
+  /// Construct a spiller base.
   SpillerBase(MachineFunction *mf, LiveIntervals *lis, VirtRegMap *vrm)
     : mf(mf), lis(lis), vrm(vrm)
   {
@@ -67,7 +70,8 @@ protected:
   /// Add spill ranges for every use/def of the live interval, inserting loads
   /// immediately before each use, and stores after each def. No folding or
   /// remat is attempted.
-  std::vector<LiveInterval*> trivialSpillEverywhere(LiveInterval *li) {
+  void trivialSpillEverywhere(LiveInterval *li,
+                              std::vector<LiveInterval*> &newIntervals) {
     DEBUG(dbgs() << "Spilling everywhere " << *li << "\n");
 
     assert(li->weight != HUGE_VALF &&
@@ -78,8 +82,6 @@ protected:
 
     DEBUG(dbgs() << "Trivial spill everywhere of reg" << li->reg << "\n");
 
-    std::vector<LiveInterval*> added;
-    
     const TargetRegisterClass *trc = mri->getRegClass(li->reg);
     unsigned ss = vrm->assignVirt2StackSlot(li->reg);
 
@@ -96,7 +98,7 @@ protected:
       do {
         ++regItr;
       } while (regItr != mri->reg_end() && (&*regItr == mi));
-      
+
       // Collect uses & defs for this instr.
       SmallVector<unsigned, 2> indices;
       bool hasUse = false;
@@ -116,7 +118,7 @@ protected:
       vrm->assignVirt2StackSlot(newVReg, ss);
       LiveInterval *newLI = &lis->getOrCreateInterval(newVReg);
       newLI->weight = HUGE_VALF;
-      
+
       // Update the reg operands & kill flags.
       for (unsigned i = 0; i < indices.size(); ++i) {
         unsigned mopIdx = indices[i];
@@ -136,10 +138,10 @@ protected:
         MachineInstr *loadInstr(prior(miItr));
         SlotIndex loadIndex =
           lis->InsertMachineInstrInMaps(loadInstr).getDefIndex();
+        vrm->addSpillSlotUse(ss, loadInstr);
         SlotIndex endIndex = loadIndex.getNextIndex();
         VNInfo *loadVNI =
           newLI->getNextValue(loadIndex, 0, true, lis->getVNInfoAllocator());
-        loadVNI->addKill(endIndex);
         newLI->addRange(LiveRange(loadIndex, endIndex, loadVNI));
       }
 
@@ -150,17 +152,15 @@ protected:
         MachineInstr *storeInstr(llvm::next(miItr));
         SlotIndex storeIndex =
           lis->InsertMachineInstrInMaps(storeInstr).getDefIndex();
+        vrm->addSpillSlotUse(ss, storeInstr);
         SlotIndex beginIndex = storeIndex.getPrevIndex();
         VNInfo *storeVNI =
           newLI->getNextValue(beginIndex, 0, true, lis->getVNInfoAllocator());
-        storeVNI->addKill(storeIndex);
         newLI->addRange(LiveRange(beginIndex, storeIndex, storeVNI));
       }
 
-      added.push_back(newLI);
+      newIntervals.push_back(newLI);
     }
-
-    return added;
   }
 };
 
@@ -176,11 +176,12 @@ public:
   TrivialSpiller(MachineFunction *mf, LiveIntervals *lis, VirtRegMap *vrm)
     : SpillerBase(mf, lis, vrm) {}
 
-  std::vector<LiveInterval*> spill(LiveInterval *li,
-                                   SmallVectorImpl<LiveInterval*> &spillIs,
-                                   SlotIndex*) {
+  void spill(LiveInterval *li,
+             std::vector<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &,
+             SlotIndex*) {
     // Ignore spillIs - we don't use it.
-    return trivialSpillEverywhere(li);
+    trivialSpillEverywhere(li, newIntervals);
   }
 };
 
@@ -200,10 +201,13 @@ public:
     : lis(lis), loopInfo(loopInfo), vrm(vrm) {}
 
   /// Falls back on LiveIntervals::addIntervalsForSpills.
-  std::vector<LiveInterval*> spill(LiveInterval *li,
-                                   SmallVectorImpl<LiveInterval*> &spillIs,
-                                   SlotIndex*) {
-    return lis->addIntervalsForSpills(*li, spillIs, loopInfo, *vrm);
+  void spill(LiveInterval *li,
+             std::vector<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &spillIs,
+             SlotIndex*) {
+    std::vector<LiveInterval*> added =
+      lis->addIntervalsForSpills(*li, spillIs, loopInfo, *vrm);
+    newIntervals.insert(newIntervals.end(), added.begin(), added.end());
   }
 };
 
@@ -214,7 +218,7 @@ namespace {
 /// When a call to spill is placed this spiller will first try to break the
 /// interval up into its component values (one new interval per value).
 /// If this fails, or if a call is placed to spill a previously split interval
-/// then the spiller falls back on the standard spilling mechanism. 
+/// then the spiller falls back on the standard spilling mechanism.
 class SplittingSpiller : public StandardSpiller {
 public:
   SplittingSpiller(MachineFunction *mf, LiveIntervals *lis,
@@ -226,22 +230,21 @@ public:
     tri = mf->getTarget().getRegisterInfo();
   }
 
-  std::vector<LiveInterval*> spill(LiveInterval *li,
-                                   SmallVectorImpl<LiveInterval*> &spillIs,
-                                   SlotIndex *earliestStart) {
-    
-    if (worthTryingToSplit(li)) {
-      return tryVNISplit(li, earliestStart);
-    }
-    // else
-    return StandardSpiller::spill(li, spillIs, earliestStart);
+  void spill(LiveInterval *li,
+             std::vector<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &spillIs,
+             SlotIndex *earliestStart) {
+    if (worthTryingToSplit(li))
+      tryVNISplit(li, earliestStart);
+    else
+      StandardSpiller::spill(li, newIntervals, spillIs, earliestStart);
   }
 
 private:
 
   MachineRegisterInfo *mri;
   const TargetInstrInfo *tii;
-  const TargetRegisterInfo *tri;  
+  const TargetRegisterInfo *tri;
   DenseSet<LiveInterval*> alreadySplit;
 
   bool worthTryingToSplit(LiveInterval *li) const {
@@ -258,18 +261,18 @@ private:
     SmallVector<VNInfo*, 4> vnis;
 
     std::copy(li->vni_begin(), li->vni_end(), std::back_inserter(vnis));
-   
+
     for (SmallVectorImpl<VNInfo*>::iterator vniItr = vnis.begin(),
          vniEnd = vnis.end(); vniItr != vniEnd; ++vniItr) {
       VNInfo *vni = *vniItr;
-      
-      // Skip unused VNIs, or VNIs with no kills.
-      if (vni->isUnused() || vni->kills.empty())
+
+      // Skip unused VNIs.
+      if (vni->isUnused())
         continue;
 
       DEBUG(dbgs() << "  Extracted Val #" << vni->id << " as ");
       LiveInterval *splitInterval = extractVNI(li, vni);
-      
+
       if (splitInterval != 0) {
         DEBUG(dbgs() << *splitInterval << "\n");
         added.push_back(splitInterval);
@@ -281,12 +284,12 @@ private:
       } else {
         DEBUG(dbgs() << "0\n");
       }
-    } 
+    }
 
     DEBUG(dbgs() << "Original LI: " << *li << "\n");
 
     // If there original interval still contains some live ranges
-    // add it to added and alreadySplit.    
+    // add it to added and alreadySplit.
     if (!li->empty()) {
       added.push_back(li);
       alreadySplit.insert(li);
@@ -302,16 +305,15 @@ private:
   /// Extract the given value number from the interval.
   LiveInterval* extractVNI(LiveInterval *li, VNInfo *vni) const {
     assert(vni->isDefAccurate() || vni->isPHIDef());
-    assert(!vni->kills.empty());
 
-    // Create a new vreg and live interval, copy VNI kills & ranges over.                                                                                                                                                     
+    // Create a new vreg and live interval, copy VNI ranges over.
     const TargetRegisterClass *trc = mri->getRegClass(li->reg);
     unsigned newVReg = mri->createVirtualRegister(trc);
     vrm->grow();
     LiveInterval *newLI = &lis->getOrCreateInterval(newVReg);
     VNInfo *newVNI = newLI->createValueCopy(vni, lis->getVNInfoAllocator());
 
-    // Start by copying all live ranges in the VN to the new interval.                                                                                                                                                        
+    // Start by copying all live ranges in the VN to the new interval.
     for (LiveInterval::iterator rItr = li->begin(), rEnd = li->end();
          rItr != rEnd; ++rItr) {
       if (rItr->valno == vni) {
@@ -319,7 +321,7 @@ private:
       }
     }
 
-    // Erase the old VNI & ranges.                                                                                                                                                                                            
+    // Erase the old VNI & ranges.
     li->removeValNo(vni);
 
     // Collect all current uses of the register belonging to the given VNI.
@@ -336,15 +338,13 @@ private:
       // Insert a copy at the start of the MBB. The range proceeding the
       // copy will be attached to the original LiveInterval.
       MachineBasicBlock *defMBB = lis->getMBBFromIndex(newVNI->def);
-      tii->copyRegToReg(*defMBB, defMBB->begin(), newVReg, li->reg, trc, trc,
-                        DebugLoc());
-      MachineInstr *copyMI = defMBB->begin();
-      copyMI->addRegisterKilled(li->reg, tri);
+      MachineInstr *copyMI = BuildMI(*defMBB, defMBB->begin(), DebugLoc(),
+                                     tii->get(TargetOpcode::COPY), newVReg)
+                               .addReg(li->reg, RegState::Kill);
       SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
       VNInfo *phiDefVNI = li->getNextValue(lis->getMBBStartIdx(defMBB),
                                            0, false, lis->getVNInfoAllocator());
       phiDefVNI->setIsPHIDef(true);
-      phiDefVNI->addKill(copyIdx.getDefIndex());
       li->addRange(LiveRange(phiDefVNI->def, copyIdx.getDefIndex(), phiDefVNI));
       LiveRange *oldPHIDefRange =
         newLI->getLiveRangeContaining(lis->getMBBStartIdx(defMBB));
@@ -367,8 +367,8 @@ private:
       newVNI->setIsPHIDef(false); // not a PHI def anymore.
       newVNI->setIsDefAccurate(true);
     } else {
-      // non-PHI def. Rename the def. If it's two-addr that means renaming the use
-      // and inserting a new copy too.
+      // non-PHI def. Rename the def. If it's two-addr that means renaming the
+      // use and inserting a new copy too.
       MachineInstr *defInst = lis->getInstructionFromIndex(newVNI->def);
       // We'll rename this now, so we can remove it from uses.
       uses.erase(defInst);
@@ -384,38 +384,26 @@ private:
             twoAddrUseIsUndef = true;
         }
       }
-    
+
       SlotIndex defIdx = lis->getInstructionIndex(defInst);
       newVNI->def = defIdx.getDefIndex();
 
       if (isTwoAddr && !twoAddrUseIsUndef) {
         MachineBasicBlock *defMBB = defInst->getParent();
-        tii->copyRegToReg(*defMBB, defInst, newVReg, li->reg, trc, trc,
-                          DebugLoc());
-        MachineInstr *copyMI = prior(MachineBasicBlock::iterator(defInst));
+        MachineInstr *copyMI = BuildMI(*defMBB, defInst, DebugLoc(),
+                                       tii->get(TargetOpcode::COPY), newVReg)
+                                 .addReg(li->reg, RegState::Kill);
         SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
-        copyMI->addRegisterKilled(li->reg, tri);
         LiveRange *origUseRange =
           li->getLiveRangeContaining(newVNI->def.getUseIndex());
-        VNInfo *origUseVNI = origUseRange->valno;
         origUseRange->end = copyIdx.getDefIndex();
-        bool updatedKills = false;
-        for (unsigned k = 0; k < origUseVNI->kills.size(); ++k) {
-          if (origUseVNI->kills[k] == defIdx.getDefIndex()) {
-            origUseVNI->kills[k] = copyIdx.getDefIndex();
-            updatedKills = true;
-            break;
-          }
-        }
-        assert(updatedKills && "Failed to update VNI kill list.");
         VNInfo *copyVNI = newLI->getNextValue(copyIdx.getDefIndex(), copyMI,
                                               true, lis->getVNInfoAllocator());
-        copyVNI->addKill(defIdx.getDefIndex());
         LiveRange copyRange(copyIdx.getDefIndex(),defIdx.getDefIndex(),copyVNI);
         newLI->addRange(copyRange);
-      }    
+      }
     }
-    
+
     for (std::set<MachineInstr*>::iterator
          usesItr = uses.begin(), usesEnd = uses.end();
          usesItr != usesEnd; ++usesItr) {
@@ -435,7 +423,7 @@ private:
       // Check if this instr is two address.
       unsigned useOpIdx = useInst->findRegisterUseOperandIdx(li->reg);
       bool isTwoAddress = useInst->isRegTiedToDefOperand(useOpIdx);
-      
+
       // Rename uses (and defs for two-address instrs).
       for (unsigned i = 0; i < useInst->getNumOperands(); ++i) {
         MachineOperand &mo = useInst->getOperand(i);
@@ -451,10 +439,9 @@ private:
         // reg.
         MachineBasicBlock *useMBB = useInst->getParent();
         MachineBasicBlock::iterator useItr(useInst);
-        tii->copyRegToReg(*useMBB, llvm::next(useItr), li->reg, newVReg, trc, trc,
-                          DebugLoc());
-        MachineInstr *copyMI = llvm::next(useItr);
-        copyMI->addRegisterKilled(newVReg, tri);
+        MachineInstr *copyMI = BuildMI(*useMBB, llvm::next(useItr), DebugLoc(),
+                                       tii->get(TargetOpcode::COPY), newVReg)
+                                 .addReg(li->reg, RegState::Kill);
         SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
 
         // Change the old two-address defined range & vni to start at
@@ -470,56 +457,44 @@ private:
         VNInfo *copyVNI =
           newLI->getNextValue(useIdx.getDefIndex(), 0, true,
                               lis->getVNInfoAllocator());
-        copyVNI->addKill(copyIdx.getDefIndex());
         LiveRange copyRange(useIdx.getDefIndex(),copyIdx.getDefIndex(),copyVNI);
         newLI->addRange(copyRange);
       }
     }
-    
-    // Iterate over any PHI kills - we'll need to insert new copies for them.
-    for (VNInfo::KillSet::iterator
-         killItr = newVNI->kills.begin(), killEnd = newVNI->kills.end();
-         killItr != killEnd; ++killItr) {
-      SlotIndex killIdx(*killItr);
-      if (killItr->isPHI()) {
-        MachineBasicBlock *killMBB = lis->getMBBFromIndex(killIdx);
-        LiveRange *oldKillRange =
-          newLI->getLiveRangeContaining(killIdx);
-
-        assert(oldKillRange != 0 && "No kill range?");
-
-        tii->copyRegToReg(*killMBB, killMBB->getFirstTerminator(),
-                          li->reg, newVReg, trc, trc,
-                          DebugLoc());
-        MachineInstr *copyMI = prior(killMBB->getFirstTerminator());
-        copyMI->addRegisterKilled(newVReg, tri);
-        SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
 
-        // Save the current end. We may need it to add a new range if the
-        // current range runs of the end of the MBB.
-        SlotIndex newKillRangeEnd = oldKillRange->end;
-        oldKillRange->end = copyIdx.getDefIndex();
+    // Iterate over any PHI kills - we'll need to insert new copies for them.
+    for (LiveInterval::iterator LRI = newLI->begin(), LRE = newLI->end();
+         LRI != LRE; ++LRI) {
+      if (LRI->valno != newVNI || LRI->end.isPHI())
+        continue;
+      SlotIndex killIdx = LRI->end;
+      MachineBasicBlock *killMBB = lis->getMBBFromIndex(killIdx);
+      MachineInstr *copyMI = BuildMI(*killMBB, killMBB->getFirstTerminator(),
+                                     DebugLoc(), tii->get(TargetOpcode::COPY),
+                                     li->reg)
+                               .addReg(newVReg, RegState::Kill);
+      SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
 
-        if (newKillRangeEnd != lis->getMBBEndIdx(killMBB)) {
-          assert(newKillRangeEnd > lis->getMBBEndIdx(killMBB) &&
-                 "PHI kill range doesn't reach kill-block end. Not sane.");
-          newLI->addRange(LiveRange(lis->getMBBEndIdx(killMBB),
-                                    newKillRangeEnd, newVNI));
-        }
+      // Save the current end. We may need it to add a new range if the
+      // current range runs of the end of the MBB.
+      SlotIndex newKillRangeEnd = LRI->end;
+      LRI->end = copyIdx.getDefIndex();
 
-        *killItr = oldKillRange->end;
-        VNInfo *newKillVNI = li->getNextValue(copyIdx.getDefIndex(),
-                                              copyMI, true,
-                                              lis->getVNInfoAllocator());
-        newKillVNI->addKill(lis->getMBBTerminatorGap(killMBB));
-        newKillVNI->setHasPHIKill(true);
-        li->addRange(LiveRange(copyIdx.getDefIndex(),
-                               lis->getMBBEndIdx(killMBB),
-                               newKillVNI));
+      if (newKillRangeEnd != lis->getMBBEndIdx(killMBB)) {
+        assert(newKillRangeEnd > lis->getMBBEndIdx(killMBB) &&
+               "PHI kill range doesn't reach kill-block end. Not sane.");
+        newLI->addRange(LiveRange(lis->getMBBEndIdx(killMBB),
+                                  newKillRangeEnd, newVNI));
       }
 
+      VNInfo *newKillVNI = li->getNextValue(copyIdx.getDefIndex(),
+                                            copyMI, true,
+                                            lis->getVNInfoAllocator());
+      newKillVNI->setHasPHIKill(true);
+      li->addRange(LiveRange(copyIdx.getDefIndex(),
+                             lis->getMBBEndIdx(killMBB),
+                             newKillVNI));
     }
-
     newVNI->setHasPHIKill(false);
 
     return newLI;
@@ -530,6 +505,13 @@ private:
 } // end anonymous namespace
 
 
+namespace llvm {
+Spiller *createInlineSpiller(MachineFunction*,
+                             LiveIntervals*,
+                             const MachineLoopInfo*,
+                             VirtRegMap*);
+}
+
 llvm::Spiller* llvm::createSpiller(MachineFunction *mf, LiveIntervals *lis,
                                    const MachineLoopInfo *loopInfo,
                                    VirtRegMap *vrm) {
@@ -538,5 +520,6 @@ llvm::Spiller* llvm::createSpiller(MachineFunction *mf, LiveIntervals *lis,
   case trivial: return new TrivialSpiller(mf, lis, vrm);
   case standard: return new StandardSpiller(lis, loopInfo, vrm);
   case splitting: return new SplittingSpiller(mf, lis, loopInfo, vrm);
+  case inline_: return createInlineSpiller(mf, lis, loopInfo, vrm);
   }
 }
diff --git a/lib/CodeGen/Spiller.h b/lib/CodeGen/Spiller.h
index dda52e8..450447b 100644
--- a/lib/CodeGen/Spiller.h
+++ b/lib/CodeGen/Spiller.h
@@ -33,11 +33,19 @@ namespace llvm {
   public:
     virtual ~Spiller() = 0;
 
-    /// Spill the given live range. The method used will depend on the Spiller
-    /// implementation selected.
-    virtual std::vector<LiveInterval*> spill(LiveInterval *li,
-					     SmallVectorImpl<LiveInterval*> &spillIs,
-                                             SlotIndex *earliestIndex = 0) = 0;
+    /// spill - Spill the given live interval. The method used will depend on
+    /// the Spiller implementation selected.
+    ///
+    /// @param li            The live interval to be spilled.
+    /// @param spillIs       A list of intervals that are about to be spilled,
+    ///                      and so cannot be used for remat etc.
+    /// @param newIntervals  The newly created intervals will be appended here.
+    /// @param earliestIndex The earliest point for splitting. (OK, it's another
+    ///                      pointer to the allocator guts).
+    virtual void spill(LiveInterval *li,
+                       std::vector<LiveInterval*> &newIntervals,
+                       SmallVectorImpl<LiveInterval*> &spillIs,
+                       SlotIndex *earliestIndex = 0) = 0;
 
   };
 
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 8a6a727..ca5c28c 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -136,7 +136,7 @@ bool StackProtector::RequiresStackProtector() const {
 bool StackProtector::InsertStackProtectors() {
   BasicBlock *FailBB = 0;       // The basic block to jump to if check fails.
   AllocaInst *AI = 0;           // Place on stack that stores the stack guard.
-  Constant *StackGuardVar = 0;  // The stack guard variable.
+  Value *StackGuardVar = 0;  // The stack guard variable.
 
   for (Function::iterator I = F->begin(), E = F->end(); I != E; ) {
     BasicBlock *BB = I++;
@@ -153,9 +153,17 @@ bool StackProtector::InsertStackProtectors() {
       //     StackGuard = load __stack_chk_guard
       //     call void @llvm.stackprotect.create(StackGuard, StackGuardSlot)
       // 
-      PointerType *PtrTy = PointerType::getUnqual(
-          Type::getInt8Ty(RI->getContext()));
-      StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy);
+      const PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
+      unsigned AddressSpace, Offset;
+      if (TLI->getStackCookieLocation(AddressSpace, Offset)) {
+        Constant *OffsetVal =
+          ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
+        
+        StackGuardVar = ConstantExpr::getIntToPtr(OffsetVal,
+                                      PointerType::get(PtrTy, AddressSpace));
+      } else {
+        StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy); 
+      }
 
       BasicBlock &Entry = F->getEntryBlock();
       Instruction *InsPt = &Entry.front();
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 7f3b452..eff3c33 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -508,8 +509,7 @@ bool StackSlotColoring::PropagateBackward(MachineBasicBlock::iterator MII,
 
         // Abort the use is actually a sub-register def. We don't have enough
         // information to figure out if it is really legal.
-        if (MO.getSubReg() || MII->isExtractSubreg() ||
-            MII->isInsertSubreg() || MII->isSubregToReg())
+        if (MO.getSubReg() || MII->isSubregToReg())
           return false;
 
         const TargetRegisterClass *RC = TID.OpInfo[i].getRegClass(TRI);
@@ -571,7 +571,7 @@ bool StackSlotColoring::PropagateForward(MachineBasicBlock::iterator MII,
 
         // Abort the use is actually a sub-register use. We don't have enough
         // information to figure out if it is really legal.
-        if (MO.getSubReg() || MII->isExtractSubreg())
+        if (MO.getSubReg())
           return false;
 
         const TargetRegisterClass *RC = TID.OpInfo[i].getRegClass(TRI);
@@ -610,8 +610,8 @@ StackSlotColoring::UnfoldAndRewriteInstruction(MachineInstr *MI, int OldFI,
       DEBUG(MI->dump());
       ++NumLoadElim;
     } else {
-      TII->copyRegToReg(*MBB, MI, DstReg, Reg, RC, RC,
-                        MI->getDebugLoc());
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY),
+              DstReg).addReg(Reg);
       ++NumRegRepl;
     }
 
@@ -627,8 +627,8 @@ StackSlotColoring::UnfoldAndRewriteInstruction(MachineInstr *MI, int OldFI,
       DEBUG(MI->dump());
       ++NumStoreElim;
     } else {
-      TII->copyRegToReg(*MBB, MI, Reg, SrcReg, RC, RC,
-                        MI->getDebugLoc());
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY), Reg)
+        .addReg(SrcReg);
       ++NumRegRepl;
     }
 
diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp
index 142398c..59315cf 100644
--- a/lib/CodeGen/StrongPHIElimination.cpp
+++ b/lib/CodeGen/StrongPHIElimination.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterCoalescer.h"
@@ -695,9 +696,8 @@ void StrongPHIElimination::ScheduleCopies(MachineBasicBlock* MBB,
         // Insert copy from curr.second to a temporary at
         // the Phi defining curr.second
         MachineBasicBlock::iterator PI = MRI.getVRegDef(curr.second);
-        TII->copyRegToReg(*PI->getParent(), PI, t,
-                          curr.second, RC, RC, DebugLoc());
-        
+        BuildMI(*PI->getParent(), PI, DebugLoc(), TII->get(TargetOpcode::COPY),
+                t).addReg(curr.second);
         DEBUG(dbgs() << "Inserted copy from " << curr.second << " to " << t
                      << "\n");
         
@@ -712,8 +712,8 @@ void StrongPHIElimination::ScheduleCopies(MachineBasicBlock* MBB,
       }
       
       // Insert copy from map[curr.first] to curr.second
-      TII->copyRegToReg(*MBB, MBB->getFirstTerminator(), curr.second,
-                        map[curr.first], RC, RC, DebugLoc());
+      BuildMI(*MBB, MBB->getFirstTerminator(), DebugLoc(),
+             TII->get(TargetOpcode::COPY), curr.second).addReg(map[curr.first]);
       map[curr.first] = curr.second;
       DEBUG(dbgs() << "Inserted copy from " << curr.first << " to "
                    << curr.second << "\n");
@@ -761,8 +761,8 @@ void StrongPHIElimination::ScheduleCopies(MachineBasicBlock* MBB,
         
         // Insert a copy from dest to a new temporary t at the end of b
         unsigned t = MF->getRegInfo().createVirtualRegister(RC);
-        TII->copyRegToReg(*MBB, MBB->getFirstTerminator(), t,
-                          curr.second, RC, RC, DebugLoc());
+        BuildMI(*MBB, MBB->getFirstTerminator(), DebugLoc(),
+                TII->get(TargetOpcode::COPY), t).addReg(curr.second);
         map[curr.second] = t;
         
         MachineBasicBlock::iterator TI = MBB->getFirstTerminator();
@@ -830,9 +830,6 @@ void StrongPHIElimination::InsertCopies(MachineDomTreeNode* MDTN,
         LiveInterval& Int = LI.getInterval(I->getOperand(i).getReg());
         VNInfo* FirstVN = *Int.vni_begin();
         FirstVN->setHasPHIKill(false);
-        if (I->getOperand(i).isKill())
-          FirstVN->addKill(LI.getInstructionIndex(I).getUseIndex());
-        
         LiveRange LR (LI.getMBBStartIdx(I->getParent()),
                       LI.getInstructionIndex(I).getUseIndex().getNextSlot(),
                       FirstVN);
@@ -959,9 +956,8 @@ bool StrongPHIElimination::runOnMachineFunction(MachineFunction &Fn) {
         } else {
           // Insert a last-minute copy if a conflict was detected.
           const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
-          const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(I->first);
-          TII->copyRegToReg(*SI->second, SI->second->getFirstTerminator(),
-                            I->first, SI->first, RC, RC, DebugLoc());
+          BuildMI(*SI->second, SI->second->getFirstTerminator(), DebugLoc(),
+                  TII->get(TargetOpcode::COPY), I->first).addReg(SI->first);
           
           LI.renumber();
           
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index f2e2a76..075db80 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineSSAUpdater.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -559,11 +560,9 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
     }
     MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator();
     for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
-      const TargetRegisterClass *RC = MRI->getRegClass(CopyInfos[i].first);
-      TII->copyRegToReg(*PredBB, Loc, CopyInfos[i].first,
-                        CopyInfos[i].second, RC,RC, DebugLoc());
-      MachineInstr *CopyMI = prior(Loc);
-      Copies.push_back(CopyMI);
+      Copies.push_back(BuildMI(*PredBB, Loc, DebugLoc(),
+                               TII->get(TargetOpcode::COPY),
+                               CopyInfos[i].first).addReg(CopyInfos[i].second));
     }
     NumInstrDups += TailBB->size() - 1; // subtract one for removed branch
 
@@ -618,11 +617,10 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
       }
       MachineBasicBlock::iterator Loc = PrevBB->getFirstTerminator();
       for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
-        const TargetRegisterClass *RC = MRI->getRegClass(CopyInfos[i].first);
-        TII->copyRegToReg(*PrevBB, Loc, CopyInfos[i].first,
-                          CopyInfos[i].second, RC, RC, DebugLoc());
-        MachineInstr *CopyMI = prior(Loc);
-        Copies.push_back(CopyMI);
+        Copies.push_back(BuildMI(*PrevBB, Loc, DebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 CopyInfos[i].first)
+                           .addReg(CopyInfos[i].second));
       }
     } else {
       // No PHIs to worry about, just splice the instructions over.
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index 0ad6619..cdacb98 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/ADT/SmallVector.h"
@@ -21,11 +22,34 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PostRAHazardRecognizer.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
+/// after it, replacing it with an unconditional branch to NewDest.
+void
+TargetInstrInfoImpl::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                                             MachineBasicBlock *NewDest) const {
+  MachineBasicBlock *MBB = Tail->getParent();
+
+  // Remove all the old successors of MBB from the CFG.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_begin());
+
+  // Remove all the dead instructions from the end of MBB.
+  MBB->erase(Tail, MBB->end());
+
+  // If MBB isn't immediately before MBB, insert a branch to it.
+  if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(NewDest))
+    InsertBranch(*MBB, NewDest, 0, SmallVector<MachineOperand, 0>(),
+                 Tail->getDebugLoc());
+  MBB->addSuccessor(NewDest);
+}
+
 // commuteInstruction - The default implementation of this method just exchanges
 // the two operands returned by findCommutedOpIndices.
 MachineInstr *TargetInstrInfoImpl::commuteInstruction(MachineInstr *MI,
@@ -136,17 +160,9 @@ void TargetInstrInfoImpl::reMaterialize(MachineBasicBlock &MBB,
                                         unsigned DestReg,
                                         unsigned SubIdx,
                                         const MachineInstr *Orig,
-                                        const TargetRegisterInfo *TRI) const {
+                                        const TargetRegisterInfo &TRI) const {
   MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
-  MachineOperand &MO = MI->getOperand(0);
-  if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
-    MO.setReg(DestReg);
-    MO.setSubReg(SubIdx);
-  } else if (SubIdx) {
-    MO.setReg(TRI->getSubReg(DestReg, SubIdx));
-  } else {
-    MO.setReg(DestReg);
-  }
+  MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI);
   MBB.insert(I, MI);
 }
 
@@ -175,6 +191,47 @@ TargetInstrInfoImpl::GetFunctionSizeInBytes(const MachineFunction &MF) const {
   return FnSize;
 }
 
+// If the COPY instruction in MI can be folded to a stack operation, return
+// the register class to use.
+static const TargetRegisterClass *canFoldCopy(const MachineInstr *MI,
+                                              unsigned FoldIdx) {
+  assert(MI->isCopy() && "MI must be a COPY instruction");
+  if (MI->getNumOperands() != 2)
+    return 0;
+  assert(FoldIdx<2 && "FoldIdx refers no nonexistent operand");
+
+  const MachineOperand &FoldOp = MI->getOperand(FoldIdx);
+  const MachineOperand &LiveOp = MI->getOperand(1-FoldIdx);
+
+  if (FoldOp.getSubReg() || LiveOp.getSubReg())
+    return 0;
+
+  unsigned FoldReg = FoldOp.getReg();
+  unsigned LiveReg = LiveOp.getReg();
+
+  assert(TargetRegisterInfo::isVirtualRegister(FoldReg) &&
+         "Cannot fold physregs");
+
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = MRI.getRegClass(FoldReg);
+
+  if (TargetRegisterInfo::isPhysicalRegister(LiveOp.getReg()))
+    return RC->contains(LiveOp.getReg()) ? RC : 0;
+
+  const TargetRegisterClass *LiveRC = MRI.getRegClass(LiveReg);
+  if (RC == LiveRC || RC->hasSubClass(LiveRC))
+    return RC;
+
+  // FIXME: Allow folding when register classes are memory compatible.
+  return 0;
+}
+
+bool TargetInstrInfoImpl::
+canFoldMemoryOperand(const MachineInstr *MI,
+                     const SmallVectorImpl<unsigned> &Ops) const {
+  return MI->isCopy() && Ops.size() == 1 && canFoldCopy(MI, Ops[0]);
+}
+
 /// foldMemoryOperand - Attempt to fold a load or store of the specified stack
 /// slot into the specified machine instruction for the specified operand(s).
 /// If this is possible, a new instruction is returned with the specified
@@ -182,10 +239,9 @@ TargetInstrInfoImpl::GetFunctionSizeInBytes(const MachineFunction &MF) const {
 /// removing the old instruction and adding the new one in the instruction
 /// stream.
 MachineInstr*
-TargetInstrInfo::foldMemoryOperand(MachineFunction &MF,
-                                   MachineInstr* MI,
+TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
                                    const SmallVectorImpl<unsigned> &Ops,
-                                   int FrameIndex) const {
+                                   int FI) const {
   unsigned Flags = 0;
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     if (MI->getOperand(Ops[i]).isDef())
@@ -193,34 +249,56 @@ TargetInstrInfo::foldMemoryOperand(MachineFunction &MF,
     else
       Flags |= MachineMemOperand::MOLoad;
 
+  MachineBasicBlock *MBB = MI->getParent();
+  assert(MBB && "foldMemoryOperand needs an inserted instruction");
+  MachineFunction &MF = *MBB->getParent();
+
   // Ask the target to do the actual folding.
-  MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
-  if (!NewMI) return 0;
+  if (MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, FI)) {
+    // Add a memory operand, foldMemoryOperandImpl doesn't do that.
+    assert((!(Flags & MachineMemOperand::MOStore) ||
+            NewMI->getDesc().mayStore()) &&
+           "Folded a def to a non-store!");
+    assert((!(Flags & MachineMemOperand::MOLoad) ||
+            NewMI->getDesc().mayLoad()) &&
+           "Folded a use to a non-load!");
+    const MachineFrameInfo &MFI = *MF.getFrameInfo();
+    assert(MFI.getObjectOffset(FI) != -1);
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                              Flags, /*Offset=*/0,
+                              MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    NewMI->addMemOperand(MF, MMO);
 
-  assert((!(Flags & MachineMemOperand::MOStore) ||
-          NewMI->getDesc().mayStore()) &&
-         "Folded a def to a non-store!");
-  assert((!(Flags & MachineMemOperand::MOLoad) ||
-          NewMI->getDesc().mayLoad()) &&
-         "Folded a use to a non-load!");
-  const MachineFrameInfo &MFI = *MF.getFrameInfo();
-  assert(MFI.getObjectOffset(FrameIndex) != -1);
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FrameIndex),
-                            Flags, /*Offset=*/0,
-                            MFI.getObjectSize(FrameIndex),
-                            MFI.getObjectAlignment(FrameIndex));
-  NewMI->addMemOperand(MF, MMO);
+    // FIXME: change foldMemoryOperandImpl semantics to also insert NewMI.
+    return MBB->insert(MI, NewMI);
+  }
 
-  return NewMI;
+  // Straight COPY may fold as load/store.
+  if (!MI->isCopy() || Ops.size() != 1)
+    return 0;
+
+  const TargetRegisterClass *RC = canFoldCopy(MI, Ops[0]);
+  if (!RC)
+    return 0;
+
+  const MachineOperand &MO = MI->getOperand(1-Ops[0]);
+  MachineBasicBlock::iterator Pos = MI;
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+
+  if (Flags == MachineMemOperand::MOStore)
+    storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI);
+  else
+    loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI);
+  return --Pos;
 }
 
 /// foldMemoryOperand - Same as the previous version except it allows folding
 /// of any load and store from / to any address, not just from a specific
 /// stack slot.
 MachineInstr*
-TargetInstrInfo::foldMemoryOperand(MachineFunction &MF,
-                                   MachineInstr* MI,
+TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
                                    const SmallVectorImpl<unsigned> &Ops,
                                    MachineInstr* LoadMI) const {
   assert(LoadMI->getDesc().canFoldAsLoad() && "LoadMI isn't foldable!");
@@ -228,11 +306,15 @@ TargetInstrInfo::foldMemoryOperand(MachineFunction &MF,
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     assert(MI->getOperand(Ops[i]).isUse() && "Folding load into def!");
 #endif
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineFunction &MF = *MBB.getParent();
 
   // Ask the target to do the actual folding.
   MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, LoadMI);
   if (!NewMI) return 0;
 
+  NewMI = MBB.insert(MI, NewMI);
+
   // Copy the memoperands from the load to the folded instruction.
   NewMI->setMemRefs(LoadMI->memoperands_begin(),
                     LoadMI->memoperands_end());
@@ -240,11 +322,9 @@ TargetInstrInfo::foldMemoryOperand(MachineFunction &MF,
   return NewMI;
 }
 
-bool
-TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(const MachineInstr *
-                                                            MI,
-                                                          AliasAnalysis *
-                                                            AA) const {
+bool TargetInstrInfo::
+isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
+                                         AliasAnalysis *AA) const {
   const MachineFunction &MF = *MI->getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetMachine &TM = MF.getTarget();
@@ -324,3 +404,31 @@ TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(const MachineInstr *
   // Everything checked out.
   return true;
 }
+
+/// isSchedulingBoundary - Test if the given instruction should be
+/// considered a scheduling boundary. This primarily includes labels
+/// and terminators.
+bool TargetInstrInfoImpl::isSchedulingBoundary(const MachineInstr *MI,
+                                               const MachineBasicBlock *MBB,
+                                               const MachineFunction &MF) const{
+  // Terminators and labels can't be scheduled around.
+  if (MI->getDesc().isTerminator() || MI->isLabel())
+    return true;
+
+  // Don't attempt to schedule around any instruction that defines
+  // a stack-oriented pointer, as it's unlikely to be profitable. This
+  // saves compile time, because it doesn't require every single
+  // stack slot reference to depend on the instruction that does the
+  // modification.
+  const TargetLowering &TLI = *MF.getTarget().getTargetLowering();
+  if (MI->definesRegister(TLI.getStackPointerRegisterToSaveRestore()))
+    return true;
+
+  return false;
+}
+
+// Default implementation of CreateTargetPostRAHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfoImpl::
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const {
+  return (ScheduleHazardRecognizer *)new PostRAHazardRecognizer(II);
+}
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 71ad3fb..a80cfc4 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -825,32 +825,32 @@ void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx,
   TargetLoweringObjectFile::Initialize(Ctx, TM);
   TextSection =
     getContext().getCOFFSection(".text",
-                                MCSectionCOFF::IMAGE_SCN_CNT_CODE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_EXECUTE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_CNT_CODE |
+                                COFF::IMAGE_SCN_MEM_EXECUTE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getText());
   DataSection =
     getContext().getCOFFSection(".data",
-                                MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ |
-                                MCSectionCOFF::IMAGE_SCN_MEM_WRITE,
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
                                 SectionKind::getDataRel());
   ReadOnlySection =
     getContext().getCOFFSection(".rdata",
-                                MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getReadOnly());
   StaticCtorSection =
     getContext().getCOFFSection(".ctors",
-                                MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ |
-                                MCSectionCOFF::IMAGE_SCN_MEM_WRITE,
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
                                 SectionKind::getDataRel());
   StaticDtorSection =
     getContext().getCOFFSection(".dtors",
-                                MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ |
-                                MCSectionCOFF::IMAGE_SCN_MEM_WRITE,
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
                                 SectionKind::getDataRel());
 
   // FIXME: We're emitting LSDA info into a readonly section on COFF, even
@@ -859,76 +859,76 @@ void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx,
   // adjusted or this should be a data section.
   LSDASection =
     getContext().getCOFFSection(".gcc_except_table",
-                                MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getReadOnly());
   EHFrameSection =
     getContext().getCOFFSection(".eh_frame",
-                                MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ |
-                                MCSectionCOFF::IMAGE_SCN_MEM_WRITE,
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
                                 SectionKind::getDataRel());
 
   // Debug info.
   DwarfAbbrevSection =
     getContext().getCOFFSection(".debug_abbrev",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfInfoSection =
     getContext().getCOFFSection(".debug_info",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfLineSection =
     getContext().getCOFFSection(".debug_line",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfFrameSection =
     getContext().getCOFFSection(".debug_frame",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfPubNamesSection =
     getContext().getCOFFSection(".debug_pubnames",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfPubTypesSection =
     getContext().getCOFFSection(".debug_pubtypes",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfStrSection =
     getContext().getCOFFSection(".debug_str",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfLocSection =
     getContext().getCOFFSection(".debug_loc",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfARangesSection =
     getContext().getCOFFSection(".debug_aranges",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfRangesSection =
     getContext().getCOFFSection(".debug_ranges",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfMacroInfoSection =
     getContext().getCOFFSection(".debug_macinfo",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
 
   DrectveSection =
     getContext().getCOFFSection(".drectve",
-                                MCSectionCOFF::IMAGE_SCN_LNK_INFO,
+                                COFF::IMAGE_SCN_LNK_INFO,
                                 SectionKind::getMetadata());
 }
 
@@ -936,27 +936,27 @@ static unsigned
 getCOFFSectionFlags(SectionKind K) {
   unsigned Flags = 0;
 
-  if (!K.isMetadata())
+  if (K.isMetadata())
     Flags |=
-      MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE;
+      COFF::IMAGE_SCN_MEM_DISCARDABLE;
   else if (K.isText())
     Flags |=
-      MCSectionCOFF::IMAGE_SCN_MEM_EXECUTE |
-      MCSectionCOFF::IMAGE_SCN_CNT_CODE;
+      COFF::IMAGE_SCN_MEM_EXECUTE |
+      COFF::IMAGE_SCN_CNT_CODE;
   else if (K.isBSS ())
     Flags |=
-      MCSectionCOFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
-      MCSectionCOFF::IMAGE_SCN_MEM_READ |
-      MCSectionCOFF::IMAGE_SCN_MEM_WRITE;
+      COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_MEM_WRITE;
   else if (K.isReadOnly())
     Flags |=
-      MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-      MCSectionCOFF::IMAGE_SCN_MEM_READ;
+      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ;
   else if (K.isWriteable())
     Flags |=
-      MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-      MCSectionCOFF::IMAGE_SCN_MEM_READ |
-      MCSectionCOFF::IMAGE_SCN_MEM_WRITE;
+      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_MEM_WRITE;
 
   return Flags;
 }
@@ -995,10 +995,10 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
 
     unsigned Characteristics = getCOFFSectionFlags(Kind);
 
-    Characteristics |= MCSectionCOFF::IMAGE_SCN_LNK_COMDAT;
+    Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
 
     return getContext().getCOFFSection(Name.str(), Characteristics,
-                          MCSectionCOFF::IMAGE_COMDAT_SELECT_EXACT_MATCH, Kind);
+                          COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH, Kind);
   }
 
   if (Kind.isText())
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 3d10dc1..5649143 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -33,6 +33,7 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -381,7 +382,7 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
   DstReg = 0;
   unsigned SrcSubIdx, DstSubIdx;
   if (!TII->isMoveInstr(MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
-    if (MI.isExtractSubreg()) {
+    if (MI.isCopy()) {
       DstReg = MI.getOperand(0).getReg();
       SrcReg = MI.getOperand(1).getReg();
     } else if (MI.isInsertSubreg()) {
@@ -897,6 +898,108 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
       }
     }
   }
+
+  // If this is an instruction with a load folded into it, try unfolding
+  // the load, e.g. avoid this:
+  //   movq %rdx, %rcx
+  //   addq (%rax), %rcx
+  // in favor of this:
+  //   movq (%rax), %rcx
+  //   addq %rdx, %rcx
+  // because it's preferable to schedule a load than a register copy.
+  if (TID.mayLoad() && !regBKilled) {
+    // Determine if a load can be unfolded.
+    unsigned LoadRegIndex;
+    unsigned NewOpc =
+      TII->getOpcodeAfterMemoryUnfold(mi->getOpcode(),
+                                      /*UnfoldLoad=*/true,
+                                      /*UnfoldStore=*/false,
+                                      &LoadRegIndex);
+    if (NewOpc != 0) {
+      const TargetInstrDesc &UnfoldTID = TII->get(NewOpc);
+      if (UnfoldTID.getNumDefs() == 1) {
+        MachineFunction &MF = *mbbi->getParent();
+
+        // Unfold the load.
+        DEBUG(dbgs() << "2addr:   UNFOLDING: " << *mi);
+        const TargetRegisterClass *RC =
+          UnfoldTID.OpInfo[LoadRegIndex].getRegClass(TRI);
+        unsigned Reg = MRI->createVirtualRegister(RC);
+        SmallVector<MachineInstr *, 2> NewMIs;
+        if (!TII->unfoldMemoryOperand(MF, mi, Reg,
+                                      /*UnfoldLoad=*/true,/*UnfoldStore=*/false,
+                                      NewMIs)) {
+          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          return false;
+        }
+        assert(NewMIs.size() == 2 &&
+               "Unfolded a load into multiple instructions!");
+        // The load was previously folded, so this is the only use.
+        NewMIs[1]->addRegisterKilled(Reg, TRI);
+
+        // Tentatively insert the instructions into the block so that they
+        // look "normal" to the transformation logic.
+        mbbi->insert(mi, NewMIs[0]);
+        mbbi->insert(mi, NewMIs[1]);
+
+        DEBUG(dbgs() << "2addr:    NEW LOAD: " << *NewMIs[0]
+                     << "2addr:    NEW INST: " << *NewMIs[1]);
+
+        // Transform the instruction, now that it no longer has a load.
+        unsigned NewDstIdx = NewMIs[1]->findRegisterDefOperandIdx(regA);
+        unsigned NewSrcIdx = NewMIs[1]->findRegisterUseOperandIdx(regB);
+        MachineBasicBlock::iterator NewMI = NewMIs[1];
+        bool TransformSuccess =
+          TryInstructionTransform(NewMI, mi, mbbi,
+                                  NewSrcIdx, NewDstIdx, Dist);
+        if (TransformSuccess ||
+            NewMIs[1]->getOperand(NewSrcIdx).isKill()) {
+          // Success, or at least we made an improvement. Keep the unfolded
+          // instructions and discard the original.
+          if (LV) {
+            for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
+              MachineOperand &MO = mi->getOperand(i);
+              if (MO.isReg() && MO.getReg() != 0 &&
+                  TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+                if (MO.isUse()) {
+                  if (MO.isKill()) {
+                    if (NewMIs[0]->killsRegister(MO.getReg()))
+                      LV->replaceKillInstruction(MO.getReg(), mi, NewMIs[0]);
+                    else {
+                      assert(NewMIs[1]->killsRegister(MO.getReg()) &&
+                             "Kill missing after load unfold!");
+                      LV->replaceKillInstruction(MO.getReg(), mi, NewMIs[1]);
+                    }
+                  }
+                } else if (LV->removeVirtualRegisterDead(MO.getReg(), mi)) {
+                  if (NewMIs[1]->registerDefIsDead(MO.getReg()))
+                    LV->addVirtualRegisterDead(MO.getReg(), NewMIs[1]);
+                  else {
+                    assert(NewMIs[0]->registerDefIsDead(MO.getReg()) &&
+                           "Dead flag missing after load unfold!");
+                    LV->addVirtualRegisterDead(MO.getReg(), NewMIs[0]);
+                  }
+                }
+              }
+            }
+            LV->addVirtualRegisterKilled(Reg, NewMIs[1]);
+          }
+          mi->eraseFromParent();
+          mi = NewMIs[1];
+          if (TransformSuccess)
+            return true;
+        } else {
+          // Transforming didn't eliminate the tie and didn't lead to an
+          // improvement. Clean up the unfolded instructions and keep the
+          // original.
+          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          NewMIs[0]->eraseFromParent();
+          NewMIs[1]->eraseFromParent();
+        }
+      }
+    }
+  }
+
   return false;
 }
 
@@ -1047,14 +1150,12 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
               isProfitableToReMat(regB, rc, mi, DefMI, mbbi, Dist)){
             DEBUG(dbgs() << "2addr: REMATTING : " << *DefMI << "\n");
             unsigned regASubIdx = mi->getOperand(DstIdx).getSubReg();
-            TII->reMaterialize(*mbbi, mi, regA, regASubIdx, DefMI, TRI);
+            TII->reMaterialize(*mbbi, mi, regA, regASubIdx, DefMI, *TRI);
             ReMatRegs.set(regB);
             ++NumReMats;
           } else {
-            bool Emitted = TII->copyRegToReg(*mbbi, mi, regA, regB, rc, rc,
-                                             mi->getDebugLoc());
-            (void)Emitted;
-            assert(Emitted && "Unable to issue a copy instruction!\n");
+            BuildMI(*mbbi, mi, mi->getDebugLoc(), TII->get(TargetOpcode::COPY),
+                    regA).addReg(regB);
           }
 
           MachineBasicBlock::iterator prevMI = prior(mi);
@@ -1104,12 +1205,30 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
             }
           }
         }
-          
+
+        // Schedule the source copy / remat inserted to form two-address
+        // instruction. FIXME: Does it matter the distance map may not be
+        // accurate after it's scheduled?
+        TII->scheduleTwoAddrSource(prior(mi), mi, *TRI);
+
         MadeChange = true;
 
         DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
       }
 
+      // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
+      if (mi->isInsertSubreg()) {
+        // From %reg = INSERT_SUBREG %reg, %subreg, subidx
+        // To   %reg:subidx = COPY %subreg
+        unsigned SubIdx = mi->getOperand(3).getImm();
+        mi->RemoveOperand(3);
+        assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
+        mi->getOperand(0).setSubReg(SubIdx);
+        mi->RemoveOperand(1);
+        mi->setDesc(TII->get(TargetOpcode::COPY));
+        DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
+      }
+
       // Clear TiedOperands here instead of at the top of the loop
       // since most instructions do not have tied operands.
       TiedOperands.clear();
@@ -1136,14 +1255,13 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
 
 static void UpdateRegSequenceSrcs(unsigned SrcReg,
                                   unsigned DstReg, unsigned SubIdx,
-                                  MachineRegisterInfo *MRI) {
+                                  MachineRegisterInfo *MRI,
+                                  const TargetRegisterInfo &TRI) {
   for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg),
          RE = MRI->reg_end(); RI != RE; ) {
     MachineOperand &MO = RI.getOperand();
     ++RI;
-    MO.setReg(DstReg);
-    assert(MO.getSubReg() == 0);
-    MO.setSubReg(SubIdx);
+    MO.substVirtReg(DstReg, SubIdx, TRI);
   }
 }
 
@@ -1165,55 +1283,102 @@ TwoAddressInstructionPass::CoalesceExtSubRegs(SmallVector<unsigned,4> &Srcs,
     if (!Seen.insert(SrcReg))
       continue;
 
-    // If there are no other uses than extract_subreg which feed into
+    // Check that the instructions are all in the same basic block.
+    MachineInstr *SrcDefMI = MRI->getVRegDef(SrcReg);
+    MachineInstr *DstDefMI = MRI->getVRegDef(DstReg);
+    if (SrcDefMI->getParent() != DstDefMI->getParent())
+      continue;
+
+    // If there are no other uses than copies which feed into
     // the reg_sequence, then we might be able to coalesce them.
     bool CanCoalesce = true;
-    SmallVector<unsigned, 4> SubIndices;
+    SmallVector<unsigned, 4> SrcSubIndices, DstSubIndices;
     for (MachineRegisterInfo::use_nodbg_iterator
            UI = MRI->use_nodbg_begin(SrcReg),
            UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
       MachineInstr *UseMI = &*UI;
-      if (!UseMI->isExtractSubreg() ||
-          UseMI->getOperand(0).getReg() != DstReg) {
+      if (!UseMI->isCopy() || UseMI->getOperand(0).getReg() != DstReg) {
         CanCoalesce = false;
         break;
       }
-      SubIndices.push_back(UseMI->getOperand(2).getImm());
+      SrcSubIndices.push_back(UseMI->getOperand(1).getSubReg());
+      DstSubIndices.push_back(UseMI->getOperand(0).getSubReg());
     }
 
-    if (!CanCoalesce || SubIndices.size() < 2)
+    if (!CanCoalesce || SrcSubIndices.size() < 2)
       continue;
 
-    std::sort(SubIndices.begin(), SubIndices.end());
-    unsigned NewSubIdx = 0;
-    if (TRI->canCombinedSubRegIndex(MRI->getRegClass(SrcReg), SubIndices,
-                                    NewSubIdx)) {
-      bool Proceed = true;
-      if (NewSubIdx)
-        for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg),
-               RE = MRI->reg_end(); RI != RE; ) {
-          MachineOperand &MO = RI.getOperand();
-          ++RI;
-          // FIXME: If the sub-registers do not combine to the whole
-          // super-register, i.e. NewSubIdx != 0, and any of the use has a
-          // sub-register index, then abort the coalescing attempt.
-          if (MO.getSubReg()) {
-            Proceed = false;
-            break;
-          }
-          MO.setReg(DstReg);
-          MO.setSubReg(NewSubIdx);
-        }
-      if (Proceed)
-        for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg),
-               RE = MRI->reg_end(); RI != RE; ) {
-          MachineOperand &MO = RI.getOperand();
-          ++RI;
-          MO.setReg(DstReg);
-          if (NewSubIdx)
-            MO.setSubReg(NewSubIdx);
-        }
+    // Check that the source subregisters can be combined.
+    std::sort(SrcSubIndices.begin(), SrcSubIndices.end());
+    unsigned NewSrcSubIdx = 0;
+    if (!TRI->canCombineSubRegIndices(MRI->getRegClass(SrcReg), SrcSubIndices,
+                                      NewSrcSubIdx))
+      continue;
+
+    // Check that the destination subregisters can also be combined.
+    std::sort(DstSubIndices.begin(), DstSubIndices.end());
+    unsigned NewDstSubIdx = 0;
+    if (!TRI->canCombineSubRegIndices(MRI->getRegClass(DstReg), DstSubIndices,
+                                      NewDstSubIdx))
+      continue;
+
+    // If neither source nor destination can be combined to the full register,
+    // just give up.  This could be improved if it ever matters.
+    if (NewSrcSubIdx != 0 && NewDstSubIdx != 0)
+      continue;
+
+    // Now that we know that all the uses are extract_subregs and that those
+    // subregs can somehow be combined, scan all the extract_subregs again to
+    // make sure the subregs are in the right order and can be composed.
+    MachineInstr *SomeMI = 0;
+    CanCoalesce = true;
+    for (MachineRegisterInfo::use_nodbg_iterator
+           UI = MRI->use_nodbg_begin(SrcReg),
+           UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
+      MachineInstr *UseMI = &*UI;
+      assert(UseMI->isCopy());
+      unsigned DstSubIdx = UseMI->getOperand(0).getSubReg();
+      unsigned SrcSubIdx = UseMI->getOperand(1).getSubReg();
+      assert(DstSubIdx != 0 && "missing subreg from RegSequence elimination");
+      if ((NewDstSubIdx == 0 &&
+           TRI->composeSubRegIndices(NewSrcSubIdx, DstSubIdx) != SrcSubIdx) ||
+          (NewSrcSubIdx == 0 &&
+           TRI->composeSubRegIndices(NewDstSubIdx, SrcSubIdx) != DstSubIdx)) {
+        CanCoalesce = false;
+        break;
+      }
+      // Keep track of one of the uses.
+      SomeMI = UseMI;
+    }
+    if (!CanCoalesce)
+      continue;
+
+    // Insert a copy to replace the original.
+    MachineBasicBlock::iterator InsertLoc = SomeMI;
+    MachineInstr *CopyMI = BuildMI(*SomeMI->getParent(), SomeMI,
+                                   SomeMI->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY))
+      .addReg(DstReg, RegState::Define, NewDstSubIdx)
+      .addReg(SrcReg, 0, NewSrcSubIdx);
+
+    // Remove all the old extract instructions.
+    for (MachineRegisterInfo::use_nodbg_iterator
+           UI = MRI->use_nodbg_begin(SrcReg),
+           UE = MRI->use_nodbg_end(); UI != UE; ) {
+      MachineInstr *UseMI = &*UI;
+      ++UI;
+      if (UseMI == CopyMI)
+        continue;
+      assert(UseMI->isCopy());
+      // Move any kills to the new copy or extract instruction.
+      if (UseMI->getOperand(1).isKill()) {
+        CopyMI->getOperand(1).setIsKill();
+        if (LV)
+          // Update live variables
+          LV->replaceKillInstruction(SrcReg, UseMI, &*CopyMI);
       }
+      UseMI->eraseFromParent();
+    }
   }
 }
 
@@ -1268,15 +1433,13 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
       }
       IsImpDef = false;
 
-      // Remember EXTRACT_SUBREG sources. These might be candidate for
-      // coalescing.
-      if (DefMI->isExtractSubreg())
+      // Remember COPY sources. These might be candidate for coalescing.
+      if (DefMI->isCopy() && DefMI->getOperand(1).getSubReg())
         RealSrcs.push_back(DefMI->getOperand(1).getReg());
 
-      if (!Seen.insert(SrcReg) ||
-          MI->getParent() != DefMI->getParent() ||
-          !MI->getOperand(i).isKill() ||
-          HasOtherRegSequenceUses(SrcReg, MI, MRI)) {
+      bool isKill = MI->getOperand(i).isKill();
+      if (!Seen.insert(SrcReg) || MI->getParent() != DefMI->getParent() ||
+          !isKill || HasOtherRegSequenceUses(SrcReg, MI, MRI)) {
         // REG_SEQUENCE cannot have duplicated operands, add a copy.
         // Also add an copy if the source is live-in the block. We don't want
         // to end up with a partial-redef of a livein, e.g.
@@ -1292,30 +1455,23 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
         // If the REG_SEQUENCE doesn't kill its source, keeping live variables
         // correctly up to date becomes very difficult. Insert a copy.
         //
-        const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
-        unsigned NewReg = MRI->createVirtualRegister(RC);
         MachineBasicBlock::iterator InsertLoc = MI;
-        bool Emitted =
-          TII->copyRegToReg(*MI->getParent(), InsertLoc, NewReg, SrcReg, RC, RC,
-                            MI->getDebugLoc());
-        (void)Emitted;
-        assert(Emitted && "Unable to issue a copy instruction!\n");
-        MI->getOperand(i).setReg(NewReg);
-        if (MI->getOperand(i).isKill()) {
-          MachineBasicBlock::iterator CopyMI = prior(InsertLoc);
-          MachineOperand *KillMO = CopyMI->findRegisterUseOperand(SrcReg);
-          KillMO->setIsKill();
-          if (LV)
-            // Update live variables
-            LV->replaceKillInstruction(SrcReg, MI, &*CopyMI);
-        }
+        MachineInstr *CopyMI = BuildMI(*MI->getParent(), InsertLoc,
+                                MI->getDebugLoc(), TII->get(TargetOpcode::COPY))
+            .addReg(DstReg, RegState::Define, MI->getOperand(i+1).getImm())
+            .addReg(SrcReg, getKillRegState(isKill));
+        MI->getOperand(i).setReg(0);
+        if (LV && isKill)
+          LV->replaceKillInstruction(SrcReg, MI, CopyMI);
+        DEBUG(dbgs() << "Inserted: " << *CopyMI);
       }
     }
 
     for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) {
       unsigned SrcReg = MI->getOperand(i).getReg();
+      if (!SrcReg) continue;
       unsigned SubIdx = MI->getOperand(i+1).getImm();
-      UpdateRegSequenceSrcs(SrcReg, DstReg, SubIdx, MRI);
+      UpdateRegSequenceSrcs(SrcReg, DstReg, SubIdx, MRI, *TRI);
     }
 
     if (IsImpDef) {
@@ -1328,8 +1484,11 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
       MI->eraseFromParent();
     }
 
-    // Try coalescing some EXTRACT_SUBREG instructions.
-    CoalesceExtSubRegs(RealSrcs, DstReg);
+    // Try coalescing some EXTRACT_SUBREG instructions. This can create
+    // INSERT_SUBREG instructions that must have <undef> flags added by
+    // LiveIntervalAnalysis, so only run it when LiveVariables is available.
+    if (LV)
+      CoalesceExtSubRegs(RealSrcs, DstReg);
   }
 
   RegSequences.clear();
diff --git a/lib/CodeGen/VirtRegRewriter.cpp b/lib/CodeGen/VirtRegRewriter.cpp
index 871d836..57a1500 100644
--- a/lib/CodeGen/VirtRegRewriter.cpp
+++ b/lib/CodeGen/VirtRegRewriter.cpp
@@ -667,8 +667,7 @@ static void ReMaterialize(MachineBasicBlock &MBB,
   assert(TID.getNumDefs() == 1 &&
          "Don't know how to remat instructions that define > 1 values!");
 #endif
-  TII->reMaterialize(MBB, MII, DestReg,
-                     ReMatDefMI->getOperand(0).getSubReg(), ReMatDefMI, TRI);
+  TII->reMaterialize(MBB, MII, DestReg, 0, ReMatDefMI, *TRI);
   MachineInstr *NewMI = prior(MII);
   for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = NewMI->getOperand(i);
@@ -769,7 +768,7 @@ void AvailableSpills::AddAvailableRegsToLiveIn(MachineBasicBlock &MBB,
          I = PhysRegsAvailable.begin(), E = PhysRegsAvailable.end();
        I != E; ++I) {
     unsigned Reg = I->first;
-    const TargetRegisterClass* RC = TRI->getPhysicalRegisterRegClass(Reg);
+    const TargetRegisterClass* RC = TRI->getMinimalPhysRegClass(Reg);
     // FIXME: A temporary workaround. We can't reuse available value if it's
     // not safe to move the def of the virtual register's class. e.g.
     // X86::RFP* register classes. Do not add it as a live-in.
@@ -1022,7 +1021,7 @@ static unsigned FindFreeRegister(MachineBasicBlock::iterator MII,
     for (unsigned i = 0, e = Kills.size(); i != e; ++i) {
       unsigned Kill = Kills[i];
       if (!Defs[Kill] && !Uses[Kill] &&
-          TRI->getPhysicalRegisterRegClass(Kill) == RC)
+          RC->contains(Kill))
         return Kill;
     }
     for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
@@ -1410,25 +1409,25 @@ OptimizeByUnfold(MachineBasicBlock::iterator &MII,
     if (TII->unfoldMemoryOperand(MF, &MI, UnfoldVR, false, false, NewMIs)) {
       assert(NewMIs.size() == 1);
       MachineInstr *NewMI = NewMIs.back();
+      MBB->insert(MII, NewMI);
       NewMIs.clear();
       int Idx = NewMI->findRegisterUseOperandIdx(VirtReg, false);
       assert(Idx != -1);
       SmallVector<unsigned, 1> Ops;
       Ops.push_back(Idx);
-      MachineInstr *FoldedMI = TII->foldMemoryOperand(MF, NewMI, Ops, SS);
+      MachineInstr *FoldedMI = TII->foldMemoryOperand(NewMI, Ops, SS);
+      NewMI->eraseFromParent();
       if (FoldedMI) {
         VRM->addSpillSlotUse(SS, FoldedMI);
         if (!VRM->hasPhys(UnfoldVR))
           VRM->assignVirt2Phys(UnfoldVR, UnfoldPR);
         VRM->virtFolded(VirtReg, FoldedMI, VirtRegMap::isRef);
-        MII = MBB->insert(MII, FoldedMI);
+        MII = FoldedMI;
         InvalidateKills(MI, TRI, RegKills, KillOps);
         VRM->RemoveMachineInstrFromMaps(&MI);
         MBB->erase(&MI);
-        MF.DeleteMachineInstr(NewMI);
         return true;
       }
-      MF.DeleteMachineInstr(NewMI);
     }
   }
 
@@ -1480,7 +1479,6 @@ CommuteToFoldReload(MachineBasicBlock::iterator &MII,
   if (MII == MBB->begin() || !MII->killsRegister(SrcReg))
     return false;
 
-  MachineFunction &MF = *MBB->getParent();
   MachineInstr &MI = *MII;
   MachineBasicBlock::iterator DefMII = prior(MII);
   MachineInstr *DefMI = DefMII;
@@ -1511,11 +1509,12 @@ CommuteToFoldReload(MachineBasicBlock::iterator &MII,
     MachineInstr *CommutedMI = TII->commuteInstruction(DefMI, true);
     if (!CommutedMI)
       return false;
+    MBB->insert(MII, CommutedMI);
     SmallVector<unsigned, 1> Ops;
     Ops.push_back(NewDstIdx);
-    MachineInstr *FoldedMI = TII->foldMemoryOperand(MF, CommutedMI, Ops, SS);
+    MachineInstr *FoldedMI = TII->foldMemoryOperand(CommutedMI, Ops, SS);
     // Not needed since foldMemoryOperand returns new MI.
-    MF.DeleteMachineInstr(CommutedMI);
+    CommutedMI->eraseFromParent();
     if (!FoldedMI)
       return false;
 
@@ -1528,7 +1527,7 @@ CommuteToFoldReload(MachineBasicBlock::iterator &MII,
     MachineInstr *StoreMI = MII;
     VRM->addSpillSlotUse(SS, StoreMI);
     VRM->virtFolded(VirtReg, StoreMI, VirtRegMap::isMod);
-    MII = MBB->insert(MII, FoldedMI);  // Update MII to backtrack.
+    MII = FoldedMI;  // Update MII to backtrack.
 
     // Delete all 3 old instructions.
     InvalidateKills(*ReloadMI, TRI, RegKills, KillOps);
@@ -1704,7 +1703,7 @@ bool LocalRewriter::InsertEmergencySpills(MachineInstr *MI) {
   std::vector<unsigned> &EmSpills = VRM->getEmergencySpills(MI);
   for (unsigned i = 0, e = EmSpills.size(); i != e; ++i) {
     unsigned PhysReg = EmSpills[i];
-    const TargetRegisterClass *RC = TRI->getPhysicalRegisterRegClass(PhysReg);
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
     assert(RC && "Unable to determine register class!");
     int SS = VRM->getEmergencySpillSlot(RC);
     if (UsedSS.count(SS))
@@ -1759,7 +1758,6 @@ bool LocalRewriter::InsertRestores(MachineInstr *MI,
     bool DoReMat = VRM->isReMaterialized(VirtReg);
     int SSorRMId = DoReMat
       ? VRM->getReMatId(VirtReg) : VRM->getStackSlot(VirtReg);
-    const TargetRegisterClass* RC = MRI->getRegClass(VirtReg);
     unsigned InReg = Spills.getSpillSlotOrReMatPhysReg(SSorRMId);
     if (InReg == Phys) {
       // If the value is already available in the expected register, save
@@ -1793,20 +1791,16 @@ bool LocalRewriter::InsertRestores(MachineInstr *MI,
       MachineBasicBlock::iterator InsertLoc =
         ComputeReloadLoc(MII, MBB->begin(), Phys, TRI, DoReMat, SSorRMId, TII,
                          *MBB->getParent());
-
-      TII->copyRegToReg(*MBB, InsertLoc, Phys, InReg, RC, RC,
-                        MI->getDebugLoc());
+      MachineInstr *CopyMI = BuildMI(*MBB, InsertLoc, MI->getDebugLoc(),
+                                     TII->get(TargetOpcode::COPY), Phys)
+                               .addReg(InReg, RegState::Kill);
 
       // This invalidates Phys.
       Spills.ClobberPhysReg(Phys);
       // Remember it's available.
       Spills.addAvailable(SSorRMId, Phys);
 
-      // Mark is killed.
-      MachineInstr *CopyMI = prior(InsertLoc);
       CopyMI->setAsmPrinterFlag(MachineInstr::ReloadReuse);
-      MachineOperand *KillOpnd = CopyMI->findRegisterUseOperand(InReg);
-      KillOpnd->setIsKill();
       UpdateKills(*CopyMI, TRI, RegKills, KillOps);
 
       DEBUG(dbgs() << '\t' << *CopyMI);
@@ -2013,7 +2007,7 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
       //       = EXTRACT_SUBREG fi#1
       // fi#1 is available in EDI, but it cannot be reused because it's not in
       // the right register file.
-      if (PhysReg && !AvoidReload && (SubIdx || MI.isExtractSubreg())) {
+      if (PhysReg && !AvoidReload && SubIdx) {
         const TargetRegisterClass* RC = MRI->getRegClass(VirtReg);
         if (!RC->contains(PhysReg))
           PhysReg = 0;
@@ -2034,6 +2028,18 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
           CanReuse = !ReusedOperands.isClobbered(PhysReg) &&
             Spills.canClobberPhysReg(PhysReg);
         }
+        // If this is an asm, and PhysReg is used elsewhere as an earlyclobber
+        // operand, we can't also use it as an input.  (Outputs always come
+        // before inputs, so we can stop looking at i.)
+        if (MI.isInlineAsm()) {
+          for (unsigned k=0; k<i; ++k) {
+            MachineOperand &MOk = MI.getOperand(k);
+            if (MOk.isReg() && MOk.getReg()==PhysReg && MOk.isEarlyClobber()) {
+              CanReuse = false;
+              break;
+            }
+          }
+        }
 
         if (CanReuse) {
           // If this stack slot value is already available, reuse it!
@@ -2104,6 +2110,8 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
         // To avoid this problem, and to avoid doing a load right after a store,
         // we emit a copy from PhysReg into the designated register for this
         // operand.
+        //
+        // This case also applies to an earlyclobber'd PhysReg.
         unsigned DesignatedReg = VRM->getPhys(VirtReg);
         assert(DesignatedReg && "Must map virtreg to physreg!");
 
@@ -2136,7 +2144,6 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
           continue;
         }
 
-        const TargetRegisterClass* RC = MRI->getRegClass(VirtReg);
         MRI->setPhysRegUsed(DesignatedReg);
         ReusedOperands.markClobbered(DesignatedReg);
 
@@ -2144,11 +2151,9 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
         MachineBasicBlock::iterator InsertLoc =
           ComputeReloadLoc(&MI, MBB->begin(), PhysReg, TRI, DoReMat,
                            SSorRMId, TII, MF);
-
-        TII->copyRegToReg(*MBB, InsertLoc, DesignatedReg, PhysReg, RC, RC,
-                          MI.getDebugLoc());
-
-        MachineInstr *CopyMI = prior(InsertLoc);
+        MachineInstr *CopyMI = BuildMI(*MBB, InsertLoc, MI.getDebugLoc(),
+                                       TII->get(TargetOpcode::COPY),
+                                       DesignatedReg).addReg(PhysReg);
         CopyMI->setAsmPrinterFlag(MachineInstr::ReloadReuse);
         UpdateKills(*CopyMI, TRI, RegKills, KillOps);
 
@@ -2269,27 +2274,16 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
           if (unsigned InReg = Spills.getSpillSlotOrReMatPhysReg(SS)) {
             DEBUG(dbgs() << "Promoted Load To Copy: " << MI);
             if (DestReg != InReg) {
-              const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
-              TII->copyRegToReg(*MBB, &MI, DestReg, InReg, RC, RC,
-                                MI.getDebugLoc());
               MachineOperand *DefMO = MI.findRegisterDefOperand(DestReg);
-              unsigned SubIdx = DefMO->getSubReg();
+              MachineInstr *CopyMI = BuildMI(*MBB, &MI, MI.getDebugLoc(),
+                                             TII->get(TargetOpcode::COPY))
+                .addReg(DestReg, RegState::Define, DefMO->getSubReg())
+                .addReg(InReg, RegState::Kill);
               // Revisit the copy so we make sure to notice the effects of the
               // operation on the destreg (either needing to RA it if it's
               // virtual or needing to clobber any values if it's physical).
-              NextMII = &MI;
-              --NextMII;  // backtrack to the copy.
+              NextMII = CopyMI;
               NextMII->setAsmPrinterFlag(MachineInstr::ReloadReuse);
-              // Propagate the sub-register index over.
-              if (SubIdx) {
-                DefMO = NextMII->findRegisterDefOperand(DestReg);
-                DefMO->setSubReg(SubIdx);
-              }
-
-              // Mark is killed.
-              MachineOperand *KillOpnd = NextMII->findRegisterUseOperand(InReg);
-              KillOpnd->setIsKill();
-
               BackTracked = true;
             } else {
               DEBUG(dbgs() << "Removing now-noop copy: " << MI);
@@ -2430,6 +2424,24 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
         // Also check if it's copying from an "undef", if so, we can't
         // eliminate this or else the undef marker is lost and it will
         // confuses the scavenger. This is extremely rare.
+        if (MI.isIdentityCopy() && !MI.getOperand(1).isUndef() &&
+            MI.getNumOperands() == 2) {
+          ++NumDCE;
+          DEBUG(dbgs() << "Removing now-noop copy: " << MI);
+          SmallVector<unsigned, 2> KillRegs;
+          InvalidateKills(MI, TRI, RegKills, KillOps, &KillRegs);
+          if (MO.isDead() && !KillRegs.empty()) {
+            // Source register or an implicit super/sub-register use is killed.
+            assert(TRI->regsOverlap(KillRegs[0], MI.getOperand(0).getReg()));
+            // Last def is now dead.
+            TransferDeadness(MI.getOperand(1).getReg(), RegKills, KillOps);
+          }
+          VRM->RemoveMachineInstrFromMaps(&MI);
+          MBB->erase(&MI);
+          Erased = true;
+          Spills.disallowClobberPhysReg(VirtReg);
+          goto ProcessNextInst;
+        }
         unsigned Src, Dst, SrcSR, DstSR;
         if (TII->isMoveInstr(MI, Src, Dst, SrcSR, DstSR) &&
             Src == Dst && SrcSR == DstSR &&
@@ -2519,6 +2531,16 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
 
         // Check to see if this is a noop copy.  If so, eliminate the
         // instruction before considering the dest reg to be changed.
+        if (MI.isIdentityCopy()) {
+          ++NumDCE;
+          DEBUG(dbgs() << "Removing now-noop copy: " << MI);
+          InvalidateKills(MI, TRI, RegKills, KillOps);
+          VRM->RemoveMachineInstrFromMaps(&MI);
+          MBB->erase(&MI);
+          Erased = true;
+          UpdateKills(*LastStore, TRI, RegKills, KillOps);
+          goto ProcessNextInst;
+        }
         {
           unsigned Src, Dst, SrcSR, DstSR;
           if (TII->isMoveInstr(MI, Src, Dst, SrcSR, DstSR) &&
diff --git a/lib/CompilerDriver/Tool.cpp b/lib/CompilerDriver/Tool.cpp
index 5e558ca..c8488b2 100644
--- a/lib/CompilerDriver/Tool.cpp
+++ b/lib/CompilerDriver/Tool.cpp
@@ -85,7 +85,8 @@ StrVector Tool::SortArgs(ArgsVector& Args) const {
   StrVector Out;
 
   // HACK: this won't be needed when we'll migrate away from CommandLine.
-  std::stable_sort(Args.begin(), Args.end(), &CompareFirst<unsigned, std::string>);
+  std::stable_sort(Args.begin(), Args.end(),
+                   &CompareFirst<unsigned, std::string>);
   for (ArgsVector::iterator B = Args.begin(), E = Args.end(); B != E; ++B) {
     Out.push_back(B->second);
   }
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
index 0748b54..59ebe6e 100644
--- a/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -591,7 +591,7 @@ void Interpreter::popStackAndReturnValueToCaller(const Type *RetTy,
   ECStack.pop_back();
 
   if (ECStack.empty()) {  // Finished main.  Put result into exit code...
-    if (RetTy && RetTy->isIntegerTy()) {          // Nonvoid return type?
+    if (RetTy && !RetTy->isVoidTy()) {          // Nonvoid return type?
       ExitValue = Result;   // Capture the exit value of the program
     } else {
       memset(&ExitValue.Untyped, 0, sizeof(ExitValue.Untyped));
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index 26a53b5..57d1260 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -266,7 +266,7 @@ GenericValue Interpreter::callExternalFunction(Function *F,
     RawFn = (RawFunc)(intptr_t)
       sys::DynamicLibrary::SearchForAddressOfSymbol(F->getName());
     if (!RawFn)
-	RawFn = (RawFunc)(intptr_t)getPointerToGlobalIfAvailable(F);
+      RawFn = (RawFunc)(intptr_t)getPointerToGlobalIfAvailable(F);
     if (RawFn != 0)
       RawFunctions->insert(std::make_pair(F, RawFn));  // Cache for later
   } else {
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index 546d2b2..67bd3ed 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -626,10 +626,7 @@ void JIT::runJITOnFunction(Function *F, MachineCodeInfo *MCI) {
 void JIT::runJITOnFunctionUnlocked(Function *F, const MutexGuard &locked) {
   assert(!isAlreadyCodeGenerating && "Error: Recursive compilation detected!");
 
-  // JIT the function
-  isAlreadyCodeGenerating = true;
-  jitstate->getPM(locked).run(*F);
-  isAlreadyCodeGenerating = false;
+  jitTheFunction(F, locked);
 
   // If the function referred to another function that had not yet been
   // read from bitcode, and we are jitting non-lazily, emit it now.
@@ -640,10 +637,7 @@ void JIT::runJITOnFunctionUnlocked(Function *F, const MutexGuard &locked) {
     assert(!PF->hasAvailableExternallyLinkage() &&
            "Externally-defined function should not be in pending list.");
 
-    // JIT the function
-    isAlreadyCodeGenerating = true;
-    jitstate->getPM(locked).run(*PF);
-    isAlreadyCodeGenerating = false;
+    jitTheFunction(PF, locked);
     
     // Now that the function has been jitted, ask the JITEmitter to rewrite
     // the stub with real address of the function.
@@ -651,6 +645,15 @@ void JIT::runJITOnFunctionUnlocked(Function *F, const MutexGuard &locked) {
   }
 }
 
+void JIT::jitTheFunction(Function *F, const MutexGuard &locked) {
+  isAlreadyCodeGenerating = true;
+  jitstate->getPM(locked).run(*F);
+  isAlreadyCodeGenerating = false;
+
+  // clear basic block addresses after this function is done
+  getBasicBlockAddressMap(locked).clear();
+}
+
 /// getPointerToFunction - This method is used to get the address of the
 /// specified function, compiling it if neccesary.
 ///
@@ -687,6 +690,41 @@ void *JIT::getPointerToFunction(Function *F) {
   return Addr;
 }
 
+void JIT::addPointerToBasicBlock(const BasicBlock *BB, void *Addr) {
+  MutexGuard locked(lock);
+  
+  BasicBlockAddressMapTy::iterator I =
+    getBasicBlockAddressMap(locked).find(BB);
+  if (I == getBasicBlockAddressMap(locked).end()) {
+    getBasicBlockAddressMap(locked)[BB] = Addr;
+  } else {
+    // ignore repeats: some BBs can be split into few MBBs?
+  }
+}
+
+void JIT::clearPointerToBasicBlock(const BasicBlock *BB) {
+  MutexGuard locked(lock);
+  getBasicBlockAddressMap(locked).erase(BB);
+}
+
+void *JIT::getPointerToBasicBlock(BasicBlock *BB) {
+  // make sure it's function is compiled by JIT
+  (void)getPointerToFunction(BB->getParent());
+
+  // resolve basic block address
+  MutexGuard locked(lock);
+  
+  BasicBlockAddressMapTy::iterator I =
+    getBasicBlockAddressMap(locked).find(BB);
+  if (I != getBasicBlockAddressMap(locked).end()) {
+    return I->second;
+  } else {
+    assert(0 && "JIT does not have BB address for address-of-label, was"
+           " it eliminated by optimizer?");
+    return 0;
+  }
+}
+
 /// getOrEmitGlobalVariable - Return the address of the specified global
 /// variable, possibly emitting it to memory if needed.  This is used by the
 /// Emitter.
diff --git a/lib/ExecutionEngine/JIT/JIT.h b/lib/ExecutionEngine/JIT/JIT.h
index edae719..1d1763e 100644
--- a/lib/ExecutionEngine/JIT/JIT.h
+++ b/lib/ExecutionEngine/JIT/JIT.h
@@ -51,6 +51,10 @@ public:
 
 
 class JIT : public ExecutionEngine {
+  /// types
+  typedef ValueMap<const BasicBlock *, void *>
+      BasicBlockAddressMapTy;
+  /// data
   TargetMachine &TM;       // The current target we are compiling to
   TargetJITInfo &TJI;      // The JITInfo for the target we are compiling to
   JITCodeEmitter *JCE;     // JCE object
@@ -67,6 +71,12 @@ class JIT : public ExecutionEngine {
 
   JITState *jitstate;
 
+  /// BasicBlockAddressMap - A mapping between LLVM basic blocks and their
+  /// actualized version, only filled for basic blocks that have their address
+  /// taken.
+  BasicBlockAddressMapTy BasicBlockAddressMap;
+
+
   JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
       JITMemoryManager *JMM, CodeGenOpt::Level OptLevel,
       bool AllocateGVsWithCode);
@@ -90,9 +100,9 @@ public:
                                  CodeGenOpt::Level OptLevel =
                                    CodeGenOpt::Default,
                                  bool GVsWithCode = true,
-				 CodeModel::Model CMM = CodeModel::Default) {
+                                 CodeModel::Model CMM = CodeModel::Default) {
     return ExecutionEngine::createJIT(M, Err, JMM, OptLevel, GVsWithCode,
-				      CMM);
+                                      CMM);
   }
 
   virtual void addModule(Module *M);
@@ -127,10 +137,15 @@ public:
   ///
   void *getPointerToFunction(Function *F);
 
-  void *getPointerToBasicBlock(BasicBlock *BB) {
-    assert(0 && "JIT does not support address-of-label yet!");
-    return 0;
-  }
+  /// addPointerToBasicBlock - Adds address of the specific basic block.
+  void addPointerToBasicBlock(const BasicBlock *BB, void *Addr);
+
+  /// clearPointerToBasicBlock - Removes address of specific basic block.
+  void clearPointerToBasicBlock(const BasicBlock *BB);
+
+  /// getPointerToBasicBlock - This returns the address of the specified basic
+  /// block, assuming function is compiled.
+  void *getPointerToBasicBlock(BasicBlock *BB);
   
   /// getOrEmitGlobalVariable - Return the address of the specified global
   /// variable, possibly emitting it to memory if needed.  This is used by the
@@ -197,11 +212,18 @@ public:
       const JITEvent_EmittedFunctionDetails &Details);
   void NotifyFreeingMachineCode(void *OldPtr);
 
+  BasicBlockAddressMapTy &
+  getBasicBlockAddressMap(const MutexGuard &) {
+    return BasicBlockAddressMap;
+  }
+
+
 private:
   static JITCodeEmitter *createEmitter(JIT &J, JITMemoryManager *JMM,
                                        TargetMachine &tm);
   void runJITOnFunctionUnlocked(Function *F, const MutexGuard &locked);
   void updateFunctionStub(Function *F);
+  void jitTheFunction(Function *F, const MutexGuard &locked);
 
 protected:
 
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index e3855b2..28d79da 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -435,6 +435,9 @@ namespace {
       if (MBBLocations.size() <= (unsigned)MBB->getNumber())
         MBBLocations.resize((MBB->getNumber()+1)*2);
       MBBLocations[MBB->getNumber()] = getCurrentPCValue();
+      if (MBB->hasAddressTaken())
+        TheJIT->addPointerToBasicBlock(MBB->getBasicBlock(),
+                                       (void*)getCurrentPCValue());
       DEBUG(dbgs() << "JIT: Emitting BB" << MBB->getNumber() << " at ["
                    << (void*) getCurrentPCValue() << "]\n");
     }
@@ -442,7 +445,7 @@ namespace {
     virtual uintptr_t getConstantPoolEntryAddress(unsigned Entry) const;
     virtual uintptr_t getJumpTableEntryAddress(unsigned Entry) const;
 
-    virtual uintptr_t getMachineBasicBlockAddress(MachineBasicBlock *MBB) const {
+    virtual uintptr_t getMachineBasicBlockAddress(MachineBasicBlock *MBB) const{
       assert(MBBLocations.size() > (unsigned)MBB->getNumber() &&
              MBBLocations[MBB->getNumber()] && "MBB not emitted!");
       return MBBLocations[MBB->getNumber()];
@@ -1310,6 +1313,11 @@ void JITEmitter::retryWithMoreMemory(MachineFunction &F) {
   deallocateMemForFunction(F.getFunction());
   // Try again with at least twice as much free space.
   SizeEstimate = (uintptr_t)(2 * (BufferEnd - BufferBegin));
+
+  for (MachineFunction::iterator MBB = F.begin(), E = F.end(); MBB != E; ++MBB){
+    if (MBB->hasAddressTaken())
+      TheJIT->clearPointerToBasicBlock(MBB->getBasicBlock());
+  }
 }
 
 /// deallocateMemForFunction - Deallocate all memory for the specified
diff --git a/lib/Linker/LinkItems.cpp b/lib/Linker/LinkItems.cpp
index 2c22550..1be2bec 100644
--- a/lib/Linker/LinkItems.cpp
+++ b/lib/Linker/LinkItems.cpp
@@ -160,27 +160,26 @@ bool Linker::LinkInFile(const sys::Path &File, bool &is_native) {
   // Check for a file of name "-", which means "read standard input"
   if (File.str() == "-") {
     std::auto_ptr<Module> M;
-    MemoryBuffer *Buffer = MemoryBuffer::getSTDIN();
-    if (!Buffer->getBufferSize()) {
-      delete Buffer;
-      Error = "standard input is empty";
-    } else {
-      M.reset(ParseBitcodeFile(Buffer, Context, &Error));
-      delete Buffer;
-      if (M.get())
-        if (!LinkInModule(M.get(), &Error))
-          return false;
+    if (MemoryBuffer *Buffer = MemoryBuffer::getSTDIN(&Error)) {
+      if (!Buffer->getBufferSize()) {
+        delete Buffer;
+        Error = "standard input is empty";
+      } else {
+        M.reset(ParseBitcodeFile(Buffer, Context, &Error));
+        delete Buffer;
+        if (M.get())
+          if (!LinkInModule(M.get(), &Error))
+            return false;
+      }
     }
     return error("Cannot link stdin: " + Error);
   }
 
-  // Make sure we can at least read the file
-  if (!File.canRead())
+  // Determine what variety of file it is.
+  std::string Magic;
+  if (!File.getMagicNumber(Magic, 64))
     return error("Cannot find linker input '" + File.str() + "'");
 
-  // If its an archive, try to link it in
-  std::string Magic;
-  File.getMagicNumber(Magic, 64);
   switch (sys::IdentifyFileType(Magic.c_str(), 64)) {
     default: llvm_unreachable("Bad file type identification");
     case sys::Unknown_FileType:
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 5e8a3b6..fc4f3c6 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -14,6 +14,7 @@ add_llvm_library(LLVMMC
   MCLoggingStreamer.cpp
   MCMachOStreamer.cpp
   MCNullStreamer.cpp
+  MCObjectStreamer.cpp
   MCObjectWriter.cpp
   MCSection.cpp
   MCSectionCOFF.cpp
@@ -23,5 +24,7 @@ add_llvm_library(LLVMMC
   MCSymbol.cpp
   MCValue.cpp
   MachObjectWriter.cpp
+  WinCOFFStreamer.cpp
+  WinCOFFObjectWriter.cpp
   TargetAsmBackend.cpp
   )
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 57b2bcc..e272b60 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -275,19 +275,20 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_Global: // .globl/.global
     OS << MAI.getGlobalDirective();
     break;
-  case MCSA_Hidden:         OS << ".hidden ";          break;
-  case MCSA_IndirectSymbol: OS << ".indirect_symbol "; break;
-  case MCSA_Internal:       OS << ".internal ";        break;
-  case MCSA_LazyReference:  OS << ".lazy_reference ";  break;
-  case MCSA_Local:          OS << ".local ";           break;
-  case MCSA_NoDeadStrip:    OS << ".no_dead_strip ";   break;
-  case MCSA_PrivateExtern:  OS << ".private_extern ";  break;
-  case MCSA_Protected:      OS << ".protected ";       break;
-  case MCSA_Reference:      OS << ".reference ";       break;
-  case MCSA_Weak:           OS << ".weak ";            break;
-  case MCSA_WeakDefinition: OS << ".weak_definition "; break;
+  case MCSA_Hidden:         OS << "\t.hidden\t";          break;
+  case MCSA_IndirectSymbol: OS << "\t.indirect_symbol\t"; break;
+  case MCSA_Internal:       OS << "\t.internal\t";        break;
+  case MCSA_LazyReference:  OS << "\t.lazy_reference\t";  break;
+  case MCSA_Local:          OS << "\t.local\t";           break;
+  case MCSA_NoDeadStrip:    OS << "\t.no_dead_strip\t";   break;
+  case MCSA_PrivateExtern:  OS << "\t.private_extern\t";  break;
+  case MCSA_Protected:      OS << "\t.protected\t";       break;
+  case MCSA_Reference:      OS << "\t.reference\t";       break;
+  case MCSA_Weak:           OS << "\t.weak\t";            break;
+  case MCSA_WeakDefinition: OS << "\t.weak_definition\t"; break;
       // .weak_reference
   case MCSA_WeakReference:  OS << MAI.getWeakRefDirective(); break;
+  case MCSA_WeakDefAutoPrivate: OS << "\t.weak_def_can_be_hidden\t"; break;
   }
 
   OS << *Symbol;
@@ -693,7 +694,6 @@ void MCAsmStreamer::EmitRawText(StringRef String) {
 }
 
 void MCAsmStreamer::Finish() {
-  OS.flush();
 }
 
 MCStreamer *llvm::createAsmStreamer(MCContext &Context,
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 5936656..7d84554 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -308,24 +308,23 @@ static bool isScatteredFixupFullyResolved(const MCAssembler &Asm,
   return !B_Base && BaseSymbol == A_Base;
 }
 
-bool MCAssembler::isSymbolLinkerVisible(const MCSymbolData *SD) const {
+bool MCAssembler::isSymbolLinkerVisible(const MCSymbol &Symbol) const {
   // Non-temporary labels should always be visible to the linker.
-  if (!SD->getSymbol().isTemporary())
+  if (!Symbol.isTemporary())
     return true;
 
   // Absolute temporary labels are never visible.
-  if (!SD->getFragment())
+  if (!Symbol.isInSection())
     return false;
 
   // Otherwise, check if the section requires symbols even for temporary labels.
-  return getBackend().doesSectionRequireSymbols(
-    SD->getFragment()->getParent()->getSection());
+  return getBackend().doesSectionRequireSymbols(Symbol.getSection());
 }
 
 const MCSymbolData *MCAssembler::getAtom(const MCAsmLayout &Layout,
                                          const MCSymbolData *SD) const {
   // Linker visible symbols define atoms.
-  if (isSymbolLinkerVisible(SD))
+  if (isSymbolLinkerVisible(SD->getSymbol()))
     return SD;
 
   // Absolute and undefined symbols have no defining atom.
@@ -685,12 +684,8 @@ void MCAssembler::Finish() {
   for (MCAssembler::iterator it = begin(), ie = end(); it != ie; ++it) {
     // Create dummy fragments to eliminate any empty sections, this simplifies
     // layout.
-    if (it->getFragmentList().empty()) {
-      unsigned ValueSize = 1;
-      if (getBackend().isVirtualSection(it->getSection()))
-        ValueSize = 1;
+    if (it->getFragmentList().empty())
       new MCFillFragment(0, 1, 0, it);
-    }
 
     it->setOrdinal(SectionIndex++);
   }
@@ -759,7 +754,6 @@ void MCAssembler::Finish() {
 
   // Write the object file.
   Writer->WriteObject(*this, Layout);
-  OS.flush();
 
   stats::ObjectBytes += OS.tell() - StartOffset;
 }
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 53ffc94..1137064 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -27,6 +27,10 @@ MCContext::MCContext(const MCAsmInfo &mai) : MAI(mai), NextUniqueID(0) {
   MachOUniquingMap = 0;
   ELFUniquingMap = 0;
   COFFUniquingMap = 0;
+
+  SecureLogFile = getenv("AS_SECURE_LOG_FILE");
+  SecureLog = 0;
+  SecureLogUsed = false;
 }
 
 MCContext::~MCContext() {
@@ -37,6 +41,9 @@ MCContext::~MCContext() {
   delete (MachOUniqueMapTy*)MachOUniquingMap;
   delete (ELFUniqueMapTy*)ELFUniquingMap;
   delete (COFFUniqueMapTy*)COFFUniquingMap;
+
+  // If the stream for the .secure_log_unique directive was created free it.
+  delete (raw_ostream*)SecureLog;
 }
 
 //===----------------------------------------------------------------------===//
@@ -90,14 +97,14 @@ MCSymbol *MCContext::CreateDirectionalLocalSymbol(int64_t LocalLabelVal) {
   return GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix()) +
                            Twine(LocalLabelVal) +
                            "\2" +
-			   Twine(NextInstance(LocalLabelVal)));
+                           Twine(NextInstance(LocalLabelVal)));
 }
 MCSymbol *MCContext::GetDirectionalLocalSymbol(int64_t LocalLabelVal,
                                                int bORf) {
   return GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix()) +
                            Twine(LocalLabelVal) +
                            "\2" +
-			   Twine(GetInstance(LocalLabelVal) + bORf));
+                           Twine(GetInstance(LocalLabelVal) + bORf));
 }
 
 MCSymbol *MCContext::LookupSymbol(StringRef Name) const {
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index c000dd7..343f334 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -40,7 +40,7 @@ void MCExpr::print(raw_ostream &OS) const {
     const MCSymbol &Sym = SRE.getSymbol();
 
     if (SRE.getKind() == MCSymbolRefExpr::VK_ARM_HI16 ||
-	SRE.getKind() == MCSymbolRefExpr::VK_ARM_LO16)
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_LO16)
       OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
 
     // Parenthesize names that start with $ so that they don't look like
@@ -51,8 +51,8 @@ void MCExpr::print(raw_ostream &OS) const {
       OS << Sym;
 
     if (SRE.getKind() != MCSymbolRefExpr::VK_None &&
-	SRE.getKind() != MCSymbolRefExpr::VK_ARM_HI16 &&
-	SRE.getKind() != MCSymbolRefExpr::VK_ARM_LO16)
+        SRE.getKind() != MCSymbolRefExpr::VK_ARM_HI16 &&
+        SRE.getKind() != MCSymbolRefExpr::VK_ARM_LO16)
       OS << '@' << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
 
     return;
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 27e4e98..44bc267 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCMachOSymbolFlags.h"
@@ -25,21 +26,13 @@ using namespace llvm;
 
 namespace {
 
-class MCMachOStreamer : public MCStreamer {
-
-private:
-  MCAssembler Assembler;
-  MCSectionData *CurSectionData;
-
-  /// Track the current atom for each section.
-  DenseMap<const MCSectionData*, MCSymbolData*> CurrentAtomMap;
-
+class MCMachOStreamer : public MCObjectStreamer {
 private:
   MCFragment *getCurrentFragment() const {
-    assert(CurSectionData && "No current section!");
+    assert(getCurrentSectionData() && "No current section!");
 
-    if (!CurSectionData->empty())
-      return &CurSectionData->getFragmentList().back();
+    if (!getCurrentSectionData()->empty())
+      return &getCurrentSectionData()->getFragmentList().back();
 
     return 0;
   }
@@ -49,28 +42,17 @@ private:
   MCDataFragment *getOrCreateDataFragment() const {
     MCDataFragment *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
     if (!F)
-      F = createDataFragment();
+      F = new MCDataFragment(getCurrentSectionData());
     return F;
   }
 
-  /// Create a new data fragment in the current section.
-  MCDataFragment *createDataFragment() const {
-    MCDataFragment *DF = new MCDataFragment(CurSectionData);
-    DF->setAtom(CurrentAtomMap.lookup(CurSectionData));
-    return DF;
-  }
-
   void EmitInstToFragment(const MCInst &Inst);
   void EmitInstToData(const MCInst &Inst);
 
 public:
   MCMachOStreamer(MCContext &Context, TargetAsmBackend &TAB,
-                  raw_ostream &_OS, MCCodeEmitter *_Emitter)
-    : MCStreamer(Context), Assembler(Context, TAB, *_Emitter, _OS),
-      CurSectionData(0) {}
-  ~MCMachOStreamer() {}
-
-  MCAssembler &getAssembler() { return Assembler; }
+                  raw_ostream &OS, MCCodeEmitter *Emitter)
+    : MCObjectStreamer(Context, TAB, OS, Emitter) {}
 
   const MCExpr *AddValueSymbols(const MCExpr *Value) {
     switch (Value->getKind()) {
@@ -86,7 +68,7 @@ public:
     }
 
     case MCExpr::SymbolRef:
-      Assembler.getOrCreateSymbolData(
+      getAssembler().getOrCreateSymbolData(
         cast<MCSymbolRefExpr>(Value)->getSymbol());
       break;
 
@@ -101,7 +83,6 @@ public:
   /// @name MCStreamer Interface
   /// @{
 
-  virtual void SwitchSection(const MCSection *Section);
   virtual void EmitLabel(MCSymbol *Symbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
   virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
@@ -152,6 +133,7 @@ public:
   }
 
   virtual void EmitInstruction(const MCInst &Inst);
+
   virtual void Finish();
 
   /// @}
@@ -159,38 +141,25 @@ public:
 
 } // end anonymous namespace.
 
-void MCMachOStreamer::SwitchSection(const MCSection *Section) {
-  assert(Section && "Cannot switch to a null section!");
-
-  // If already in this section, then this is a noop.
-  if (Section == CurSection) return;
-
-  CurSection = Section;
-  CurSectionData = &Assembler.getOrCreateSectionData(*Section);
-}
-
 void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(CurSection && "Cannot emit before setting section!");
 
-  MCSymbolData &SD = Assembler.getOrCreateSymbolData(*Symbol);
+  Symbol->setSection(*CurSection);
 
-  // Update the current atom map, if necessary.
-  bool MustCreateFragment = false;
-  if (Assembler.isSymbolLinkerVisible(&SD)) {
-    CurrentAtomMap[CurSectionData] = &SD;
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
 
-    // We have to create a new fragment, fragments cannot span atoms.
-    MustCreateFragment = true;
-  }
+  // We have to create a new fragment if this is an atom defining symbol,
+  // fragments cannot span atoms.
+  if (getAssembler().isSymbolLinkerVisible(SD.getSymbol()))
+    new MCDataFragment(getCurrentSectionData());
 
   // FIXME: This is wasteful, we don't necessarily need to create a data
   // fragment. Instead, we should mark the symbol as pointing into the data
   // fragment if it exists, otherwise we should just queue the label and set its
   // fragment pointer when we emit the next fragment.
-  MCDataFragment *F =
-    MustCreateFragment ? createDataFragment() : getOrCreateDataFragment();
+  MCDataFragment *F = getOrCreateDataFragment();
   assert(!SD.getFragment() && "Unexpected fragment on symbol data!");
   SD.setFragment(F);
   SD.setOffset(F->getContents().size());
@@ -203,14 +172,12 @@ void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
   // FIXME: Cleanup this code, these bits should be emitted based on semantic
   // properties, not on the order of definition, etc.
   SD.setFlags(SD.getFlags() & ~SF_ReferenceTypeMask);
-
-  Symbol->setSection(*CurSection);
 }
 
 void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   switch (Flag) {
   case MCAF_SubsectionsViaSymbols:
-    Assembler.setSubsectionsViaSymbols(true);
+    getAssembler().setSubsectionsViaSymbols(true);
     return;
   }
 
@@ -219,7 +186,7 @@ void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
 
 void MCMachOStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
   // FIXME: Lift context changes into super class.
-  Assembler.getOrCreateSymbolData(*Symbol);
+  getAssembler().getOrCreateSymbolData(*Symbol);
   Symbol->setVariableValue(AddValueSymbols(Value));
 }
 
@@ -232,15 +199,15 @@ void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     // important for matching the string table that 'as' generates.
     IndirectSymbolData ISD;
     ISD.Symbol = Symbol;
-    ISD.SectionData = CurSectionData;
-    Assembler.getIndirectSymbols().push_back(ISD);
+    ISD.SectionData = getCurrentSectionData();
+    getAssembler().getIndirectSymbols().push_back(ISD);
     return;
   }
 
   // Adding a symbol attribute always introduces the symbol, note that an
   // important side effect of calling getOrCreateSymbolData here is to register
   // the symbol with the assembler.
-  MCSymbolData &SD = Assembler.getOrCreateSymbolData(*Symbol);
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
 
   // The implementation of symbol attributes is designed to match 'as', but it
   // leaves much to desired. It doesn't really make sense to arbitrarily add and
@@ -306,6 +273,10 @@ void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     // it has to be in a coalesced section, but this isn't enforced.
     SD.setFlags(SD.getFlags() | SF_WeakDefinition);
     break;
+
+  case MCSA_WeakDefAutoPrivate:
+    SD.setFlags(SD.getFlags() | SF_WeakDefinition | SF_WeakReference);
+    break;
   }
 }
 
@@ -313,7 +284,8 @@ void MCMachOStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   // Encode the 'desc' value into the lowest implementation defined bits.
   assert(DescValue == (DescValue & SF_DescFlagsMask) &&
          "Invalid .desc value!");
-  Assembler.getOrCreateSymbolData(*Symbol).setFlags(DescValue&SF_DescFlagsMask);
+  getAssembler().getOrCreateSymbolData(*Symbol).setFlags(
+    DescValue & SF_DescFlagsMask);
 }
 
 void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
@@ -321,14 +293,14 @@ void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   // FIXME: Darwin 'as' does appear to allow redef of a .comm by itself.
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
-  MCSymbolData &SD = Assembler.getOrCreateSymbolData(*Symbol);
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
   SD.setExternal(true);
   SD.setCommon(Size, ByteAlignment);
 }
 
 void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
                                    unsigned Size, unsigned ByteAlignment) {
-  MCSectionData &SectData = Assembler.getOrCreateSectionData(*Section);
+  MCSectionData &SectData = getAssembler().getOrCreateSectionData(*Section);
 
   // The symbol may not be present, which only creates the section.
   if (!Symbol)
@@ -338,7 +310,7 @@ void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
 
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
-  MCSymbolData &SD = Assembler.getOrCreateSymbolData(*Symbol);
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
 
   // Emit an align fragment if necessary.
   if (ByteAlignment != 1)
@@ -346,8 +318,6 @@ void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
 
   MCFragment *F = new MCFillFragment(0, 0, Size, &SectData);
   SD.setFragment(F);
-  if (Assembler.isSymbolLinkerVisible(&SD))
-    F->setAtom(&SD);
 
   Symbol->setSection(*Section);
 
@@ -391,13 +361,12 @@ void MCMachOStreamer::EmitValueToAlignment(unsigned ByteAlignment,
                                            unsigned MaxBytesToEmit) {
   if (MaxBytesToEmit == 0)
     MaxBytesToEmit = ByteAlignment;
-  MCFragment *F = new MCAlignFragment(ByteAlignment, Value, ValueSize,
-                                      MaxBytesToEmit, CurSectionData);
-  F->setAtom(CurrentAtomMap.lookup(CurSectionData));
+  new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit,
+                      getCurrentSectionData());
 
   // Update the maximum alignment on the current section if necessary.
-  if (ByteAlignment > CurSectionData->getAlignment())
-    CurSectionData->setAlignment(ByteAlignment);
+  if (ByteAlignment > getCurrentSectionData()->getAlignment())
+    getCurrentSectionData()->setAlignment(ByteAlignment);
 }
 
 void MCMachOStreamer::EmitCodeAlignment(unsigned ByteAlignment,
@@ -405,24 +374,21 @@ void MCMachOStreamer::EmitCodeAlignment(unsigned ByteAlignment,
   if (MaxBytesToEmit == 0)
     MaxBytesToEmit = ByteAlignment;
   MCAlignFragment *F = new MCAlignFragment(ByteAlignment, 0, 1, MaxBytesToEmit,
-                                           CurSectionData);
+                                           getCurrentSectionData());
   F->setEmitNops(true);
-  F->setAtom(CurrentAtomMap.lookup(CurSectionData));
 
   // Update the maximum alignment on the current section if necessary.
-  if (ByteAlignment > CurSectionData->getAlignment())
-    CurSectionData->setAlignment(ByteAlignment);
+  if (ByteAlignment > getCurrentSectionData()->getAlignment())
+    getCurrentSectionData()->setAlignment(ByteAlignment);
 }
 
 void MCMachOStreamer::EmitValueToOffset(const MCExpr *Offset,
                                         unsigned char Value) {
-  MCFragment *F = new MCOrgFragment(*Offset, Value, CurSectionData);
-  F->setAtom(CurrentAtomMap.lookup(CurSectionData));
+  new MCOrgFragment(*Offset, Value, getCurrentSectionData());
 }
 
 void MCMachOStreamer::EmitInstToFragment(const MCInst &Inst) {
-  MCInstFragment *IF = new MCInstFragment(Inst, CurSectionData);
-  IF->setAtom(CurrentAtomMap.lookup(CurSectionData));
+  MCInstFragment *IF = new MCInstFragment(Inst, getCurrentSectionData());
 
   // Add the fixups and data.
   //
@@ -431,7 +397,7 @@ void MCMachOStreamer::EmitInstToFragment(const MCInst &Inst) {
   SmallVector<MCFixup, 4> Fixups;
   SmallString<256> Code;
   raw_svector_ostream VecOS(Code);
-  Assembler.getEmitter().EncodeInstruction(Inst, VecOS, Fixups);
+  getAssembler().getEmitter().EncodeInstruction(Inst, VecOS, Fixups);
   VecOS.flush();
 
   IF->getCode() = Code;
@@ -444,7 +410,7 @@ void MCMachOStreamer::EmitInstToData(const MCInst &Inst) {
   SmallVector<MCFixup, 4> Fixups;
   SmallString<256> Code;
   raw_svector_ostream VecOS(Code);
-  Assembler.getEmitter().EncodeInstruction(Inst, VecOS, Fixups);
+  getAssembler().getEmitter().EncodeInstruction(Inst, VecOS, Fixups);
   VecOS.flush();
 
   // Add the fixups and data.
@@ -461,21 +427,21 @@ void MCMachOStreamer::EmitInstruction(const MCInst &Inst) {
     if (Inst.getOperand(i).isExpr())
       AddValueSymbols(Inst.getOperand(i).getExpr());
 
-  CurSectionData->setHasInstructions(true);
+  getCurrentSectionData()->setHasInstructions(true);
 
   // If this instruction doesn't need relaxation, just emit it as data.
-  if (!Assembler.getBackend().MayNeedRelaxation(Inst)) {
+  if (!getAssembler().getBackend().MayNeedRelaxation(Inst)) {
     EmitInstToData(Inst);
     return;
   }
 
   // Otherwise, if we are relaxing everything, relax the instruction as much as
   // possible and emit it as data.
-  if (Assembler.getRelaxAll()) {
+  if (getAssembler().getRelaxAll()) {
     MCInst Relaxed;
-    Assembler.getBackend().RelaxInstruction(Inst, Relaxed);
-    while (Assembler.getBackend().MayNeedRelaxation(Relaxed))
-      Assembler.getBackend().RelaxInstruction(Relaxed, Relaxed);
+    getAssembler().getBackend().RelaxInstruction(Inst, Relaxed);
+    while (getAssembler().getBackend().MayNeedRelaxation(Relaxed))
+      getAssembler().getBackend().RelaxInstruction(Relaxed, Relaxed);
     EmitInstToData(Relaxed);
     return;
   }
@@ -485,7 +451,36 @@ void MCMachOStreamer::EmitInstruction(const MCInst &Inst) {
 }
 
 void MCMachOStreamer::Finish() {
-  Assembler.Finish();
+  // We have to set the fragment atom associations so we can relax properly for
+  // Mach-O.
+
+  // First, scan the symbol table to build a lookup table from fragments to
+  // defining symbols.
+  DenseMap<const MCFragment*, MCSymbolData*> DefiningSymbolMap;
+  for (MCAssembler::symbol_iterator it = getAssembler().symbol_begin(),
+         ie = getAssembler().symbol_end(); it != ie; ++it) {
+    if (getAssembler().isSymbolLinkerVisible(it->getSymbol()) &&
+        it->getFragment()) {
+      // An atom defining symbol should never be internal to a fragment.
+      assert(it->getOffset() == 0 && "Invalid offset in atom defining symbol!");
+      DefiningSymbolMap[it->getFragment()] = it;
+    }
+  }
+
+  // Set the fragment atom associations by tracking the last seen atom defining
+  // symbol.
+  for (MCAssembler::iterator it = getAssembler().begin(),
+         ie = getAssembler().end(); it != ie; ++it) {
+    MCSymbolData *CurrentAtom = 0;
+    for (MCSectionData::iterator it2 = it->begin(),
+           ie2 = it->end(); it2 != ie2; ++it2) {
+      if (MCSymbolData *SD = DefiningSymbolMap.lookup(it2))
+        CurrentAtom = SD;
+      it2->setAtom(CurrentAtom);
+    }
+  }
+
+  this->MCObjectStreamer::Finish();
 }
 
 MCStreamer *llvm::createMachOStreamer(MCContext &Context, TargetAsmBackend &TAB,
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
new file mode 100644
index 0000000..d3f7f77
--- /dev/null
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -0,0 +1,39 @@
+//===- lib/MC/MCObjectStreamer.cpp - Object File MCStreamer Interface -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCObjectStreamer.h"
+
+#include "llvm/MC/MCAssembler.h"
+using namespace llvm;
+
+MCObjectStreamer::MCObjectStreamer(MCContext &Context, TargetAsmBackend &TAB,
+                                   raw_ostream &_OS, MCCodeEmitter *_Emitter)
+  : MCStreamer(Context), Assembler(new MCAssembler(Context, TAB,
+                                                   *_Emitter, _OS)),
+    CurSectionData(0)
+{
+}
+
+MCObjectStreamer::~MCObjectStreamer() {
+  delete Assembler;
+}
+
+void MCObjectStreamer::SwitchSection(const MCSection *Section) {
+  assert(Section && "Cannot switch to a null section!");
+
+  // If already in this section, then this is a noop.
+  if (Section == CurSection) return;
+
+  CurSection = Section;
+  CurSectionData = &getAssembler().getOrCreateSectionData(*Section);
+}
+
+void MCObjectStreamer::Finish() {
+  getAssembler().Finish();
+}
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index 1cbe09a..465d983 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -23,7 +23,6 @@ using namespace llvm;
 AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
   CurBuf = NULL;
   CurPtr = NULL;
-  TokStart = 0;
 }
 
 AsmLexer::~AsmLexer() {
@@ -40,10 +39,6 @@ void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) {
   TokStart = 0;
 }
 
-SMLoc AsmLexer::getLoc() const {
-  return SMLoc::getFromPointer(TokStart);
-}
-
 /// ReturnError - Set the error to the specified string at the specified
 /// location.  This is defined to always return AsmToken::Error.
 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
@@ -229,7 +224,7 @@ StringRef AsmLexer::LexUntilEndOfStatement() {
   TokStart = CurPtr;
 
   while (!isAtStartOfComment(*CurPtr) && // Start of line comment.
-	  *CurPtr != ';' &&  // End of statement marker.
+          *CurPtr != ';' &&  // End of statement marker.
          *CurPtr != '\n' &&
          *CurPtr != '\r' &&
          (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 4523eab..793f3c7 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -18,34 +18,85 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetAsmParser.h"
 using namespace llvm;
 
+namespace {
+
+/// \brief Generic implementations of directive handling, etc. which is shared
+/// (or the default, at least) for all assembler parser.
+class GenericAsmParser : public MCAsmParserExtension {
+public:
+  GenericAsmParser() {}
+
+  virtual void Initialize(MCAsmParser &Parser) {
+    // Call the base implementation.
+    this->MCAsmParserExtension::Initialize(Parser);
+
+    // Debugging directives.
+    Parser.AddDirectiveHandler(this, ".file", MCAsmParser::DirectiveHandler(
+                                 &GenericAsmParser::ParseDirectiveFile));
+    Parser.AddDirectiveHandler(this, ".line", MCAsmParser::DirectiveHandler(
+                                 &GenericAsmParser::ParseDirectiveLine));
+    Parser.AddDirectiveHandler(this, ".loc", MCAsmParser::DirectiveHandler(
+                                 &GenericAsmParser::ParseDirectiveLoc));
+  }
+
+  bool ParseDirectiveFile(StringRef, SMLoc DirectiveLoc); // ".file"
+  bool ParseDirectiveLine(StringRef, SMLoc DirectiveLoc); // ".line"
+  bool ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc); // ".loc"
+};
+
+}
+
+namespace llvm {
+
+extern MCAsmParserExtension *createDarwinAsmParser();
+extern MCAsmParserExtension *createELFAsmParser();
+
+}
 
 enum { DEFAULT_ADDRSPACE = 0 };
 
-AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx, MCStreamer &_Out,
-                     const MCAsmInfo &_MAI) 
-  : Lexer(_MAI), Ctx(_Ctx), Out(_Out), SrcMgr(_SM), TargetParser(0),
-    CurBuffer(0) {
+AsmParser::AsmParser(const Target &T, SourceMgr &_SM, MCContext &_Ctx,
+                     MCStreamer &_Out, const MCAsmInfo &_MAI)
+  : Lexer(_MAI), Ctx(_Ctx), Out(_Out), SrcMgr(_SM),
+    GenericParser(new GenericAsmParser), PlatformParser(0),
+    TargetParser(0), CurBuffer(0) {
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
-  
-  // Debugging directives.
-  AddDirectiveHandler(".file", &AsmParser::ParseDirectiveFile);
-  AddDirectiveHandler(".line", &AsmParser::ParseDirectiveLine);
-  AddDirectiveHandler(".loc", &AsmParser::ParseDirectiveLoc);
-}
 
+  // Initialize the generic parser.
+  GenericParser->Initialize(*this);
 
+  // Initialize the platform / file format parser.
+  //
+  // FIXME: This is a hack, we need to (majorly) cleanup how these objects are
+  // created.
+  if (_MAI.hasSubsectionsViaSymbols()) {
+    PlatformParser = createDarwinAsmParser();
+    PlatformParser->Initialize(*this);
+  } else {
+    PlatformParser = createELFAsmParser();
+    PlatformParser->Initialize(*this);
+  }
+}
 
 AsmParser::~AsmParser() {
+  delete PlatformParser;
+  delete GenericParser;
+}
+
+void AsmParser::setTargetParser(TargetAsmParser &P) {
+  assert(!TargetParser && "Target parser is already initialized!");
+  TargetParser = &P;
+  TargetParser->Initialize(*this);
 }
 
 void AsmParser::Warning(SMLoc L, const Twine &Msg) {
@@ -57,11 +108,6 @@ bool AsmParser::Error(SMLoc L, const Twine &Msg) {
   return true;
 }
 
-bool AsmParser::TokError(const char *Msg) {
-  PrintMessage(Lexer.getLoc(), Msg, "error");
-  return true;
-}
-
 void AsmParser::PrintMessage(SMLoc Loc, const std::string &Msg, 
                              const char *Type) const {
   SrcMgr.PrintMessage(Loc, Msg, Type);
@@ -163,11 +209,6 @@ bool AsmParser::ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   return false;
 }
 
-MCSymbol *AsmParser::CreateSymbol(StringRef Name) {
-  // FIXME: Inline into callers.
-  return Ctx.GetOrCreateSymbol(Name);
-}
-
 /// ParsePrimaryExpr - Parse a primary expression and return it.
 ///  primaryexpr ::= (parenexpr
 ///  primaryexpr ::= symbol
@@ -188,7 +229,7 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   case AsmToken::Identifier: {
     // This is a symbol reference.
     std::pair<StringRef, StringRef> Split = getTok().getIdentifier().split('@');
-    MCSymbol *Sym = CreateSymbol(Split.first);
+    MCSymbol *Sym = getContext().GetOrCreateSymbol(Split.first);
 
     // Mark the symbol as used in an expression.
     Sym->setUsedInExpr(true);
@@ -454,8 +495,8 @@ bool AsmParser::ParseStatement() {
       IDVal = getTok().getString();
       Lex(); // Consume the integer token to be used as an identifier token.
       if (Lexer.getKind() != AsmToken::Colon) {
-	  if (!TheCondState.Ignore)
-	    return TokError("unexpected token at start of statement");
+        if (!TheCondState.Ignore)
+          return TokError("unexpected token at start of statement");
       }
     }
   }
@@ -498,7 +539,7 @@ bool AsmParser::ParseStatement() {
     // implicitly marked as external.
     MCSymbol *Sym;
     if (LocalLabelVal == -1)
-      Sym = CreateSymbol(IDVal);
+      Sym = getContext().GetOrCreateSymbol(IDVal);
     else
       Sym = Ctx.CreateDirectionalLocalSymbol(LocalLabelVal);
     if (!Sym->isUndefined() || Sym->isVariable())
@@ -530,158 +571,6 @@ bool AsmParser::ParseStatement() {
   
   // Otherwise, we have a normal instruction or directive.  
   if (IDVal[0] == '.') {
-    // FIXME: This should be driven based on a hash lookup and callback.
-    if (IDVal == ".section")
-      return ParseDirectiveDarwinSection();
-    if (IDVal == ".text")
-      // FIXME: This changes behavior based on the -static flag to the
-      // assembler.
-      return ParseDirectiveSectionSwitch("__TEXT", "__text",
-                                     MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS);
-    if (IDVal == ".const")
-      return ParseDirectiveSectionSwitch("__TEXT", "__const");
-    if (IDVal == ".static_const")
-      return ParseDirectiveSectionSwitch("__TEXT", "__static_const");
-    if (IDVal == ".cstring")
-      return ParseDirectiveSectionSwitch("__TEXT","__cstring", 
-                                         MCSectionMachO::S_CSTRING_LITERALS);
-    if (IDVal == ".literal4")
-      return ParseDirectiveSectionSwitch("__TEXT", "__literal4",
-                                         MCSectionMachO::S_4BYTE_LITERALS,
-                                         4);
-    if (IDVal == ".literal8")
-      return ParseDirectiveSectionSwitch("__TEXT", "__literal8",
-                                         MCSectionMachO::S_8BYTE_LITERALS,
-                                         8);
-    if (IDVal == ".literal16")
-      return ParseDirectiveSectionSwitch("__TEXT","__literal16",
-                                         MCSectionMachO::S_16BYTE_LITERALS,
-                                         16);
-    if (IDVal == ".constructor")
-      return ParseDirectiveSectionSwitch("__TEXT","__constructor");
-    if (IDVal == ".destructor")
-      return ParseDirectiveSectionSwitch("__TEXT","__destructor");
-    if (IDVal == ".fvmlib_init0")
-      return ParseDirectiveSectionSwitch("__TEXT","__fvmlib_init0");
-    if (IDVal == ".fvmlib_init1")
-      return ParseDirectiveSectionSwitch("__TEXT","__fvmlib_init1");
-
-    // FIXME: The assembler manual claims that this has the self modify code
-    // flag, at least on x86-32, but that does not appear to be correct.
-    if (IDVal == ".symbol_stub")
-      return ParseDirectiveSectionSwitch("__TEXT","__symbol_stub",
-                                         MCSectionMachO::S_SYMBOL_STUBS |
-                                       MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
-                                          // FIXME: Different on PPC and ARM.
-                                         0, 16);
-    // FIXME: PowerPC only?
-    if (IDVal == ".picsymbol_stub")
-      return ParseDirectiveSectionSwitch("__TEXT","__picsymbol_stub",
-                                         MCSectionMachO::S_SYMBOL_STUBS |
-                                       MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
-                                         0, 26);
-    if (IDVal == ".data")
-      return ParseDirectiveSectionSwitch("__DATA", "__data");
-    if (IDVal == ".static_data")
-      return ParseDirectiveSectionSwitch("__DATA", "__static_data");
-
-    // FIXME: The section names of these two are misspelled in the assembler
-    // manual.
-    if (IDVal == ".non_lazy_symbol_pointer")
-      return ParseDirectiveSectionSwitch("__DATA", "__nl_symbol_ptr",
-                                     MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS,
-                                         4);
-    if (IDVal == ".lazy_symbol_pointer")
-      return ParseDirectiveSectionSwitch("__DATA", "__la_symbol_ptr",
-                                         MCSectionMachO::S_LAZY_SYMBOL_POINTERS,
-                                         4);
-
-    if (IDVal == ".dyld")
-      return ParseDirectiveSectionSwitch("__DATA", "__dyld");
-    if (IDVal == ".mod_init_func")
-      return ParseDirectiveSectionSwitch("__DATA", "__mod_init_func",
-                                       MCSectionMachO::S_MOD_INIT_FUNC_POINTERS,
-                                         4);
-    if (IDVal == ".mod_term_func")
-      return ParseDirectiveSectionSwitch("__DATA", "__mod_term_func",
-                                       MCSectionMachO::S_MOD_TERM_FUNC_POINTERS,
-                                         4);
-    if (IDVal == ".const_data")
-      return ParseDirectiveSectionSwitch("__DATA", "__const");
-    
-    
-    if (IDVal == ".objc_class")
-      return ParseDirectiveSectionSwitch("__OBJC", "__class", 
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
-    if (IDVal == ".objc_meta_class")
-      return ParseDirectiveSectionSwitch("__OBJC", "__meta_class",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
-    if (IDVal == ".objc_cat_cls_meth")
-      return ParseDirectiveSectionSwitch("__OBJC", "__cat_cls_meth",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
-    if (IDVal == ".objc_cat_inst_meth")
-      return ParseDirectiveSectionSwitch("__OBJC", "__cat_inst_meth",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
-    if (IDVal == ".objc_protocol")
-      return ParseDirectiveSectionSwitch("__OBJC", "__protocol",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
-    if (IDVal == ".objc_string_object")
-      return ParseDirectiveSectionSwitch("__OBJC", "__string_object",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
-    if (IDVal == ".objc_cls_meth")
-      return ParseDirectiveSectionSwitch("__OBJC", "__cls_meth",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
-    if (IDVal == ".objc_inst_meth")
-      return ParseDirectiveSectionSwitch("__OBJC", "__inst_meth",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
-    if (IDVal == ".objc_cls_refs")
-      return ParseDirectiveSectionSwitch("__OBJC", "__cls_refs",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP |
-                                         MCSectionMachO::S_LITERAL_POINTERS,
-                                         4);
-    if (IDVal == ".objc_message_refs")
-      return ParseDirectiveSectionSwitch("__OBJC", "__message_refs",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP |
-                                         MCSectionMachO::S_LITERAL_POINTERS,
-                                         4);
-    if (IDVal == ".objc_symbols")
-      return ParseDirectiveSectionSwitch("__OBJC", "__symbols",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
-    if (IDVal == ".objc_category")
-      return ParseDirectiveSectionSwitch("__OBJC", "__category",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
-    if (IDVal == ".objc_class_vars")
-      return ParseDirectiveSectionSwitch("__OBJC", "__class_vars",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
-    if (IDVal == ".objc_instance_vars")
-      return ParseDirectiveSectionSwitch("__OBJC", "__instance_vars",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
-    if (IDVal == ".objc_module_info")
-      return ParseDirectiveSectionSwitch("__OBJC", "__module_info",
-                                         MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
-    if (IDVal == ".objc_class_names")
-      return ParseDirectiveSectionSwitch("__TEXT", "__cstring",
-                                         MCSectionMachO::S_CSTRING_LITERALS);
-    if (IDVal == ".objc_meth_var_types")
-      return ParseDirectiveSectionSwitch("__TEXT", "__cstring",
-                                         MCSectionMachO::S_CSTRING_LITERALS);
-    if (IDVal == ".objc_meth_var_names")
-      return ParseDirectiveSectionSwitch("__TEXT", "__cstring",
-                                         MCSectionMachO::S_CSTRING_LITERALS);
-    if (IDVal == ".objc_selector_strs")
-      return ParseDirectiveSectionSwitch("__OBJC", "__selector_strs",
-                                         MCSectionMachO::S_CSTRING_LITERALS);
-    
-    if (IDVal == ".tdata")
-      return ParseDirectiveSectionSwitch("__DATA", "__thread_data",
-                                        MCSectionMachO::S_THREAD_LOCAL_REGULAR);
-    if (IDVal == ".tlv")
-      return ParseDirectiveSectionSwitch("__DATA", "__thread_vars",
-                                      MCSectionMachO::S_THREAD_LOCAL_VARIABLES);
-    if (IDVal == ".thread_init_func")
-      return ParseDirectiveSectionSwitch("__DATA", "__thread_init",
-                        MCSectionMachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS);
-    
     // Assembler features
     if (IDVal == ".set")
       return ParseDirectiveSet();
@@ -756,36 +645,25 @@ bool AsmParser::ParseStatement() {
       return ParseDirectiveSymbolAttribute(MCSA_WeakDefinition);
     if (IDVal == ".weak_reference")
       return ParseDirectiveSymbolAttribute(MCSA_WeakReference);
+    if (IDVal == ".weak_def_can_be_hidden")
+      return ParseDirectiveSymbolAttribute(MCSA_WeakDefAutoPrivate);
 
     if (IDVal == ".comm")
       return ParseDirectiveComm(/*IsLocal=*/false);
     if (IDVal == ".lcomm")
       return ParseDirectiveComm(/*IsLocal=*/true);
-    if (IDVal == ".zerofill")
-      return ParseDirectiveDarwinZerofill();
-    if (IDVal == ".desc")
-      return ParseDirectiveDarwinSymbolDesc();
-    if (IDVal == ".lsym")
-      return ParseDirectiveDarwinLsym();
-    if (IDVal == ".tbss")
-      return ParseDirectiveDarwinTBSS();
-
-    if (IDVal == ".subsections_via_symbols")
-      return ParseDirectiveDarwinSubsectionsViaSymbols();
+
     if (IDVal == ".abort")
       return ParseDirectiveAbort();
     if (IDVal == ".include")
       return ParseDirectiveInclude();
-    if (IDVal == ".dump")
-      return ParseDirectiveDarwinDumpOrLoad(IDLoc, /*IsDump=*/true);
-    if (IDVal == ".load")
-      return ParseDirectiveDarwinDumpOrLoad(IDLoc, /*IsLoad=*/false);
-
-    // Look up the handler in the handler table, 
-    bool(AsmParser::*Handler)(StringRef, SMLoc) = DirectiveMap[IDVal];
-    if (Handler)
-      return (this->*Handler)(IDVal, IDLoc);
-    
+
+    // Look up the handler in the handler table.
+    std::pair<MCAsmParserExtension*, DirectiveHandler> Handler =
+      DirectiveMap.lookup(IDVal);
+    if (Handler.first)
+      return (Handler.first->*Handler.second)(IDVal, IDLoc);
+
     // Target hook for parsing target specific directives.
     if (!getTargetParser().ParseDirective(ID))
       return false;
@@ -839,7 +717,6 @@ bool AsmParser::ParseAssignment(const StringRef &Name) {
   SMLoc EqualLoc = Lexer.getLoc();
 
   const MCExpr *Value;
-  SMLoc StartLoc = Lexer.getLoc();
   if (ParseExpression(Value))
     return true;
   
@@ -867,7 +744,7 @@ bool AsmParser::ParseAssignment(const StringRef &Name) {
       return Error(EqualLoc, "invalid reassignment of non-absolute variable '" +
                    Name + "'");
   } else
-    Sym = CreateSymbol(Name);
+    Sym = getContext().GetOrCreateSymbol(Name);
 
   // FIXME: Handle '.'.
 
@@ -902,90 +779,15 @@ bool AsmParser::ParseDirectiveSet() {
   if (ParseIdentifier(Name))
     return TokError("expected identifier after '.set' directive");
   
-  if (Lexer.isNot(AsmToken::Comma))
+  if (getLexer().isNot(AsmToken::Comma))
     return TokError("unexpected token in '.set'");
   Lex();
 
   return ParseAssignment(Name);
 }
 
-/// ParseDirectiveSection:
-///   ::= .section identifier (',' identifier)*
-/// FIXME: This should actually parse out the segment, section, attributes and
-/// sizeof_stub fields.
-bool AsmParser::ParseDirectiveDarwinSection() {
-  SMLoc Loc = Lexer.getLoc();
-
-  StringRef SectionName;
-  if (ParseIdentifier(SectionName))
-    return Error(Loc, "expected identifier after '.section' directive");
-
-  // Verify there is a following comma.
-  if (!Lexer.is(AsmToken::Comma))
-    return TokError("unexpected token in '.section' directive");
-
-  std::string SectionSpec = SectionName;
-  SectionSpec += ",";
-
-  // Add all the tokens until the end of the line, ParseSectionSpecifier will
-  // handle this.
-  StringRef EOL = Lexer.LexUntilEndOfStatement();
-  SectionSpec.append(EOL.begin(), EOL.end());
-
-  Lex();
-  if (Lexer.isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.section' directive");
-  Lex();
-
-
-  StringRef Segment, Section;
-  unsigned TAA, StubSize;
-  std::string ErrorStr = 
-    MCSectionMachO::ParseSectionSpecifier(SectionSpec, Segment, Section,
-                                          TAA, StubSize);
-  
-  if (!ErrorStr.empty())
-    return Error(Loc, ErrorStr.c_str());
-  
-  // FIXME: Arch specific.
-  bool isText = Segment == "__TEXT";  // FIXME: Hack.
-  Out.SwitchSection(Ctx.getMachOSection(Segment, Section, TAA, StubSize,
-                                        isText ? SectionKind::getText()
-                                               : SectionKind::getDataRel()));
-  return false;
-}
-
-/// ParseDirectiveSectionSwitch - 
-bool AsmParser::ParseDirectiveSectionSwitch(const char *Segment,
-                                            const char *Section,
-                                            unsigned TAA, unsigned Align,
-                                            unsigned StubSize) {
-  if (Lexer.isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in section switching directive");
-  Lex();
-  
-  // FIXME: Arch specific.
-  bool isText = StringRef(Segment) == "__TEXT";  // FIXME: Hack.
-  Out.SwitchSection(Ctx.getMachOSection(Segment, Section, TAA, StubSize,
-                                        isText ? SectionKind::getText()
-                                               : SectionKind::getDataRel()));
-
-  // Set the implicit alignment, if any.
-  //
-  // FIXME: This isn't really what 'as' does; I think it just uses the implicit
-  // alignment on the section (e.g., if one manually inserts bytes into the
-  // section, then just issueing the section switch directive will not realign
-  // the section. However, this is arguably more reasonable behavior, and there
-  // is no good reason for someone to intentionally emit incorrectly sized
-  // values into the implicitly aligned sections.
-  if (Align)
-    Out.EmitValueToAlignment(Align, 0, 1, 0);
-
-  return false;
-}
-
 bool AsmParser::ParseEscapedString(std::string &Data) {
-  assert(Lexer.is(AsmToken::String) && "Unexpected current token!");
+  assert(getLexer().is(AsmToken::String) && "Unexpected current token!");
 
   Data = "";
   StringRef Str = getTok().getStringContents();
@@ -1045,25 +847,25 @@ bool AsmParser::ParseEscapedString(std::string &Data) {
 /// ParseDirectiveAscii:
 ///   ::= ( .ascii | .asciz ) [ "string" ( , "string" )* ]
 bool AsmParser::ParseDirectiveAscii(bool ZeroTerminated) {
-  if (Lexer.isNot(AsmToken::EndOfStatement)) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
-      if (Lexer.isNot(AsmToken::String))
+      if (getLexer().isNot(AsmToken::String))
         return TokError("expected string in '.ascii' or '.asciz' directive");
-      
+
       std::string Data;
       if (ParseEscapedString(Data))
         return true;
-      
-      Out.EmitBytes(Data, DEFAULT_ADDRSPACE);
+
+      getStreamer().EmitBytes(Data, DEFAULT_ADDRSPACE);
       if (ZeroTerminated)
-        Out.EmitBytes(StringRef("\0", 1), DEFAULT_ADDRSPACE);
-      
+        getStreamer().EmitBytes(StringRef("\0", 1), DEFAULT_ADDRSPACE);
+
       Lex();
-      
-      if (Lexer.is(AsmToken::EndOfStatement))
+
+      if (getLexer().is(AsmToken::EndOfStatement))
         break;
 
-      if (Lexer.isNot(AsmToken::Comma))
+      if (getLexer().isNot(AsmToken::Comma))
         return TokError("unexpected token in '.ascii' or '.asciz' directive");
       Lex();
     }
@@ -1076,24 +878,24 @@ bool AsmParser::ParseDirectiveAscii(bool ZeroTerminated) {
 /// ParseDirectiveValue
 ///  ::= (.byte | .short | ... ) [ expression (, expression)* ]
 bool AsmParser::ParseDirectiveValue(unsigned Size) {
-  if (Lexer.isNot(AsmToken::EndOfStatement)) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
-      SMLoc ATTRIBUTE_UNUSED StartLoc = Lexer.getLoc();
+      SMLoc ATTRIBUTE_UNUSED StartLoc = getLexer().getLoc();
       if (ParseExpression(Value))
         return true;
 
       // Special case constant expressions to match code generator.
       if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value))
-        Out.EmitIntValue(MCE->getValue(), Size, DEFAULT_ADDRSPACE);
+        getStreamer().EmitIntValue(MCE->getValue(), Size, DEFAULT_ADDRSPACE);
       else
-        Out.EmitValue(Value, Size, DEFAULT_ADDRSPACE);
+        getStreamer().EmitValue(Value, Size, DEFAULT_ADDRSPACE);
 
-      if (Lexer.is(AsmToken::EndOfStatement))
+      if (getLexer().is(AsmToken::EndOfStatement))
         break;
       
       // FIXME: Improve diagnostic.
-      if (Lexer.isNot(AsmToken::Comma))
+      if (getLexer().isNot(AsmToken::Comma))
         return TokError("unexpected token in directive");
       Lex();
     }
@@ -1111,18 +913,15 @@ bool AsmParser::ParseDirectiveSpace() {
     return true;
 
   int64_t FillExpr = 0;
-  bool HasFillExpr = false;
-  if (Lexer.isNot(AsmToken::EndOfStatement)) {
-    if (Lexer.isNot(AsmToken::Comma))
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::Comma))
       return TokError("unexpected token in '.space' directive");
     Lex();
     
     if (ParseAbsoluteExpression(FillExpr))
       return true;
 
-    HasFillExpr = true;
-
-    if (Lexer.isNot(AsmToken::EndOfStatement))
+    if (getLexer().isNot(AsmToken::EndOfStatement))
       return TokError("unexpected token in '.space' directive");
   }
 
@@ -1132,7 +931,7 @@ bool AsmParser::ParseDirectiveSpace() {
     return TokError("invalid number of bytes in '.space' directive");
 
   // FIXME: Sometimes the fill expr is 'nop' if it isn't supplied, instead of 0.
-  Out.EmitFill(NumBytes, FillExpr, DEFAULT_ADDRSPACE);
+  getStreamer().EmitFill(NumBytes, FillExpr, DEFAULT_ADDRSPACE);
 
   return false;
 }
@@ -1144,7 +943,7 @@ bool AsmParser::ParseDirectiveFill() {
   if (ParseAbsoluteExpression(NumValues))
     return true;
 
-  if (Lexer.isNot(AsmToken::Comma))
+  if (getLexer().isNot(AsmToken::Comma))
     return TokError("unexpected token in '.fill' directive");
   Lex();
   
@@ -1152,7 +951,7 @@ bool AsmParser::ParseDirectiveFill() {
   if (ParseAbsoluteExpression(FillSize))
     return true;
 
-  if (Lexer.isNot(AsmToken::Comma))
+  if (getLexer().isNot(AsmToken::Comma))
     return TokError("unexpected token in '.fill' directive");
   Lex();
   
@@ -1160,7 +959,7 @@ bool AsmParser::ParseDirectiveFill() {
   if (ParseAbsoluteExpression(FillExpr))
     return true;
 
-  if (Lexer.isNot(AsmToken::EndOfStatement))
+  if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.fill' directive");
   
   Lex();
@@ -1169,7 +968,7 @@ bool AsmParser::ParseDirectiveFill() {
     return TokError("invalid '.fill' size, expected 1, 2, 4, or 8");
 
   for (uint64_t i = 0, e = NumValues; i != e; ++i)
-    Out.EmitIntValue(FillExpr, FillSize, DEFAULT_ADDRSPACE);
+    getStreamer().EmitIntValue(FillExpr, FillSize, DEFAULT_ADDRSPACE);
 
   return false;
 }
@@ -1178,21 +977,20 @@ bool AsmParser::ParseDirectiveFill() {
 ///  ::= .org expression [ , expression ]
 bool AsmParser::ParseDirectiveOrg() {
   const MCExpr *Offset;
-  SMLoc StartLoc = Lexer.getLoc();
   if (ParseExpression(Offset))
     return true;
 
   // Parse optional fill expression.
   int64_t FillExpr = 0;
-  if (Lexer.isNot(AsmToken::EndOfStatement)) {
-    if (Lexer.isNot(AsmToken::Comma))
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::Comma))
       return TokError("unexpected token in '.org' directive");
     Lex();
     
     if (ParseAbsoluteExpression(FillExpr))
       return true;
 
-    if (Lexer.isNot(AsmToken::EndOfStatement))
+    if (getLexer().isNot(AsmToken::EndOfStatement))
       return TokError("unexpected token in '.org' directive");
   }
 
@@ -1200,7 +998,7 @@ bool AsmParser::ParseDirectiveOrg() {
 
   // FIXME: Only limited forms of relocatable expressions are accepted here, it
   // has to be relative to the current section.
-  Out.EmitValueToOffset(Offset, FillExpr);
+  getStreamer().EmitValueToOffset(Offset, FillExpr);
 
   return false;
 }
@@ -1208,7 +1006,7 @@ bool AsmParser::ParseDirectiveOrg() {
 /// ParseDirectiveAlign
 ///  ::= {.align, ...} expression [ , expression [ , expression ]]
 bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
-  SMLoc AlignmentLoc = Lexer.getLoc();
+  SMLoc AlignmentLoc = getLexer().getLoc();
   int64_t Alignment;
   if (ParseAbsoluteExpression(Alignment))
     return true;
@@ -1217,30 +1015,30 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   bool HasFillExpr = false;
   int64_t FillExpr = 0;
   int64_t MaxBytesToFill = 0;
-  if (Lexer.isNot(AsmToken::EndOfStatement)) {
-    if (Lexer.isNot(AsmToken::Comma))
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::Comma))
       return TokError("unexpected token in directive");
     Lex();
 
     // The fill expression can be omitted while specifying a maximum number of
     // alignment bytes, e.g:
     //  .align 3,,4
-    if (Lexer.isNot(AsmToken::Comma)) {
+    if (getLexer().isNot(AsmToken::Comma)) {
       HasFillExpr = true;
       if (ParseAbsoluteExpression(FillExpr))
         return true;
     }
 
-    if (Lexer.isNot(AsmToken::EndOfStatement)) {
-      if (Lexer.isNot(AsmToken::Comma))
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if (getLexer().isNot(AsmToken::Comma))
         return TokError("unexpected token in directive");
       Lex();
 
-      MaxBytesLoc = Lexer.getLoc();
+      MaxBytesLoc = getLexer().getLoc();
       if (ParseAbsoluteExpression(MaxBytesToFill))
         return true;
       
-      if (Lexer.isNot(AsmToken::EndOfStatement))
+      if (getLexer().isNot(AsmToken::EndOfStatement))
         return TokError("unexpected token in directive");
     }
   }
@@ -1282,14 +1080,14 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   // FIXME: This should be using a target hook.
   bool UseCodeAlign = false;
   if (const MCSectionMachO *S = dyn_cast<MCSectionMachO>(
-        Out.getCurrentSection()))
+        getStreamer().getCurrentSection()))
       UseCodeAlign = S->hasAttribute(MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS);
   if ((!HasFillExpr || Lexer.getMAI().getTextAlignFillValue() == FillExpr) &&
       ValueSize == 1 && UseCodeAlign) {
-    Out.EmitCodeAlignment(Alignment, MaxBytesToFill);
+    getStreamer().EmitCodeAlignment(Alignment, MaxBytesToFill);
   } else {
     // FIXME: Target specific behavior about how the "extra" bytes are filled.
-    Out.EmitValueToAlignment(Alignment, FillExpr, ValueSize, MaxBytesToFill);
+    getStreamer().EmitValueToAlignment(Alignment, FillExpr, ValueSize, MaxBytesToFill);
   }
 
   return false;
@@ -1298,21 +1096,21 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
 /// ParseDirectiveSymbolAttribute
 ///  ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
 bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
-  if (Lexer.isNot(AsmToken::EndOfStatement)) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       StringRef Name;
 
       if (ParseIdentifier(Name))
         return TokError("expected identifier in directive");
       
-      MCSymbol *Sym = CreateSymbol(Name);
+      MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
 
-      Out.EmitSymbolAttribute(Sym, Attr);
+      getStreamer().EmitSymbolAttribute(Sym, Attr);
 
-      if (Lexer.is(AsmToken::EndOfStatement))
+      if (getLexer().is(AsmToken::EndOfStatement))
         break;
 
-      if (Lexer.isNot(AsmToken::Comma))
+      if (getLexer().isNot(AsmToken::Comma))
         return TokError("unexpected token in directive");
       Lex();
     }
@@ -1330,20 +1128,20 @@ bool AsmParser::ParseDirectiveELFType() {
     return TokError("expected identifier in directive");
 
   // Handle the identifier as the key symbol.
-  MCSymbol *Sym = CreateSymbol(Name);
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
 
-  if (Lexer.isNot(AsmToken::Comma))
+  if (getLexer().isNot(AsmToken::Comma))
     return TokError("unexpected token in '.type' directive");
   Lex();
 
-  if (Lexer.isNot(AsmToken::At))
+  if (getLexer().isNot(AsmToken::At))
     return TokError("expected '@' before type");
   Lex();
 
   StringRef Type;
   SMLoc TypeLoc;
 
-  TypeLoc = Lexer.getLoc();
+  TypeLoc = getLexer().getLoc();
   if (ParseIdentifier(Type))
     return TokError("expected symbol type in directive");
 
@@ -1358,42 +1156,12 @@ bool AsmParser::ParseDirectiveELFType() {
   if (Attr == MCSA_Invalid)
     return Error(TypeLoc, "unsupported attribute in '.type' directive");
 
-  if (Lexer.isNot(AsmToken::EndOfStatement))
+  if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.type' directive");
 
   Lex();
 
-  Out.EmitSymbolAttribute(Sym, Attr);
-
-  return false;
-}
-
-/// ParseDirectiveDarwinSymbolDesc
-///  ::= .desc identifier , expression
-bool AsmParser::ParseDirectiveDarwinSymbolDesc() {
-  StringRef Name;
-  if (ParseIdentifier(Name))
-    return TokError("expected identifier in directive");
-  
-  // Handle the identifier as the key symbol.
-  MCSymbol *Sym = CreateSymbol(Name);
-
-  if (Lexer.isNot(AsmToken::Comma))
-    return TokError("unexpected token in '.desc' directive");
-  Lex();
-
-  SMLoc DescLoc = Lexer.getLoc();
-  int64_t DescValue;
-  if (ParseAbsoluteExpression(DescValue))
-    return true;
-
-  if (Lexer.isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.desc' directive");
-  
-  Lex();
-
-  // Set the n_desc field of this Symbol to this DescValue
-  Out.EmitSymbolDesc(Sym, DescValue);
+  getStreamer().EmitSymbolAttribute(Sym, Attr);
 
   return false;
 }
@@ -1401,28 +1169,28 @@ bool AsmParser::ParseDirectiveDarwinSymbolDesc() {
 /// ParseDirectiveComm
 ///  ::= ( .comm | .lcomm ) identifier , size_expression [ , align_expression ]
 bool AsmParser::ParseDirectiveComm(bool IsLocal) {
-  SMLoc IDLoc = Lexer.getLoc();
+  SMLoc IDLoc = getLexer().getLoc();
   StringRef Name;
   if (ParseIdentifier(Name))
     return TokError("expected identifier in directive");
   
   // Handle the identifier as the key symbol.
-  MCSymbol *Sym = CreateSymbol(Name);
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
 
-  if (Lexer.isNot(AsmToken::Comma))
+  if (getLexer().isNot(AsmToken::Comma))
     return TokError("unexpected token in directive");
   Lex();
 
   int64_t Size;
-  SMLoc SizeLoc = Lexer.getLoc();
+  SMLoc SizeLoc = getLexer().getLoc();
   if (ParseAbsoluteExpression(Size))
     return true;
 
   int64_t Pow2Alignment = 0;
   SMLoc Pow2AlignmentLoc;
-  if (Lexer.is(AsmToken::Comma)) {
+  if (getLexer().is(AsmToken::Comma)) {
     Lex();
-    Pow2AlignmentLoc = Lexer.getLoc();
+    Pow2AlignmentLoc = getLexer().getLoc();
     if (ParseAbsoluteExpression(Pow2Alignment))
       return true;
     
@@ -1434,7 +1202,7 @@ bool AsmParser::ParseDirectiveComm(bool IsLocal) {
     }
   }
   
-  if (Lexer.isNot(AsmToken::EndOfStatement))
+  if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.comm' or '.lcomm' directive");
   
   Lex();
@@ -1458,168 +1226,14 @@ bool AsmParser::ParseDirectiveComm(bool IsLocal) {
   // '.lcomm' is equivalent to '.zerofill'.
   // Create the Symbol as a common or local common with Size and Pow2Alignment
   if (IsLocal) {
-    Out.EmitZerofill(Ctx.getMachOSection("__DATA", "__bss",
-                                         MCSectionMachO::S_ZEROFILL, 0,
-                                         SectionKind::getBSS()),
-                     Sym, Size, 1 << Pow2Alignment);
+    getStreamer().EmitZerofill(Ctx.getMachOSection(
+                                 "__DATA", "__bss", MCSectionMachO::S_ZEROFILL,
+                                 0, SectionKind::getBSS()),
+                               Sym, Size, 1 << Pow2Alignment);
     return false;
   }
 
-  Out.EmitCommonSymbol(Sym, Size, 1 << Pow2Alignment);
-  return false;
-}
-
-/// ParseDirectiveDarwinZerofill
-///  ::= .zerofill segname , sectname [, identifier , size_expression [
-///      , align_expression ]]
-bool AsmParser::ParseDirectiveDarwinZerofill() {
-  StringRef Segment;
-  if (ParseIdentifier(Segment))
-    return TokError("expected segment name after '.zerofill' directive");
-
-  if (Lexer.isNot(AsmToken::Comma))
-    return TokError("unexpected token in directive");
-  Lex();
-
-  StringRef Section;
-  if (ParseIdentifier(Section))
-    return TokError("expected section name after comma in '.zerofill' "
-                    "directive");
-
-  // If this is the end of the line all that was wanted was to create the
-  // the section but with no symbol.
-  if (Lexer.is(AsmToken::EndOfStatement)) {
-    // Create the zerofill section but no symbol
-    Out.EmitZerofill(Ctx.getMachOSection(Segment, Section,
-                                         MCSectionMachO::S_ZEROFILL, 0,
-                                         SectionKind::getBSS()));
-    return false;
-  }
-
-  if (Lexer.isNot(AsmToken::Comma))
-    return TokError("unexpected token in directive");
-  Lex();
-
-  SMLoc IDLoc = Lexer.getLoc();
-  StringRef IDStr;
-  if (ParseIdentifier(IDStr))
-    return TokError("expected identifier in directive");
-  
-  // handle the identifier as the key symbol.
-  MCSymbol *Sym = CreateSymbol(IDStr);
-
-  if (Lexer.isNot(AsmToken::Comma))
-    return TokError("unexpected token in directive");
-  Lex();
-
-  int64_t Size;
-  SMLoc SizeLoc = Lexer.getLoc();
-  if (ParseAbsoluteExpression(Size))
-    return true;
-
-  int64_t Pow2Alignment = 0;
-  SMLoc Pow2AlignmentLoc;
-  if (Lexer.is(AsmToken::Comma)) {
-    Lex();
-    Pow2AlignmentLoc = Lexer.getLoc();
-    if (ParseAbsoluteExpression(Pow2Alignment))
-      return true;
-  }
-  
-  if (Lexer.isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.zerofill' directive");
-  
-  Lex();
-
-  if (Size < 0)
-    return Error(SizeLoc, "invalid '.zerofill' directive size, can't be less "
-                 "than zero");
-
-  // NOTE: The alignment in the directive is a power of 2 value, the assembler
-  // may internally end up wanting an alignment in bytes.
-  // FIXME: Diagnose overflow.
-  if (Pow2Alignment < 0)
-    return Error(Pow2AlignmentLoc, "invalid '.zerofill' directive alignment, "
-                 "can't be less than zero");
-
-  if (!Sym->isUndefined())
-    return Error(IDLoc, "invalid symbol redefinition");
-
-  // Create the zerofill Symbol with Size and Pow2Alignment
-  //
-  // FIXME: Arch specific.
-  Out.EmitZerofill(Ctx.getMachOSection(Segment, Section,
-                                       MCSectionMachO::S_ZEROFILL, 0,
-                                       SectionKind::getBSS()),
-                   Sym, Size, 1 << Pow2Alignment);
-
-  return false;
-}
-
-/// ParseDirectiveDarwinTBSS
-///  ::= .tbss identifier, size, align
-bool AsmParser::ParseDirectiveDarwinTBSS() {
-  SMLoc IDLoc = Lexer.getLoc();
-  StringRef Name;
-  if (ParseIdentifier(Name))
-    return TokError("expected identifier in directive");
-    
-  // Handle the identifier as the key symbol.
-  MCSymbol *Sym = CreateSymbol(Name);
-
-  if (Lexer.isNot(AsmToken::Comma))
-    return TokError("unexpected token in directive");
-  Lex();
-
-  int64_t Size;
-  SMLoc SizeLoc = Lexer.getLoc();
-  if (ParseAbsoluteExpression(Size))
-    return true;
-
-  int64_t Pow2Alignment = 0;
-  SMLoc Pow2AlignmentLoc;
-  if (Lexer.is(AsmToken::Comma)) {
-    Lex();
-    Pow2AlignmentLoc = Lexer.getLoc();
-    if (ParseAbsoluteExpression(Pow2Alignment))
-      return true;
-  }
-  
-  if (Lexer.isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.tbss' directive");
-  
-  Lex();
-
-  if (Size < 0)
-    return Error(SizeLoc, "invalid '.tbss' directive size, can't be less than"
-                 "zero");
-
-  // FIXME: Diagnose overflow.
-  if (Pow2Alignment < 0)
-    return Error(Pow2AlignmentLoc, "invalid '.tbss' alignment, can't be less"
-                 "than zero");
-
-  if (!Sym->isUndefined())
-    return Error(IDLoc, "invalid symbol redefinition");
-  
-  Out.EmitTBSSSymbol(Ctx.getMachOSection("__DATA", "__thread_bss",
-                                        MCSectionMachO::S_THREAD_LOCAL_ZEROFILL,
-                                        0, SectionKind::getThreadBSS()),
-                     Sym, Size, 1 << Pow2Alignment);
-  
-  return false;
-}
-
-/// ParseDirectiveDarwinSubsectionsViaSymbols
-///  ::= .subsections_via_symbols
-bool AsmParser::ParseDirectiveDarwinSubsectionsViaSymbols() {
-  if (Lexer.isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.subsections_via_symbols' directive");
-  
-  Lex();
-
-  Out.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
-
+  getStreamer().EmitCommonSymbol(Sym, Size, 1 << Pow2Alignment);
   return false;
 }
 
@@ -1627,11 +1241,11 @@ bool AsmParser::ParseDirectiveDarwinSubsectionsViaSymbols() {
 ///  ::= .abort [ "abort_string" ]
 bool AsmParser::ParseDirectiveAbort() {
   // FIXME: Use loc from directive.
-  SMLoc Loc = Lexer.getLoc();
+  SMLoc Loc = getLexer().getLoc();
 
   StringRef Str = "";
-  if (Lexer.isNot(AsmToken::EndOfStatement)) {
-    if (Lexer.isNot(AsmToken::String))
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::String))
       return TokError("expected string in '.abort' directive");
     
     Str = getTok().getString();
@@ -1639,7 +1253,7 @@ bool AsmParser::ParseDirectiveAbort() {
     Lex();
   }
 
-  if (Lexer.isNot(AsmToken::EndOfStatement))
+  if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.abort' directive");
   
   Lex();
@@ -1653,48 +1267,17 @@ bool AsmParser::ParseDirectiveAbort() {
   return false;
 }
 
-/// ParseDirectiveLsym
-///  ::= .lsym identifier , expression
-bool AsmParser::ParseDirectiveDarwinLsym() {
-  StringRef Name;
-  if (ParseIdentifier(Name))
-    return TokError("expected identifier in directive");
-  
-  // Handle the identifier as the key symbol.
-  MCSymbol *Sym = CreateSymbol(Name);
-
-  if (Lexer.isNot(AsmToken::Comma))
-    return TokError("unexpected token in '.lsym' directive");
-  Lex();
-
-  const MCExpr *Value;
-  SMLoc StartLoc = Lexer.getLoc();
-  if (ParseExpression(Value))
-    return true;
-
-  if (Lexer.isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.lsym' directive");
-  
-  Lex();
-
-  // We don't currently support this directive.
-  //
-  // FIXME: Diagnostic location!
-  (void) Sym;
-  return TokError("directive '.lsym' is unsupported");
-}
-
 /// ParseDirectiveInclude
 ///  ::= .include "filename"
 bool AsmParser::ParseDirectiveInclude() {
-  if (Lexer.isNot(AsmToken::String))
+  if (getLexer().isNot(AsmToken::String))
     return TokError("expected string in '.include' directive");
   
   std::string Filename = getTok().getString();
-  SMLoc IncludeLoc = Lexer.getLoc();
+  SMLoc IncludeLoc = getLexer().getLoc();
   Lex();
 
-  if (Lexer.isNot(AsmToken::EndOfStatement))
+  if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.include' directive");
   
   // Strip the quotes.
@@ -1712,29 +1295,6 @@ bool AsmParser::ParseDirectiveInclude() {
   return false;
 }
 
-/// ParseDirectiveDarwinDumpOrLoad
-///  ::= ( .dump | .load ) "filename"
-bool AsmParser::ParseDirectiveDarwinDumpOrLoad(SMLoc IDLoc, bool IsDump) {
-  if (Lexer.isNot(AsmToken::String))
-    return TokError("expected string in '.dump' or '.load' directive");
-  
-  Lex();
-
-  if (Lexer.isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.dump' or '.load' directive");
-  
-  Lex();
-
-  // FIXME: If/when .dump and .load are implemented they will be done in the
-  // the assembly parser and not have any need for an MCStreamer API.
-  if (IsDump)
-    Warning(IDLoc, "ignoring directive .dump for now");
-  else
-    Warning(IDLoc, "ignoring directive .load for now");
-
-  return false;
-}
-
 /// ParseDirectiveIf
 /// ::= .if expression
 bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
@@ -1748,7 +1308,7 @@ bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
     if (ParseAbsoluteExpression(ExprValue))
       return true;
 
-    if (Lexer.isNot(AsmToken::EndOfStatement))
+    if (getLexer().isNot(AsmToken::EndOfStatement))
       return TokError("unexpected token in '.if' directive");
     
     Lex();
@@ -1781,7 +1341,7 @@ bool AsmParser::ParseDirectiveElseIf(SMLoc DirectiveLoc) {
     if (ParseAbsoluteExpression(ExprValue))
       return true;
 
-    if (Lexer.isNot(AsmToken::EndOfStatement))
+    if (getLexer().isNot(AsmToken::EndOfStatement))
       return TokError("unexpected token in '.elseif' directive");
     
     Lex();
@@ -1795,7 +1355,7 @@ bool AsmParser::ParseDirectiveElseIf(SMLoc DirectiveLoc) {
 /// ParseDirectiveElse
 /// ::= .else
 bool AsmParser::ParseDirectiveElse(SMLoc DirectiveLoc) {
-  if (Lexer.isNot(AsmToken::EndOfStatement))
+  if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.else' directive");
   
   Lex();
@@ -1819,7 +1379,7 @@ bool AsmParser::ParseDirectiveElse(SMLoc DirectiveLoc) {
 /// ParseDirectiveEndIf
 /// ::= .endif
 bool AsmParser::ParseDirectiveEndIf(SMLoc DirectiveLoc) {
-  if (Lexer.isNot(AsmToken::EndOfStatement))
+  if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.endif' directive");
   
   Lex();
@@ -1838,40 +1398,40 @@ bool AsmParser::ParseDirectiveEndIf(SMLoc DirectiveLoc) {
 
 /// ParseDirectiveFile
 /// ::= .file [number] string
-bool AsmParser::ParseDirectiveFile(StringRef, SMLoc DirectiveLoc) {
+bool GenericAsmParser::ParseDirectiveFile(StringRef, SMLoc DirectiveLoc) {
   // FIXME: I'm not sure what this is.
   int64_t FileNumber = -1;
-  if (Lexer.is(AsmToken::Integer)) {
+  if (getLexer().is(AsmToken::Integer)) {
     FileNumber = getTok().getIntVal();
     Lex();
-    
+
     if (FileNumber < 1)
       return TokError("file number less than one");
   }
 
-  if (Lexer.isNot(AsmToken::String))
+  if (getLexer().isNot(AsmToken::String))
     return TokError("unexpected token in '.file' directive");
-  
+
   StringRef Filename = getTok().getString();
   Filename = Filename.substr(1, Filename.size()-2);
   Lex();
 
-  if (Lexer.isNot(AsmToken::EndOfStatement))
+  if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.file' directive");
 
   if (FileNumber == -1)
-    Out.EmitFileDirective(Filename);
+    getStreamer().EmitFileDirective(Filename);
   else
-    Out.EmitDwarfFileDirective(FileNumber, Filename);
-  
+    getStreamer().EmitDwarfFileDirective(FileNumber, Filename);
+
   return false;
 }
 
 /// ParseDirectiveLine
 /// ::= .line [number]
-bool AsmParser::ParseDirectiveLine(StringRef, SMLoc DirectiveLoc) {
-  if (Lexer.isNot(AsmToken::EndOfStatement)) {
-    if (Lexer.isNot(AsmToken::Integer))
+bool GenericAsmParser::ParseDirectiveLine(StringRef, SMLoc DirectiveLoc) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::Integer))
       return TokError("unexpected token in '.line' directive");
 
     int64_t LineNumber = getTok().getIntVal();
@@ -1881,8 +1441,8 @@ bool AsmParser::ParseDirectiveLine(StringRef, SMLoc DirectiveLoc) {
     // FIXME: Do something with the .line.
   }
 
-  if (Lexer.isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.file' directive");
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.line' directive");
 
   return false;
 }
@@ -1890,8 +1450,8 @@ bool AsmParser::ParseDirectiveLine(StringRef, SMLoc DirectiveLoc) {
 
 /// ParseDirectiveLoc
 /// ::= .loc number [number [number]]
-bool AsmParser::ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc) {
-  if (Lexer.isNot(AsmToken::Integer))
+bool GenericAsmParser::ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc) {
+  if (getLexer().isNot(AsmToken::Integer))
     return TokError("unexpected token in '.loc' directive");
 
   // FIXME: What are these fields?
@@ -1900,16 +1460,16 @@ bool AsmParser::ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc) {
   // FIXME: Validate file.
 
   Lex();
-  if (Lexer.isNot(AsmToken::EndOfStatement)) {
-    if (Lexer.isNot(AsmToken::Integer))
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::Integer))
       return TokError("unexpected token in '.loc' directive");
 
     int64_t Param2 = getTok().getIntVal();
     (void) Param2;
     Lex();
 
-    if (Lexer.isNot(AsmToken::EndOfStatement)) {
-      if (Lexer.isNot(AsmToken::Integer))
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if (getLexer().isNot(AsmToken::Integer))
         return TokError("unexpected token in '.loc' directive");
 
       int64_t Param3 = getTok().getIntVal();
@@ -1920,7 +1480,7 @@ bool AsmParser::ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc) {
     }
   }
 
-  if (Lexer.isNot(AsmToken::EndOfStatement))
+  if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.file' directive");
 
   return false;
diff --git a/lib/MC/MCParser/CMakeLists.txt b/lib/MC/MCParser/CMakeLists.txt
index a5c0818..25a7bf4 100644
--- a/lib/MC/MCParser/CMakeLists.txt
+++ b/lib/MC/MCParser/CMakeLists.txt
@@ -1,7 +1,10 @@
 add_llvm_library(LLVMMCParser
   AsmLexer.cpp
   AsmParser.cpp
+  DarwinAsmParser.cpp
+  ELFAsmParser.cpp
   MCAsmLexer.cpp
   MCAsmParser.cpp
+  MCAsmParserExtension.cpp
   TargetAsmParser.cpp
   )
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
new file mode 100644
index 0000000..7d8639e
--- /dev/null
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -0,0 +1,758 @@
+//===- DarwinAsmParser.cpp - Darwin (Mach-O) Assembly Parser --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+using namespace llvm;
+
+namespace {
+
+/// \brief Implementation of directive handling which is shared across all
+/// Darwin targets.
+class DarwinAsmParser : public MCAsmParserExtension {
+  bool ParseSectionSwitch(const char *Segment, const char *Section,
+                          unsigned TAA = 0, unsigned ImplicitAlign = 0,
+                          unsigned StubSize = 0);
+
+public:
+  DarwinAsmParser() {}
+
+  virtual void Initialize(MCAsmParser &Parser) {
+    // Call the base implementation.
+    this->MCAsmParserExtension::Initialize(Parser);
+
+    Parser.AddDirectiveHandler(this, ".desc", MCAsmParser::DirectiveHandler(
+                                 &DarwinAsmParser::ParseDirectiveDesc));
+    Parser.AddDirectiveHandler(this, ".lsym", MCAsmParser::DirectiveHandler(
+                                 &DarwinAsmParser::ParseDirectiveLsym));
+    Parser.AddDirectiveHandler(this, ".subsections_via_symbols",
+                               MCAsmParser::DirectiveHandler(
+                        &DarwinAsmParser::ParseDirectiveSubsectionsViaSymbols));
+    Parser.AddDirectiveHandler(this, ".dump", MCAsmParser::DirectiveHandler(
+                                 &DarwinAsmParser::ParseDirectiveDumpOrLoad));
+    Parser.AddDirectiveHandler(this, ".load", MCAsmParser::DirectiveHandler(
+                                 &DarwinAsmParser::ParseDirectiveDumpOrLoad));
+    Parser.AddDirectiveHandler(this, ".section", MCAsmParser::DirectiveHandler(
+                                 &DarwinAsmParser::ParseDirectiveSection));
+    Parser.AddDirectiveHandler(this, ".secure_log_unique",
+                               MCAsmParser::DirectiveHandler(
+                             &DarwinAsmParser::ParseDirectiveSecureLogUnique));
+    Parser.AddDirectiveHandler(this, ".secure_log_reset",
+                               MCAsmParser::DirectiveHandler(
+                             &DarwinAsmParser::ParseDirectiveSecureLogReset));
+    Parser.AddDirectiveHandler(this, ".tbss",
+                               MCAsmParser::DirectiveHandler(
+                                 &DarwinAsmParser::ParseDirectiveTBSS));
+    Parser.AddDirectiveHandler(this, ".zerofill",
+                               MCAsmParser::DirectiveHandler(
+                                 &DarwinAsmParser::ParseDirectiveZerofill));
+
+    // Special section directives.
+    Parser.AddDirectiveHandler(this, ".const",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveConst));
+    Parser.AddDirectiveHandler(this, ".const_data",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveConstData));
+    Parser.AddDirectiveHandler(this, ".constructor",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveConstructor));
+    Parser.AddDirectiveHandler(this, ".cstring",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveCString));
+    Parser.AddDirectiveHandler(this, ".data",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveData));
+    Parser.AddDirectiveHandler(this, ".destructor",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveDestructor));
+    Parser.AddDirectiveHandler(this, ".dyld",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveDyld));
+    Parser.AddDirectiveHandler(this, ".fvmlib_init0",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveFVMLibInit0));
+    Parser.AddDirectiveHandler(this, ".fvmlib_init1",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveFVMLibInit1));
+    Parser.AddDirectiveHandler(this, ".lazy_symbol_pointer",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveLazySymbolPointers));
+    Parser.AddDirectiveHandler(this, ".literal16",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveLiteral16));
+    Parser.AddDirectiveHandler(this, ".literal4",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveLiteral4));
+    Parser.AddDirectiveHandler(this, ".literal8",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveLiteral8));
+    Parser.AddDirectiveHandler(this, ".mod_init_func",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveModInitFunc));
+    Parser.AddDirectiveHandler(this, ".mod_term_func",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveModTermFunc));
+    Parser.AddDirectiveHandler(this, ".non_lazy_symbol_pointer",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveNonLazySymbolPointers));
+    Parser.AddDirectiveHandler(this, ".objc_cat_cls_meth",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCCatClsMeth));
+    Parser.AddDirectiveHandler(this, ".objc_cat_inst_meth",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCCatInstMeth));
+    Parser.AddDirectiveHandler(this, ".objc_category",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCCategory));
+    Parser.AddDirectiveHandler(this, ".objc_class",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCClass));
+    Parser.AddDirectiveHandler(this, ".objc_class_names",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCClassNames));
+    Parser.AddDirectiveHandler(this, ".objc_class_vars",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCClassVars));
+    Parser.AddDirectiveHandler(this, ".objc_cls_meth",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCClsMeth));
+    Parser.AddDirectiveHandler(this, ".objc_cls_refs",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCClsRefs));
+    Parser.AddDirectiveHandler(this, ".objc_inst_meth",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCInstMeth));
+    Parser.AddDirectiveHandler(this, ".objc_instance_vars",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCInstanceVars));
+    Parser.AddDirectiveHandler(this, ".objc_message_refs",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCMessageRefs));
+    Parser.AddDirectiveHandler(this, ".objc_meta_class",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCMetaClass));
+    Parser.AddDirectiveHandler(this, ".objc_meth_var_names",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCMethVarNames));
+    Parser.AddDirectiveHandler(this, ".objc_meth_var_types",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCMethVarTypes));
+    Parser.AddDirectiveHandler(this, ".objc_module_info",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCModuleInfo));
+    Parser.AddDirectiveHandler(this, ".objc_protocol",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCProtocol));
+    Parser.AddDirectiveHandler(this, ".objc_selector_strs",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCSelectorStrs));
+    Parser.AddDirectiveHandler(this, ".objc_string_object",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCStringObject));
+    Parser.AddDirectiveHandler(this, ".objc_symbols",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveObjCSymbols));
+    Parser.AddDirectiveHandler(this, ".picsymbol_stub",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectivePICSymbolStub));
+    Parser.AddDirectiveHandler(this, ".static_const",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveStaticConst));
+    Parser.AddDirectiveHandler(this, ".static_data",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveStaticData));
+    Parser.AddDirectiveHandler(this, ".symbol_stub",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveSymbolStub));
+    Parser.AddDirectiveHandler(this, ".tdata",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveTData));
+    Parser.AddDirectiveHandler(this, ".text",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveText));
+    Parser.AddDirectiveHandler(this, ".thread_init_func",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveThreadInitFunc));
+    Parser.AddDirectiveHandler(this, ".tlv",
+                               MCAsmParser::DirectiveHandler(
+                 &DarwinAsmParser::ParseSectionDirectiveTLV));
+  }
+
+  bool ParseDirectiveDesc(StringRef, SMLoc);
+  bool ParseDirectiveDumpOrLoad(StringRef, SMLoc);
+  bool ParseDirectiveLsym(StringRef, SMLoc);
+  bool ParseDirectiveSection();
+  bool ParseDirectiveSecureLogReset(StringRef, SMLoc);
+  bool ParseDirectiveSecureLogUnique(StringRef, SMLoc);
+  bool ParseDirectiveSubsectionsViaSymbols(StringRef, SMLoc);
+  bool ParseDirectiveTBSS(StringRef, SMLoc);
+  bool ParseDirectiveZerofill(StringRef, SMLoc);
+
+  // Named Section Directive
+  bool ParseSectionDirectiveConst(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__const");
+  }
+  bool ParseSectionDirectiveStaticConst(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__static_const");
+  }
+  bool ParseSectionDirectiveCString(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__cstring",
+                              MCSectionMachO::S_CSTRING_LITERALS);
+  }
+  bool ParseSectionDirectiveLiteral4(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__literal4",
+                              MCSectionMachO::S_4BYTE_LITERALS, 4);
+  }
+  bool ParseSectionDirectiveLiteral8(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__literal8",
+                              MCSectionMachO::S_8BYTE_LITERALS, 8);
+  }
+  bool ParseSectionDirectiveLiteral16(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__literal16",
+                              MCSectionMachO::S_16BYTE_LITERALS, 16);
+  }
+  bool ParseSectionDirectiveConstructor(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__constructor");
+  }
+  bool ParseSectionDirectiveDestructor(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__destructor");
+  }
+  bool ParseSectionDirectiveFVMLibInit0(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__fvmlib_init0");
+  }
+  bool ParseSectionDirectiveFVMLibInit1(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__fvmlib_init1");
+  }
+  bool ParseSectionDirectiveSymbolStub(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__symbol_stub",
+                              MCSectionMachO::S_SYMBOL_STUBS |
+                              MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                              // FIXME: Different on PPC and ARM.
+                              0, 16);
+  }
+  bool ParseSectionDirectivePICSymbolStub(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__picsymbol_stub",
+                              MCSectionMachO::S_SYMBOL_STUBS |
+                              MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS, 0, 26);
+  }
+  bool ParseSectionDirectiveData(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__data");
+  }
+  bool ParseSectionDirectiveStaticData(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__static_data");
+  }
+  bool ParseSectionDirectiveNonLazySymbolPointers(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__nl_symbol_ptr",
+                              MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS, 4);
+  }
+  bool ParseSectionDirectiveLazySymbolPointers(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__la_symbol_ptr",
+                              MCSectionMachO::S_LAZY_SYMBOL_POINTERS, 4);
+  }
+  bool ParseSectionDirectiveDyld(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__dyld");
+  }
+  bool ParseSectionDirectiveModInitFunc(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__mod_init_func",
+                              MCSectionMachO::S_MOD_INIT_FUNC_POINTERS, 4);
+  }
+  bool ParseSectionDirectiveModTermFunc(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__mod_term_func",
+                              MCSectionMachO::S_MOD_TERM_FUNC_POINTERS, 4);
+  }
+  bool ParseSectionDirectiveConstData(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__const");
+  }
+  bool ParseSectionDirectiveObjCClass(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__class",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCMetaClass(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__meta_class",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCCatClsMeth(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__cat_cls_meth",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCCatInstMeth(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__cat_inst_meth",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCProtocol(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__protocol",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCStringObject(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__string_object",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCClsMeth(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__cls_meth",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCInstMeth(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__inst_meth",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCClsRefs(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__cls_refs",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP |
+                              MCSectionMachO::S_LITERAL_POINTERS, 4);
+  }
+  bool ParseSectionDirectiveObjCMessageRefs(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__message_refs",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP |
+                              MCSectionMachO::S_LITERAL_POINTERS, 4);
+  }
+  bool ParseSectionDirectiveObjCSymbols(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__symbols",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCCategory(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__category",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCClassVars(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__class_vars",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCInstanceVars(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__instance_vars",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCModuleInfo(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__module_info",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCClassNames(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__cstring",
+                              MCSectionMachO::S_CSTRING_LITERALS);
+  }
+  bool ParseSectionDirectiveObjCMethVarTypes(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__cstring",
+                              MCSectionMachO::S_CSTRING_LITERALS);
+  }
+  bool ParseSectionDirectiveObjCMethVarNames(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__cstring",
+                              MCSectionMachO::S_CSTRING_LITERALS);
+  }
+  bool ParseSectionDirectiveObjCSelectorStrs(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__selector_strs",
+                              MCSectionMachO::S_CSTRING_LITERALS);
+  }
+  bool ParseSectionDirectiveTData(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__thread_data",
+                              MCSectionMachO::S_THREAD_LOCAL_REGULAR);
+  }
+  bool ParseSectionDirectiveText(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__text",
+                              MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS);
+  }
+  bool ParseSectionDirectiveTLV(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__thread_vars",
+                              MCSectionMachO::S_THREAD_LOCAL_VARIABLES);
+  }
+  bool ParseSectionDirectiveThreadInitFunc(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__thread_init",
+                         MCSectionMachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS);
+  }
+
+};
+
+}
+
+bool DarwinAsmParser::ParseSectionSwitch(const char *Segment,
+                                         const char *Section,
+                                         unsigned TAA, unsigned Align,
+                                         unsigned StubSize) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in section switching directive");
+  Lex();
+
+  // FIXME: Arch specific.
+  bool isText = StringRef(Segment) == "__TEXT";  // FIXME: Hack.
+  getStreamer().SwitchSection(getContext().getMachOSection(
+                                Segment, Section, TAA, StubSize,
+                                isText ? SectionKind::getText()
+                                       : SectionKind::getDataRel()));
+
+  // Set the implicit alignment, if any.
+  //
+  // FIXME: This isn't really what 'as' does; I think it just uses the implicit
+  // alignment on the section (e.g., if one manually inserts bytes into the
+  // section, then just issueing the section switch directive will not realign
+  // the section. However, this is arguably more reasonable behavior, and there
+  // is no good reason for someone to intentionally emit incorrectly sized
+  // values into the implicitly aligned sections.
+  if (Align)
+    getStreamer().EmitValueToAlignment(Align, 0, 1, 0);
+
+  return false;
+}
+
+/// ParseDirectiveDesc
+///  ::= .desc identifier , expression
+bool DarwinAsmParser::ParseDirectiveDesc(StringRef, SMLoc) {
+  StringRef Name;
+  if (getParser().ParseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  // Handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in '.desc' directive");
+  Lex();
+
+  int64_t DescValue;
+  if (getParser().ParseAbsoluteExpression(DescValue))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.desc' directive");
+
+  Lex();
+
+  // Set the n_desc field of this Symbol to this DescValue
+  getStreamer().EmitSymbolDesc(Sym, DescValue);
+
+  return false;
+}
+
+/// ParseDirectiveDumpOrLoad
+///  ::= ( .dump | .load ) "filename"
+bool DarwinAsmParser::ParseDirectiveDumpOrLoad(StringRef Directive,
+                                               SMLoc IDLoc) {
+  bool IsDump = Directive == ".dump";
+  if (getLexer().isNot(AsmToken::String))
+    return TokError("expected string in '.dump' or '.load' directive");
+
+  Lex();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.dump' or '.load' directive");
+
+  Lex();
+
+  // FIXME: If/when .dump and .load are implemented they will be done in the
+  // the assembly parser and not have any need for an MCStreamer API.
+  if (IsDump)
+    Warning(IDLoc, "ignoring directive .dump for now");
+  else
+    Warning(IDLoc, "ignoring directive .load for now");
+
+  return false;
+}
+
+/// ParseDirectiveLsym
+///  ::= .lsym identifier , expression
+bool DarwinAsmParser::ParseDirectiveLsym(StringRef, SMLoc) {
+  StringRef Name;
+  if (getParser().ParseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  // Handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in '.lsym' directive");
+  Lex();
+
+  const MCExpr *Value;
+  if (getParser().ParseExpression(Value))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.lsym' directive");
+
+  Lex();
+
+  // We don't currently support this directive.
+  //
+  // FIXME: Diagnostic location!
+  (void) Sym;
+  return TokError("directive '.lsym' is unsupported");
+}
+
+/// ParseDirectiveSection:
+///   ::= .section identifier (',' identifier)*
+bool DarwinAsmParser::ParseDirectiveSection() {
+  SMLoc Loc = getLexer().getLoc();
+
+  StringRef SectionName;
+  if (getParser().ParseIdentifier(SectionName))
+    return Error(Loc, "expected identifier after '.section' directive");
+
+  // Verify there is a following comma.
+  if (!getLexer().is(AsmToken::Comma))
+    return TokError("unexpected token in '.section' directive");
+
+  std::string SectionSpec = SectionName;
+  SectionSpec += ",";
+
+  // Add all the tokens until the end of the line, ParseSectionSpecifier will
+  // handle this.
+  StringRef EOL = getLexer().LexUntilEndOfStatement();
+  SectionSpec.append(EOL.begin(), EOL.end());
+
+  Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.section' directive");
+  Lex();
+
+
+  StringRef Segment, Section;
+  unsigned TAA, StubSize;
+  std::string ErrorStr =
+    MCSectionMachO::ParseSectionSpecifier(SectionSpec, Segment, Section,
+                                          TAA, StubSize);
+
+  if (!ErrorStr.empty())
+    return Error(Loc, ErrorStr.c_str());
+
+  // FIXME: Arch specific.
+  bool isText = Segment == "__TEXT";  // FIXME: Hack.
+  getStreamer().SwitchSection(getContext().getMachOSection(
+                                Segment, Section, TAA, StubSize,
+                                isText ? SectionKind::getText()
+                                : SectionKind::getDataRel()));
+  return false;
+}
+
+/// ParseDirectiveSecureLogUnique
+///  ::= .secure_log_unique "log message"
+bool DarwinAsmParser::ParseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) {
+  std::string LogMessage;
+
+  if (getLexer().isNot(AsmToken::String))
+    LogMessage = "";
+  else{
+    LogMessage = getTok().getString();
+    Lex();
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.secure_log_unique' directive");
+
+  if (getContext().getSecureLogUsed() != false)
+    return Error(IDLoc, ".secure_log_unique specified multiple times");
+
+  char *SecureLogFile = getContext().getSecureLogFile();
+  if (SecureLogFile == NULL)
+    return Error(IDLoc, ".secure_log_unique used but AS_SECURE_LOG_FILE "
+                 "environment variable unset.");
+
+  raw_ostream *OS = getContext().getSecureLog();
+  if (OS == NULL) {
+    std::string Err;
+    OS = new raw_fd_ostream(SecureLogFile, Err, raw_fd_ostream::F_Append);
+    if (!Err.empty()) {
+       delete OS;
+       return Error(IDLoc, Twine("can't open secure log file: ") +
+                    SecureLogFile + " (" + Err + ")");
+    }
+    getContext().setSecureLog(OS);
+  }
+
+  int CurBuf = getSourceManager().FindBufferContainingLoc(IDLoc);
+  *OS << getSourceManager().getBufferInfo(CurBuf).Buffer->getBufferIdentifier()
+      << ":" << getSourceManager().FindLineNumber(IDLoc, CurBuf) << ":"
+      << LogMessage + "\n";
+
+  getContext().setSecureLogUsed(true);
+
+  return false;
+}
+
+/// ParseDirectiveSecureLogReset
+///  ::= .secure_log_reset
+bool DarwinAsmParser::ParseDirectiveSecureLogReset(StringRef, SMLoc IDLoc) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.secure_log_reset' directive");
+
+  Lex();
+
+  getContext().setSecureLogUsed(false);
+
+  return false;
+}
+
+/// ParseDirectiveSubsectionsViaSymbols
+///  ::= .subsections_via_symbols
+bool DarwinAsmParser::ParseDirectiveSubsectionsViaSymbols(StringRef, SMLoc) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.subsections_via_symbols' directive");
+
+  Lex();
+
+  getStreamer().EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+
+  return false;
+}
+
+/// ParseDirectiveTBSS
+///  ::= .tbss identifier, size, align
+bool DarwinAsmParser::ParseDirectiveTBSS(StringRef, SMLoc) {
+  SMLoc IDLoc = getLexer().getLoc();
+  StringRef Name;
+  if (getParser().ParseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  // Handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Size;
+  SMLoc SizeLoc = getLexer().getLoc();
+  if (getParser().ParseAbsoluteExpression(Size))
+    return true;
+
+  int64_t Pow2Alignment = 0;
+  SMLoc Pow2AlignmentLoc;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+    Pow2AlignmentLoc = getLexer().getLoc();
+    if (getParser().ParseAbsoluteExpression(Pow2Alignment))
+      return true;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.tbss' directive");
+
+  Lex();
+
+  if (Size < 0)
+    return Error(SizeLoc, "invalid '.tbss' directive size, can't be less than"
+                 "zero");
+
+  // FIXME: Diagnose overflow.
+  if (Pow2Alignment < 0)
+    return Error(Pow2AlignmentLoc, "invalid '.tbss' alignment, can't be less"
+                 "than zero");
+
+  if (!Sym->isUndefined())
+    return Error(IDLoc, "invalid symbol redefinition");
+
+  getStreamer().EmitTBSSSymbol(getContext().getMachOSection(
+                                 "__DATA", "__thread_bss",
+                                 MCSectionMachO::S_THREAD_LOCAL_ZEROFILL,
+                                 0, SectionKind::getThreadBSS()),
+                               Sym, Size, 1 << Pow2Alignment);
+
+  return false;
+}
+
+/// ParseDirectiveZerofill
+///  ::= .zerofill segname , sectname [, identifier , size_expression [
+///      , align_expression ]]
+bool DarwinAsmParser::ParseDirectiveZerofill(StringRef, SMLoc) {
+  StringRef Segment;
+  if (getParser().ParseIdentifier(Segment))
+    return TokError("expected segment name after '.zerofill' directive");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  StringRef Section;
+  if (getParser().ParseIdentifier(Section))
+    return TokError("expected section name after comma in '.zerofill' "
+                    "directive");
+
+  // If this is the end of the line all that was wanted was to create the
+  // the section but with no symbol.
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    // Create the zerofill section but no symbol
+    getStreamer().EmitZerofill(getContext().getMachOSection(
+                                 Segment, Section, MCSectionMachO::S_ZEROFILL,
+                                 0, SectionKind::getBSS()));
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  SMLoc IDLoc = getLexer().getLoc();
+  StringRef IDStr;
+  if (getParser().ParseIdentifier(IDStr))
+    return TokError("expected identifier in directive");
+
+  // handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(IDStr);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Size;
+  SMLoc SizeLoc = getLexer().getLoc();
+  if (getParser().ParseAbsoluteExpression(Size))
+    return true;
+
+  int64_t Pow2Alignment = 0;
+  SMLoc Pow2AlignmentLoc;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+    Pow2AlignmentLoc = getLexer().getLoc();
+    if (getParser().ParseAbsoluteExpression(Pow2Alignment))
+      return true;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.zerofill' directive");
+
+  Lex();
+
+  if (Size < 0)
+    return Error(SizeLoc, "invalid '.zerofill' directive size, can't be less "
+                 "than zero");
+
+  // NOTE: The alignment in the directive is a power of 2 value, the assembler
+  // may internally end up wanting an alignment in bytes.
+  // FIXME: Diagnose overflow.
+  if (Pow2Alignment < 0)
+    return Error(Pow2AlignmentLoc, "invalid '.zerofill' directive alignment, "
+                 "can't be less than zero");
+
+  if (!Sym->isUndefined())
+    return Error(IDLoc, "invalid symbol redefinition");
+
+  // Create the zerofill Symbol with Size and Pow2Alignment
+  //
+  // FIXME: Arch specific.
+  getStreamer().EmitZerofill(getContext().getMachOSection(
+                               Segment, Section, MCSectionMachO::S_ZEROFILL,
+                               0, SectionKind::getBSS()),
+                             Sym, Size, 1 << Pow2Alignment);
+
+  return false;
+}
+
+namespace llvm {
+
+MCAsmParserExtension *createDarwinAsmParser() {
+  return new DarwinAsmParser;
+}
+
+}
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
new file mode 100644
index 0000000..7a54dd3
--- /dev/null
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -0,0 +1,68 @@
+//===- ELFAsmParser.cpp - ELF Assembly Parser -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+using namespace llvm;
+
+namespace {
+
+class ELFAsmParser : public MCAsmParserExtension {
+  bool ParseSectionSwitch(StringRef Section, unsigned Type,
+                          unsigned Flags, SectionKind Kind);
+
+public:
+  ELFAsmParser() {}
+
+  virtual void Initialize(MCAsmParser &Parser) {
+    // Call the base implementation.
+    this->MCAsmParserExtension::Initialize(Parser);
+
+    Parser.AddDirectiveHandler(this, ".data", MCAsmParser::DirectiveHandler(
+                                 &ELFAsmParser::ParseSectionDirectiveData));
+    Parser.AddDirectiveHandler(this, ".text", MCAsmParser::DirectiveHandler(
+                                 &ELFAsmParser::ParseSectionDirectiveText));
+  }
+
+  bool ParseSectionDirectiveData(StringRef, SMLoc) {
+    return ParseSectionSwitch(".data", MCSectionELF::SHT_PROGBITS,
+                              MCSectionELF::SHF_WRITE |MCSectionELF::SHF_ALLOC,
+                              SectionKind::getDataRel());
+  }
+  bool ParseSectionDirectiveText(StringRef, SMLoc) {
+    return ParseSectionSwitch(".text", MCSectionELF::SHT_PROGBITS,
+                              MCSectionELF::SHF_EXECINSTR |
+                              MCSectionELF::SHF_ALLOC, SectionKind::getText());
+  }
+};
+
+}
+
+bool ELFAsmParser::ParseSectionSwitch(StringRef Section, unsigned Type,
+                                      unsigned Flags, SectionKind Kind) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in section switching directive");
+  Lex();
+
+  getStreamer().SwitchSection(getContext().getELFSection(
+                                Section, Type, Flags, Kind));
+
+  return false;
+}
+
+namespace llvm {
+
+MCAsmParserExtension *createELFAsmParser() {
+  return new ELFAsmParser;
+}
+
+}
diff --git a/lib/MC/MCParser/MCAsmLexer.cpp b/lib/MC/MCParser/MCAsmLexer.cpp
index e5b2955..dceece7 100644
--- a/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/lib/MC/MCParser/MCAsmLexer.cpp
@@ -12,12 +12,16 @@
 
 using namespace llvm;
 
-MCAsmLexer::MCAsmLexer() : CurTok(AsmToken::Error, StringRef()) {
+MCAsmLexer::MCAsmLexer() : CurTok(AsmToken::Error, StringRef()), TokStart(0) {
 }
 
 MCAsmLexer::~MCAsmLexer() {
 }
 
+SMLoc MCAsmLexer::getLoc() const {
+  return SMLoc::getFromPointer(TokStart);
+}
+
 SMLoc AsmToken::getLoc() const {
   return SMLoc::getFromPointer(Str.data());
 }
diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index b8c2054..bee3064 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/Support/SourceMgr.h"
@@ -23,6 +24,11 @@ const AsmToken &MCAsmParser::getTok() {
   return getLexer().getTok();
 }
 
+bool MCAsmParser::TokError(const char *Msg) {
+  Error(getLexer().getLoc(), Msg);
+  return true;
+}
+
 bool MCAsmParser::ParseExpression(const MCExpr *&Res) {
   SMLoc L;
   return ParseExpression(Res, L);
diff --git a/lib/MC/MCParser/MCAsmParserExtension.cpp b/lib/MC/MCParser/MCAsmParserExtension.cpp
new file mode 100644
index 0000000..c30d306
--- /dev/null
+++ b/lib/MC/MCParser/MCAsmParserExtension.cpp
@@ -0,0 +1,21 @@
+//===-- MCAsmParserExtension.cpp - Asm Parser Hooks -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+using namespace llvm;
+
+MCAsmParserExtension::MCAsmParserExtension() {
+}
+
+MCAsmParserExtension::~MCAsmParserExtension() {
+}
+
+void MCAsmParserExtension::Initialize(MCAsmParser &Parser) {
+  this->Parser = &Parser;
+}
diff --git a/lib/MC/MCSectionCOFF.cpp b/lib/MC/MCSectionCOFF.cpp
index d57bb0c..eb53160 100644
--- a/lib/MC/MCSectionCOFF.cpp
+++ b/lib/MC/MCSectionCOFF.cpp
@@ -44,28 +44,28 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << 'w';
   else
     OS << 'r';
-  if (getCharacteristics() & MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE)
+  if (getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE)
     OS << 'n';
   OS << "\"\n";
   
-  if (getCharacteristics() & MCSectionCOFF::IMAGE_SCN_LNK_COMDAT) {
+  if (getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
     switch (Selection) {
-      case IMAGE_COMDAT_SELECT_NODUPLICATES:
+      case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES:
         OS << "\t.linkonce one_only\n";
         break;
-      case IMAGE_COMDAT_SELECT_ANY:
+      case COFF::IMAGE_COMDAT_SELECT_ANY:
         OS << "\t.linkonce discard\n";
         break;
-      case IMAGE_COMDAT_SELECT_SAME_SIZE:
+      case COFF::IMAGE_COMDAT_SELECT_SAME_SIZE:
         OS << "\t.linkonce same_size\n";
         break;
-      case IMAGE_COMDAT_SELECT_EXACT_MATCH:
+      case COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH:
         OS << "\t.linkonce same_contents\n";
         break;
     //NOTE: as of binutils 2.20, there is no way to specifiy select largest
     //      with the .linkonce directive. For now, we treat it as an invalid
     //      comdat selection value.
-      case IMAGE_COMDAT_SELECT_LARGEST:
+      case COFF::IMAGE_COMDAT_SELECT_LARGEST:
     //  OS << "\t.linkonce largest\n";
     //  break;
       default:
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 3207e99..7ca0951 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -33,6 +33,7 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
   default: llvm_unreachable("invalid fixup kind!");
   case X86::reloc_pcrel_1byte:
   case FK_Data_1: return 0;
+  case X86::reloc_pcrel_2byte:
   case FK_Data_2: return 1;
   case X86::reloc_pcrel_4byte:
   case X86::reloc_riprel_4byte:
@@ -47,6 +48,7 @@ static bool isFixupKindPCRel(unsigned Kind) {
   default:
     return false;
   case X86::reloc_pcrel_1byte:
+  case X86::reloc_pcrel_2byte:
   case X86::reloc_pcrel_4byte:
   case X86::reloc_riprel_4byte:
   case X86::reloc_riprel_4byte_movq_load:
@@ -738,6 +740,51 @@ public:
     Relocations[Fragment->getParent()].push_back(MRE);
   }
 
+  void RecordTLVPRelocation(const MCAssembler &Asm,
+                            const MCAsmLayout &Layout,
+                            const MCFragment *Fragment,
+                            const MCFixup &Fixup, MCValue Target,
+                            uint64_t &FixedValue) {
+    assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP &&
+           !Is64Bit &&
+           "Should only be called with a 32-bit TLVP relocation!");
+
+    unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+    uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+    unsigned IsPCRel = 0;
+
+    // Get the symbol data.
+    MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+    unsigned Index = SD_A->getIndex();
+
+    // We're only going to have a second symbol in pic mode and it'll be a
+    // subtraction from the picbase. For 32-bit pic the addend is the difference
+    // between the picbase and the next address.  For 32-bit static the addend
+    // is zero.
+    if (Target.getSymB()) {
+      // If this is a subtraction then we're pcrel.
+      uint32_t FixupAddress =
+      Layout.getFragmentAddress(Fragment) + Fixup.getOffset();
+      MCSymbolData *SD_B = &Asm.getSymbolData(Target.getSymB()->getSymbol());
+      IsPCRel = 1;
+      FixedValue = (FixupAddress - Layout.getSymbolAddress(SD_B) +
+                    Target.getConstant());
+      FixedValue += 1 << Log2Size;
+    } else {
+      FixedValue = 0;
+    }
+    
+    // struct relocation_info (8 bytes)
+    MachRelocationEntry MRE;
+    MRE.Word0 = Value;
+    MRE.Word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (1         << 27) | // Extern
+                 (RIT_TLV   << 28)); // Type
+    Relocations[Fragment->getParent()].push_back(MRE);
+  }
+  
   void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, uint64_t &FixedValue) {
@@ -749,6 +796,12 @@ public:
     unsigned IsPCRel = isFixupKindPCRel(Fixup.getKind());
     unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
 
+    // If this is a 32-bit TLVP reloc it's handled a bit differently.
+    if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) {
+      RecordTLVPRelocation(Asm, Layout, Fragment, Fixup, Target, FixedValue);
+      return;
+    }
+    
     // If this is a difference or a defined symbol plus an offset, then we need
     // a scattered relocation entry.
     // Differences always require scattered relocations.
@@ -772,7 +825,6 @@ public:
 
     // See <reloc.h>.
     uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
-    uint32_t Value = 0;
     unsigned Index = 0;
     unsigned IsExtern = 0;
     unsigned Type = 0;
@@ -783,7 +835,6 @@ public:
       // FIXME: Currently, these are never generated (see code below). I cannot
       // find a case where they are actually emitted.
       Type = RIT_Vanilla;
-      Value = 0;
     } else {
       // Check whether we need an external or internal relocation.
       if (doesSymbolRequireExternRelocation(SD)) {
@@ -794,11 +845,9 @@ public:
         // undefined. This occurs with weak definitions, for example.
         if (!SD->Symbol->isUndefined())
           FixedValue -= Layout.getSymbolAddress(SD);
-        Value = 0;
       } else {
         // The index is the section ordinal (1-based).
         Index = SD->getFragment()->getParent()->getOrdinal() + 1;
-        Value = Layout.getSymbolAddress(SD);
       }
 
       Type = RIT_Vanilla;
@@ -898,7 +947,7 @@ public:
       const MCSymbol &Symbol = it->getSymbol();
 
       // Ignore non-linker visible symbols.
-      if (!Asm.isSymbolLinkerVisible(it))
+      if (!Asm.isSymbolLinkerVisible(it->getSymbol()))
         continue;
 
       if (!it->isExternal() && !Symbol.isUndefined())
@@ -934,7 +983,7 @@ public:
       const MCSymbol &Symbol = it->getSymbol();
 
       // Ignore non-linker visible symbols.
-      if (!Asm.isSymbolLinkerVisible(it))
+      if (!Asm.isSymbolLinkerVisible(it->getSymbol()))
         continue;
 
       if (it->isExternal() || Symbol.isUndefined())
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
new file mode 100644
index 0000000..6804766
--- /dev/null
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -0,0 +1,71 @@
+//===-- llvm/MC/WinCOFFObjectWriter.cpp -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of a Win32 COFF object file writer.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "WinCOFFObjectWriter"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
+using namespace llvm;
+
+namespace {
+
+  class WinCOFFObjectWriter : public MCObjectWriter {
+  public:
+    WinCOFFObjectWriter(raw_ostream &OS);
+
+    // MCObjectWriter interface implementation.
+
+    void ExecutePostLayoutBinding(MCAssembler &Asm);
+
+    void RecordRelocation(const MCAssembler &Asm,
+                          const MCAsmLayout &Layout,
+                          const MCFragment *Fragment,
+                          const MCFixup &Fixup,
+                          MCValue Target,
+                          uint64_t &FixedValue);
+
+    void WriteObject(const MCAssembler &Asm, const MCAsmLayout &Layout);
+  };
+}
+
+WinCOFFObjectWriter::WinCOFFObjectWriter(raw_ostream &OS)
+                                : MCObjectWriter(OS, true) {
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// MCObjectWriter interface implementations
+
+void WinCOFFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm) {
+}
+
+void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
+                                           const MCAsmLayout &Layout,
+                                           const MCFragment *Fragment,
+                                           const MCFixup &Fixup,
+                                           MCValue Target,
+                                           uint64_t &FixedValue) {
+}
+
+void WinCOFFObjectWriter::WriteObject(const MCAssembler &Asm,
+                                      const MCAsmLayout &Layout) {
+}
+
+//------------------------------------------------------------------------------
+// WinCOFFObjectWriter factory function
+
+namespace llvm {
+  MCObjectWriter *createWinCOFFObjectWriter(raw_ostream &OS) {
+    return new WinCOFFObjectWriter(OS);
+  }
+}
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
new file mode 100644
index 0000000..1030cdb
--- /dev/null
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -0,0 +1,198 @@
+//===-- llvm/MC/WinCOFFStreamer.cpp -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of a Win32 COFF object file streamer.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "WinCOFFStreamer"
+
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/Target/TargetAsmBackend.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define dbg_notimpl(x) \
+  do { dbgs() << "not implemented, " << __FUNCTION__  << " (" << x << ")"; \
+    abort(); } while (false);
+
+namespace {
+class WinCOFFStreamer : public MCObjectStreamer {
+public:
+  WinCOFFStreamer(MCContext &Context,
+                  TargetAsmBackend &TAB,
+                  MCCodeEmitter &CE,
+                  raw_ostream &OS);
+
+  // MCStreamer interface
+
+  virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
+  virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
+  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
+  virtual void BeginCOFFSymbolDef(MCSymbol const *Symbol);
+  virtual void EmitCOFFSymbolStorageClass(int StorageClass);
+  virtual void EmitCOFFSymbolType(int Type);
+  virtual void EndCOFFSymbolDef();
+  virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value);
+  virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                        unsigned ByteAlignment);
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size);
+  virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
+                    unsigned Size,unsigned ByteAlignment);
+  virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                      uint64_t Size, unsigned ByteAlignment);
+  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
+  virtual void EmitValue(const MCExpr *Value, unsigned Size, 
+                         unsigned AddrSpace);
+  virtual void EmitGPRel32Value(const MCExpr *Value);
+  virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
+                            unsigned ValueSize, unsigned MaxBytesToEmit);
+  virtual void EmitCodeAlignment(unsigned ByteAlignment,
+                                 unsigned MaxBytesToEmit);
+  virtual void EmitValueToOffset(const MCExpr *Offset, unsigned char Value);
+  virtual void EmitFileDirective(StringRef Filename);
+  virtual void EmitDwarfFileDirective(unsigned FileNo,StringRef Filename);
+  virtual void EmitInstruction(const MCInst &Instruction);
+  virtual void Finish();
+};
+} // end anonymous namespace.
+
+WinCOFFStreamer::WinCOFFStreamer(MCContext &Context,
+                                 TargetAsmBackend &TAB,
+                                 MCCodeEmitter &CE,
+                                 raw_ostream &OS)
+    : MCObjectStreamer(Context, TAB, OS, &CE) {
+}
+
+// MCStreamer interface
+
+void WinCOFFStreamer::EmitLabel(MCSymbol *Symbol) {
+}
+
+void WinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  dbg_notimpl("Flag = " << Flag);
+}
+
+void WinCOFFStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+}
+
+void WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+                                          MCSymbolAttr Attribute) {
+}
+
+void WinCOFFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+  dbg_notimpl("Symbol = " << Symbol->getName() << ", DescValue = "<< DescValue);
+}
+
+void WinCOFFStreamer::BeginCOFFSymbolDef(MCSymbol const *Symbol) {
+}
+
+void WinCOFFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
+}
+
+void WinCOFFStreamer::EmitCOFFSymbolType(int Type) {
+}
+
+void WinCOFFStreamer::EndCOFFSymbolDef() {
+}
+
+void WinCOFFStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
+  dbg_notimpl("Symbol = " << Symbol->getName() << ", Value = " << *Value);
+}
+
+void WinCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                      unsigned ByteAlignment) {
+}
+
+void WinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size) {
+}
+
+void WinCOFFStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
+                                  unsigned Size,unsigned ByteAlignment) {
+  MCSectionCOFF const *SectionCOFF =
+    static_cast<MCSectionCOFF const *>(Section);
+
+  dbg_notimpl("Section = " << SectionCOFF->getSectionName() << ", Symbol = " <<
+              Symbol->getName() << ", Size = " << Size << ", ByteAlignment = "
+              << ByteAlignment);
+}
+
+void WinCOFFStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                                     uint64_t Size, unsigned ByteAlignment) {
+  MCSectionCOFF const *SectionCOFF =
+    static_cast<MCSectionCOFF const *>(Section);
+
+  dbg_notimpl("Section = " << SectionCOFF->getSectionName() << ", Symbol = " <<
+              Symbol->getName() << ", Size = " << Size << ", ByteAlignment = "
+              << ByteAlignment);
+}
+
+void WinCOFFStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
+}
+
+void WinCOFFStreamer::EmitValue(const MCExpr *Value, unsigned Size,
+                               unsigned AddrSpace) {
+}
+
+void WinCOFFStreamer::EmitGPRel32Value(const MCExpr *Value) {
+  dbg_notimpl("Value = '" << *Value);
+}
+
+void WinCOFFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
+                                          int64_t Value,
+                                          unsigned ValueSize,
+                                          unsigned MaxBytesToEmit) {
+}
+
+void WinCOFFStreamer::EmitCodeAlignment(unsigned ByteAlignment,
+                                       unsigned MaxBytesToEmit = 0) {
+}
+
+void WinCOFFStreamer::EmitValueToOffset(const MCExpr *Offset,
+                                       unsigned char Value = 0) {
+  dbg_notimpl("Offset = '" << *Offset << "', Value = " << Value);
+}
+
+void WinCOFFStreamer::EmitFileDirective(StringRef Filename) {
+  // Ignore for now, linkers don't care, and proper debug
+  // info will be a much large effort.
+}
+
+void WinCOFFStreamer::EmitDwarfFileDirective(unsigned FileNo,
+                                            StringRef Filename) {
+  dbg_notimpl("FileNo = " << FileNo << ", Filename = '" << Filename << "'");
+}
+
+void WinCOFFStreamer::EmitInstruction(const MCInst &Instruction) {
+}
+
+void WinCOFFStreamer::Finish() {
+  MCObjectStreamer::Finish();
+}
+
+namespace llvm
+{
+  MCStreamer *createWinCOFFStreamer(MCContext &Context,
+                                    TargetAsmBackend &TAB,
+                                    MCCodeEmitter &CE,
+                                    raw_ostream &OS) {
+    return new WinCOFFStreamer(Context, TAB, CE, OS);
+  }
+}
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index f1347f9..366d2f7 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_library(LLVMSupport
   ConstantRange.cpp
   Debug.cpp
   DeltaAlgorithm.cpp
+  DAGDeltaAlgorithm.cpp
   Dwarf.cpp
   ErrorHandling.cpp
   FileUtilities.cpp
diff --git a/lib/Support/DAGDeltaAlgorithm.cpp b/lib/Support/DAGDeltaAlgorithm.cpp
new file mode 100644
index 0000000..8145664
--- /dev/null
+++ b/lib/Support/DAGDeltaAlgorithm.cpp
@@ -0,0 +1,357 @@
+//===--- DAGDeltaAlgorithm.cpp - A DAG Minimization Algorithm --*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//===----------------------------------------------------------------------===//
+//
+// The algorithm we use attempts to exploit the dependency information by
+// minimizing top-down. We start by constructing an initial root set R, and
+// then iteratively:
+//
+//   1. Minimize the set R using the test predicate:
+//       P'(S) = P(S union pred*(S))
+//
+//   2. Extend R to R' = R union pred(R).
+//
+// until a fixed point is reached.
+//
+// The idea is that we want to quickly prune entire portions of the graph, so we
+// try to find high-level nodes that can be eliminated with all of their
+// dependents.
+//
+// FIXME: The current algorithm doesn't actually provide a strong guarantee
+// about the minimality of the result. The problem is that after adding nodes to
+// the required set, we no longer consider them for elimination. For strictly
+// well formed predicates, this doesn't happen, but it commonly occurs in
+// practice when there are unmodelled dependencies. I believe we can resolve
+// this by allowing the required set to be minimized as well, but need more test
+// cases first.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DAGDeltaAlgorithm.h"
+#include "llvm/ADT/DeltaAlgorithm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <map>
+using namespace llvm;
+
+namespace {
+
+class DAGDeltaAlgorithmImpl {
+  friend class DeltaActiveSetHelper;
+
+public:
+  typedef DAGDeltaAlgorithm::change_ty change_ty;
+  typedef DAGDeltaAlgorithm::changeset_ty changeset_ty;
+  typedef DAGDeltaAlgorithm::changesetlist_ty changesetlist_ty;
+  typedef DAGDeltaAlgorithm::edge_ty edge_ty;
+
+private:
+  typedef std::vector<change_ty>::iterator pred_iterator_ty;
+  typedef std::vector<change_ty>::iterator succ_iterator_ty;
+  typedef std::set<change_ty>::iterator pred_closure_iterator_ty;
+  typedef std::set<change_ty>::iterator succ_closure_iterator_ty;
+
+  DAGDeltaAlgorithm &DDA;
+
+  const changeset_ty &Changes;
+  const std::vector<edge_ty> &Dependencies;
+
+  std::vector<change_ty> Roots;
+
+  /// Cache of failed test results. Successful test results are never cached
+  /// since we always reduce following a success. We maintain an independent
+  /// cache from that used by the individual delta passes because we may get
+  /// hits across multiple individual delta invocations.
+  mutable std::set<changeset_ty> FailedTestsCache;
+
+  // FIXME: Gross.
+  std::map<change_ty, std::vector<change_ty> > Predecessors;
+  std::map<change_ty, std::vector<change_ty> > Successors;
+
+  std::map<change_ty, std::set<change_ty> > PredClosure;
+  std::map<change_ty, std::set<change_ty> > SuccClosure;
+
+private:
+  pred_iterator_ty pred_begin(change_ty Node) {
+    assert(Predecessors.count(Node) && "Invalid node!");
+    return Predecessors[Node].begin();
+  }
+  pred_iterator_ty pred_end(change_ty Node) {
+    assert(Predecessors.count(Node) && "Invalid node!");
+    return Predecessors[Node].end();
+  }
+
+  pred_closure_iterator_ty pred_closure_begin(change_ty Node) {
+    assert(PredClosure.count(Node) && "Invalid node!");
+    return PredClosure[Node].begin();
+  }
+  pred_closure_iterator_ty pred_closure_end(change_ty Node) {
+    assert(PredClosure.count(Node) && "Invalid node!");
+    return PredClosure[Node].end();
+  }
+  
+  succ_iterator_ty succ_begin(change_ty Node) {
+    assert(Successors.count(Node) && "Invalid node!");
+    return Successors[Node].begin();
+  }
+  succ_iterator_ty succ_end(change_ty Node) {
+    assert(Successors.count(Node) && "Invalid node!");
+    return Successors[Node].end();
+  }
+
+  succ_closure_iterator_ty succ_closure_begin(change_ty Node) {
+    assert(SuccClosure.count(Node) && "Invalid node!");
+    return SuccClosure[Node].begin();
+  }
+  succ_closure_iterator_ty succ_closure_end(change_ty Node) {
+    assert(SuccClosure.count(Node) && "Invalid node!");
+    return SuccClosure[Node].end();
+  }
+
+  void UpdatedSearchState(const changeset_ty &Changes,
+                          const changesetlist_ty &Sets,
+                          const changeset_ty &Required) {
+    DDA.UpdatedSearchState(Changes, Sets, Required);
+  }
+
+  /// ExecuteOneTest - Execute a single test predicate on the change set \arg S.
+  bool ExecuteOneTest(const changeset_ty &S) {
+    // Check dependencies invariant.
+    DEBUG({
+        for (changeset_ty::const_iterator it = S.begin(),
+               ie = S.end(); it != ie; ++it)
+          for (succ_iterator_ty it2 = succ_begin(*it),
+                 ie2 = succ_end(*it); it2 != ie2; ++it2)
+            assert(S.count(*it2) && "Attempt to run invalid changeset!");
+      });
+
+    return DDA.ExecuteOneTest(S);
+  }
+
+public:
+  DAGDeltaAlgorithmImpl(DAGDeltaAlgorithm &_DDA,
+                        const changeset_ty &_Changes,
+                        const std::vector<edge_ty> &_Dependencies);
+
+  changeset_ty Run();
+
+  /// GetTestResult - Get the test result for the active set \arg Changes with
+  /// \arg Required changes from the cache, executing the test if necessary.
+  ///
+  /// \param Changes - The set of active changes being minimized, which should
+  /// have their pred closure included in the test.
+  /// \param Required - The set of changes which have previously been
+  /// established to be required.
+  /// \return - The test result.
+  bool GetTestResult(const changeset_ty &Changes, const changeset_ty &Required);
+};
+
+/// Helper object for minimizing an active set of changes.
+class DeltaActiveSetHelper : public DeltaAlgorithm {
+  DAGDeltaAlgorithmImpl &DDAI;
+
+  const changeset_ty &Required;
+
+protected:
+  /// UpdatedSearchState - Callback used when the search state changes.
+  virtual void UpdatedSearchState(const changeset_ty &Changes,
+                                  const changesetlist_ty &Sets) {
+    DDAI.UpdatedSearchState(Changes, Sets, Required);
+  }
+
+  virtual bool ExecuteOneTest(const changeset_ty &S) {
+    return DDAI.GetTestResult(S, Required);
+  }
+
+public:
+  DeltaActiveSetHelper(DAGDeltaAlgorithmImpl &_DDAI,
+                       const changeset_ty &_Required)
+    : DDAI(_DDAI), Required(_Required) {}
+};
+
+}
+
+DAGDeltaAlgorithmImpl::DAGDeltaAlgorithmImpl(DAGDeltaAlgorithm &_DDA,
+                                             const changeset_ty &_Changes,
+                                             const std::vector<edge_ty>
+                                               &_Dependencies)
+  : DDA(_DDA),
+    Changes(_Changes),
+    Dependencies(_Dependencies)
+{
+  for (changeset_ty::const_iterator it = Changes.begin(),
+         ie = Changes.end(); it != ie; ++it) {
+    Predecessors.insert(std::make_pair(*it, std::vector<change_ty>()));
+    Successors.insert(std::make_pair(*it, std::vector<change_ty>()));
+  }
+  for (std::vector<edge_ty>::const_iterator it = Dependencies.begin(),
+         ie = Dependencies.end(); it != ie; ++it) {
+    Predecessors[it->second].push_back(it->first);
+    Successors[it->first].push_back(it->second);
+  }
+
+  // Compute the roots.
+  for (changeset_ty::const_iterator it = Changes.begin(),
+         ie = Changes.end(); it != ie; ++it)
+    if (succ_begin(*it) == succ_end(*it))
+      Roots.push_back(*it);
+
+  // Pre-compute the closure of the successor relation.
+  std::vector<change_ty> Worklist(Roots.begin(), Roots.end());
+  while (!Worklist.empty()) {
+    change_ty Change = Worklist.back();
+    Worklist.pop_back();
+
+    std::set<change_ty> &ChangeSuccs = SuccClosure[Change];
+    for (pred_iterator_ty it = pred_begin(Change), 
+           ie = pred_end(Change); it != ie; ++it) {
+      SuccClosure[*it].insert(Change);
+      SuccClosure[*it].insert(ChangeSuccs.begin(), ChangeSuccs.end());
+      Worklist.push_back(*it);
+    }
+  }
+
+  // Invert to form the predecessor closure map.
+  for (changeset_ty::const_iterator it = Changes.begin(),
+         ie = Changes.end(); it != ie; ++it)
+    PredClosure.insert(std::make_pair(*it, std::set<change_ty>()));
+  for (changeset_ty::const_iterator it = Changes.begin(),
+         ie = Changes.end(); it != ie; ++it)
+    for (succ_closure_iterator_ty it2 = succ_closure_begin(*it),
+           ie2 = succ_closure_end(*it); it2 != ie2; ++it2)
+      PredClosure[*it2].insert(*it);
+  
+  // Dump useful debug info.
+  DEBUG({
+      llvm::errs() << "-- DAGDeltaAlgorithmImpl --\n";
+      llvm::errs() << "Changes: [";
+      for (changeset_ty::const_iterator it = Changes.begin(),
+             ie = Changes.end(); it != ie; ++it) {
+        if (it != Changes.begin()) llvm::errs() << ", ";
+        llvm::errs() << *it;
+
+        if (succ_begin(*it) != succ_end(*it)) {
+          llvm::errs() << "(";
+          for (succ_iterator_ty it2 = succ_begin(*it),
+                 ie2 = succ_end(*it); it2 != ie2; ++it2) {
+            if (it2 != succ_begin(*it)) llvm::errs() << ", ";
+            llvm::errs() << "->" << *it2;
+          }
+          llvm::errs() << ")";
+        }
+      }
+      llvm::errs() << "]\n";
+
+      llvm::errs() << "Roots: [";
+      for (std::vector<change_ty>::const_iterator it = Roots.begin(),
+             ie = Roots.end(); it != ie; ++it) {
+        if (it != Roots.begin()) llvm::errs() << ", ";
+        llvm::errs() << *it;
+      }
+      llvm::errs() << "]\n";
+
+      llvm::errs() << "Predecessor Closure:\n";
+      for (changeset_ty::const_iterator it = Changes.begin(),
+             ie = Changes.end(); it != ie; ++it) {
+        llvm::errs() << format("  %-4d: [", *it);
+        for (pred_closure_iterator_ty it2 = pred_closure_begin(*it),
+               ie2 = pred_closure_end(*it); it2 != ie2; ++it2) {
+          if (it2 != pred_closure_begin(*it)) llvm::errs() << ", ";
+          llvm::errs() << *it2;
+        }
+        llvm::errs() << "]\n";
+      }
+      
+      llvm::errs() << "Successor Closure:\n";
+      for (changeset_ty::const_iterator it = Changes.begin(),
+             ie = Changes.end(); it != ie; ++it) {
+        llvm::errs() << format("  %-4d: [", *it);
+        for (succ_closure_iterator_ty it2 = succ_closure_begin(*it),
+               ie2 = succ_closure_end(*it); it2 != ie2; ++it2) {
+          if (it2 != succ_closure_begin(*it)) llvm::errs() << ", ";
+          llvm::errs() << *it2;
+        }
+        llvm::errs() << "]\n";
+      }
+
+      llvm::errs() << "\n\n";
+    });
+}
+
+bool DAGDeltaAlgorithmImpl::GetTestResult(const changeset_ty &Changes,
+                                          const changeset_ty &Required) {
+  changeset_ty Extended(Required);
+  Extended.insert(Changes.begin(), Changes.end());
+  for (changeset_ty::const_iterator it = Changes.begin(),
+         ie = Changes.end(); it != ie; ++it)
+    Extended.insert(pred_closure_begin(*it), pred_closure_end(*it));
+
+  if (FailedTestsCache.count(Extended))
+    return false;
+
+  bool Result = ExecuteOneTest(Extended);
+  if (!Result)
+    FailedTestsCache.insert(Extended);
+
+  return Result;
+}
+
+DAGDeltaAlgorithm::changeset_ty
+DAGDeltaAlgorithmImpl::Run() {
+  // The current set of changes we are minimizing, starting at the roots.
+  changeset_ty CurrentSet(Roots.begin(), Roots.end());
+
+  // The set of required changes.
+  changeset_ty Required;
+
+  // Iterate until the active set of changes is empty. Convergence is guaranteed
+  // assuming input was a DAG.
+  //
+  // Invariant:  CurrentSet intersect Required == {}
+  // Invariant:  Required == (Required union succ*(Required))
+  while (!CurrentSet.empty()) {
+    DEBUG({
+        llvm::errs() << "DAG_DD - " << CurrentSet.size() << " active changes, "
+                     << Required.size() << " required changes\n";
+      });
+
+    // Minimize the current set of changes.
+    DeltaActiveSetHelper Helper(*this, Required);
+    changeset_ty CurrentMinSet = Helper.Run(CurrentSet);
+
+    // Update the set of required changes. Since
+    //   CurrentMinSet subset CurrentSet
+    // and after the last iteration,
+    //   succ(CurrentSet) subset Required
+    // then
+    //   succ(CurrentMinSet) subset Required
+    // and our invariant on Required is maintained.
+    Required.insert(CurrentMinSet.begin(), CurrentMinSet.end());
+
+    // Replace the current set with the predecssors of the minimized set of
+    // active changes.
+    CurrentSet.clear();
+    for (changeset_ty::const_iterator it = CurrentMinSet.begin(),
+           ie = CurrentMinSet.end(); it != ie; ++it)
+      CurrentSet.insert(pred_begin(*it), pred_end(*it));
+
+    // FIXME: We could enforce CurrentSet intersect Required == {} here if we
+    // wanted to protect against cyclic graphs.
+  }
+
+  return Required;
+}
+
+DAGDeltaAlgorithm::changeset_ty
+DAGDeltaAlgorithm::Run(const changeset_ty &Changes,
+                       const std::vector<edge_ty> &Dependencies) {
+  return DAGDeltaAlgorithmImpl(*this, Changes, Dependencies).Run();
+}
diff --git a/lib/Support/DeltaAlgorithm.cpp b/lib/Support/DeltaAlgorithm.cpp
index d176548..9e52874 100644
--- a/lib/Support/DeltaAlgorithm.cpp
+++ b/lib/Support/DeltaAlgorithm.cpp
@@ -30,10 +30,10 @@ void DeltaAlgorithm::Split(const changeset_ty &S, changesetlist_ty &Res) {
 
   // FIXME: This is really slow.
   changeset_ty LHS, RHS;
-  unsigned idx = 0;
+  unsigned idx = 0, N = S.size() / 2;
   for (changeset_ty::const_iterator it = S.begin(),
          ie = S.end(); it != ie; ++it, ++idx)
-    ((idx & 1) ? LHS : RHS).insert(*it);
+    ((idx < N) ? LHS : RHS).insert(*it);
   if (!LHS.empty())
     Res.push_back(LHS);
   if (!RHS.empty())
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index c19c2d6..96ce9d3 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -86,8 +86,8 @@ const char *llvm::dwarf::TagString(unsigned Tag) {
 ///
 const char *llvm::dwarf::ChildrenString(unsigned Children) {
   switch (Children) {
-  case DW_CHILDREN_no:                   return "CHILDREN_no";
-  case DW_CHILDREN_yes:                  return "CHILDREN_yes";
+  case DW_CHILDREN_no:                   return "DW_CHILDREN_no";
+  case DW_CHILDREN_yes:                  return "DW_CHILDREN_yes";
   }
   return 0;
 }
@@ -207,27 +207,27 @@ const char *llvm::dwarf::AttributeString(unsigned Attribute) {
 ///
 const char *llvm::dwarf::FormEncodingString(unsigned Encoding) {
   switch (Encoding) {
-  case DW_FORM_addr:                     return "FORM_addr";
-  case DW_FORM_block2:                   return "FORM_block2";
-  case DW_FORM_block4:                   return "FORM_block4";
-  case DW_FORM_data2:                    return "FORM_data2";
-  case DW_FORM_data4:                    return "FORM_data4";
-  case DW_FORM_data8:                    return "FORM_data8";
-  case DW_FORM_string:                   return "FORM_string";
-  case DW_FORM_block:                    return "FORM_block";
-  case DW_FORM_block1:                   return "FORM_block1";
-  case DW_FORM_data1:                    return "FORM_data1";
-  case DW_FORM_flag:                     return "FORM_flag";
-  case DW_FORM_sdata:                    return "FORM_sdata";
-  case DW_FORM_strp:                     return "FORM_strp";
-  case DW_FORM_udata:                    return "FORM_udata";
-  case DW_FORM_ref_addr:                 return "FORM_ref_addr";
-  case DW_FORM_ref1:                     return "FORM_ref1";
-  case DW_FORM_ref2:                     return "FORM_ref2";
-  case DW_FORM_ref4:                     return "FORM_ref4";
-  case DW_FORM_ref8:                     return "FORM_ref8";
-  case DW_FORM_ref_udata:                return "FORM_ref_udata";
-  case DW_FORM_indirect:                 return "FORM_indirect";
+  case DW_FORM_addr:                     return "DW_FORM_addr";
+  case DW_FORM_block2:                   return "DW_FORM_block2";
+  case DW_FORM_block4:                   return "DW_FORM_block4";
+  case DW_FORM_data2:                    return "DW_FORM_data2";
+  case DW_FORM_data4:                    return "DW_FORM_data4";
+  case DW_FORM_data8:                    return "DW_FORM_data8";
+  case DW_FORM_string:                   return "DW_FORM_string";
+  case DW_FORM_block:                    return "DW_FORM_block";
+  case DW_FORM_block1:                   return "DW_FORM_block1";
+  case DW_FORM_data1:                    return "DW_FORM_data1";
+  case DW_FORM_flag:                     return "DW_FORM_flag";
+  case DW_FORM_sdata:                    return "DW_FORM_sdata";
+  case DW_FORM_strp:                     return "DW_FORM_strp";
+  case DW_FORM_udata:                    return "DW_FORM_udata";
+  case DW_FORM_ref_addr:                 return "DW_FORM_ref_addr";
+  case DW_FORM_ref1:                     return "DW_FORM_ref1";
+  case DW_FORM_ref2:                     return "DW_FORM_ref2";
+  case DW_FORM_ref4:                     return "DW_FORM_ref4";
+  case DW_FORM_ref8:                     return "DW_FORM_ref8";
+  case DW_FORM_ref_udata:                return "DW_FORM_ref_udata";
+  case DW_FORM_indirect:                 return "DW_FORM_indirect";
   }
   return 0;
 }
@@ -236,72 +236,159 @@ const char *llvm::dwarf::FormEncodingString(unsigned Encoding) {
 /// encoding.
 const char *llvm::dwarf::OperationEncodingString(unsigned Encoding) {
   switch (Encoding) {
-  case DW_OP_addr:                       return "OP_addr";
-  case DW_OP_deref:                      return "OP_deref";
-  case DW_OP_const1u:                    return "OP_const1u";
-  case DW_OP_const1s:                    return "OP_const1s";
-  case DW_OP_const2u:                    return "OP_const2u";
-  case DW_OP_const2s:                    return "OP_const2s";
-  case DW_OP_const4u:                    return "OP_const4u";
-  case DW_OP_const4s:                    return "OP_const4s";
-  case DW_OP_const8u:                    return "OP_const8u";
-  case DW_OP_const8s:                    return "OP_const8s";
-  case DW_OP_constu:                     return "OP_constu";
-  case DW_OP_consts:                     return "OP_consts";
-  case DW_OP_dup:                        return "OP_dup";
-  case DW_OP_drop:                       return "OP_drop";
-  case DW_OP_over:                       return "OP_over";
-  case DW_OP_pick:                       return "OP_pick";
-  case DW_OP_swap:                       return "OP_swap";
-  case DW_OP_rot:                        return "OP_rot";
-  case DW_OP_xderef:                     return "OP_xderef";
-  case DW_OP_abs:                        return "OP_abs";
-  case DW_OP_and:                        return "OP_and";
-  case DW_OP_div:                        return "OP_div";
-  case DW_OP_minus:                      return "OP_minus";
-  case DW_OP_mod:                        return "OP_mod";
-  case DW_OP_mul:                        return "OP_mul";
-  case DW_OP_neg:                        return "OP_neg";
-  case DW_OP_not:                        return "OP_not";
-  case DW_OP_or:                         return "OP_or";
-  case DW_OP_plus:                       return "OP_plus";
-  case DW_OP_plus_uconst:                return "OP_plus_uconst";
-  case DW_OP_shl:                        return "OP_shl";
-  case DW_OP_shr:                        return "OP_shr";
-  case DW_OP_shra:                       return "OP_shra";
-  case DW_OP_xor:                        return "OP_xor";
-  case DW_OP_skip:                       return "OP_skip";
-  case DW_OP_bra:                        return "OP_bra";
-  case DW_OP_eq:                         return "OP_eq";
-  case DW_OP_ge:                         return "OP_ge";
-  case DW_OP_gt:                         return "OP_gt";
-  case DW_OP_le:                         return "OP_le";
-  case DW_OP_lt:                         return "OP_lt";
-  case DW_OP_ne:                         return "OP_ne";
-  case DW_OP_lit0:                       return "OP_lit0";
-  case DW_OP_lit1:                       return "OP_lit1";
-  case DW_OP_lit31:                      return "OP_lit31";
-  case DW_OP_reg0:                       return "OP_reg0";
-  case DW_OP_reg1:                       return "OP_reg1";
-  case DW_OP_reg31:                      return "OP_reg31";
-  case DW_OP_breg0:                      return "OP_breg0";
-  case DW_OP_breg1:                      return "OP_breg1";
-  case DW_OP_breg31:                     return "OP_breg31";
-  case DW_OP_regx:                       return "OP_regx";
-  case DW_OP_fbreg:                      return "OP_fbreg";
-  case DW_OP_bregx:                      return "OP_bregx";
-  case DW_OP_piece:                      return "OP_piece";
-  case DW_OP_deref_size:                 return "OP_deref_size";
-  case DW_OP_xderef_size:                return "OP_xderef_size";
-  case DW_OP_nop:                        return "OP_nop";
-  case DW_OP_push_object_address:        return "OP_push_object_address";
-  case DW_OP_call2:                      return "OP_call2";
-  case DW_OP_call4:                      return "OP_call4";
-  case DW_OP_call_ref:                   return "OP_call_ref";
-  case DW_OP_form_tls_address:           return "OP_form_tls_address";
-  case DW_OP_call_frame_cfa:             return "OP_call_frame_cfa";
-  case DW_OP_lo_user:                    return "OP_lo_user";
-  case DW_OP_hi_user:                    return "OP_hi_user";
+  case DW_OP_addr:                       return "DW_OP_addr";
+  case DW_OP_deref:                      return "DW_OP_deref";
+  case DW_OP_const1u:                    return "DW_OP_const1u";
+  case DW_OP_const1s:                    return "DW_OP_const1s";
+  case DW_OP_const2u:                    return "DW_OP_const2u";
+  case DW_OP_const2s:                    return "DW_OP_const2s";
+  case DW_OP_const4u:                    return "DW_OP_const4u";
+  case DW_OP_const4s:                    return "DW_OP_const4s";
+  case DW_OP_const8u:                    return "DW_OP_const8u";
+  case DW_OP_const8s:                    return "DW_OP_const8s";
+  case DW_OP_constu:                     return "DW_OP_constu";
+  case DW_OP_consts:                     return "DW_OP_consts";
+  case DW_OP_dup:                        return "DW_OP_dup";
+  case DW_OP_drop:                       return "DW_OP_drop";
+  case DW_OP_over:                       return "DW_OP_over";
+  case DW_OP_pick:                       return "DW_OP_pick";
+  case DW_OP_swap:                       return "DW_OP_swap";
+  case DW_OP_rot:                        return "DW_OP_rot";
+  case DW_OP_xderef:                     return "DW_OP_xderef";
+  case DW_OP_abs:                        return "DW_OP_abs";
+  case DW_OP_and:                        return "DW_OP_and";
+  case DW_OP_div:                        return "DW_OP_div";
+  case DW_OP_minus:                      return "DW_OP_minus";
+  case DW_OP_mod:                        return "DW_OP_mod";
+  case DW_OP_mul:                        return "DW_OP_mul";
+  case DW_OP_neg:                        return "DW_OP_neg";
+  case DW_OP_not:                        return "DW_OP_not";
+  case DW_OP_or:                         return "DW_OP_or";
+  case DW_OP_plus:                       return "DW_OP_plus";
+  case DW_OP_plus_uconst:                return "DW_OP_plus_uconst";
+  case DW_OP_shl:                        return "DW_OP_shl";
+  case DW_OP_shr:                        return "DW_OP_shr";
+  case DW_OP_shra:                       return "DW_OP_shra";
+  case DW_OP_xor:                        return "DW_OP_xor";
+  case DW_OP_skip:                       return "DW_OP_skip";
+  case DW_OP_bra:                        return "DW_OP_bra";
+  case DW_OP_eq:                         return "DW_OP_eq";
+  case DW_OP_ge:                         return "DW_OP_ge";
+  case DW_OP_gt:                         return "DW_OP_gt";
+  case DW_OP_le:                         return "DW_OP_le";
+  case DW_OP_lt:                         return "DW_OP_lt";
+  case DW_OP_ne:                         return "DW_OP_ne";
+  case DW_OP_lit0:                       return "DW_OP_lit0";
+  case DW_OP_lit1:                       return "DW_OP_lit1";
+  case DW_OP_lit2:                       return "DW_OP_lit2";
+  case DW_OP_lit3:                       return "DW_OP_lit3";
+  case DW_OP_lit4:                       return "DW_OP_lit4";
+  case DW_OP_lit5:                       return "DW_OP_lit5";
+  case DW_OP_lit6:                       return "DW_OP_lit6";
+  case DW_OP_lit7:                       return "DW_OP_lit7";
+  case DW_OP_lit8:                       return "DW_OP_lit8";
+  case DW_OP_lit9:                       return "DW_OP_lit9";
+  case DW_OP_lit10:                      return "DW_OP_lit10";
+  case DW_OP_lit11:                      return "DW_OP_lit11";
+  case DW_OP_lit12:                      return "DW_OP_lit12";
+  case DW_OP_lit13:                      return "DW_OP_lit13";
+  case DW_OP_lit14:                      return "DW_OP_lit14";
+  case DW_OP_lit15:                      return "DW_OP_lit15";
+  case DW_OP_lit16:                      return "DW_OP_lit16";
+  case DW_OP_lit17:                      return "DW_OP_lit17";
+  case DW_OP_lit18:                      return "DW_OP_lit18";
+  case DW_OP_lit19:                      return "DW_OP_lit19";
+  case DW_OP_lit20:                      return "DW_OP_lit20";
+  case DW_OP_lit21:                      return "DW_OP_lit21";
+  case DW_OP_lit22:                      return "DW_OP_lit22";
+  case DW_OP_lit23:                      return "DW_OP_lit23";
+  case DW_OP_lit24:                      return "DW_OP_lit24";
+  case DW_OP_lit25:                      return "DW_OP_lit25";
+  case DW_OP_lit26:                      return "DW_OP_lit26";
+  case DW_OP_lit27:                      return "DW_OP_lit27";
+  case DW_OP_lit28:                      return "DW_OP_lit28";
+  case DW_OP_lit29:                      return "DW_OP_lit29";
+  case DW_OP_lit30:                      return "DW_OP_lit30";
+  case DW_OP_lit31:                      return "DW_OP_lit31";
+  case DW_OP_reg0:                       return "DW_OP_reg0";
+  case DW_OP_reg1:                       return "DW_OP_reg1";
+  case DW_OP_reg2:                       return "DW_OP_reg2";
+  case DW_OP_reg3:                       return "DW_OP_reg3";
+  case DW_OP_reg4:                       return "DW_OP_reg4";
+  case DW_OP_reg5:                       return "DW_OP_reg5";
+  case DW_OP_reg6:                       return "DW_OP_reg6";
+  case DW_OP_reg7:                       return "DW_OP_reg7";
+  case DW_OP_reg8:                       return "DW_OP_reg8";
+  case DW_OP_reg9:                       return "DW_OP_reg9";
+  case DW_OP_reg10:                      return "DW_OP_reg10";
+  case DW_OP_reg11:                      return "DW_OP_reg11";
+  case DW_OP_reg12:                      return "DW_OP_reg12";
+  case DW_OP_reg13:                      return "DW_OP_reg13";
+  case DW_OP_reg14:                      return "DW_OP_reg14";
+  case DW_OP_reg15:                      return "DW_OP_reg15";
+  case DW_OP_reg16:                      return "DW_OP_reg16";
+  case DW_OP_reg17:                      return "DW_OP_reg17";
+  case DW_OP_reg18:                      return "DW_OP_reg18";
+  case DW_OP_reg19:                      return "DW_OP_reg19";
+  case DW_OP_reg20:                      return "DW_OP_reg20";
+  case DW_OP_reg21:                      return "DW_OP_reg21";
+  case DW_OP_reg22:                      return "DW_OP_reg22";
+  case DW_OP_reg23:                      return "DW_OP_reg23";
+  case DW_OP_reg24:                      return "DW_OP_reg24";
+  case DW_OP_reg25:                      return "DW_OP_reg25";
+  case DW_OP_reg26:                      return "DW_OP_reg26";
+  case DW_OP_reg27:                      return "DW_OP_reg27";
+  case DW_OP_reg28:                      return "DW_OP_reg28";
+  case DW_OP_reg29:                      return "DW_OP_reg29";
+  case DW_OP_reg30:                      return "DW_OP_reg30";
+  case DW_OP_reg31:                      return "DW_OP_reg31";
+  case DW_OP_breg0:                      return "DW_OP_breg0";
+  case DW_OP_breg1:                      return "DW_OP_breg1";
+  case DW_OP_breg2:                      return "DW_OP_breg2";
+  case DW_OP_breg3:                      return "DW_OP_breg3";
+  case DW_OP_breg4:                      return "DW_OP_breg4";
+  case DW_OP_breg5:                      return "DW_OP_breg5";
+  case DW_OP_breg6:                      return "DW_OP_breg6";
+  case DW_OP_breg7:                      return "DW_OP_breg7";
+  case DW_OP_breg8:                      return "DW_OP_breg8";
+  case DW_OP_breg9:                      return "DW_OP_breg9";
+  case DW_OP_breg10:                     return "DW_OP_breg10";
+  case DW_OP_breg11:                     return "DW_OP_breg11";
+  case DW_OP_breg12:                     return "DW_OP_breg12";
+  case DW_OP_breg13:                     return "DW_OP_breg13";
+  case DW_OP_breg14:                     return "DW_OP_breg14";
+  case DW_OP_breg15:                     return "DW_OP_breg15";
+  case DW_OP_breg16:                     return "DW_OP_breg16";
+  case DW_OP_breg17:                     return "DW_OP_breg17";
+  case DW_OP_breg18:                     return "DW_OP_breg18";
+  case DW_OP_breg19:                     return "DW_OP_breg19";
+  case DW_OP_breg20:                     return "DW_OP_breg20";
+  case DW_OP_breg21:                     return "DW_OP_breg21";
+  case DW_OP_breg22:                     return "DW_OP_breg22";
+  case DW_OP_breg23:                     return "DW_OP_breg23";
+  case DW_OP_breg24:                     return "DW_OP_breg24";
+  case DW_OP_breg25:                     return "DW_OP_breg25";
+  case DW_OP_breg26:                     return "DW_OP_breg26";
+  case DW_OP_breg27:                     return "DW_OP_breg27";
+  case DW_OP_breg28:                     return "DW_OP_breg28";
+  case DW_OP_breg29:                     return "DW_OP_breg29";
+  case DW_OP_breg30:                     return "DW_OP_breg30";
+  case DW_OP_breg31:                     return "DW_OP_breg31";
+  case DW_OP_regx:                       return "DW_OP_regx";
+  case DW_OP_fbreg:                      return "DW_OP_fbreg";
+  case DW_OP_bregx:                      return "DW_OP_bregx";
+  case DW_OP_piece:                      return "DW_OP_piece";
+  case DW_OP_deref_size:                 return "DW_OP_deref_size";
+  case DW_OP_xderef_size:                return "DW_OP_xderef_size";
+  case DW_OP_nop:                        return "DW_OP_nop";
+  case DW_OP_push_object_address:        return "DW_OP_push_object_address";
+  case DW_OP_call2:                      return "DW_OP_call2";
+  case DW_OP_call4:                      return "DW_OP_call4";
+  case DW_OP_call_ref:                   return "DW_OP_call_ref";
+  case DW_OP_form_tls_address:           return "DW_OP_form_tls_address";
+  case DW_OP_call_frame_cfa:             return "DW_OP_call_frame_cfa";
+  case DW_OP_lo_user:                    return "DW_OP_lo_user";
+  case DW_OP_hi_user:                    return "DW_OP_hi_user";
   }
   return 0;
 }
@@ -310,23 +397,23 @@ const char *llvm::dwarf::OperationEncodingString(unsigned Encoding) {
 /// encoding.
 const char *llvm::dwarf::AttributeEncodingString(unsigned Encoding) {
   switch (Encoding) {
-  case DW_ATE_address:                   return "ATE_address";
-  case DW_ATE_boolean:                   return "ATE_boolean";
-  case DW_ATE_complex_float:             return "ATE_complex_float";
-  case DW_ATE_float:                     return "ATE_float";
-  case DW_ATE_signed:                    return "ATE_signed";
-  case DW_ATE_signed_char:               return "ATE_signed_char";
-  case DW_ATE_unsigned:                  return "ATE_unsigned";
-  case DW_ATE_unsigned_char:             return "ATE_unsigned_char";
-  case DW_ATE_imaginary_float:           return "ATE_imaginary_float";
-  case DW_ATE_packed_decimal:            return "ATE_packed_decimal";
-  case DW_ATE_numeric_string:            return "ATE_numeric_string";
-  case DW_ATE_edited:                    return "ATE_edited";
-  case DW_ATE_signed_fixed:              return "ATE_signed_fixed";
-  case DW_ATE_unsigned_fixed:            return "ATE_unsigned_fixed";
-  case DW_ATE_decimal_float:             return "ATE_decimal_float";
-  case DW_ATE_lo_user:                   return "ATE_lo_user";
-  case DW_ATE_hi_user:                   return "ATE_hi_user";
+  case DW_ATE_address:                   return "DW_ATE_address";
+  case DW_ATE_boolean:                   return "DW_ATE_boolean";
+  case DW_ATE_complex_float:             return "DW_ATE_complex_float";
+  case DW_ATE_float:                     return "DW_ATE_float";
+  case DW_ATE_signed:                    return "DW_ATE_signed";
+  case DW_ATE_signed_char:               return "DW_ATE_signed_char";
+  case DW_ATE_unsigned:                  return "DW_ATE_unsigned";
+  case DW_ATE_unsigned_char:             return "DW_ATE_unsigned_char";
+  case DW_ATE_imaginary_float:           return "DW_ATE_imaginary_float";
+  case DW_ATE_packed_decimal:            return "DW_ATE_packed_decimal";
+  case DW_ATE_numeric_string:            return "DW_ATE_numeric_string";
+  case DW_ATE_edited:                    return "DW_ATE_edited";
+  case DW_ATE_signed_fixed:              return "DW_ATE_signed_fixed";
+  case DW_ATE_unsigned_fixed:            return "DW_ATE_unsigned_fixed";
+  case DW_ATE_decimal_float:             return "DW_ATE_decimal_float";
+  case DW_ATE_lo_user:                   return "DW_ATE_lo_user";
+  case DW_ATE_hi_user:                   return "DW_ATE_hi_user";
   }
   return 0;
 }
@@ -335,11 +422,11 @@ const char *llvm::dwarf::AttributeEncodingString(unsigned Encoding) {
 /// attribute.
 const char *llvm::dwarf::DecimalSignString(unsigned Sign) {
   switch (Sign) {
-  case DW_DS_unsigned:                   return "DS_unsigned";
-  case DW_DS_leading_overpunch:          return "DS_leading_overpunch";
-  case DW_DS_trailing_overpunch:         return "DS_trailing_overpunch";
-  case DW_DS_leading_separate:           return "DS_leading_separate";
-  case DW_DS_trailing_separate:          return "DS_trailing_separate";
+  case DW_DS_unsigned:                   return "DW_DS_unsigned";
+  case DW_DS_leading_overpunch:          return "DW_DS_leading_overpunch";
+  case DW_DS_trailing_overpunch:         return "DW_DS_trailing_overpunch";
+  case DW_DS_leading_separate:           return "DW_DS_leading_separate";
+  case DW_DS_trailing_separate:          return "DW_DS_trailing_separate";
   }
   return 0;
 }
@@ -348,11 +435,11 @@ const char *llvm::dwarf::DecimalSignString(unsigned Sign) {
 ///
 const char *llvm::dwarf::EndianityString(unsigned Endian) {
   switch (Endian) {
-  case DW_END_default:                   return "END_default";
-  case DW_END_big:                       return "END_big";
-  case DW_END_little:                    return "END_little";
-  case DW_END_lo_user:                   return "END_lo_user";
-  case DW_END_hi_user:                   return "END_hi_user";
+  case DW_END_default:                   return "DW_END_default";
+  case DW_END_big:                       return "DW_END_big";
+  case DW_END_little:                    return "DW_END_little";
+  case DW_END_lo_user:                   return "DW_END_lo_user";
+  case DW_END_hi_user:                   return "DW_END_hi_user";
   }
   return 0;
 }
@@ -362,9 +449,9 @@ const char *llvm::dwarf::EndianityString(unsigned Endian) {
 const char *llvm::dwarf::AccessibilityString(unsigned Access) {
   switch (Access) {
   // Accessibility codes
-  case DW_ACCESS_public:                 return "ACCESS_public";
-  case DW_ACCESS_protected:              return "ACCESS_protected";
-  case DW_ACCESS_private:                return "ACCESS_private";
+  case DW_ACCESS_public:                 return "DW_ACCESS_public";
+  case DW_ACCESS_protected:              return "DW_ACCESS_protected";
+  case DW_ACCESS_private:                return "DW_ACCESS_private";
   }
   return 0;
 }
@@ -373,9 +460,9 @@ const char *llvm::dwarf::AccessibilityString(unsigned Access) {
 ///
 const char *llvm::dwarf::VisibilityString(unsigned Visibility) {
   switch (Visibility) {
-  case DW_VIS_local:                     return "VIS_local";
-  case DW_VIS_exported:                  return "VIS_exported";
-  case DW_VIS_qualified:                 return "VIS_qualified";
+  case DW_VIS_local:                     return "DW_VIS_local";
+  case DW_VIS_exported:                  return "DW_VIS_exported";
+  case DW_VIS_qualified:                 return "DW_VIS_qualified";
   }
   return 0;
 }
@@ -384,9 +471,9 @@ const char *llvm::dwarf::VisibilityString(unsigned Visibility) {
 ///
 const char *llvm::dwarf::VirtualityString(unsigned Virtuality) {
   switch (Virtuality) {
-  case DW_VIRTUALITY_none:               return "VIRTUALITY_none";
-  case DW_VIRTUALITY_virtual:            return "VIRTUALITY_virtual";
-  case DW_VIRTUALITY_pure_virtual:       return "VIRTUALITY_pure_virtual";
+  case DW_VIRTUALITY_none:               return "DW_VIRTUALITY_none";
+  case DW_VIRTUALITY_virtual:            return "DW_VIRTUALITY_virtual";
+  case DW_VIRTUALITY_pure_virtual:       return "DW_VIRTUALITY_pure_virtual";
   }
   return 0;
 }
@@ -395,27 +482,27 @@ const char *llvm::dwarf::VirtualityString(unsigned Virtuality) {
 ///
 const char *llvm::dwarf::LanguageString(unsigned Language) {
   switch (Language) {
-  case DW_LANG_C89:                      return "LANG_C89";
-  case DW_LANG_C:                        return "LANG_C";
-  case DW_LANG_Ada83:                    return "LANG_Ada83";
-  case DW_LANG_C_plus_plus:              return "LANG_C_plus_plus";
-  case DW_LANG_Cobol74:                  return "LANG_Cobol74";
-  case DW_LANG_Cobol85:                  return "LANG_Cobol85";
-  case DW_LANG_Fortran77:                return "LANG_Fortran77";
-  case DW_LANG_Fortran90:                return "LANG_Fortran90";
-  case DW_LANG_Pascal83:                 return "LANG_Pascal83";
-  case DW_LANG_Modula2:                  return "LANG_Modula2";
-  case DW_LANG_Java:                     return "LANG_Java";
-  case DW_LANG_C99:                      return "LANG_C99";
-  case DW_LANG_Ada95:                    return "LANG_Ada95";
-  case DW_LANG_Fortran95:                return "LANG_Fortran95";
-  case DW_LANG_PLI:                      return "LANG_PLI";
-  case DW_LANG_ObjC:                     return "LANG_ObjC";
-  case DW_LANG_ObjC_plus_plus:           return "LANG_ObjC_plus_plus";
-  case DW_LANG_UPC:                      return "LANG_UPC";
-  case DW_LANG_D:                        return "LANG_D";
-  case DW_LANG_lo_user:                  return "LANG_lo_user";
-  case DW_LANG_hi_user:                  return "LANG_hi_user";
+  case DW_LANG_C89:                      return "DW_LANG_C89";
+  case DW_LANG_C:                        return "DW_LANG_C";
+  case DW_LANG_Ada83:                    return "DW_LANG_Ada83";
+  case DW_LANG_C_plus_plus:              return "DW_LANG_C_plus_plus";
+  case DW_LANG_Cobol74:                  return "DW_LANG_Cobol74";
+  case DW_LANG_Cobol85:                  return "DW_LANG_Cobol85";
+  case DW_LANG_Fortran77:                return "DW_LANG_Fortran77";
+  case DW_LANG_Fortran90:                return "DW_LANG_Fortran90";
+  case DW_LANG_Pascal83:                 return "DW_LANG_Pascal83";
+  case DW_LANG_Modula2:                  return "DW_LANG_Modula2";
+  case DW_LANG_Java:                     return "DW_LANG_Java";
+  case DW_LANG_C99:                      return "DW_LANG_C99";
+  case DW_LANG_Ada95:                    return "DW_LANG_Ada95";
+  case DW_LANG_Fortran95:                return "DW_LANG_Fortran95";
+  case DW_LANG_PLI:                      return "DW_LANG_PLI";
+  case DW_LANG_ObjC:                     return "DW_LANG_ObjC";
+  case DW_LANG_ObjC_plus_plus:           return "DW_LANG_ObjC_plus_plus";
+  case DW_LANG_UPC:                      return "DW_LANG_UPC";
+  case DW_LANG_D:                        return "DW_LANG_D";
+  case DW_LANG_lo_user:                  return "DW_LANG_lo_user";
+  case DW_LANG_hi_user:                  return "DW_LANG_hi_user";
   }
   return 0;
 }
@@ -424,10 +511,10 @@ const char *llvm::dwarf::LanguageString(unsigned Language) {
 ///
 const char *llvm::dwarf::CaseString(unsigned Case) {
   switch (Case) {
-  case DW_ID_case_sensitive:             return "ID_case_sensitive";
-  case DW_ID_up_case:                    return "ID_up_case";
-  case DW_ID_down_case:                  return "ID_down_case";
-  case DW_ID_case_insensitive:           return "ID_case_insensitive";
+  case DW_ID_case_sensitive:             return "DW_ID_case_sensitive";
+  case DW_ID_up_case:                    return "DW_ID_up_case";
+  case DW_ID_down_case:                  return "DW_ID_down_case";
+  case DW_ID_case_insensitive:           return "DW_ID_case_insensitive";
   }
   return 0;
 }
@@ -436,11 +523,11 @@ const char *llvm::dwarf::CaseString(unsigned Case) {
 ///
 const char *llvm::dwarf::ConventionString(unsigned Convention) {
    switch (Convention) {
-   case DW_CC_normal:                     return "CC_normal";
-   case DW_CC_program:                    return "CC_program";
-   case DW_CC_nocall:                     return "CC_nocall";
-   case DW_CC_lo_user:                    return "CC_lo_user";
-   case DW_CC_hi_user:                    return "CC_hi_user";
+   case DW_CC_normal:                     return "DW_CC_normal";
+   case DW_CC_program:                    return "DW_CC_program";
+   case DW_CC_nocall:                     return "DW_CC_nocall";
+   case DW_CC_lo_user:                    return "DW_CC_lo_user";
+   case DW_CC_hi_user:                    return "DW_CC_hi_user";
   }
   return 0;
 }
@@ -449,10 +536,10 @@ const char *llvm::dwarf::ConventionString(unsigned Convention) {
 ///
 const char *llvm::dwarf::InlineCodeString(unsigned Code) {
   switch (Code) {
-  case DW_INL_not_inlined:               return "INL_not_inlined";
-  case DW_INL_inlined:                   return "INL_inlined";
-  case DW_INL_declared_not_inlined:      return "INL_declared_not_inlined";
-  case DW_INL_declared_inlined:          return "INL_declared_inlined";
+  case DW_INL_not_inlined:               return "DW_INL_not_inlined";
+  case DW_INL_inlined:                   return "DW_INL_inlined";
+  case DW_INL_declared_not_inlined:      return "DW_INL_declared_not_inlined";
+  case DW_INL_declared_inlined:          return "DW_INL_declared_inlined";
   }
   return 0;
 }
@@ -461,8 +548,8 @@ const char *llvm::dwarf::InlineCodeString(unsigned Code) {
 ///
 const char *llvm::dwarf::ArrayOrderString(unsigned Order) {
   switch (Order) {
-  case DW_ORD_row_major:                 return "ORD_row_major";
-  case DW_ORD_col_major:                 return "ORD_col_major";
+  case DW_ORD_row_major:                 return "DW_ORD_row_major";
+  case DW_ORD_col_major:                 return "DW_ORD_col_major";
   }
   return 0;
 }
@@ -471,8 +558,8 @@ const char *llvm::dwarf::ArrayOrderString(unsigned Order) {
 /// descriptor.
 const char *llvm::dwarf::DiscriminantString(unsigned Discriminant) {
   switch (Discriminant) {
-  case DW_DSC_label:                     return "DSC_label";
-  case DW_DSC_range:                     return "DSC_range";
+  case DW_DSC_label:                     return "DW_DSC_label";
+  case DW_DSC_range:                     return "DW_DSC_range";
   }
   return 0;
 }
@@ -481,18 +568,18 @@ const char *llvm::dwarf::DiscriminantString(unsigned Discriminant) {
 ///
 const char *llvm::dwarf::LNStandardString(unsigned Standard) {
   switch (Standard) {
-  case DW_LNS_copy:                      return "LNS_copy";
-  case DW_LNS_advance_pc:                return "LNS_advance_pc";
-  case DW_LNS_advance_line:              return "LNS_advance_line";
-  case DW_LNS_set_file:                  return "LNS_set_file";
-  case DW_LNS_set_column:                return "LNS_set_column";
-  case DW_LNS_negate_stmt:               return "LNS_negate_stmt";
-  case DW_LNS_set_basic_block:           return "LNS_set_basic_block";
-  case DW_LNS_const_add_pc:              return "LNS_const_add_pc";
-  case DW_LNS_fixed_advance_pc:          return "LNS_fixed_advance_pc";
-  case DW_LNS_set_prologue_end:          return "LNS_set_prologue_end";
-  case DW_LNS_set_epilogue_begin:        return "LNS_set_epilogue_begin";
-  case DW_LNS_set_isa:                   return "LNS_set_isa";
+  case DW_LNS_copy:                      return "DW_LNS_copy";
+  case DW_LNS_advance_pc:                return "DW_LNS_advance_pc";
+  case DW_LNS_advance_line:              return "DW_LNS_advance_line";
+  case DW_LNS_set_file:                  return "DW_LNS_set_file";
+  case DW_LNS_set_column:                return "DW_LNS_set_column";
+  case DW_LNS_negate_stmt:               return "DW_LNS_negate_stmt";
+  case DW_LNS_set_basic_block:           return "DW_LNS_set_basic_block";
+  case DW_LNS_const_add_pc:              return "DW_LNS_const_add_pc";
+  case DW_LNS_fixed_advance_pc:          return "DW_LNS_fixed_advance_pc";
+  case DW_LNS_set_prologue_end:          return "DW_LNS_set_prologue_end";
+  case DW_LNS_set_epilogue_begin:        return "DW_LNS_set_epilogue_begin";
+  case DW_LNS_set_isa:                   return "DW_LNS_set_isa";
   }
   return 0;
 }
@@ -502,11 +589,11 @@ const char *llvm::dwarf::LNStandardString(unsigned Standard) {
 const char *llvm::dwarf::LNExtendedString(unsigned Encoding) {
   switch (Encoding) {
   // Line Number Extended Opcode Encodings
-  case DW_LNE_end_sequence:              return "LNE_end_sequence";
-  case DW_LNE_set_address:               return "LNE_set_address";
-  case DW_LNE_define_file:               return "LNE_define_file";
-  case DW_LNE_lo_user:                   return "LNE_lo_user";
-  case DW_LNE_hi_user:                   return "LNE_hi_user";
+  case DW_LNE_end_sequence:              return "DW_LNE_end_sequence";
+  case DW_LNE_set_address:               return "DW_LNE_set_address";
+  case DW_LNE_define_file:               return "DW_LNE_define_file";
+  case DW_LNE_lo_user:                   return "DW_LNE_lo_user";
+  case DW_LNE_hi_user:                   return "DW_LNE_hi_user";
   }
   return 0;
 }
@@ -516,11 +603,11 @@ const char *llvm::dwarf::LNExtendedString(unsigned Encoding) {
 const char *llvm::dwarf::MacinfoString(unsigned Encoding) {
   switch (Encoding) {
   // Macinfo Type Encodings
-  case DW_MACINFO_define:                return "MACINFO_define";
-  case DW_MACINFO_undef:                 return "MACINFO_undef";
-  case DW_MACINFO_start_file:            return "MACINFO_start_file";
-  case DW_MACINFO_end_file:              return "MACINFO_end_file";
-  case DW_MACINFO_vendor_ext:            return "MACINFO_vendor_ext";
+  case DW_MACINFO_define:                return "DW_MACINFO_define";
+  case DW_MACINFO_undef:                 return "DW_MACINFO_undef";
+  case DW_MACINFO_start_file:            return "DW_MACINFO_start_file";
+  case DW_MACINFO_end_file:              return "DW_MACINFO_end_file";
+  case DW_MACINFO_vendor_ext:            return "DW_MACINFO_vendor_ext";
   }
   return 0;
 }
@@ -529,33 +616,33 @@ const char *llvm::dwarf::MacinfoString(unsigned Encoding) {
 /// encodings.
 const char *llvm::dwarf::CallFrameString(unsigned Encoding) {
   switch (Encoding) {
-  case DW_CFA_advance_loc:               return "CFA_advance_loc";
-  case DW_CFA_offset:                    return "CFA_offset";
-  case DW_CFA_restore:                   return "CFA_restore";
-  case DW_CFA_set_loc:                   return "CFA_set_loc";
-  case DW_CFA_advance_loc1:              return "CFA_advance_loc1";
-  case DW_CFA_advance_loc2:              return "CFA_advance_loc2";
-  case DW_CFA_advance_loc4:              return "CFA_advance_loc4";
-  case DW_CFA_offset_extended:           return "CFA_offset_extended";
-  case DW_CFA_restore_extended:          return "CFA_restore_extended";
-  case DW_CFA_undefined:                 return "CFA_undefined";
-  case DW_CFA_same_value:                return "CFA_same_value";
-  case DW_CFA_register:                  return "CFA_register";
-  case DW_CFA_remember_state:            return "CFA_remember_state";
-  case DW_CFA_restore_state:             return "CFA_restore_state";
-  case DW_CFA_def_cfa:                   return "CFA_def_cfa";
-  case DW_CFA_def_cfa_register:          return "CFA_def_cfa_register";
-  case DW_CFA_def_cfa_offset:            return "CFA_def_cfa_offset";
-  case DW_CFA_def_cfa_expression:        return "CFA_def_cfa_expression";
-  case DW_CFA_expression:                return "CFA_expression";
-  case DW_CFA_offset_extended_sf:        return "CFA_offset_extended_sf";
-  case DW_CFA_def_cfa_sf:                return "CFA_def_cfa_sf";
-  case DW_CFA_def_cfa_offset_sf:         return "CFA_def_cfa_offset_sf";
-  case DW_CFA_val_offset:                return "CFA_val_offset";
-  case DW_CFA_val_offset_sf:             return "CFA_val_offset_sf";
-  case DW_CFA_val_expression:            return "CFA_val_expression";
-  case DW_CFA_lo_user:                   return "CFA_lo_user";
-  case DW_CFA_hi_user:                   return "CFA_hi_user";
+  case DW_CFA_advance_loc:               return "DW_CFA_advance_loc";
+  case DW_CFA_offset:                    return "DW_CFA_offset";
+  case DW_CFA_restore:                   return "DW_CFA_restore";
+  case DW_CFA_set_loc:                   return "DW_CFA_set_loc";
+  case DW_CFA_advance_loc1:              return "DW_CFA_advance_loc1";
+  case DW_CFA_advance_loc2:              return "DW_CFA_advance_loc2";
+  case DW_CFA_advance_loc4:              return "DW_CFA_advance_loc4";
+  case DW_CFA_offset_extended:           return "DW_CFA_offset_extended";
+  case DW_CFA_restore_extended:          return "DW_CFA_restore_extended";
+  case DW_CFA_undefined:                 return "DW_CFA_undefined";
+  case DW_CFA_same_value:                return "DW_CFA_same_value";
+  case DW_CFA_register:                  return "DW_CFA_register";
+  case DW_CFA_remember_state:            return "DW_CFA_remember_state";
+  case DW_CFA_restore_state:             return "DW_CFA_restore_state";
+  case DW_CFA_def_cfa:                   return "DW_CFA_def_cfa";
+  case DW_CFA_def_cfa_register:          return "DW_CFA_def_cfa_register";
+  case DW_CFA_def_cfa_offset:            return "DW_CFA_def_cfa_offset";
+  case DW_CFA_def_cfa_expression:        return "DW_CFA_def_cfa_expression";
+  case DW_CFA_expression:                return "DW_CFA_expression";
+  case DW_CFA_offset_extended_sf:        return "DW_CFA_offset_extended_sf";
+  case DW_CFA_def_cfa_sf:                return "DW_CFA_def_cfa_sf";
+  case DW_CFA_def_cfa_offset_sf:         return "DW_CFA_def_cfa_offset_sf";
+  case DW_CFA_val_offset:                return "DW_CFA_val_offset";
+  case DW_CFA_val_offset_sf:             return "DW_CFA_val_offset_sf";
+  case DW_CFA_val_expression:            return "DW_CFA_val_expression";
+  case DW_CFA_lo_user:                   return "DW_CFA_lo_user";
+  case DW_CFA_hi_user:                   return "DW_CFA_hi_user";
   }
   return 0;
 }
diff --git a/lib/Support/FileUtilities.cpp b/lib/Support/FileUtilities.cpp
index 095395f..1bde2fe 100644
--- a/lib/Support/FileUtilities.cpp
+++ b/lib/Support/FileUtilities.cpp
@@ -51,7 +51,15 @@ static const char *BackupNumber(const char *Pos, const char *FirstChar) {
   if (!isNumberChar(*Pos)) return Pos;
 
   // Otherwise, return to the start of the number.
+  bool HasPeriod = false;
   while (Pos > FirstChar && isNumberChar(Pos[-1])) {
+    // Backup over at most one period.
+    if (Pos[-1] == '.') {
+      if (HasPeriod)
+        break;
+      HasPeriod = true;
+    }
+
     --Pos;
     if (Pos > FirstChar && isSignedChar(Pos[0]) && !isExponentChar(Pos[-1]))
       break;
@@ -204,16 +212,16 @@ int llvm::DiffFilesWithTolerance(const sys::PathWithStatus &FileA,
   const char *F1P = File1Start;
   const char *F2P = File2Start;
 
-  if (A_size == B_size) {
-    // Are the buffers identical?  Common case: Handle this efficiently.
-    if (std::memcmp(File1Start, File2Start, A_size) == 0)
-      return 0;
+  // Are the buffers identical?  Common case: Handle this efficiently.
+  if (A_size == B_size &&
+      std::memcmp(File1Start, File2Start, A_size) == 0)
+    return 0;
 
-    if (AbsTol == 0 && RelTol == 0) {
-      if (Error)
-        *Error = "Files differ without tolerance allowance";
-      return 1;   // Files different!
-    }
+  // Otherwise, we are done a tolerances are set.
+  if (AbsTol == 0 && RelTol == 0) {
+    if (Error)
+      *Error = "Files differ without tolerance allowance";
+    return 1;   // Files different!
   }
 
   bool CompareFailed = false;
diff --git a/lib/Support/FoldingSet.cpp b/lib/Support/FoldingSet.cpp
index 3f467fe..b8dca33 100644
--- a/lib/Support/FoldingSet.cpp
+++ b/lib/Support/FoldingSet.cpp
@@ -175,6 +175,14 @@ static void **GetBucketFor(const FoldingSetNodeID &ID,
   return Buckets + BucketNum;
 }
 
+/// AllocateBuckets - Allocated initialized bucket memory.
+static void **AllocateBuckets(unsigned NumBuckets) {
+  void **Buckets = static_cast<void**>(calloc(NumBuckets+1, sizeof(void*)));
+  // Set the very last bucket to be a non-null "pointer".
+  Buckets[NumBuckets] = reinterpret_cast<void*>(-1);
+  return Buckets;
+}
+
 //===----------------------------------------------------------------------===//
 // FoldingSetImpl Implementation
 
@@ -182,11 +190,11 @@ FoldingSetImpl::FoldingSetImpl(unsigned Log2InitSize) {
   assert(5 < Log2InitSize && Log2InitSize < 32 &&
          "Initial hash table size out of range");
   NumBuckets = 1 << Log2InitSize;
-  Buckets = new void*[NumBuckets+1];
-  clear();
+  Buckets = AllocateBuckets(NumBuckets);
+  NumNodes = 0;
 }
 FoldingSetImpl::~FoldingSetImpl() {
-  delete [] Buckets;
+  free(Buckets);
 }
 void FoldingSetImpl::clear() {
   // Set all but the last bucket to null pointers.
@@ -207,8 +215,8 @@ void FoldingSetImpl::GrowHashTable() {
   NumBuckets <<= 1;
   
   // Clear out new buckets.
-  Buckets = new void*[NumBuckets+1];
-  clear();
+  Buckets = AllocateBuckets(NumBuckets);
+  NumNodes = 0;
 
   // Walk the old buckets, rehashing nodes into their new place.
   FoldingSetNodeID ID;
@@ -227,7 +235,7 @@ void FoldingSetImpl::GrowHashTable() {
     }
   }
   
-  delete[] OldBuckets;
+  free(OldBuckets);
 }
 
 /// FindNodeOrInsertPos - Look up the node specified by ID.  If it exists,
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 2b95089..542162d 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/System/Errno.h"
 #include "llvm/System/Path.h"
 #include "llvm/System/Process.h"
@@ -37,22 +38,7 @@ using namespace llvm;
 // MemoryBuffer implementation itself.
 //===----------------------------------------------------------------------===//
 
-MemoryBuffer::~MemoryBuffer() {
-  if (MustDeleteBuffer)
-    free((void*)BufferStart);
-}
-
-/// initCopyOf - Initialize this source buffer with a copy of the specified
-/// memory range.  We make the copy so that we can null terminate it
-/// successfully.
-void MemoryBuffer::initCopyOf(const char *BufStart, const char *BufEnd) {
-  size_t Size = BufEnd-BufStart;
-  BufferStart = (char *)malloc(Size+1);
-  BufferEnd = BufferStart+Size;
-  memcpy(const_cast<char*>(BufferStart), BufStart, Size);
-  *const_cast<char*>(BufferEnd) = 0;   // Null terminate buffer.
-  MustDeleteBuffer = true;
-}
+MemoryBuffer::~MemoryBuffer() { }
 
 /// init - Initialize this MemoryBuffer as a reference to externally allocated
 /// memory, memory that we know is already null terminated.
@@ -60,27 +46,38 @@ void MemoryBuffer::init(const char *BufStart, const char *BufEnd) {
   assert(BufEnd[0] == 0 && "Buffer is not null terminated!");
   BufferStart = BufStart;
   BufferEnd = BufEnd;
-  MustDeleteBuffer = false;
 }
 
 //===----------------------------------------------------------------------===//
 // MemoryBufferMem implementation.
 //===----------------------------------------------------------------------===//
 
+/// CopyStringRef - Copies contents of a StringRef into a block of memory and
+/// null-terminates it.
+static void CopyStringRef(char *Memory, StringRef Data) {
+  memcpy(Memory, Data.data(), Data.size());
+  Memory[Data.size()] = 0; // Null terminate string.
+}
+
+/// GetNamedBuffer - Allocates a new MemoryBuffer with Name copied after it.
+template <typename T>
+static T* GetNamedBuffer(StringRef Buffer, StringRef Name) {
+  char *Mem = static_cast<char*>(operator new(sizeof(T) + Name.size() + 1));
+  CopyStringRef(Mem + sizeof(T), Name);
+  return new (Mem) T(Buffer);
+}
+
 namespace {
+/// MemoryBufferMem - Named MemoryBuffer pointing to a block of memory.
 class MemoryBufferMem : public MemoryBuffer {
-  std::string FileID;
 public:
-  MemoryBufferMem(StringRef InputData, StringRef FID, bool Copy = false)
-  : FileID(FID) {
-    if (!Copy)
-      init(InputData.data(), InputData.data()+InputData.size());
-    else
-      initCopyOf(InputData.data(), InputData.data()+InputData.size());
+  MemoryBufferMem(StringRef InputData) {
+    init(InputData.begin(), InputData.end());
   }
-  
+
   virtual const char *getBufferIdentifier() const {
-    return FileID.c_str();
+     // The name is stored after the class itself.
+    return reinterpret_cast<const char*>(this + 1);
   }
 };
 }
@@ -88,42 +85,55 @@ public:
 /// getMemBuffer - Open the specified memory range as a MemoryBuffer.  Note
 /// that EndPtr[0] must be a null byte and be accessible!
 MemoryBuffer *MemoryBuffer::getMemBuffer(StringRef InputData,
-                                         const char *BufferName) {
-  return new MemoryBufferMem(InputData, BufferName);
+                                         StringRef BufferName) {
+  return GetNamedBuffer<MemoryBufferMem>(InputData, BufferName);
 }
 
 /// getMemBufferCopy - Open the specified memory range as a MemoryBuffer,
 /// copying the contents and taking ownership of it.  This has no requirements
 /// on EndPtr[0].
 MemoryBuffer *MemoryBuffer::getMemBufferCopy(StringRef InputData,
-                                             const char *BufferName) {
-  return new MemoryBufferMem(InputData, BufferName, true);
+                                             StringRef BufferName) {
+  MemoryBuffer *Buf = getNewUninitMemBuffer(InputData.size(), BufferName);
+  if (!Buf) return 0;
+  memcpy(const_cast<char*>(Buf->getBufferStart()), InputData.data(),
+         InputData.size());
+  return Buf;
 }
 
 /// getNewUninitMemBuffer - Allocate a new MemoryBuffer of the specified size
-/// that is completely initialized to zeros.  Note that the caller should
-/// initialize the memory allocated by this method.  The memory is owned by
-/// the MemoryBuffer object.
+/// that is not initialized.  Note that the caller should initialize the
+/// memory allocated by this method.  The memory is owned by the MemoryBuffer
+/// object.
 MemoryBuffer *MemoryBuffer::getNewUninitMemBuffer(size_t Size,
                                                   StringRef BufferName) {
-  char *Buf = (char *)malloc(Size+1);
-  if (!Buf) return 0;
-  Buf[Size] = 0;
-  MemoryBufferMem *SB = new MemoryBufferMem(StringRef(Buf, Size), BufferName);
-  // The memory for this buffer is owned by the MemoryBuffer.
-  SB->MustDeleteBuffer = true;
-  return SB;
+  // Allocate space for the MemoryBuffer, the data and the name. It is important
+  // that MemoryBuffer and data are aligned so PointerIntPair works with them.
+  size_t AlignedStringLen =
+    RoundUpToAlignment(sizeof(MemoryBufferMem) + BufferName.size() + 1,
+                       sizeof(void*)); // TODO: Is sizeof(void*) enough?
+  size_t RealLen = AlignedStringLen + Size + 1;
+  char *Mem = static_cast<char*>(operator new(RealLen, std::nothrow));
+  if (!Mem) return 0;
+
+  // The name is stored after the class itself.
+  CopyStringRef(Mem + sizeof(MemoryBufferMem), BufferName);
+
+  // The buffer begins after the name and must be aligned.
+  char *Buf = Mem + AlignedStringLen;
+  Buf[Size] = 0; // Null terminate buffer.
+
+  return new (Mem) MemoryBufferMem(StringRef(Buf, Size));
 }
 
 /// getNewMemBuffer - Allocate a new MemoryBuffer of the specified size that
 /// is completely initialized to zeros.  Note that the caller should
 /// initialize the memory allocated by this method.  The memory is owned by
 /// the MemoryBuffer object.
-MemoryBuffer *MemoryBuffer::getNewMemBuffer(size_t Size,
-                                            const char *BufferName) {
+MemoryBuffer *MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) {
   MemoryBuffer *SB = getNewUninitMemBuffer(Size, BufferName);
   if (!SB) return 0;
-  memset(const_cast<char*>(SB->getBufferStart()), 0, Size+1);
+  memset(const_cast<char*>(SB->getBufferStart()), 0, Size);
   return SB;
 }
 
@@ -137,7 +147,16 @@ MemoryBuffer *MemoryBuffer::getFileOrSTDIN(StringRef Filename,
                                            int64_t FileSize,
                                            struct stat *FileInfo) {
   if (Filename == "-")
-    return getSTDIN();
+    return getSTDIN(ErrStr);
+  return getFile(Filename, ErrStr, FileSize, FileInfo);
+}
+
+MemoryBuffer *MemoryBuffer::getFileOrSTDIN(const char *Filename,
+                                           std::string *ErrStr,
+                                           int64_t FileSize,
+                                           struct stat *FileInfo) {
+  if (strcmp(Filename, "-") == 0)
+    return getSTDIN(ErrStr);
   return getFile(Filename, ErrStr, FileSize, FileInfo);
 }
 
@@ -149,18 +168,11 @@ namespace {
 /// MemoryBufferMMapFile - This represents a file that was mapped in with the
 /// sys::Path::MapInFilePages method.  When destroyed, it calls the
 /// sys::Path::UnMapFilePages method.
-class MemoryBufferMMapFile : public MemoryBuffer {
-  std::string Filename;
+class MemoryBufferMMapFile : public MemoryBufferMem {
 public:
-  MemoryBufferMMapFile(StringRef filename, const char *Pages, uint64_t Size)
-    : Filename(filename) {
-    init(Pages, Pages+Size);
-  }
-  
-  virtual const char *getBufferIdentifier() const {
-    return Filename.c_str();
-  }
-    
+  MemoryBufferMMapFile(StringRef Buffer)
+    : MemoryBufferMem(Buffer) { }
+
   ~MemoryBufferMMapFile() {
     sys::Path::UnMapFilePages(getBufferStart(), getBufferSize());
   }
@@ -170,19 +182,24 @@ public:
 class FileCloser {
   int FD;
 public:
-  FileCloser(int FD) : FD(FD) {}
+  explicit FileCloser(int FD) : FD(FD) {}
   ~FileCloser() { ::close(FD); }
 };
 }
 
 MemoryBuffer *MemoryBuffer::getFile(StringRef Filename, std::string *ErrStr,
                                     int64_t FileSize, struct stat *FileInfo) {
-  int OpenFlags = 0;
+  SmallString<256> PathBuf(Filename.begin(), Filename.end());
+  return MemoryBuffer::getFile(PathBuf.c_str(), ErrStr, FileSize, FileInfo);
+}
+
+MemoryBuffer *MemoryBuffer::getFile(const char *Filename, std::string *ErrStr,
+                                    int64_t FileSize, struct stat *FileInfo) {
+  int OpenFlags = O_RDONLY;
 #ifdef O_BINARY
   OpenFlags |= O_BINARY;  // Open input file in binary mode on win32.
 #endif
-  SmallString<256> PathBuf(Filename.begin(), Filename.end());
-  int FD = ::open(PathBuf.c_str(), O_RDONLY|OpenFlags);
+  int FD = ::open(Filename, OpenFlags);
   if (FD == -1) {
     if (ErrStr) *ErrStr = sys::StrError();
     return 0;
@@ -213,8 +230,8 @@ MemoryBuffer *MemoryBuffer::getFile(StringRef Filename, std::string *ErrStr,
   if (FileSize >= 4096*4 &&
       (FileSize & (sys::Process::GetPageSize()-1)) != 0) {
     if (const char *Pages = sys::Path::MapInFilePages(FD, FileSize)) {
-      // Close the file descriptor, now that the whole file is in memory.
-      return new MemoryBufferMMapFile(Filename, Pages, FileSize);
+      return GetNamedBuffer<MemoryBufferMMapFile>(StringRef(Pages, FileSize),
+                                                  Filename);
     }
   }
 
@@ -254,34 +271,27 @@ MemoryBuffer *MemoryBuffer::getFile(StringRef Filename, std::string *ErrStr,
 // MemoryBuffer::getSTDIN implementation.
 //===----------------------------------------------------------------------===//
 
-namespace {
-class STDINBufferFile : public MemoryBuffer {
-public:
-  virtual const char *getBufferIdentifier() const {
-    return "<stdin>";
-  }
-};
-}
-
-MemoryBuffer *MemoryBuffer::getSTDIN() {
-  char Buffer[4096*4];
-
-  std::vector<char> FileData;
-
+MemoryBuffer *MemoryBuffer::getSTDIN(std::string *ErrStr) {
   // Read in all of the data from stdin, we cannot mmap stdin.
   //
   // FIXME: That isn't necessarily true, we should try to mmap stdin and
   // fallback if it fails.
   sys::Program::ChangeStdinToBinary();
-  size_t ReadBytes;
+
+  const ssize_t ChunkSize = 4096*4;
+  SmallString<ChunkSize> Buffer;
+  ssize_t ReadBytes;
+  // Read into Buffer until we hit EOF.
   do {
-    ReadBytes = fread(Buffer, sizeof(char), sizeof(Buffer), stdin);
-    FileData.insert(FileData.end(), Buffer, Buffer+ReadBytes);
-  } while (ReadBytes == sizeof(Buffer));
-
-  FileData.push_back(0); // &FileData[Size] is invalid. So is &*FileData.end().
-  size_t Size = FileData.size();
-  MemoryBuffer *B = new STDINBufferFile();
-  B->initCopyOf(&FileData[0], &FileData[Size-1]);
-  return B;
+    Buffer.reserve(Buffer.size() + ChunkSize);
+    ReadBytes = read(0, Buffer.end(), ChunkSize);
+    if (ReadBytes == -1) {
+      if (errno == EINTR) continue;
+      if (ErrStr) *ErrStr = sys::StrError();
+      return 0;
+    }
+    Buffer.set_size(Buffer.size() + ReadBytes);
+  } while (ReadBytes != 0);
+
+  return getMemBufferCopy(Buffer, "<stdin>");
 }
diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp
index 7a04a53..a99ab2f 100644
--- a/lib/Support/PrettyStackTrace.cpp
+++ b/lib/Support/PrettyStackTrace.cpp
@@ -12,11 +12,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Config/config.h"     // Get autoconf configuration settings
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/System/Signals.h"
 #include "llvm/System/ThreadLocal.h"
 #include "llvm/ADT/SmallString.h"
+
+#ifdef HAVE_CRASHREPORTERCLIENT_H
+#include <CrashReporterClient.h>
+#endif
+
 using namespace llvm;
 
 namespace llvm {
@@ -48,8 +54,17 @@ static void PrintCurStackTrace(raw_ostream &OS) {
   OS.flush();
 }
 
-// Integrate with crash reporter.
-#ifdef __APPLE__
+// Integrate with crash reporter libraries.
+#if defined (__APPLE__) && defined (HAVE_CRASHREPORTERCLIENT_H)
+//  If any clients of llvm try to link to libCrashReporterClient.a themselves,
+//  only one crash info struct will be used.
+extern "C" {
+CRASH_REPORTER_CLIENT_HIDDEN 
+struct crashreporter_annotations_t gCRAnnotations 
+        __attribute__((section("__DATA," CRASHREPORTER_ANNOTATIONS_SECTION))) 
+        = { CRASHREPORTER_ANNOTATIONS_VERSION, 0, 0, 0, 0 };
+}
+#elif defined (__APPLE__)
 static const char *__crashreporter_info__ = 0;
 asm(".desc ___crashreporter_info__, 0x10");
 #endif
@@ -71,7 +86,11 @@ static void CrashHandler(void *Cookie) {
   }
   
   if (!TmpStr.empty()) {
+#ifndef HAVE_CRASHREPORTERCLIENT_H
     __crashreporter_info__ = strdup(std::string(TmpStr.str()).c_str());
+#else
+    CRSetCrashLogMessage(std::string(TmpStr.str()).c_str());
+#endif
     errs() << TmpStr.str();
   }
   
diff --git a/lib/Support/SmallPtrSet.cpp b/lib/Support/SmallPtrSet.cpp
index 68938fa..504e649 100644
--- a/lib/Support/SmallPtrSet.cpp
+++ b/lib/Support/SmallPtrSet.cpp
@@ -166,10 +166,13 @@ void SmallPtrSetImpl::Grow() {
   }
 }
 
-SmallPtrSetImpl::SmallPtrSetImpl(const SmallPtrSetImpl& that) {
+SmallPtrSetImpl::SmallPtrSetImpl(const void **SmallStorage,
+                                 const SmallPtrSetImpl& that) {
+  SmallArray = SmallStorage;
+
   // If we're becoming small, prepare to insert into our stack space
   if (that.isSmall()) {
-    CurArray = &SmallArray[0];
+    CurArray = SmallArray;
   // Otherwise, allocate new heap space (unless we were the same size)
   } else {
     CurArray = (const void**)malloc(sizeof(void*) * (that.CurArraySize+1));
@@ -197,7 +200,7 @@ void SmallPtrSetImpl::CopyFrom(const SmallPtrSetImpl &RHS) {
   if (RHS.isSmall()) {
     if (!isSmall())
       free(CurArray);
-    CurArray = &SmallArray[0];
+    CurArray = SmallArray;
   // Otherwise, allocate new heap space (unless we were the same size)
   } else if (CurArraySize != RHS.CurArraySize) {
     if (isSmall())
diff --git a/lib/Support/SmallVector.cpp b/lib/Support/SmallVector.cpp
index 6821382..2e17af8 100644
--- a/lib/Support/SmallVector.cpp
+++ b/lib/Support/SmallVector.cpp
@@ -21,15 +21,18 @@ void SmallVectorBase::grow_pod(size_t MinSizeInBytes, size_t TSize) {
   size_t NewCapacityInBytes = 2 * capacity_in_bytes();
   if (NewCapacityInBytes < MinSizeInBytes)
     NewCapacityInBytes = MinSizeInBytes;
-  void *NewElts = operator new(NewCapacityInBytes);
-  
-  // Copy the elements over.  No need to run dtors on PODs.
-  memcpy(NewElts, this->BeginX, CurSizeBytes);
-  
-  // If this wasn't grown from the inline copy, deallocate the old space.
-  if (!this->isSmall())
-    operator delete(this->BeginX);
-  
+
+  void *NewElts;
+  if (this->isSmall()) {
+    NewElts = malloc(NewCapacityInBytes);
+
+    // Copy the elements over.  No need to run dtors on PODs.
+    memcpy(NewElts, this->BeginX, CurSizeBytes);
+  } else {
+    // If this wasn't grown from the inline copy, grow the allocated space.
+    NewElts = realloc(this->BeginX, NewCapacityInBytes);
+  }
+
   this->EndX = (char*)NewElts+CurSizeBytes;
   this->BeginX = NewElts;
   this->CapacityX = (char*)this->BeginX + NewCapacityInBytes;
diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp
index 784b77c..44ee177 100644
--- a/lib/Support/Timer.cpp
+++ b/lib/Support/Timer.cpp
@@ -236,11 +236,13 @@ static Timer &getNamedRegionTimer(StringRef Name) {
   return T;
 }
 
-NamedRegionTimer::NamedRegionTimer(StringRef Name)
-  : TimeRegion(getNamedRegionTimer(Name)) {}
+NamedRegionTimer::NamedRegionTimer(StringRef Name,
+                                   bool Enabled)
+  : TimeRegion(!Enabled ? 0 : &getNamedRegionTimer(Name)) {}
 
-NamedRegionTimer::NamedRegionTimer(StringRef Name, StringRef GroupName)
-  : TimeRegion(NamedGroupedTimers->get(Name, GroupName)) {}
+NamedRegionTimer::NamedRegionTimer(StringRef Name, StringRef GroupName,
+                                   bool Enabled)
+  : TimeRegion(!Enabled ? 0 : &NamedGroupedTimers->get(Name, GroupName)) {}
 
 //===----------------------------------------------------------------------===//
 //   TimerGroup Implementation
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 9796ca5..6a70449 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -104,6 +104,7 @@ const char *Triple::getOSTypeName(OSType Kind) {
   case Solaris: return "solaris";
   case Win32: return "win32";
   case Haiku: return "haiku";
+  case Minix: return "minix";
   }
 
   return "<invalid>";
@@ -326,7 +327,9 @@ void Triple::Parse() const {
   else if (OSName.startswith("win32"))
     OS = Win32;
   else if (OSName.startswith("haiku"))
-  	OS = Haiku;
+    OS = Haiku;
+  else if (OSName.startswith("minix"))
+    OS = Minix;
   else
     OS = UnknownOS;
 
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 11cf0ec..8054ae6 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -427,10 +427,9 @@ raw_fd_ostream::~raw_fd_ostream() {
 void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
   assert(FD >= 0 && "File already closed.");
   pos += Size;
-  ssize_t ret;
 
   do {
-    ret = ::write(FD, Ptr, Size);
+    ssize_t ret = ::write(FD, Ptr, Size);
 
     if (ret < 0) {
       // If it's a recoverable error, swallow it and retry the write.
@@ -482,7 +481,7 @@ uint64_t raw_fd_ostream::seek(uint64_t off) {
 }
 
 size_t raw_fd_ostream::preferred_buffer_size() const {
-#if !defined(_MSC_VER) && !defined(__MINGW32__) && !defined(_MINIX)
+#if !defined(_MSC_VER) && !defined(__MINGW32__) && !defined(__minix)
   // Windows and Minix have no st_blksize.
   assert(FD >= 0 && "File not yet open!");
   struct stat statbuf;
@@ -496,8 +495,9 @@ size_t raw_fd_ostream::preferred_buffer_size() const {
     return 0;
   // Return the preferred block size.
   return statbuf.st_blksize;
-#endif
+#else
   return raw_ostream::preferred_buffer_size();
+#endif
 }
 
 raw_ostream &raw_fd_ostream::changeColor(enum Colors colors, bool bold,
diff --git a/lib/System/Disassembler.cpp b/lib/System/Disassembler.cpp
index bad427a..139e3be 100644
--- a/lib/System/Disassembler.cpp
+++ b/lib/System/Disassembler.cpp
@@ -44,33 +44,29 @@ std::string llvm::sys::disassembleBuffer(uint8_t* start, size_t length,
                                          uint64_t pc) {
   std::stringstream res;
 
-#if defined (__i386__) || defined (__amd64__) || defined (__x86_64__)
+#if (defined (__i386__) || defined (__amd64__) || defined (__x86_64__)) \
+  && USE_UDIS86
   unsigned bits;
 # if defined(__i386__)
   bits = 32;
 # else
   bits = 64;
 # endif
-  
-# if USE_UDIS86
+
   ud_t ud_obj;
-   
+
   ud_init(&ud_obj);
   ud_set_input_buffer(&ud_obj, start, length);
   ud_set_mode(&ud_obj, bits);
   ud_set_pc(&ud_obj, pc);
   ud_set_syntax(&ud_obj, UD_SYN_ATT);
-  
+
   res << std::setbase(16)
       << std::setw(bits/4);
-  
+
   while (ud_disassemble(&ud_obj)) {
     res << ud_insn_off(&ud_obj) << ":\t" << ud_insn_asm(&ud_obj) << "\n";
   }
-# else
-  res << "No disassembler available. See configure help for options.\n";
-# endif
-  
 #else
   res << "No disassembler available. See configure help for options.\n";
 #endif
diff --git a/lib/System/Path.cpp b/lib/System/Path.cpp
index 6844530..1235257 100644
--- a/lib/System/Path.cpp
+++ b/lib/System/Path.cpp
@@ -136,26 +136,23 @@ sys::IdentifyFileType(const char *magic, unsigned length) {
 
 bool
 Path::isArchive() const {
-  if (canRead())
-    return hasMagicNumber("!<arch>\012");
-  return false;
+  return hasMagicNumber("!<arch>\012");
 }
 
 bool
 Path::isDynamicLibrary() const {
-  if (canRead()) {
-    std::string Magic;
-    if (getMagicNumber(Magic, 64))
-      switch (IdentifyFileType(Magic.c_str(),
-                               static_cast<unsigned>(Magic.length()))) {
-        default: return false;
-        case Mach_O_FixedVirtualMemorySharedLib_FileType:
-        case Mach_O_DynamicallyLinkedSharedLib_FileType:
-        case Mach_O_DynamicallyLinkedSharedLibStub_FileType:
-        case ELF_SharedObject_FileType:
-        case COFF_FileType:  return true;
-      }
-  }
+  std::string Magic;
+  if (getMagicNumber(Magic, 64))
+    switch (IdentifyFileType(Magic.c_str(),
+                             static_cast<unsigned>(Magic.length()))) {
+      default: return false;
+      case Mach_O_FixedVirtualMemorySharedLib_FileType:
+      case Mach_O_DynamicallyLinkedSharedLib_FileType:
+      case Mach_O_DynamicallyLinkedSharedLibStub_FileType:
+      case ELF_SharedObject_FileType:
+      case COFF_FileType:  return true;
+    }
+
   return false;
 }
 
diff --git a/lib/System/Unix/Path.inc b/lib/System/Unix/Path.inc
index 74596dc..bc104a3 100644
--- a/lib/System/Unix/Path.inc
+++ b/lib/System/Unix/Path.inc
@@ -421,10 +421,8 @@ bool Path::getMagicNumber(std::string &Magic, unsigned len) const {
     return false;
   ssize_t bytes_read = ::read(fd, Buf, len);
   ::close(fd);
-  if (ssize_t(len) != bytes_read) {
-    Magic.clear();
+  if (ssize_t(len) != bytes_read)
     return false;
-  }
   Magic.assign(Buf, len);
   return true;
 }
@@ -890,14 +888,19 @@ Path::makeUnique(bool reuse_current, std::string* ErrMsg) {
 #else
   // Okay, looks like we have to do it all by our lonesome.
   static unsigned FCounter = 0;
-  unsigned offset = path.size() + 1;
-  while ( FCounter < 999999 && exists()) {
-    sprintf(FNBuffer+offset,"%06u",++FCounter);
+  // Try to initialize with unique value.
+  if (FCounter == 0) FCounter = ((unsigned)getpid() & 0xFFFF) << 8;
+  char* pos = strstr(FNBuffer, "XXXXXX");
+  do {
+    if (++FCounter > 0xFFFFFF) {
+      return MakeErrMsg(ErrMsg,
+        path + ": can't make unique filename: too many files");
+    }
+    sprintf(pos, "%06X", FCounter);
     path = FNBuffer;
-  }
-  if (FCounter > 999999)
-    return MakeErrMsg(ErrMsg,
-      path + ": can't make unique filename: too many files");
+  } while (exists());
+  // POSSIBLE SECURITY BUG: An attacker can easily guess the name and exploit
+  // LLVM.
 #endif
   return false;
 }
diff --git a/lib/System/Unix/Program.inc b/lib/System/Unix/Program.inc
index 358415f..67018de 100644
--- a/lib/System/Unix/Program.inc
+++ b/lib/System/Unix/Program.inc
@@ -310,7 +310,7 @@ Program::Wait(unsigned secondsToWait,
   // fact of having a handler at all causes the wait below to return with EINTR,
   // unlike if we used SIG_IGN.
   if (secondsToWait) {
-#ifndef __HAIKU__
+#if !defined(__HAIKU__) && !defined(__minix)
     Act.sa_sigaction = 0;
 #endif
     Act.sa_handler = TimeOutHandler;
diff --git a/lib/System/Unix/Signals.inc b/lib/System/Unix/Signals.inc
index 9548816..1e74647 100644
--- a/lib/System/Unix/Signals.inc
+++ b/lib/System/Unix/Signals.inc
@@ -111,6 +111,14 @@ static void UnregisterHandlers() {
 }
 
 
+/// RemoveFilesToRemove - Process the FilesToRemove list. This function
+/// should be called with the SignalsMutex lock held.
+static void RemoveFilesToRemove() {
+  while (!FilesToRemove.empty()) {
+    FilesToRemove.back().eraseFromDisk(true);
+    FilesToRemove.pop_back();
+  }
+}
 
 // SignalHandler - The signal handler that runs.
 static RETSIGTYPE SignalHandler(int Sig) {
@@ -126,10 +134,7 @@ static RETSIGTYPE SignalHandler(int Sig) {
   sigprocmask(SIG_UNBLOCK, &SigMask, 0);
 
   SignalsMutex.acquire();
-  while (!FilesToRemove.empty()) {
-    FilesToRemove.back().eraseFromDisk(true);
-    FilesToRemove.pop_back();
-  }
+  RemoveFilesToRemove();
 
   if (std::find(IntSigs, IntSigsEnd, Sig) != IntSigsEnd) {
     if (InterruptFunction) {
@@ -153,7 +158,9 @@ static RETSIGTYPE SignalHandler(int Sig) {
 }
 
 void llvm::sys::RunInterruptHandlers() {
-  SignalHandler(SIGINT);
+  SignalsMutex.acquire();
+  RemoveFilesToRemove();
+  SignalsMutex.release();
 }
 
 void llvm::sys::SetInterruptFunction(void (*IF)()) {
diff --git a/lib/System/Win32/Path.inc b/lib/System/Win32/Path.inc
index 5a0052f..379527d 100644
--- a/lib/System/Win32/Path.inc
+++ b/lib/System/Win32/Path.inc
@@ -281,12 +281,6 @@ Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
 // FIXME: the above set of functions don't map to Windows very well.
 
 
-bool
-Path::isRootDirectory() const {
-  size_t len = path.size();
-  return len > 0 && path[len-1] == '/';
-}
-
 StringRef Path::getDirname() const {
   return getDirnameCharSep(path, "/");
 }
diff --git a/lib/System/Win32/Signals.inc b/lib/System/Win32/Signals.inc
index a3a393c..d6db71b 100644
--- a/lib/System/Win32/Signals.inc
+++ b/lib/System/Win32/Signals.inc
@@ -283,7 +283,7 @@ static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
 
 #ifdef _MSC_VER
   if (ExitOnUnhandledExceptions)
-  	_exit(-3);
+    _exit(-3);
 #endif
 
   // Allow dialog box to pop up allowing choice to start debugger.
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index ae7ae59..14825a7 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -90,10 +90,6 @@ inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
   }
 }
 
-/// ModelWithRegSequence - Return true if isel should use REG_SEQUENCE to model
-/// operations involving sub-registers.
-bool ModelWithRegSequence();
-
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 
diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h
index e68354a..d316b13 100644
--- a/lib/Target/ARM/ARMAddressingModes.h
+++ b/lib/Target/ARM/ARMAddressingModes.h
@@ -520,6 +520,70 @@ namespace ARM_AM {
   // This is stored in two operands [regaddr, align].  The first is the
   // address register.  The second operand is the value of the alignment
   // specifier to use or zero if no explicit alignment.
+  // Valid alignments are: 0, 8, 16, and 32 bytes, depending on the specific
+  // instruction.
+
+  //===--------------------------------------------------------------------===//
+  // NEON Modified Immediates
+  //===--------------------------------------------------------------------===//
+  //
+  // Several NEON instructions (e.g., VMOV) take a "modified immediate"
+  // vector operand, where a small immediate encoded in the instruction
+  // specifies a full NEON vector value.  These modified immediates are
+  // represented here as encoded integers.  The low 8 bits hold the immediate
+  // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold
+  // the "Cmode" field of the instruction.  The interfaces below treat the
+  // Op and Cmode values as a single 5-bit value.
+
+  static inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) {
+    return (OpCmode << 8) | Val;
+  }
+  static inline unsigned getNEONModImmOpCmode(unsigned ModImm) {
+    return (ModImm >> 8) & 0x1f;
+  }
+  static inline unsigned getNEONModImmVal(unsigned ModImm) {
+    return ModImm & 0xff;
+  }
+
+  /// decodeNEONModImm - Decode a NEON modified immediate value into the
+  /// element value and the element size in bits.  (If the element size is
+  /// smaller than the vector, it is splatted into all the elements.)
+  static inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) {
+    unsigned OpCmode = getNEONModImmOpCmode(ModImm);
+    unsigned Imm8 = getNEONModImmVal(ModImm);
+    uint64_t Val = 0;
+
+    if (OpCmode == 0xe) {
+      // 8-bit vector elements
+      Val = Imm8;
+      EltBits = 8;
+    } else if ((OpCmode & 0xc) == 0x8) {
+      // 16-bit vector elements
+      unsigned ByteNum = (OpCmode & 0x6) >> 1;
+      Val = Imm8 << (8 * ByteNum);
+      EltBits = 16;
+    } else if ((OpCmode & 0x8) == 0) {
+      // 32-bit vector elements, zero with one byte set
+      unsigned ByteNum = (OpCmode & 0x6) >> 1;
+      Val = Imm8 << (8 * ByteNum);
+      EltBits = 32;
+    } else if ((OpCmode & 0xe) == 0xc) {
+      // 32-bit vector elements, one byte with low bits set
+      unsigned ByteNum = 1 + (OpCmode & 0x1);
+      Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum)));
+      EltBits = 32;
+    } else if (OpCmode == 0x1e) {
+      // 64-bit vector elements
+      for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
+        if ((ModImm >> ByteNum) & 1)
+          Val |= (uint64_t)0xff << (8 * ByteNum);
+      }
+      EltBits = 64;
+    } else {
+      assert(false && "Unsupported NEON immediate");
+    }
+    return Val;
+  }
 
 } // end namespace ARM_AM
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 2528854..49c16f3 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -56,7 +56,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
   MachineInstr *MI = MBBI;
   MachineFunction &MF = *MI->getParent()->getParent();
-  unsigned TSFlags = MI->getDesc().TSFlags;
+  uint64_t TSFlags = MI->getDesc().TSFlags;
   bool isPre = false;
   switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
   default: return NULL;
@@ -199,9 +199,9 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
 bool
 ARMBaseInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                            MachineBasicBlock::iterator MI,
-                                            const std::vector<CalleeSavedInfo> &CSI,
-                                            const TargetRegisterInfo *TRI) const {
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return false;
 
@@ -227,8 +227,9 @@ ARMBaseInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 
     // Insert the spill to the stack frame. The register is killed at the spill
     // 
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
     storeRegToStackSlot(MBB, MI, Reg, isKill,
-                        CSI[i].getFrameIdx(), CSI[i].getRegClass(), TRI);
+                        CSI[i].getFrameIdx(), RC, TRI);
   }
   return true;
 }
@@ -347,10 +348,8 @@ unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 unsigned
 ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                MachineBasicBlock *FBB,
-                             const SmallVectorImpl<MachineOperand> &Cond) const {
-  // FIXME this should probably have a DebugLoc argument
-  DebugLoc dl;
-
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               DebugLoc DL) const {
   ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>();
   int BOpc   = !AFI->isThumbFunction()
     ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB);
@@ -364,17 +363,17 @@ ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
 
   if (FBB == 0) {
     if (Cond.empty()) // Unconditional branch?
-      BuildMI(&MBB, dl, get(BOpc)).addMBB(TBB);
+      BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
     else
-      BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
+      BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
         .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
     return 1;
   }
 
   // Two-way conditional branch.
-  BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
+  BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
     .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
-  BuildMI(&MBB, dl, get(BOpc)).addMBB(FBB);
+  BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
   return 2;
 }
 
@@ -487,7 +486,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
 
   // Basic size info comes from the TSFlags field.
   const TargetInstrDesc &TID = MI->getDesc();
-  unsigned TSFlags = TID.TSFlags;
+  uint64_t TSFlags = TID.TSFlags;
 
   unsigned Opc = MI->getOpcode();
   switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) {
@@ -524,11 +523,11 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
       return 10;
     case ARM::Int_eh_sjlj_setjmp:
     case ARM::Int_eh_sjlj_setjmp_nofp:
-      return 24;
+      return 20;
     case ARM::tInt_eh_sjlj_setjmp:
     case ARM::t2Int_eh_sjlj_setjmp:
     case ARM::t2Int_eh_sjlj_setjmp_nofp:
-      return 14;
+      return 12;
     case ARM::BR_JTr:
     case ARM::BR_JTm:
     case ARM::BR_JTadd:
@@ -595,6 +594,7 @@ ARMBaseInstrInfo::isMoveInstr(const MachineInstr &MI,
     return true;
   }
   case ARM::MOVr:
+  case ARM::MOVr_TC:
   case ARM::tMOVr:
   case ARM::tMOVgpr2tgpr:
   case ARM::tMOVtgpr2gpr:
@@ -693,75 +693,44 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   return 0;
 }
 
-bool
-ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I,
-                               unsigned DestReg, unsigned SrcReg,
-                               const TargetRegisterClass *DestRC,
-                               const TargetRegisterClass *SrcRC,
-                               DebugLoc DL) const {
-  // tGPR is used sometimes in ARM instructions that need to avoid using
-  // certain registers.  Just treat it as GPR here.
-  if (DestRC == ARM::tGPRRegisterClass)
-    DestRC = ARM::GPRRegisterClass;
-  if (SrcRC == ARM::tGPRRegisterClass)
-    SrcRC = ARM::GPRRegisterClass;
-
-  // Allow DPR / DPR_VFP2 / DPR_8 cross-class copies.
-  if (DestRC == ARM::DPR_8RegisterClass)
-    DestRC = ARM::DPR_VFP2RegisterClass;
-  if (SrcRC == ARM::DPR_8RegisterClass)
-    SrcRC = ARM::DPR_VFP2RegisterClass;
-
-  // Allow QPR / QPR_VFP2 / QPR_8 cross-class copies.
-  if (DestRC == ARM::QPR_VFP2RegisterClass ||
-      DestRC == ARM::QPR_8RegisterClass)
-    DestRC = ARM::QPRRegisterClass;
-  if (SrcRC == ARM::QPR_VFP2RegisterClass ||
-      SrcRC == ARM::QPR_8RegisterClass)
-    SrcRC = ARM::QPRRegisterClass;
-
-  // Allow QQPR / QQPR_VFP2 cross-class copies.
-  if (DestRC == ARM::QQPR_VFP2RegisterClass)
-    DestRC = ARM::QQPRRegisterClass;
-  if (SrcRC == ARM::QQPR_VFP2RegisterClass)
-    SrcRC = ARM::QQPRRegisterClass;
-
-  // Disallow copies of unequal sizes.
-  if (DestRC != SrcRC && DestRC->getSize() != SrcRC->getSize())
-    return false;
-
-  if (DestRC == ARM::GPRRegisterClass) {
-    if (SrcRC == ARM::SPRRegisterClass)
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VMOVRS), DestReg)
-                     .addReg(SrcReg));
-    else
-      AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr),
-                                          DestReg).addReg(SrcReg)));
-  } else {
-    unsigned Opc;
-
-    if (DestRC == ARM::SPRRegisterClass)
-      Opc = (SrcRC == ARM::GPRRegisterClass ? ARM::VMOVSR : ARM::VMOVS);
-    else if (DestRC == ARM::DPRRegisterClass)
-      Opc = ARM::VMOVD;
-    else if (DestRC == ARM::DPR_VFP2RegisterClass ||
-             SrcRC == ARM::DPR_VFP2RegisterClass)
-      // Always use neon reg-reg move if source or dest is NEON-only regclass.
-      Opc = ARM::VMOVDneon;
-    else if (DestRC == ARM::QPRRegisterClass)
-      Opc = ARM::VMOVQ;
-    else if (DestRC == ARM::QQPRRegisterClass)
-      Opc = ARM::VMOVQQ;
-    else if (DestRC == ARM::QQQQPRRegisterClass)
-      Opc = ARM::VMOVQQQQ;
-    else
-      return false;
-
-    AddDefaultPred(BuildMI(MBB, I, DL, get(Opc), DestReg).addReg(SrcReg));
+void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I, DebugLoc DL,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   bool KillSrc) const {
+  bool GPRDest = ARM::GPRRegClass.contains(DestReg);
+  bool GPRSrc  = ARM::GPRRegClass.contains(SrcReg);
+
+  if (GPRDest && GPRSrc) {
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
+                                  .addReg(SrcReg, getKillRegState(KillSrc))));
+    return;
   }
 
-  return true;
+  bool SPRDest = ARM::SPRRegClass.contains(DestReg);
+  bool SPRSrc  = ARM::SPRRegClass.contains(SrcReg);
+
+  unsigned Opc;
+  if (SPRDest && SPRSrc)
+    Opc = ARM::VMOVS;
+  else if (GPRDest && SPRSrc)
+    Opc = ARM::VMOVRS;
+  else if (SPRDest && GPRSrc)
+    Opc = ARM::VMOVSR;
+  else if (ARM::DPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD;
+  else if (ARM::QPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVQ;
+  else if (ARM::QQPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVQQ;
+  else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVQQQQ;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg);
+  MIB.addReg(SrcReg, getKillRegState(KillSrc));
+  if (Opc != ARM::VMOVQQ && Opc != ARM::VMOVQQQQ)
+    AddDefaultPred(MIB);
 }
 
 static const
@@ -795,30 +764,34 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
   // tGPR is used sometimes in ARM instructions that need to avoid using
   // certain registers.  Just treat it as GPR here.
-  if (RC == ARM::tGPRRegisterClass)
+  if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass)
     RC = ARM::GPRRegisterClass;
 
-  if (RC == ARM::GPRRegisterClass) {
+  switch (RC->getID()) {
+  case ARM::GPRRegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO));
-  } else if (RC == ARM::SPRRegisterClass) {
+    break;
+  case ARM::SPRRegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-  } else if (RC == ARM::DPRRegisterClass ||
-             RC == ARM::DPR_VFP2RegisterClass ||
-             RC == ARM::DPR_8RegisterClass) {
+    break;
+  case ARM::DPRRegClassID:
+  case ARM::DPR_VFP2RegClassID:
+  case ARM::DPR_8RegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-  } else if (RC == ARM::QPRRegisterClass ||
-             RC == ARM::QPR_VFP2RegisterClass ||
-             RC == ARM::QPR_8RegisterClass) {
+    break;
+  case ARM::QPRRegClassID:
+  case ARM::QPR_VFP2RegClassID:
+  case ARM::QPR_8RegClassID:
     // FIXME: Neon instructions should support predicates
     if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q))
-                     .addFrameIndex(FI).addImm(128)
+                     .addFrameIndex(FI).addImm(16)
                      .addReg(SrcReg, getKillRegState(isKill))
                      .addMemOperand(MMO));
     } else {
@@ -828,12 +801,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
                      .addMemOperand(MMO));
     }
-  } else if (RC == ARM::QQPRRegisterClass || RC == ARM::QQPR_VFP2RegisterClass){
+    break;
+  case ARM::QQPRRegClassID:
+  case ARM::QQPR_VFP2RegClassID:
     if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
       // FIXME: It's possible to only store part of the QQ register if the
       // spilled def has a sub-register index.
-      MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VST2q32))
-        .addFrameIndex(FI).addImm(128);
+      MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VST1d64Q))
+        .addFrameIndex(FI).addImm(16);
       MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
       MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
       MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
@@ -850,8 +825,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
             AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
     }
-  } else {
-    assert(RC == ARM::QQQQPRRegisterClass && "Unknown regclass!");
+    break;
+  case ARM::QQQQPRRegClassID: {
     MachineInstrBuilder MIB =
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMD))
                      .addFrameIndex(FI)
@@ -865,6 +840,10 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI);
     MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI);
           AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI);
+    break;
+  }
+  default:
+    llvm_unreachable("Unknown regclass!");
   }
 }
 
@@ -886,26 +865,30 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
   // tGPR is used sometimes in ARM instructions that need to avoid using
   // certain registers.  Just treat it as GPR here.
-  if (RC == ARM::tGPRRegisterClass)
+  if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass)
     RC = ARM::GPRRegisterClass;
 
-  if (RC == ARM::GPRRegisterClass) {
+  switch (RC->getID()) {
+  case ARM::GPRRegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg)
                    .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO));
-  } else if (RC == ARM::SPRRegisterClass) {
+    break;
+  case ARM::SPRRegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-  } else if (RC == ARM::DPRRegisterClass ||
-             RC == ARM::DPR_VFP2RegisterClass ||
-             RC == ARM::DPR_8RegisterClass) {
+    break;
+  case ARM::DPRRegClassID:
+  case ARM::DPR_VFP2RegClassID:
+  case ARM::DPR_8RegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-  } else if (RC == ARM::QPRRegisterClass ||
-             RC == ARM::QPR_VFP2RegisterClass ||
-             RC == ARM::QPR_8RegisterClass) {
+    break;
+  case ARM::QPRRegClassID:
+  case ARM::QPR_VFP2RegClassID:
+  case ARM::QPR_8RegClassID:
     if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q), DestReg)
-                     .addFrameIndex(FI).addImm(128)
+                     .addFrameIndex(FI).addImm(16)
                      .addMemOperand(MMO));
     } else {
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQ), DestReg)
@@ -913,14 +896,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
                      .addMemOperand(MMO));
     }
-  } else if (RC == ARM::QQPRRegisterClass || RC == ARM::QQPR_VFP2RegisterClass){
+    break;
+  case ARM::QQPRRegClassID:
+  case ARM::QQPR_VFP2RegClassID:
     if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-      MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLD2q32));
+      MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLD1d64Q));
       MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
-      AddDefaultPred(MIB.addFrameIndex(FI).addImm(128).addMemOperand(MMO));
+      AddDefaultPred(MIB.addFrameIndex(FI).addImm(16).addMemOperand(MMO));
     } else {
       MachineInstrBuilder MIB =
         AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD))
@@ -932,21 +917,25 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
             AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
     }
-  } else {
-    assert(RC == ARM::QQQQPRRegisterClass && "Unknown regclass!");
-      MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD))
-                       .addFrameIndex(FI)
-                       .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)))
-        .addMemOperand(MMO);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::Define, TRI);
-            AddDReg(MIB, DestReg, ARM::dsub_7, RegState::Define, TRI);
+    break;
+  case ARM::QQQQPRRegClassID: {
+    MachineInstrBuilder MIB =
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD))
+                     .addFrameIndex(FI)
+                     .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)))
+      .addMemOperand(MMO);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::Define, TRI);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::Define, TRI);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::Define, TRI);
+    AddDReg(MIB, DestReg, ARM::dsub_7, RegState::Define, TRI);
+    break;
+  }
+  default:
+    llvm_unreachable("Unknown regclass!");
   }
 }
 
@@ -960,223 +949,6 @@ ARMBaseInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
   return &*MIB;
 }
 
-MachineInstr *ARMBaseInstrInfo::
-foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                      const SmallVectorImpl<unsigned> &Ops, int FI) const {
-  if (Ops.size() != 1) return NULL;
-
-  unsigned OpNum = Ops[0];
-  unsigned Opc = MI->getOpcode();
-  MachineInstr *NewMI = NULL;
-  if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) {
-    // If it is updating CPSR, then it cannot be folded.
-    if (MI->getOperand(4).getReg() == ARM::CPSR && !MI->getOperand(4).isDead())
-      return NULL;
-    unsigned Pred = MI->getOperand(2).getImm();
-    unsigned PredReg = MI->getOperand(3).getReg();
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      if (Opc == ARM::MOVr)
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::STR))
-          .addReg(SrcReg,
-                  getKillRegState(isKill) | getUndefRegState(isUndef),
-                  SrcSubReg)
-          .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
-      else // ARM::t2MOVr
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12))
-          .addReg(SrcReg,
-                  getKillRegState(isKill) | getUndefRegState(isUndef),
-                  SrcSubReg)
-          .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned DstSubReg = MI->getOperand(0).getSubReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      if (Opc == ARM::MOVr)
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::LDR))
-          .addReg(DstReg,
-                  RegState::Define |
-                  getDeadRegState(isDead) |
-                  getUndefRegState(isUndef), DstSubReg)
-          .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
-      else // ARM::t2MOVr
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12))
-          .addReg(DstReg,
-                  RegState::Define |
-                  getDeadRegState(isDead) |
-                  getUndefRegState(isUndef), DstSubReg)
-          .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
-    }
-  } else if (Opc == ARM::tMOVgpr2gpr ||
-             Opc == ARM::tMOVtgpr2gpr ||
-             Opc == ARM::tMOVgpr2tgpr) {
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12))
-        .addReg(SrcReg,
-                getKillRegState(isKill) | getUndefRegState(isUndef),
-                SrcSubReg)
-        .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0);
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned DstSubReg = MI->getOperand(0).getSubReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12))
-        .addReg(DstReg,
-                RegState::Define |
-                getDeadRegState(isDead) |
-                getUndefRegState(isUndef),
-                DstSubReg)
-        .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0);
-    }
-  } else if (Opc == ARM::VMOVS) {
-    unsigned Pred = MI->getOperand(2).getImm();
-    unsigned PredReg = MI->getOperand(3).getReg();
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTRS))
-        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef),
-                SrcSubReg)
-        .addFrameIndex(FI)
-        .addImm(0).addImm(Pred).addReg(PredReg);
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned DstSubReg = MI->getOperand(0).getSubReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDRS))
-        .addReg(DstReg,
-                RegState::Define |
-                getDeadRegState(isDead) |
-                getUndefRegState(isUndef),
-                DstSubReg)
-        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
-    }
-  } else if (Opc == ARM::VMOVD || Opc == ARM::VMOVDneon) {
-    unsigned Pred = MI->getOperand(2).getImm();
-    unsigned PredReg = MI->getOperand(3).getReg();
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTRD))
-        .addReg(SrcReg,
-                getKillRegState(isKill) | getUndefRegState(isUndef),
-                SrcSubReg)
-        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned DstSubReg = MI->getOperand(0).getSubReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDRD))
-        .addReg(DstReg,
-                RegState::Define |
-                getDeadRegState(isDead) |
-                getUndefRegState(isUndef),
-                DstSubReg)
-        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
-    }
-  }  else if (Opc == ARM::VMOVQ) {
-    MachineFrameInfo &MFI = *MF.getFrameInfo();
-    unsigned Pred = MI->getOperand(2).getImm();
-    unsigned PredReg = MI->getOperand(3).getReg();
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      if (MFI.getObjectAlignment(FI) >= 16 &&
-          getRegisterInfo().canRealignStack(MF)) {
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VST1q))
-          .addFrameIndex(FI).addImm(128)
-          .addReg(SrcReg,
-                  getKillRegState(isKill) | getUndefRegState(isUndef),
-                  SrcSubReg)
-          .addImm(Pred).addReg(PredReg);
-      } else {
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTMQ))
-          .addReg(SrcReg,
-                  getKillRegState(isKill) | getUndefRegState(isUndef),
-                  SrcSubReg)
-          .addFrameIndex(FI).addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
-          .addImm(Pred).addReg(PredReg);
-      }
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned DstSubReg = MI->getOperand(0).getSubReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      if (MFI.getObjectAlignment(FI) >= 16 &&
-          getRegisterInfo().canRealignStack(MF)) {
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLD1q))
-          .addReg(DstReg,
-                  RegState::Define |
-                  getDeadRegState(isDead) |
-                  getUndefRegState(isUndef),
-                  DstSubReg)
-          .addFrameIndex(FI).addImm(128).addImm(Pred).addReg(PredReg);
-      } else {
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDMQ))
-          .addReg(DstReg,
-                  RegState::Define |
-                  getDeadRegState(isDead) |
-                  getUndefRegState(isUndef),
-                  DstSubReg)
-          .addFrameIndex(FI).addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
-          .addImm(Pred).addReg(PredReg);
-      }
-    }
-  }
-
-  return NewMI;
-}
-
-MachineInstr*
-ARMBaseInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                        MachineInstr* MI,
-                                        const SmallVectorImpl<unsigned> &Ops,
-                                        MachineInstr* LoadMI) const {
-  // FIXME
-  return 0;
-}
-
-bool
-ARMBaseInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
-                                   const SmallVectorImpl<unsigned> &Ops) const {
-  if (Ops.size() != 1) return false;
-
-  unsigned Opc = MI->getOpcode();
-  if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) {
-    // If it is updating CPSR, then it cannot be folded.
-    return MI->getOperand(4).getReg() != ARM::CPSR ||
-      MI->getOperand(4).isDead();
-  } else if (Opc == ARM::tMOVgpr2gpr ||
-             Opc == ARM::tMOVtgpr2gpr ||
-             Opc == ARM::tMOVgpr2tgpr) {
-    return true;
-  } else if (Opc == ARM::VMOVS || Opc == ARM::VMOVD ||
-             Opc == ARM::VMOVDneon || Opc == ARM::VMOVQ) {
-    return true;
-  }
-
-  // FIXME: VMOVQQ and VMOVQQQQ?
-
-  return false;
-}
-
 /// Create a copy of a const pool value. Update CPI to the new index and return
 /// the label UID.
 static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
@@ -1211,17 +983,12 @@ reMaterialize(MachineBasicBlock &MBB,
               MachineBasicBlock::iterator I,
               unsigned DestReg, unsigned SubIdx,
               const MachineInstr *Orig,
-              const TargetRegisterInfo *TRI) const {
-  if (SubIdx && TargetRegisterInfo::isPhysicalRegister(DestReg)) {
-    DestReg = TRI->getSubReg(DestReg, SubIdx);
-    SubIdx = 0;
-  }
-
+              const TargetRegisterInfo &TRI) const {
   unsigned Opcode = Orig->getOpcode();
   switch (Opcode) {
   default: {
     MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
-    MI->getOperand(0).setReg(DestReg);
+    MI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
     MBB.insert(I, MI);
     break;
   }
@@ -1237,9 +1004,6 @@ reMaterialize(MachineBasicBlock &MBB,
     break;
   }
   }
-
-  MachineInstr *NewMI = prior(I);
-  NewMI->getOperand(0).setSubReg(SubIdx);
 }
 
 MachineInstr *
@@ -1291,6 +1055,165 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
   return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
 }
 
+/// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
+/// determine if two loads are loading from the same base address. It should
+/// only return true if the base pointers are the same and the only differences
+/// between the two addresses is the offset. It also returns the offsets by
+/// reference.
+bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                               int64_t &Offset1,
+                                               int64_t &Offset2) const {
+  // Don't worry about Thumb: just ARM and Thumb2.
+  if (Subtarget.isThumb1Only()) return false;
+
+  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+    return false;
+
+  switch (Load1->getMachineOpcode()) {
+  default:
+    return false;
+  case ARM::LDR:
+  case ARM::LDRB:
+  case ARM::LDRD:
+  case ARM::LDRH:
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRDi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRi12:
+  case ARM::t2LDRSHi12:
+    break;
+  }
+
+  switch (Load2->getMachineOpcode()) {
+  default:
+    return false;
+  case ARM::LDR:
+  case ARM::LDRB:
+  case ARM::LDRD:
+  case ARM::LDRH:
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRDi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRi12:
+  case ARM::t2LDRSHi12:
+    break;
+  }
+
+  // Check if base addresses and chain operands match.
+  if (Load1->getOperand(0) != Load2->getOperand(0) ||
+      Load1->getOperand(4) != Load2->getOperand(4))
+    return false;
+
+  // Index should be Reg0.
+  if (Load1->getOperand(3) != Load2->getOperand(3))
+    return false;
+
+  // Determine the offsets.
+  if (isa<ConstantSDNode>(Load1->getOperand(1)) &&
+      isa<ConstantSDNode>(Load2->getOperand(1))) {
+    Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getSExtValue();
+    Offset2 = cast<ConstantSDNode>(Load2->getOperand(1))->getSExtValue();
+    return true;
+  }
+
+  return false;
+}
+
+/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+/// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should
+/// be scheduled togther. On some targets if two loads are loading from
+/// addresses in the same cache line, it's better if they are scheduled
+/// together. This function takes two integers that represent the load offsets
+/// from the common base address. It returns true if it decides it's desirable
+/// to schedule the two loads together. "NumLoads" is the number of loads that
+/// have already been scheduled after Load1.
+bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                               int64_t Offset1, int64_t Offset2,
+                                               unsigned NumLoads) const {
+  // Don't worry about Thumb: just ARM and Thumb2.
+  if (Subtarget.isThumb1Only()) return false;
+
+  assert(Offset2 > Offset1);
+
+  if ((Offset2 - Offset1) / 8 > 64)
+    return false;
+
+  if (Load1->getMachineOpcode() != Load2->getMachineOpcode())
+    return false;  // FIXME: overly conservative?
+
+  // Four loads in a row should be sufficient.
+  if (NumLoads >= 3)
+    return false;
+
+  return true;
+}
+
+bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+                                            const MachineBasicBlock *MBB,
+                                            const MachineFunction &MF) const {
+  // Debug info is never a scheduling boundary. It's necessary to be explicit
+  // due to the special treatment of IT instructions below, otherwise a
+  // dbg_value followed by an IT will result in the IT instruction being
+  // considered a scheduling hazard, which is wrong. It should be the actual
+  // instruction preceding the dbg_value instruction(s), just like it is
+  // when debug info is not present.
+  if (MI->isDebugValue())
+    return false;
+
+  // Terminators and labels can't be scheduled around.
+  if (MI->getDesc().isTerminator() || MI->isLabel())
+    return true;
+
+  // Treat the start of the IT block as a scheduling boundary, but schedule
+  // t2IT along with all instructions following it.
+  // FIXME: This is a big hammer. But the alternative is to add all potential
+  // true and anti dependencies to IT block instructions as implicit operands
+  // to the t2IT instruction. The added compile time and complexity does not
+  // seem worth it.
+  MachineBasicBlock::const_iterator I = MI;
+  // Make sure to skip any dbg_value instructions
+  while (++I != MBB->end() && I->isDebugValue())
+    ;
+  if (I != MBB->end() && I->getOpcode() == ARM::t2IT)
+    return true;
+
+  // Don't attempt to schedule around any instruction that defines
+  // a stack-oriented pointer, as it's unlikely to be profitable. This
+  // saves compile time, because it doesn't require every single
+  // stack slot reference to depend on the instruction that does the
+  // modification.
+  if (MI->definesRegister(ARM::SP))
+    return true;
+
+  return false;
+}
+
+bool ARMBaseInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs) const {
+  if (!NumInstrs)
+    return false;
+  if (Subtarget.getCPUString() == "generic")
+    // Generic (and overly aggressive) if-conversion limits for testing.
+    return NumInstrs <= 10;
+  else if (Subtarget.hasV7Ops())
+    return NumInstrs <= 3;
+  return NumInstrs <= 2;
+}
+  
+bool ARMBaseInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
+                    MachineBasicBlock &FMBB, unsigned NumF) const {
+  return NumT && NumF && NumT <= 2 && NumF <= 2;
+}
+
 /// getInstrPredicate - If instruction is predicated, returns its predicate
 /// condition, otherwise returns AL. It also returns the condition code
 /// register by reference.
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index b566271..89a2db7 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -116,11 +116,25 @@ namespace ARMII {
     // Thumb format
     ThumbFrm      = 24 << FormShift,
 
-    // NEON format
-    NEONFrm       = 25 << FormShift,
-    NEONGetLnFrm  = 26 << FormShift,
-    NEONSetLnFrm  = 27 << FormShift,
-    NEONDupFrm    = 28 << FormShift,
+    // Miscelleaneous format
+    MiscFrm       = 25 << FormShift,
+
+    // NEON formats
+    NGetLnFrm     = 26 << FormShift,
+    NSetLnFrm     = 27 << FormShift,
+    NDupFrm       = 28 << FormShift,
+    NLdStFrm      = 29 << FormShift,
+    N1RegModImmFrm= 30 << FormShift,
+    N2RegFrm      = 31 << FormShift,
+    NVCVTFrm      = 32 << FormShift,
+    NVDupLnFrm    = 33 << FormShift,
+    N2RegVShLFrm  = 34 << FormShift,
+    N2RegVShRFrm  = 35 << FormShift,
+    N3RegFrm      = 36 << FormShift,
+    N3RegVShFrm   = 37 << FormShift,
+    NVExtFrm      = 38 << FormShift,
+    NVMulSLFrm    = 39 << FormShift,
+    NVTBLFrm      = 40 << FormShift,
 
     //===------------------------------------------------------------------===//
     // Misc flags.
@@ -213,7 +227,8 @@ public:
   virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                 MachineBasicBlock *FBB,
-                            const SmallVectorImpl<MachineOperand> &Cond) const;
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
 
   virtual
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
@@ -258,12 +273,10 @@ public:
   virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
                                       int &FrameIndex) const;
 
-  virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator I,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
 
   virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
@@ -283,29 +296,51 @@ public:
                                                  const MDNode *MDPtr,
                                                  DebugLoc DL) const;
 
-  virtual bool canFoldMemoryOperand(const MachineInstr *MI,
-                                    const SmallVectorImpl<unsigned> &Ops) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                              const SmallVectorImpl<unsigned> &Ops,
-                                              int FrameIndex) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              MachineInstr* LoadMI) const;
-
   virtual void reMaterialize(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator MI,
                              unsigned DestReg, unsigned SubIdx,
                              const MachineInstr *Orig,
-                             const TargetRegisterInfo *TRI) const;
+                             const TargetRegisterInfo &TRI) const;
 
   MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const;
 
   virtual bool produceSameValue(const MachineInstr *MI0,
                                 const MachineInstr *MI1) const;
+
+  /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
+  /// determine if two loads are loading from the same base address. It should
+  /// only return true if the base pointers are the same and the only
+  /// differences between the two addresses is the offset. It also returns the
+  /// offsets by reference.
+  virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                       int64_t &Offset1, int64_t &Offset2)const;
+
+  /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+  /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should
+  /// be scheduled togther. On some targets if two loads are loading from
+  /// addresses in the same cache line, it's better if they are scheduled
+  /// together. This function takes two integers that represent the load offsets
+  /// from the common base address. It returns true if it decides it's desirable
+  /// to schedule the two loads together. "NumLoads" is the number of loads that
+  /// have already been scheduled after Load1.
+  virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                       int64_t Offset1, int64_t Offset2,
+                                       unsigned NumLoads) const;
+
+  virtual bool isSchedulingBoundary(const MachineInstr *MI,
+                                    const MachineBasicBlock *MBB,
+                                    const MachineFunction &MF) const;
+
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB,
+                                   unsigned NumInstrs) const;
+
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,unsigned NumT,
+                                   MachineBasicBlock &FMBB,unsigned NumF) const;
+
+  virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+                                         unsigned NumInstrs) const {
+    return NumInstrs && NumInstrs == 1;
+  }
 };
 
 static inline
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 82458d2..182bd99 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -170,56 +170,6 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs;
 }
 
-const TargetRegisterClass* const *
-ARMBaseRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    0
-  };
-
-  static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = {
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass,
-    &ARM::tGPRRegClass,&ARM::tGPRRegClass,&ARM::tGPRRegClass,
-
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    0
-  };
-
-  static const TargetRegisterClass * const DarwinCalleeSavedRegClasses[] = {
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass,
-
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    0
-  };
-
-  static const TargetRegisterClass * const DarwinThumbCalleeSavedRegClasses[] ={
-    &ARM::GPRRegClass,  &ARM::tGPRRegClass, &ARM::tGPRRegClass,
-    &ARM::tGPRRegClass, &ARM::tGPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass,  &ARM::GPRRegClass,
-
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    0
-  };
-
-  if (STI.isThumb1Only()) {
-    return STI.isTargetDarwin()
-      ? DarwinThumbCalleeSavedRegClasses : ThumbCalleeSavedRegClasses;
-  }
-  return STI.isTargetDarwin()
-    ? DarwinCalleeSavedRegClasses : CalleeSavedRegClasses;
-}
-
 BitVector ARMBaseRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   // FIXME: avoid re-calculating this everytime.
@@ -352,7 +302,7 @@ ARMBaseRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
 }
 
 bool
-ARMBaseRegisterInfo::canCombinedSubRegIndex(const TargetRegisterClass *RC,
+ARMBaseRegisterInfo::canCombineSubRegIndices(const TargetRegisterClass *RC,
                                           SmallVectorImpl<unsigned> &SubIndices,
                                           unsigned &NewSubIdx) const {
 
@@ -724,6 +674,15 @@ ARMBaseRegisterInfo::estimateRSStackSizeLimit(MachineFunction &MF) const {
          I != E; ++I) {
       for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
         if (!I->getOperand(i).isFI()) continue;
+
+        // When using ADDri to get the address of a stack object, 255 is the
+        // largest offset guaranteed to fit in the immediate offset.
+        if (I->getOpcode() == ARM::ADDri) {
+          Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        }
+
+        // Otherwise check the addressing mode.
         switch (I->getDesc().TSFlags & ARMII::AddrModeMask) {
         case ARMII::AddrMode3:
         case ARMII::AddrModeT2_i8:
@@ -765,6 +724,7 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   SmallVector<unsigned, 4> UnspilledCS1GPRs;
   SmallVector<unsigned, 4> UnspilledCS2GPRs;
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // Spill R4 if Thumb2 function requires stack realignment - it will be used as
   // scratch register.
@@ -780,7 +740,6 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // Don't spill FP if the frame can be eliminated. This is determined
   // by scanning the callee-save registers to see if any is used.
   const unsigned *CSRegs = getCalleeSavedRegs();
-  const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses();
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     bool Spilled = false;
@@ -798,50 +757,50 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       }
     }
 
-    if (CSRegClasses[i] == ARM::GPRRegisterClass ||
-        CSRegClasses[i] == ARM::tGPRRegisterClass) {
-      if (Spilled) {
-        NumGPRSpills++;
+    if (!ARM::GPRRegisterClass->contains(Reg))
+      continue;
 
-        if (!STI.isTargetDarwin()) {
-          if (Reg == ARM::LR)
-            LRSpilled = true;
-          CS1Spilled = true;
-          continue;
-        }
+    if (Spilled) {
+      NumGPRSpills++;
 
-        // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
-        switch (Reg) {
-        case ARM::LR:
+      if (!STI.isTargetDarwin()) {
+        if (Reg == ARM::LR)
           LRSpilled = true;
-          // Fallthrough
-        case ARM::R4:
-        case ARM::R5:
-        case ARM::R6:
-        case ARM::R7:
-          CS1Spilled = true;
-          break;
-        default:
-          break;
-        }
-      } else {
-        if (!STI.isTargetDarwin()) {
-          UnspilledCS1GPRs.push_back(Reg);
-          continue;
-        }
+        CS1Spilled = true;
+        continue;
+      }
 
-        switch (Reg) {
-        case ARM::R4:
-        case ARM::R5:
-        case ARM::R6:
-        case ARM::R7:
-        case ARM::LR:
-          UnspilledCS1GPRs.push_back(Reg);
-          break;
-        default:
-          UnspilledCS2GPRs.push_back(Reg);
-          break;
-        }
+      // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
+      switch (Reg) {
+      case ARM::LR:
+        LRSpilled = true;
+        // Fallthrough
+      case ARM::R4:
+      case ARM::R5:
+      case ARM::R6:
+      case ARM::R7:
+        CS1Spilled = true;
+        break;
+      default:
+        break;
+      }
+    } else {
+      if (!STI.isTargetDarwin()) {
+        UnspilledCS1GPRs.push_back(Reg);
+        continue;
+      }
+
+      switch (Reg) {
+      case ARM::R4:
+      case ARM::R5:
+      case ARM::R6:
+      case ARM::R7:
+      case ARM::LR:
+        UnspilledCS1GPRs.push_back(Reg);
+        break;
+      default:
+        UnspilledCS2GPRs.push_back(Reg);
+        break;
       }
     }
   }
@@ -862,9 +821,16 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // offset, make sure a register (or a spill slot) is available for the
   // register scavenger. Note that if we're indexing off the frame pointer, the
   // effective stack size is 4 bytes larger since the FP points to the stack
-  // slot of the previous FP.
-  bool BigStack = RS &&
-    estimateStackSize(MF) + (hasFP(MF) ? 4 : 0) >= estimateRSStackSizeLimit(MF);
+  // slot of the previous FP. Also, if we have variable sized objects in the
+  // function, stack slot references will often be negative, and some of
+  // our instructions are positive-offset only, so conservatively consider
+  // that case to want a spill slot (or register) as well.
+  // FIXME: We could add logic to be more precise about negative offsets
+  //        and which instructions will need a scratch register for them. Is it
+  //        worth the effort and added fragility?
+  bool BigStack =
+    (RS && (estimateStackSize(MF) + (hasFP(MF) ? 4:0) >=
+            estimateRSStackSizeLimit(MF))) || MFI->hasVarSizedObjects();
 
   bool ExtraCSSpill = false;
   if (BigStack || !CanEliminateFrame || cannotEliminateFrame(MF)) {
@@ -957,7 +923,6 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
         // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
         // closest to SP or frame pointer.
         const TargetRegisterClass *RC = ARM::GPRRegisterClass;
-        MachineFrameInfo *MFI = MF.getFrameInfo();
         RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
                                                            RC->getAlignment(),
                                                            false));
@@ -1622,6 +1587,7 @@ emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = prior(MBB.end());
   assert(MBBI->getDesc().isReturn() &&
          "Can only insert epilog into returning blocks");
+  unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc dl = MBBI->getDebugLoc();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -1696,6 +1662,39 @@ emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
     emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea1Size());
   }
 
+  if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND ||
+      RetOpcode == ARM::TCRETURNri || RetOpcode == ARM::TCRETURNriND) {
+    // Tail call return: adjust the stack pointer and jump to callee.
+    MBBI = prior(MBB.end());
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+
+    // Jump to label or value in register.
+    if (RetOpcode == ARM::TCRETURNdi) {
+      BuildMI(MBB, MBBI, dl, 
+            TII.get(STI.isThumb() ? ARM::TAILJMPdt : ARM::TAILJMPd)).
+        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                         JumpTarget.getTargetFlags());
+    } else if (RetOpcode == ARM::TCRETURNdiND) {
+      BuildMI(MBB, MBBI, dl,
+            TII.get(STI.isThumb() ? ARM::TAILJMPdNDt : ARM::TAILJMPdND)).
+        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                         JumpTarget.getTargetFlags());
+    } else if (RetOpcode == ARM::TCRETURNri) {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPr)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
+    } else if (RetOpcode == ARM::TCRETURNriND) {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPrND)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
+    } 
+
+    MachineInstr *NewMI = prior(MBBI);
+    for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
+      NewMI->addOperand(MBBI->getOperand(i));
+
+    // Delete the pseudo instruction TCRETURN.
+    MBB.erase(MBBI);
+  }
+
   if (VARegSaveSize)
     emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize);
 }
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 2c9c82d..f7ee0d5 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -69,9 +69,6 @@ public:
   /// Code Generation virtual methods...
   const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
 
-  const TargetRegisterClass* const*
-  getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
-
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
   /// getMatchingSuperRegClass - Return a subclass of the specified register
@@ -81,14 +78,15 @@ public:
   getMatchingSuperRegClass(const TargetRegisterClass *A,
                            const TargetRegisterClass *B, unsigned Idx) const;
 
-  /// canCombinedSubRegIndex - Given a register class and a list of sub-register
-  /// indices, return true if it's possible to combine the sub-register indices
-  /// into one that corresponds to a larger sub-register. Return the new sub-
-  /// register index by reference. Note the new index by be zero if the given
-  /// sub-registers combined to form the whole register.
-  virtual bool canCombinedSubRegIndex(const TargetRegisterClass *RC,
-                                      SmallVectorImpl<unsigned> &SubIndices,
-                                      unsigned &NewSubIdx) const;
+  /// canCombineSubRegIndices - Given a register class and a list of
+  /// subregister indices, return true if it's possible to combine the
+  /// subregister indices into one that corresponds to a larger
+  /// subregister. Return the new subregister index by reference. Note the
+  /// new index may be zero if the given subregisters can be combined to
+  /// form the whole register.
+  virtual bool canCombineSubRegIndices(const TargetRegisterClass *RC,
+                                       SmallVectorImpl<unsigned> &SubIndices,
+                                       unsigned &NewSubIdx) const;
 
   const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
 
@@ -150,8 +148,8 @@ public:
   virtual bool canSimplifyCallFramePseudos(MachineFunction &MF) const;
 
   virtual void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                             MachineBasicBlock &MBB,
-                                             MachineBasicBlock::iterator I) const;
+                                           MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator I) const;
 
   virtual unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
                                        int SPAdj, FrameIndexValue *Value = NULL,
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index f2730fc..7895cb0 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -55,6 +55,7 @@ namespace {
     const std::vector<MachineConstantPoolEntry> *MCPEs;
     const std::vector<MachineJumpTableEntry> *MJTEs;
     bool IsPIC;
+    bool IsThumb;
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<MachineModuleInfo>();
@@ -67,8 +68,8 @@ namespace {
       : MachineFunctionPass(&ID), JTI(0),
         II((const ARMInstrInfo *)tm.getInstrInfo()),
         TD(tm.getTargetData()), TM(tm),
-    MCE(mce), MCPEs(0), MJTEs(0),
-    IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+        MCE(mce), MCPEs(0), MJTEs(0),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_), IsThumb(false) {}
 
     /// getBinaryCodeForInstr - This function, generated by the
     /// CodeEmitterGenerator using TableGen, produces the binary encoding for
@@ -139,6 +140,12 @@ namespace {
 
     void emitMiscInstruction(const MachineInstr &MI);
 
+    void emitNEONLaneInstruction(const MachineInstr &MI);
+    void emitNEONDupInstruction(const MachineInstr &MI);
+    void emitNEON1RegModImmInstruction(const MachineInstr &MI);
+    void emitNEON2RegInstruction(const MachineInstr &MI);
+    void emitNEON3RegInstruction(const MachineInstr &MI);
+
     /// getMachineOpValue - Return binary encoding of operand. If the machine
     /// operand requires relocation, record the relocation and return zero.
     unsigned getMachineOpValue(const MachineInstr &MI,const MachineOperand &MO);
@@ -147,7 +154,8 @@ namespace {
     }
 
     /// getMovi32Value - Return binary encoding of operand for movw/movt. If the
-    /// machine operand requires relocation, record the relocation and return zero.
+    /// machine operand requires relocation, record the relocation and return
+    /// zero.
     unsigned getMovi32Value(const MachineInstr &MI,const MachineOperand &MO,
                             unsigned Reloc);
     unsigned getMovi32Value(const MachineInstr &MI, unsigned OpIdx,
@@ -193,6 +201,7 @@ bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
   MJTEs = 0;
   if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
   IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+  IsThumb = MF.getInfo<ARMFunctionInfo>()->isThumbFunction();
   JTI->Initialize(MF, IsPIC);
   MMI = &getAnalysis<MachineModuleInfo>();
   MCE.setModuleInfo(MMI);
@@ -347,7 +356,7 @@ void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) {
 
   MCE.processDebugLoc(MI.getDebugLoc(), true);
 
-  NumEmitted++;  // Keep track of the # of mi's emitted
+  ++NumEmitted;  // Keep track of the # of mi's emitted
   switch (MI.getDesc().TSFlags & ARMII::FormMask) {
   default: {
     llvm_unreachable("Unhandled instruction encoding format!");
@@ -407,6 +416,23 @@ void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) {
   case ARMII::VFPMiscFrm:
     emitMiscInstruction(MI);
     break;
+  // NEON instructions.
+  case ARMII::NGetLnFrm:
+  case ARMII::NSetLnFrm:
+    emitNEONLaneInstruction(MI);
+    break;
+  case ARMII::NDupFrm:
+    emitNEONDupInstruction(MI);
+    break;
+  case ARMII::N1RegModImmFrm:
+    emitNEON1RegModImmInstruction(MI);
+    break;
+  case ARMII::N2RegFrm:
+    emitNEON2RegInstruction(MI);
+    break;
+  case ARMII::N3RegFrm:
+    emitNEON3RegInstruction(MI);
+    break;
   }
   MCE.processDebugLoc(MI.getDebugLoc(), false);
 }
@@ -1539,4 +1565,144 @@ void ARMCodeEmitter::emitMiscInstruction(const MachineInstr &MI) {
   emitWordLE(Binary);
 }
 
+static unsigned encodeNEONRd(const MachineInstr &MI, unsigned OpIdx) {
+  unsigned RegD = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  RegD = ARMRegisterInfo::getRegisterNumbering(RegD);
+  Binary |= (RegD & 0xf) << ARMII::RegRdShift;
+  Binary |= ((RegD >> 4) & 1) << ARMII::D_BitShift;
+  return Binary;
+}
+
+static unsigned encodeNEONRn(const MachineInstr &MI, unsigned OpIdx) {
+  unsigned RegN = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  RegN = ARMRegisterInfo::getRegisterNumbering(RegN);
+  Binary |= (RegN & 0xf) << ARMII::RegRnShift;
+  Binary |= ((RegN >> 4) & 1) << ARMII::N_BitShift;
+  return Binary;
+}
+
+static unsigned encodeNEONRm(const MachineInstr &MI, unsigned OpIdx) {
+  unsigned RegM = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  RegM = ARMRegisterInfo::getRegisterNumbering(RegM);
+  Binary |= (RegM & 0xf);
+  Binary |= ((RegM >> 4) & 1) << ARMII::M_BitShift;
+  return Binary;
+}
+
+/// convertNEONDataProcToThumb - Convert the ARM mode encoding for a NEON
+/// data-processing instruction to the corresponding Thumb encoding.
+static unsigned convertNEONDataProcToThumb(unsigned Binary) {
+  assert((Binary & 0xfe000000) == 0xf2000000 &&
+         "not an ARM NEON data-processing instruction");
+  unsigned UBit = (Binary >> 24) & 1;
+  return 0xef000000 | (UBit << 28) | (Binary & 0xffffff);
+}
+
+void ARMCodeEmitter::emitNEONLaneInstruction(const MachineInstr &MI) {
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  unsigned RegTOpIdx, RegNOpIdx, LnOpIdx;
+  const TargetInstrDesc &TID = MI.getDesc();
+  if ((TID.TSFlags & ARMII::FormMask) == ARMII::NGetLnFrm) {
+    RegTOpIdx = 0;
+    RegNOpIdx = 1;
+    LnOpIdx = 2;
+  } else { // ARMII::NSetLnFrm
+    RegTOpIdx = 2;
+    RegNOpIdx = 0;
+    LnOpIdx = 3;
+  }
+
+  // Set the conditional execution predicate
+  Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
+
+  unsigned RegT = MI.getOperand(RegTOpIdx).getReg();
+  RegT = ARMRegisterInfo::getRegisterNumbering(RegT);
+  Binary |= (RegT << ARMII::RegRdShift);
+  Binary |= encodeNEONRn(MI, RegNOpIdx);
+
+  unsigned LaneShift;
+  if ((Binary & (1 << 22)) != 0)
+    LaneShift = 0; // 8-bit elements
+  else if ((Binary & (1 << 5)) != 0)
+    LaneShift = 1; // 16-bit elements
+  else
+    LaneShift = 2; // 32-bit elements
+
+  unsigned Lane = MI.getOperand(LnOpIdx).getImm() << LaneShift;
+  unsigned Opc1 = Lane >> 2;
+  unsigned Opc2 = Lane & 3;
+  assert((Opc1 & 3) == 0 && "out-of-range lane number operand");
+  Binary |= (Opc1 << 21);
+  Binary |= (Opc2 << 5);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEONDupInstruction(const MachineInstr &MI) {
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
+
+  unsigned RegT = MI.getOperand(1).getReg();
+  RegT = ARMRegisterInfo::getRegisterNumbering(RegT);
+  Binary |= (RegT << ARMII::RegRdShift);
+  Binary |= encodeNEONRn(MI, 0);
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEON1RegModImmInstruction(const MachineInstr &MI) {
+  unsigned Binary = getBinaryCodeForInstr(MI);
+  // Destination register is encoded in Dd.
+  Binary |= encodeNEONRd(MI, 0);
+  // Immediate fields: Op, Cmode, I, Imm3, Imm4
+  unsigned Imm = MI.getOperand(1).getImm();
+  unsigned Op = (Imm >> 12) & 1;
+  unsigned Cmode = (Imm >> 8) & 0xf;
+  unsigned I = (Imm >> 7) & 1;
+  unsigned Imm3 = (Imm >> 4) & 0x7;
+  unsigned Imm4 = Imm & 0xf;
+  Binary |= (I << 24) | (Imm3 << 16) | (Cmode << 8) | (Op << 5) | Imm4;
+  if (IsThumb)
+    Binary = convertNEONDataProcToThumb(Binary);
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEON2RegInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+  unsigned Binary = getBinaryCodeForInstr(MI);
+  // Destination register is encoded in Dd; source register in Dm.
+  unsigned OpIdx = 0;
+  Binary |= encodeNEONRd(MI, OpIdx++);
+  if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+  Binary |= encodeNEONRm(MI, OpIdx);
+  if (IsThumb)
+    Binary = convertNEONDataProcToThumb(Binary);
+  // FIXME: This does not handle VDUPfdf or VDUPfqf.
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEON3RegInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+  unsigned Binary = getBinaryCodeForInstr(MI);
+  // Destination register is encoded in Dd; source registers in Dn and Dm.
+  unsigned OpIdx = 0;
+  Binary |= encodeNEONRd(MI, OpIdx++);
+  if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+  Binary |= encodeNEONRn(MI, OpIdx++);
+  if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+  Binary |= encodeNEONRm(MI, OpIdx);
+  if (IsThumb)
+    Binary = convertNEONDataProcToThumb(Binary);
+  // FIXME: This does not handle VMOVDneon or VMOVQ.
+  emitWordLE(Binary);
+}
+
 #include "ARMGenCodeEmitter.inc"
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 13d8b74..65a3da6 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -337,7 +337,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
     if (CPChange && ++NoCPIters > 30)
       llvm_unreachable("Constant Island pass failed to converge!");
     DEBUG(dumpBBs());
-    
+
     // Clear NewWaterList now.  If we split a block for branches, it should
     // appear as "new water" for the next iteration of constant pool placement.
     NewWaterList.clear();
@@ -361,8 +361,8 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
   // After a while, this might be made debug-only, but it is not expensive.
   verify(MF);
 
-  // If LR has been forced spilled and no far jumps (i.e. BL) has been issued.
-  // Undo the spill / restore of LR if possible.
+  // If LR has been forced spilled and no far jump (i.e. BL) has been issued,
+  // undo the spill / restore of LR if possible.
   if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
     MadeChange |= UndoLRSpillRestore();
 
@@ -407,7 +407,7 @@ void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF,
     std::vector<CPEntry> CPEs;
     CPEs.push_back(CPEntry(CPEMI, i));
     CPEntries.push_back(CPEs);
-    NumCPEs++;
+    ++NumCPEs;
     DEBUG(errs() << "Moved CPI#" << i << " to end of function as #" << i
                  << "\n");
   }
@@ -418,7 +418,8 @@ void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF,
 static bool BBHasFallthrough(MachineBasicBlock *MBB) {
   // Get the next machine basic block in the function.
   MachineFunction::iterator MBBI = MBB;
-  if (llvm::next(MBBI) == MBB->getParent()->end())  // Can't fall off end of function.
+  // Can't fall off end of function.
+  if (llvm::next(MBBI) == MBB->getParent()->end())
     return false;
 
   MachineBasicBlock *NextBB = llvm::next(MBBI);
@@ -491,6 +492,8 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF,
     unsigned MBBSize = 0;
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          I != E; ++I) {
+      if (I->isDebugValue())
+        continue;
       // Add instruction size to MBBSize.
       MBBSize += TII->GetInstSizeInBytes(I);
 
@@ -722,7 +725,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
   // correspond to anything in the source.
   unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B;
   BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB);
-  NumSplit++;
+  ++NumSplit;
 
   // Update the CFG.  All succs of OrigBB are now succs of NewBB.
   while (!OrigBB->succ_empty()) {
@@ -945,7 +948,7 @@ bool ARMConstantIslands::DecrementOldEntry(unsigned CPI, MachineInstr *CPEMI) {
   if (--CPE->RefCount == 0) {
     RemoveDeadCPEMI(CPEMI);
     CPE->CPEMI = NULL;
-    NumCPEs--;
+    --NumCPEs;
     return true;
   }
   return false;
@@ -1246,7 +1249,7 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
   U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
                 .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
   CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
-  NumCPEs++;
+  ++NumCPEs;
 
   BBOffsets[NewIsland->getNumber()] = BBOffsets[NewMBB->getNumber()];
   // Compensate for .align 2 in thumb mode.
@@ -1369,7 +1372,7 @@ ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br) {
   BBSizes[MBB->getNumber()] += 2;
   AdjustBBOffsetsAfter(MBB, 2);
   HasFarJump = true;
-  NumUBrFixed++;
+  ++NumUBrFixed;
 
   DEBUG(errs() << "  Changed B to long jump " << *MI);
 
@@ -1402,7 +1405,7 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) {
   MachineInstr *BMI = &MBB->back();
   bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
 
-  NumCBrFixed++;
+  ++NumCBrFixed;
   if (BMI != MI) {
     if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
         BMI->getOpcode() == Br.UncondBr) {
@@ -1621,7 +1624,7 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
   // constantpool tables?
   MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
   if (MJTI == 0) return false;
-  
+
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
     MachineInstr *MI = T2JumpTables[i];
@@ -1658,15 +1661,25 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
         continue;
       unsigned IdxReg = MI->getOperand(1).getReg();
       bool IdxRegKill = MI->getOperand(1).isKill();
+
+      // Scan backwards to find the instruction that defines the base
+      // register. Due to post-RA scheduling, we can't count on it
+      // immediately preceding the branch instruction.
       MachineBasicBlock::iterator PrevI = MI;
-      if (PrevI == MBB->begin())
+      MachineBasicBlock::iterator B = MBB->begin();
+      while (PrevI != B && !PrevI->definesRegister(BaseReg))
+        --PrevI;
+
+      // If for some reason we didn't find it, we can't do anything, so
+      // just skip this one.
+      if (!PrevI->definesRegister(BaseReg))
         continue;
 
-      MachineInstr *AddrMI = --PrevI;
+      MachineInstr *AddrMI = PrevI;
       bool OptOk = true;
-      // Examine the instruction that calculate the jumptable entry address.
-      // If it's not the one just before the t2BR_JT, we won't delete it, then
-      // it's not worth doing the optimization.
+      // Examine the instruction that calculates the jumptable entry address.
+      // Make sure it only defines the base register and kills any uses
+      // other than the index register.
       for (unsigned k = 0, eee = AddrMI->getNumOperands(); k != eee; ++k) {
         const MachineOperand &MO = AddrMI->getOperand(k);
         if (!MO.isReg() || !MO.getReg())
@@ -1683,9 +1696,14 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
       if (!OptOk)
         continue;
 
-      // The previous instruction should be a tLEApcrel or t2LEApcrelJT, we want
+      // Now scan back again to find the tLEApcrel or t2LEApcrelJT instruction
+      // that gave us the initial base register definition.
+      for (--PrevI; PrevI != B && !PrevI->definesRegister(BaseReg); --PrevI)
+        ;
+
+      // The instruction should be a tLEApcrel or t2LEApcrelJT; we want
       // to delete it as well.
-      MachineInstr *LeaMI = --PrevI;
+      MachineInstr *LeaMI = PrevI;
       if ((LeaMI->getOpcode() != ARM::tLEApcrelJT &&
            LeaMI->getOpcode() != ARM::t2LEApcrelJT) ||
           LeaMI->getOperand(0).getReg() != BaseReg)
@@ -1729,7 +1747,7 @@ bool ARMConstantIslands::ReorderThumb2JumpTables(MachineFunction &MF) {
 
   MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
   if (MJTI == 0) return false;
-  
+
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
     MachineInstr *MI = T2JumpTables[i];
@@ -1769,7 +1787,7 @@ AdjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB)
 {
   MachineFunction &MF = *BB->getParent();
 
-  // If it's the destination block is terminated by an unconditional branch,
+  // If the destination block is terminated by an unconditional branch,
   // try to move it; otherwise, create a new block following the jump
   // table that branches back to the actual target. This is a very simple
   // heuristic. FIXME: We can definitely improve it.
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index 6f4eddf..3119b54 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -15,6 +15,7 @@
 #define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
 
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include <cstddef>
 
 namespace llvm {
 
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index c87f5d7..9c62597 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -144,13 +144,15 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
       MachineInstrBuilder Even =
         AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
                                TII->get(ARM::VMOVQ))
-                       .addReg(EvenDst, getDefRegState(true) | getDeadRegState(DstIsDead))
-                       .addReg(EvenSrc, getKillRegState(SrcIsKill)));
+                     .addReg(EvenDst,
+                             getDefRegState(true) | getDeadRegState(DstIsDead))
+                     .addReg(EvenSrc, getKillRegState(SrcIsKill)));
       MachineInstrBuilder Odd =
         AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
                                TII->get(ARM::VMOVQ))
-                       .addReg(OddDst, getDefRegState(true) | getDeadRegState(DstIsDead))
-                       .addReg(OddSrc, getKillRegState(SrcIsKill)));
+                     .addReg(OddDst,
+                             getDefRegState(true) | getDeadRegState(DstIsDead))
+                     .addReg(OddSrc, getKillRegState(SrcIsKill)));
       TransferImpOps(MI, Even, Odd);
       MI.eraseFromParent();
       Modified = true;
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 9baef6b..c84d3ff 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "arm-isel"
 #include "ARM.h"
 #include "ARMAddressingModes.h"
 #include "ARMTargetMachine.h"
@@ -35,11 +36,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool>
-UseRegSeq("neon-reg-sequence", cl::Hidden,
-          cl::desc("Use reg_sequence to model ld / st of multiple neon regs"),
-          cl::init(true));
-
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
@@ -147,6 +143,11 @@ private:
                           unsigned *DOpcodes, unsigned *QOpcodes0,
                           unsigned *QOpcodes1);
 
+  /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
+  /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
+  /// generated to force the table registers to be consecutive.
+  SDNode *SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc);
+
   /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM.
   SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
 
@@ -173,24 +174,17 @@ private:
                                             char ConstraintCode,
                                             std::vector<SDValue> &OutOps);
 
-  /// PairDRegs - Form a quad register from a pair of D registers.
-  ///
+  // Form pairs of consecutive S, D, or Q registers.
+  SDNode *PairSRegs(EVT VT, SDValue V0, SDValue V1);
   SDNode *PairDRegs(EVT VT, SDValue V0, SDValue V1);
-
-  /// PairDRegs - Form a quad register pair from a pair of Q registers.
-  ///
   SDNode *PairQRegs(EVT VT, SDValue V0, SDValue V1);
 
-  /// QuadDRegs - Form a quad register pair from a quad of D registers.
-  ///
+  // Form sequences of 4 consecutive S, D, or Q registers.
+  SDNode *QuadSRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
   SDNode *QuadDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
-
-  /// QuadQRegs - Form 4 consecutive Q registers.
-  ///
   SDNode *QuadQRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
 
-  /// OctoDRegs - Form 8 consecutive D registers.
-  ///
+  // Form sequences of 8 consecutive D registers.
   SDNode *OctoDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3,
                     SDValue V4, SDValue V5, SDValue V6, SDValue V7);
 };
@@ -544,10 +538,9 @@ bool ARMDAGToDAGISel::SelectAddrModePC(SDNode *Op, SDValue N,
 bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDNode *Op, SDValue N,
                                             SDValue &Base, SDValue &Offset){
   // FIXME dl should come from the parent load or store, not the address
-  DebugLoc dl = Op->getDebugLoc();
   if (N.getOpcode() != ISD::ADD) {
     ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N);
-    if (!NC || NC->getZExtValue() != 0)
+    if (!NC || !NC->isNullValue())
       return false;
 
     Base = Offset = N;
@@ -788,8 +781,9 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDNode *Op, SDValue N,
   if (N.getOpcode() == ISD::ADD) {
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       int RHSC = (int)RHS->getZExtValue();
+      // 8 bits.
       if (((RHSC & 0x3) == 0) &&
-          ((RHSC >= 0 && RHSC < 0x400) || (RHSC < 0 && RHSC > -0x400))) { // 8 bits.
+          ((RHSC >= 0 && RHSC < 0x400) || (RHSC < 0 && RHSC > -0x400))) {
         Base   = N.getOperand(0);
         OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
         return true;
@@ -798,7 +792,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDNode *Op, SDValue N,
   } else if (N.getOpcode() == ISD::SUB) {
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       int RHSC = (int)RHS->getZExtValue();
-      if (((RHSC & 0x3) == 0) && (RHSC >= 0 && RHSC < 0x400)) { // 8 bits.
+      // 8 bits.
+      if (((RHSC & 0x3) == 0) && (RHSC >= 0 && RHSC < 0x400)) {
         Base   = N.getOperand(0);
         OffImm = CurDAG->getTargetConstant(-RHSC, MVT::i32);
         return true;
@@ -960,22 +955,24 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
   return NULL;
 }
 
+/// PairSRegs - Form a D register from a pair of S registers.
+///
+SDNode *ARMDAGToDAGISel::PairSRegs(EVT VT, SDValue V0, SDValue V1) {
+  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32);
+  const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4);
+}
+
 /// PairDRegs - Form a quad register from a pair of D registers.
 ///
 SDNode *ARMDAGToDAGISel::PairDRegs(EVT VT, SDValue V0, SDValue V1) {
   DebugLoc dl = V0.getNode()->getDebugLoc();
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
-  if (llvm::ModelWithRegSequence()) {
-    const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 };
-    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4);
-  }
-  SDValue Undef =
-    SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0);
-  SDNode *Pair = CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
-                                        VT, Undef, V0, SubReg0);
-  return CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
-                                VT, SDValue(Pair, 0), V1, SubReg1);
+  const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4);
 }
 
 /// PairQRegs - Form 4 consecutive D registers from a pair of Q registers.
@@ -988,6 +985,19 @@ SDNode *ARMDAGToDAGISel::PairQRegs(EVT VT, SDValue V0, SDValue V1) {
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4);
 }
 
+/// QuadSRegs - Form 4 consecutive S registers.
+///
+SDNode *ARMDAGToDAGISel::QuadSRegs(EVT VT, SDValue V0, SDValue V1,
+                                   SDValue V2, SDValue V3) {
+  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32);
+  SDValue SubReg2 = CurDAG->getTargetConstant(ARM::ssub_2, MVT::i32);
+  SDValue SubReg3 = CurDAG->getTargetConstant(ARM::ssub_3, MVT::i32);
+  const SDValue Ops[] = { V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8);
+}
+
 /// QuadDRegs - Form 4 consecutive D registers.
 ///
 SDNode *ARMDAGToDAGISel::QuadDRegs(EVT VT, SDValue V0, SDValue V1,
@@ -1088,7 +1098,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
     std::vector<EVT> ResTys(NumVecs, VT);
     ResTys.push_back(MVT::Other);
     SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 5);
-    if (!llvm::ModelWithRegSequence() || NumVecs < 2)
+    if (NumVecs < 2)
       return VLd;
 
     SDValue RegSeq;
@@ -1129,24 +1139,17 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
     Chain = SDValue(VLd, 2 * NumVecs);
 
     // Combine the even and odd subregs to produce the result.
-    if (llvm::ModelWithRegSequence()) {
-      if (NumVecs == 1) {
-        SDNode *Q = PairDRegs(VT, SDValue(VLd, 0), SDValue(VLd, 1));
-        ReplaceUses(SDValue(N, 0), SDValue(Q, 0));
-      } else {
-        SDValue QQ = SDValue(QuadDRegs(MVT::v4i64,
-                                       SDValue(VLd, 0), SDValue(VLd, 1),
-                                       SDValue(VLd, 2), SDValue(VLd, 3)), 0);
-        SDValue Q0 = CurDAG->getTargetExtractSubreg(ARM::qsub_0, dl, VT, QQ);
-        SDValue Q1 = CurDAG->getTargetExtractSubreg(ARM::qsub_1, dl, VT, QQ);
-        ReplaceUses(SDValue(N, 0), Q0);
-        ReplaceUses(SDValue(N, 1), Q1);
-      }
+    if (NumVecs == 1) {
+      SDNode *Q = PairDRegs(VT, SDValue(VLd, 0), SDValue(VLd, 1));
+      ReplaceUses(SDValue(N, 0), SDValue(Q, 0));
     } else {
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-        SDNode *Q = PairDRegs(VT, SDValue(VLd, 2*Vec), SDValue(VLd, 2*Vec+1));
-        ReplaceUses(SDValue(N, Vec), SDValue(Q, 0));
-      }
+      SDValue QQ = SDValue(QuadDRegs(MVT::v4i64,
+                                     SDValue(VLd, 0), SDValue(VLd, 1),
+                                     SDValue(VLd, 2), SDValue(VLd, 3)), 0);
+      SDValue Q0 = CurDAG->getTargetExtractSubreg(ARM::qsub_0, dl, VT, QQ);
+      SDValue Q1 = CurDAG->getTargetExtractSubreg(ARM::qsub_1, dl, VT, QQ);
+      ReplaceUses(SDValue(N, 0), Q0);
+      ReplaceUses(SDValue(N, 1), Q1);
     }
   } else {
     // Otherwise, quad registers are loaded with two separate instructions,
@@ -1169,37 +1172,27 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
     SDNode *VLdB = CurDAG->getMachineNode(Opc, dl, ResTys, OpsB, 6);
     Chain = SDValue(VLdB, NumVecs+1);
 
-    if (llvm::ModelWithRegSequence()) {
-      SDValue V0 = SDValue(VLdA, 0);
-      SDValue V1 = SDValue(VLdB, 0);
-      SDValue V2 = SDValue(VLdA, 1);
-      SDValue V3 = SDValue(VLdB, 1);
-      SDValue V4 = SDValue(VLdA, 2);
-      SDValue V5 = SDValue(VLdB, 2);
-      SDValue V6 = (NumVecs == 3)
-          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT),
-                    0)
-          : SDValue(VLdA, 3);
-      SDValue V7 = (NumVecs == 3)
-          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT),
-                    0)
-          : SDValue(VLdB, 3);
-      SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V0, V1, V2, V3,
-                                         V4, V5, V6, V7), 0);
-
-      // Extract out the 3 / 4 Q registers.
-      assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-        SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec,
-                                                   dl, VT, RegSeq);
-        ReplaceUses(SDValue(N, Vec), Q);
-      }
-    } else {
-      // Combine the even and odd subregs to produce the result.
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-        SDNode *Q = PairDRegs(VT, SDValue(VLdA, Vec), SDValue(VLdB, Vec));
-        ReplaceUses(SDValue(N, Vec), SDValue(Q, 0));
-      }
+    SDValue V0 = SDValue(VLdA, 0);
+    SDValue V1 = SDValue(VLdB, 0);
+    SDValue V2 = SDValue(VLdA, 1);
+    SDValue V3 = SDValue(VLdB, 1);
+    SDValue V4 = SDValue(VLdA, 2);
+    SDValue V5 = SDValue(VLdB, 2);
+    SDValue V6 = (NumVecs == 3)
+      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT), 0)
+      : SDValue(VLdA, 3);
+    SDValue V7 = (NumVecs == 3)
+      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT), 0)
+      : SDValue(VLdB, 3);
+    SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V0, V1, V2, V3,
+                                       V4, V5, V6, V7), 0);
+
+    // Extract out the 3 / 4 Q registers.
+    assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+      SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec,
+                                                 dl, VT, RegSeq);
+      ReplaceUses(SDValue(N, Vec), Q);
     }
   }
   ReplaceUses(SDValue(N, NumVecs), Chain);
@@ -1209,7 +1202,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
 SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
                                    unsigned *DOpcodes, unsigned *QOpcodes0,
                                    unsigned *QOpcodes1) {
-  assert(NumVecs >=1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
   SDValue MemAddr, Align;
@@ -1247,7 +1240,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
   Ops.push_back(Align);
 
   if (is64BitVector) {
-    if (llvm::ModelWithRegSequence() && NumVecs >= 2) {
+    if (NumVecs >= 2) {
       SDValue RegSeq;
       SDValue V0 = N->getOperand(0+3);
       SDValue V1 = N->getOperand(1+3);
@@ -1292,7 +1285,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
     // Quad registers are directly supported for VST1 and VST2,
     // storing pairs of D regs.
     unsigned Opc = QOpcodes0[OpcodeIndex];
-    if (llvm::ModelWithRegSequence() && NumVecs == 2) {
+    if (NumVecs == 2) {
       // First extract the pair of Q registers.
       SDValue Q0 = N->getOperand(3);
       SDValue Q1 = N->getOperand(4);
@@ -1330,76 +1323,48 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
 
   // Otherwise, quad registers are stored with two separate instructions,
   // where one stores the even registers and the other stores the odd registers.
-  if (llvm::ModelWithRegSequence()) {
-    // Form the QQQQ REG_SEQUENCE.
-    SDValue V[8];
-    for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) {
-      V[i]   = CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT,
-                                              N->getOperand(Vec+3));
-      V[i+1] = CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT,
-                                              N->getOperand(Vec+3));
-    }
-    if (NumVecs == 3)
-      V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                   dl, RegVT), 0);
-
-    SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3],
-                                       V[4], V[5], V[6], V[7]), 0);
-
-    // Store the even D registers.
-    assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
-    Ops.push_back(Reg0); // post-access address offset
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec*2, dl,
-                                                   RegVT, RegSeq));
-    Ops.push_back(Pred);
-    Ops.push_back(Reg0); // predicate register
-    Ops.push_back(Chain);
-    unsigned Opc = QOpcodes0[OpcodeIndex];
-    SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
-                                          MVT::Other, Ops.data(), NumVecs+6);
-    Chain = SDValue(VStA, 1);
 
-    // Store the odd D registers.
-    Ops[0] = SDValue(VStA, 0); // MemAddr
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-      Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1+Vec*2, dl,
-                                                  RegVT, RegSeq);
-    Ops[NumVecs+5] = Chain;
-    Opc = QOpcodes1[OpcodeIndex];
-    SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
-                                          MVT::Other, Ops.data(), NumVecs+6);
-    Chain = SDValue(VStB, 1);
-    ReplaceUses(SDValue(N, 0), Chain);
-    return NULL;
-  } else {
-    Ops.push_back(Reg0); // post-access address offset
-
-    // Store the even subregs.
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT,
-                                                   N->getOperand(Vec+3)));
-    Ops.push_back(Pred);
-    Ops.push_back(Reg0); // predicate register
-    Ops.push_back(Chain);
-    unsigned Opc = QOpcodes0[OpcodeIndex];
-    SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
-                                          MVT::Other, Ops.data(), NumVecs+6);
-    Chain = SDValue(VStA, 1);
-
-    // Store the odd subregs.
-    Ops[0] = SDValue(VStA, 0); // MemAddr
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-      Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT,
-                                                  N->getOperand(Vec+3));
-    Ops[NumVecs+5] = Chain;
-    Opc = QOpcodes1[OpcodeIndex];
-    SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
-                                          MVT::Other, Ops.data(), NumVecs+6);
-    Chain = SDValue(VStB, 1);
-    ReplaceUses(SDValue(N, 0), Chain);
-    return NULL;
-  }
+  // Form the QQQQ REG_SEQUENCE.
+  SDValue V[8];
+  for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) {
+    V[i]   = CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT,
+                                            N->getOperand(Vec+3));
+    V[i+1] = CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT,
+                                            N->getOperand(Vec+3));
+  }
+  if (NumVecs == 3)
+    V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, RegVT), 0);
+
+  SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3],
+                                     V[4], V[5], V[6], V[7]), 0);
+
+  // Store the even D registers.
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  Ops.push_back(Reg0); // post-access address offset
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec*2, dl,
+                                                 RegVT, RegSeq));
+  Ops.push_back(Pred);
+  Ops.push_back(Reg0); // predicate register
+  Ops.push_back(Chain);
+  unsigned Opc = QOpcodes0[OpcodeIndex];
+  SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
+                                        MVT::Other, Ops.data(), NumVecs+6);
+  Chain = SDValue(VStA, 1);
+
+  // Store the odd D registers.
+  Ops[0] = SDValue(VStA, 0); // MemAddr
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1+Vec*2, dl,
+                                                RegVT, RegSeq);
+  Ops[NumVecs+5] = Chain;
+  Opc = QOpcodes1[OpcodeIndex];
+  SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
+                                        MVT::Other, Ops.data(), NumVecs+6);
+  Chain = SDValue(VStB, 1);
+  ReplaceUses(SDValue(N, 0), Chain);
+  return NULL;
 }
 
 SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
@@ -1421,13 +1386,11 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
 
   // Quad registers are handled by load/store of subregs. Find the subreg info.
   unsigned NumElts = 0;
-  int SubregIdx = 0;
   bool Even = false;
   EVT RegVT = VT;
   if (!is64BitVector) {
     RegVT = GetNEONSubregVT(VT);
     NumElts = RegVT.getVectorNumElements();
-    SubregIdx = (Lane < NumElts) ? ARM::dsub_0 : ARM::dsub_1;
     Even = Lane < NumElts;
   }
 
@@ -1455,35 +1418,26 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   unsigned Opc = 0;
   if (is64BitVector) {
     Opc = DOpcodes[OpcodeIndex];
-    if (llvm::ModelWithRegSequence()) {
-      SDValue RegSeq;
-      SDValue V0 = N->getOperand(0+3);
-      SDValue V1 = N->getOperand(1+3);
-      if (NumVecs == 2) {
-        RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
-      } else {
-        SDValue V2 = N->getOperand(2+3);
-        SDValue V3 = (NumVecs == 3)
-          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
-          : N->getOperand(3+3);
-        RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
-      }
-
-      // Now extract the D registers back out.
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT,
-                                                   RegSeq));
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT,
-                                                   RegSeq));
-      if (NumVecs > 2)
-        Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT,
-                                                     RegSeq));
-      if (NumVecs > 3)
-        Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT,
-                                                     RegSeq));
+    SDValue RegSeq;
+    SDValue V0 = N->getOperand(0+3);
+    SDValue V1 = N->getOperand(1+3);
+    if (NumVecs == 2) {
+      RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
     } else {
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-        Ops.push_back(N->getOperand(Vec+3));
+      SDValue V2 = N->getOperand(2+3);
+      SDValue V3 = (NumVecs == 3)
+        ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
+        : N->getOperand(3+3);
+      RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
     }
+
+    // Now extract the D registers back out.
+    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, RegSeq));
+    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, RegSeq));
+    if (NumVecs > 2)
+      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT,RegSeq));
+    if (NumVecs > 3)
+      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT,RegSeq));
   } else {
     // Check if this is loading the even or odd subreg of a Q register.
     if (Lane < NumElts) {
@@ -1493,31 +1447,24 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
       Opc = QOpcodes1[OpcodeIndex];
     }
 
-    if (llvm::ModelWithRegSequence()) {
-      SDValue RegSeq;
-      SDValue V0 = N->getOperand(0+3);
-      SDValue V1 = N->getOperand(1+3);
-      if (NumVecs == 2) {
-        RegSeq = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0);
-      } else {
-        SDValue V2 = N->getOperand(2+3);
-        SDValue V3 = (NumVecs == 3)
-          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
-          : N->getOperand(3+3);
-        RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0);
-      }
-
-      // Extract the subregs of the input vector.
-      unsigned SubIdx = Even ? ARM::dsub_0 : ARM::dsub_1;
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-        Ops.push_back(CurDAG->getTargetExtractSubreg(SubIdx+Vec*2, dl, RegVT,
-                                                     RegSeq));
+    SDValue RegSeq;
+    SDValue V0 = N->getOperand(0+3);
+    SDValue V1 = N->getOperand(1+3);
+    if (NumVecs == 2) {
+      RegSeq = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0);
     } else {
-      // Extract the subregs of the input vector.
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-        Ops.push_back(CurDAG->getTargetExtractSubreg(SubregIdx, dl, RegVT,
-                                                     N->getOperand(Vec+3)));
+      SDValue V2 = N->getOperand(2+3);
+      SDValue V3 = (NumVecs == 3)
+        ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
+        : N->getOperand(3+3);
+      RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0);
     }
+
+    // Extract the subregs of the input vector.
+    unsigned SubIdx = Even ? ARM::dsub_0 : ARM::dsub_1;
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+      Ops.push_back(CurDAG->getTargetExtractSubreg(SubIdx+Vec*2, dl, RegVT,
+                                                   RegSeq));
   }
   Ops.push_back(getI32Imm(Lane));
   Ops.push_back(Pred);
@@ -1531,76 +1478,97 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   ResTys.push_back(MVT::Other);
   SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(),NumVecs+6);
 
-  if (llvm::ModelWithRegSequence()) {
-    // Form a REG_SEQUENCE to force register allocation.
-    SDValue RegSeq;
-    if (is64BitVector) {
-      SDValue V0 = SDValue(VLdLn, 0);
-      SDValue V1 = SDValue(VLdLn, 1);
-      if (NumVecs == 2) {
-        RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
-      } else {
-        SDValue V2 = SDValue(VLdLn, 2);
-        // If it's a vld3, form a quad D-register but discard the last part.
-        SDValue V3 = (NumVecs == 3)
-          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
-          : SDValue(VLdLn, 3);
-        RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
-      }
+  // Form a REG_SEQUENCE to force register allocation.
+  SDValue RegSeq;
+  if (is64BitVector) {
+    SDValue V0 = SDValue(VLdLn, 0);
+    SDValue V1 = SDValue(VLdLn, 1);
+    if (NumVecs == 2) {
+      RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
     } else {
-      // For 128-bit vectors, take the 64-bit results of the load and insert them
-      // as subregs into the result.
-      SDValue V[8];
-      for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) {
-        if (Even) {
-          V[i]   = SDValue(VLdLn, Vec);
-          V[i+1] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                  dl, RegVT), 0);
-        } else {
-          V[i]   = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                  dl, RegVT), 0);
-          V[i+1] = SDValue(VLdLn, Vec);
-        }
+      SDValue V2 = SDValue(VLdLn, 2);
+      // If it's a vld3, form a quad D-register but discard the last part.
+      SDValue V3 = (NumVecs == 3)
+        ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
+        : SDValue(VLdLn, 3);
+      RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
+    }
+  } else {
+    // For 128-bit vectors, take the 64-bit results of the load and insert
+    // them as subregs into the result.
+    SDValue V[8];
+    for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) {
+      if (Even) {
+        V[i]   = SDValue(VLdLn, Vec);
+        V[i+1] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                dl, RegVT), 0);
+      } else {
+        V[i]   = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                dl, RegVT), 0);
+        V[i+1] = SDValue(VLdLn, Vec);
       }
-      if (NumVecs == 3)
-        V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                     dl, RegVT), 0);
-
-      if (NumVecs == 2)
-        RegSeq = SDValue(QuadDRegs(MVT::v4i64, V[0], V[1], V[2], V[3]), 0);
-      else
-        RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3],
-                                   V[4], V[5], V[6], V[7]), 0);
     }
+    if (NumVecs == 3)
+      V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                   dl, RegVT), 0);
 
-    assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
-    assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
-    unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-      ReplaceUses(SDValue(N, Vec),
-                  CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, RegSeq));
-    ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, NumVecs));
-    return NULL;
-  }
-
-  // For a 64-bit vector load to D registers, nothing more needs to be done.
-  if (is64BitVector)
-    return VLdLn;
-
-  // For 128-bit vectors, take the 64-bit results of the load and insert them
-  // as subregs into the result.
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-    SDValue QuadVec = CurDAG->getTargetInsertSubreg(SubregIdx, dl, VT,
-                                                    N->getOperand(Vec+3),
-                                                    SDValue(VLdLn, Vec));
-    ReplaceUses(SDValue(N, Vec), QuadVec);
+    if (NumVecs == 2)
+      RegSeq = SDValue(QuadDRegs(MVT::v4i64, V[0], V[1], V[2], V[3]), 0);
+    else
+      RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3],
+                                 V[4], V[5], V[6], V[7]), 0);
   }
 
-  Chain = SDValue(VLdLn, NumVecs);
-  ReplaceUses(SDValue(N, NumVecs), Chain);
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, RegSeq));
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, NumVecs));
   return NULL;
 }
 
+SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
+                                    unsigned Opc) {
+  assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  unsigned FirstTblReg = IsExt ? 2 : 1;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SDValue RegSeq;
+  SDValue V0 = N->getOperand(FirstTblReg + 0);
+  SDValue V1 = N->getOperand(FirstTblReg + 1);
+  if (NumVecs == 2)
+    RegSeq = SDValue(PairDRegs(MVT::v16i8, V0, V1), 0);
+  else {
+    SDValue V2 = N->getOperand(FirstTblReg + 2);
+    // If it's a vtbl3, form a quad D-register and leave the last part as 
+    // an undef.
+    SDValue V3 = (NumVecs == 3)
+      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+      : N->getOperand(FirstTblReg + 3);
+    RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
+  }
+
+  // Now extract the D registers back out.
+  SmallVector<SDValue, 6> Ops;
+  if (IsExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, RegSeq));
+  Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, RegSeq));
+  if (NumVecs > 2)
+    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT, RegSeq));
+  if (NumVecs > 3)
+    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT, RegSeq));
+
+  Ops.push_back(N->getOperand(FirstTblReg + NumVecs));
+  Ops.push_back(getAL(CurDAG)); // predicate
+  Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops.data(), Ops.size());
+}
+
 SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
                                                      bool isSigned) {
   if (!Subtarget->hasV6T2Ops())
@@ -1954,8 +1922,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32);
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         if (Subtarget->isThumb()) {
-          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 5);
+          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 6);
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
           return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7);
@@ -2015,7 +1983,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops,4);
+      return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32,Ops,4);
     } else {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
@@ -2029,7 +1997,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops,4);
+      return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32,Ops,4);
     } else {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
@@ -2211,6 +2179,22 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
     return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4);
   }
+  case ARMISD::BUILD_VECTOR: {
+    EVT VecVT = N->getValueType(0);
+    EVT EltVT = VecVT.getVectorElementType();
+    unsigned NumElts = VecVT.getVectorNumElements();
+    if (EltVT.getSimpleVT() == MVT::f64) {
+      assert(NumElts == 2 && "unexpected type for BUILD_VECTOR");
+      return PairDRegs(VecVT, N->getOperand(0), N->getOperand(1));
+    }
+    assert(EltVT.getSimpleVT() == MVT::f32 &&
+           "unexpected type for BUILD_VECTOR");
+    if (NumElts == 2)
+      return PairSRegs(VecVT, N->getOperand(0), N->getOperand(1));
+    assert(NumElts == 4 && "unexpected type for BUILD_VECTOR");
+    return QuadSRegs(VecVT, N->getOperand(0), N->getOperand(1),
+                     N->getOperand(2), N->getOperand(3));
+  }
 
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN: {
@@ -2342,6 +2326,29 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     break;
   }
 
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+
+    case Intrinsic::arm_neon_vtbl2:
+      return SelectVTBL(N, false, 2, ARM::VTBL2);
+    case Intrinsic::arm_neon_vtbl3:
+      return SelectVTBL(N, false, 3, ARM::VTBL3);
+    case Intrinsic::arm_neon_vtbl4:
+      return SelectVTBL(N, false, 4, ARM::VTBL4);
+
+    case Intrinsic::arm_neon_vtbx2:
+      return SelectVTBL(N, true, 2, ARM::VTBX2);
+    case Intrinsic::arm_neon_vtbx3:
+      return SelectVTBL(N, true, 3, ARM::VTBX3);
+    case Intrinsic::arm_neon_vtbx4:
+      return SelectVTBL(N, true, 4, ARM::VTBX4);
+    }
+    break;
+  }
+
   case ISD::CONCAT_VECTORS:
     return SelectConcatVector(N);
   }
@@ -2367,9 +2374,3 @@ FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM,
                                      CodeGenOpt::Level OptLevel) {
   return new ARMDAGToDAGISel(TM, OptLevel);
 }
-
-/// ModelWithRegSequence - Return true if isel should use REG_SEQUENCE to model
-/// operations involving sub-registers.
-bool llvm::ModelWithRegSequence() {
-  return UseRegSeq;
-}
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index b8126a3..98d8b85 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "arm-isel"
 #include "ARM.h"
 #include "ARMAddressingModes.h"
 #include "ARMConstantPoolValue.h"
@@ -40,6 +41,7 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/VectorExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -47,9 +49,27 @@
 #include <sstream>
 using namespace llvm;
 
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+// This option should go away when tail calls fully work.
+static cl::opt<bool>
+EnableARMTailCalls("arm-tail-calls", cl::Hidden,
+  cl::desc("Generate tail calls (TEMPORARY OPTION)."),
+  cl::init(true));
+
 static cl::opt<bool>
 EnableARMLongCalls("arm-long-calls", cl::Hidden,
-  cl::desc("Generate calls via indirect call instructions."),
+  cl::desc("Generate calls via indirect call instructions"),
+  cl::init(false));
+
+static cl::opt<bool>
+ARMInterworking("arm-interworking", cl::Hidden,
+  cl::desc("Enable / disable ARM interworking (for debugging only)"),
+  cl::init(true));
+
+static cl::opt<bool>
+EnableARMCodePlacement("arm-code-placement", cl::Hidden,
+  cl::desc("Enable code placement pass for ARM"),
   cl::init(false));
 
 static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
@@ -94,10 +114,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
   }
   setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
-  if (llvm::ModelWithRegSequence())
-    setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
-  else
-    setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
@@ -393,13 +410,57 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   // doesn't yet know how to not do that for SjLj.
   setExceptionSelectorRegister(ARM::R0);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
-  setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
-
-  // If the subtarget does not have extract instructions, sign_extend_inreg
-  // needs to be expanded. Extract is available in ARM mode on v6 and up,
-  // and on most Thumb2 implementations.
-  if ((!Subtarget->isThumb() && !Subtarget->hasV6Ops())
-      || (Subtarget->isThumb2() && !Subtarget->hasT2ExtractPack())) {
+  // Handle atomics directly for ARMv[67] (except for Thumb1), otherwise
+  // use the default expansion.
+  bool canHandleAtomics =
+    (Subtarget->hasV7Ops() ||
+      (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only()));
+  if (canHandleAtomics) {
+    // membarrier needs custom lowering; the rest are legal and handled
+    // normally.
+    setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
+  } else {
+    // Set them all for expansion, which will force libcalls.
+    setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
+    // Since the libcalls include locking, fold in the fences
+    setShouldFoldAtomicFences(true);
+  }
+  // 64-bit versions are always libcalls (for now)
+  setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_SWAP,      MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Expand);
+
+  // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
+  if (!Subtarget->hasV6Ops()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
   }
@@ -412,8 +473,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
-  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+  if (Subtarget->isTargetDarwin()) {
+    setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+    setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+  }
 
   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
@@ -474,28 +537,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   else
     setSchedulingPreference(Sched::Hybrid);
 
-  // FIXME: If-converter should use instruction latency to determine
-  // profitability rather than relying on fixed limits.
-  if (Subtarget->getCPUString() == "generic") {
-    // Generic (and overly aggressive) if-conversion limits.
-    setIfCvtBlockSizeLimit(10);
-    setIfCvtDupBlockSizeLimit(2);
-  } else if (Subtarget->hasV7Ops()) {
-    setIfCvtBlockSizeLimit(3);
-    setIfCvtDupBlockSizeLimit(1);
-  } else if (Subtarget->hasV6Ops()) {
-    setIfCvtBlockSizeLimit(2);
-    setIfCvtDupBlockSizeLimit(1);
-  } else {
-    setIfCvtBlockSizeLimit(3);
-    setIfCvtDupBlockSizeLimit(2);
-  }
-
   maxStoresPerMemcpy = 1;   //// temporary - rewrite interface to use type
-  // Do not enable CodePlacementOpt for now: it currently runs after the
-  // ARMConstantIslandPass and messes up branch relaxation and placement
-  // of constant islands.
-  // benefitFromCodePlacementOpt = true;
+
+  // On ARM arguments smaller than 4 bytes are extended, so all arguments
+  // are at least 4 bytes aligned.
+  setMinStackArgumentAlignment(4);
+
+  if (EnableARMCodePlacement)
+    benefitFromCodePlacementOpt = true;
 }
 
 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -537,6 +586,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
   case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
 
+  case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
+  
   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
 
   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
@@ -581,6 +632,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VZIP:          return "ARMISD::VZIP";
   case ARMISD::VUZP:          return "ARMISD::VUZP";
   case ARMISD::VTRN:          return "ARMISD::VTRN";
+  case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::FMAX:          return "ARMISD::FMAX";
   case ARMISD::FMIN:          return "ARMISD::FMIN";
   }
@@ -603,15 +655,33 @@ TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const {
 
 /// getFunctionAlignment - Return the Log2 alignment of this function.
 unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const {
-  return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 0 : 1;
+  return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 1 : 2;
 }
 
 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
-  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+  unsigned NumVals = N->getNumValues();
+  if (!NumVals)
+    return Sched::RegPressure;
+
+  for (unsigned i = 0; i != NumVals; ++i) {
     EVT VT = N->getValueType(i);
     if (VT.isFloatingPoint() || VT.isVector())
       return Sched::Latency;
   }
+
+  if (!N->isMachineOpcode())
+    return Sched::RegPressure;
+
+  // Load are scheduled for latency even if there instruction itinerary
+  // is not available.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
+  if (TID.mayLoad())
+    return Sched::Latency;
+
+  const InstrItineraryData &Itins = getTargetMachine().getInstrItineraryData();
+  if (!Itins.isEmpty() && Itins.getStageLatency(TID.getSchedClass()) > 2)
+    return Sched::Latency;
   return Sched::RegPressure;
 }
 
@@ -964,11 +1034,28 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                              CallingConv::ID CallConv, bool isVarArg,
                              bool &isTailCall,
                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
                              const SmallVectorImpl<ISD::InputArg> &Ins,
                              DebugLoc dl, SelectionDAG &DAG,
                              SmallVectorImpl<SDValue> &InVals) const {
-  // ARM target does not yet support tail call optimization.
-  isTailCall = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool IsSibCall = false;
+  // Temporarily disable tail calls so things don't break.
+  if (!EnableARMTailCalls)
+    isTailCall = false;
+  if (isTailCall) {
+    // Check if it's really possible to do a tail call.
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
+                                                   Outs, OutVals, Ins, DAG);
+    // We don't support GuaranteedTailCallOpt for ARM, only automatically
+    // detected sibcalls.
+    if (isTailCall) {
+      ++NumTailCalls;
+      IsSibCall = true;
+    }
+  }
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -981,9 +1068,14 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
+  // For tail calls, memory operands are available in our caller's stack.
+  if (IsSibCall)
+    NumBytes = 0;
+
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  if (!IsSibCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
 
@@ -996,7 +1088,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
        i != e;
        ++i, ++realArgIdx) {
     CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = Outs[realArgIdx].Val;
+    SDValue Arg = OutVals[realArgIdx];
     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
 
     // Promote the value if needed.
@@ -1044,7 +1136,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       }
     } else if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-    } else {
+    } else if (!IsSibCall) {
       assert(VA.isMemLoc());
 
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
@@ -1059,10 +1151,32 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
+  // Tail call byval lowering might overwrite argument registers so in case of
+  // tail call optimization the copies to registers are lowered later.
+  if (!isTailCall)
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+
+  // For tail calls lower the arguments to the 'real' stack slot.
+  if (isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+
+    // Do not flag preceeding copytoreg stuff together with the following stuff.
+    InFlag = SDValue();
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+    InFlag =SDValue();
   }
 
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
@@ -1071,7 +1185,6 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   bool isDirect = false;
   bool isARMFunc = false;
   bool isLocalARMFunc = false;
-  MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   if (EnableARMLongCalls) {
@@ -1117,7 +1230,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                    getTargetMachine().getRelocationModel() != Reloc::Static;
     isARMFunc = !Subtarget->isThumb() || isStub;
     // ARM call to a local ARM function is predicable.
-    isLocalARMFunc = !Subtarget->isThumb() && !isExt;
+    isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
     // tBX takes a register source operand.
     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
       unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
@@ -1134,7 +1247,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
                            getPointerTy(), Callee, PICLabel);
     } else
-      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy());
+      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     isDirect = true;
     bool isStub = Subtarget->isTargetDarwin() &&
@@ -1171,11 +1284,6 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       ? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL)
       : ARMISD::CALL_NOLINK;
   }
-  if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb1Only()) {
-    // implicit def LR - LR mustn't be allocated as GRP:$dst of CALL_NOLINK
-    Chain = DAG.getCopyToReg(Chain, dl, ARM::LR, DAG.getUNDEF(MVT::i32),InFlag);
-    InFlag = Chain.getValue(1);
-  }
 
   std::vector<SDValue> Ops;
   Ops.push_back(Chain);
@@ -1189,9 +1297,13 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  if (isTailCall)
+    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+
   // Returns a chain and a flag for retval copy to use.
-  Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag),
-                      &Ops[0], Ops.size());
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
   InFlag = Chain.getValue(1);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
@@ -1205,10 +1317,203 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                          dl, DAG, InVals);
 }
 
+/// MatchingStackOffset - Return true if the given stack call argument is
+/// already available in the same position (relatively) of the caller's
+/// incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+                         const ARMInstrInfo *TII) {
+  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+  int FI = INT_MAX;
+  if (Arg.getOpcode() == ISD::CopyFromReg) {
+    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!VR || TargetRegisterInfo::isPhysicalRegister(VR))
+      return false;
+    MachineInstr *Def = MRI->getVRegDef(VR);
+    if (!Def)
+      return false;
+    if (!Flags.isByVal()) {
+      if (!TII->isLoadFromStackSlot(Def, FI))
+        return false;
+    } else {
+      return false;
+    }
+  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+    if (Flags.isByVal())
+      // ByVal argument is passed in as a pointer but it's now being
+      // dereferenced. e.g.
+      // define @foo(%struct.X* %A) {
+      //   tail call @bar(%struct.X* byval %A)
+      // }
+      return false;
+    SDValue Ptr = Ld->getBasePtr();
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+    if (!FINode)
+      return false;
+    FI = FINode->getIndex();
+  } else
+    return false;
+
+  assert(FI != INT_MAX);
+  if (!MFI->isFixedObjectIndex(FI))
+    return false;
+  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                                     bool isCalleeStructRet,
+                                                     bool isCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG& DAG) const {
+  const Function *CallerF = DAG.getMachineFunction().getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // Look for obvious safe cases to perform tail call optimization that do not
+  // require ABI changes. This is what gcc calls sibcall.
+
+  // Do not sibcall optimize vararg calls unless the call site is not passing
+  // any arguments.
+  if (isVarArg && !Outs.empty())
+    return false;
+
+  // Also avoid sibcall optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo::
+  // emitEpilogue is not ready for them.
+  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
+  // LR.  This means if we need to reload LR, it takes an extra instructions,
+  // which outweighs the value of the tail call; but here we don't know yet
+  // whether LR is going to be used.  Probably the right approach is to
+  // generate the tail call here and turn it back into CALL/RET in 
+  // emitEpilogue if LR is used.
+  if (Subtarget->isThumb1Only())
+    return false;
+
+  // For the moment, we can only do this to functions defined in this
+  // compilation, or to indirect calls.  A Thumb B to an ARM function,
+  // or vice versa, is not easily fixed up in the linker unlike BL.
+  // (We could do this by loading the address of the callee into a register;
+  // that is an extra instruction over the direct call and burns a register
+  // as well, so is not likely to be a win.)
+
+  // It might be safe to remove this restriction on non-Darwin.
+
+  // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
+  // but we need to make sure there are enough registers; the only valid
+  // registers are the 4 used for parameters.  We don't currently do this
+  // case.
+  if (isa<ExternalSymbolSDNode>(Callee))
+      return false;
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    if (GV->isDeclaration() || GV->isWeakForLinker())
+      return false;
+  }
+
+  // If the calling conventions do not match, then we'd better make sure the
+  // results are returned in the same way as what the caller expects.
+  if (!CCMatch) {
+    SmallVector<CCValAssign, 16> RVLocs1;
+    CCState CCInfo1(CalleeCC, false, getTargetMachine(),
+                    RVLocs1, *DAG.getContext());
+    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
+
+    SmallVector<CCValAssign, 16> RVLocs2;
+    CCState CCInfo2(CallerCC, false, getTargetMachine(),
+                    RVLocs2, *DAG.getContext());
+    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
+
+    if (RVLocs1.size() != RVLocs2.size())
+      return false;
+    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+        return false;
+      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+        return false;
+      if (RVLocs1[i].isRegLoc()) {
+        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+          return false;
+      } else {
+        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+          return false;
+      }
+    }
+  }
+
+  // If the callee takes no arguments then go on to check the results of the
+  // call.
+  if (!Outs.empty()) {
+    // Check if stack adjustment is needed. For now, do not do this if any
+    // argument is passed on the stack.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
+                   ArgLocs, *DAG.getContext());
+    CCInfo.AnalyzeCallOperands(Outs,
+                               CCAssignFnForNode(CalleeCC, false, isVarArg));
+    if (CCInfo.getNextStackOffset()) {
+      MachineFunction &MF = DAG.getMachineFunction();
+
+      // Check if the arguments are already laid out in the right way as
+      // the caller's fixed stack objects.
+      MachineFrameInfo *MFI = MF.getFrameInfo();
+      const MachineRegisterInfo *MRI = &MF.getRegInfo();
+      const ARMInstrInfo *TII =
+        ((ARMTargetMachine&)getTargetMachine()).getInstrInfo();
+      for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
+           i != e;
+           ++i, ++realArgIdx) {
+        CCValAssign &VA = ArgLocs[i];
+        EVT RegVT = VA.getLocVT();
+        SDValue Arg = OutVals[realArgIdx];
+        ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+        if (VA.getLocInfo() == CCValAssign::Indirect)
+          return false;
+        if (VA.needsCustom()) {
+          // f64 and vector types are split into multiple registers or
+          // register/stack-slot combinations.  The types will not match
+          // the registers; give up on memory f64 refs until we figure
+          // out what to do about this.
+          if (!VA.isRegLoc())
+            return false;
+          if (!ArgLocs[++i].isRegLoc())
+            return false; 
+          if (RegVT == MVT::v2f64) {
+            if (!ArgLocs[++i].isRegLoc())
+              return false;
+            if (!ArgLocs[++i].isRegLoc())
+              return false;
+          }
+        } else if (!VA.isRegLoc()) {
+          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+                                   MFI, MRI, TII))
+            return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
 SDValue
 ARMTargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
                                DebugLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to a location.
@@ -1239,7 +1544,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
-    SDValue Arg = Outs[realRVLocIdx].Val;
+    SDValue Arg = OutVals[realRVLocIdx];
 
     switch (VA.getLocInfo()) {
     default: llvm_unreachable("Unknown loc info!");
@@ -1477,7 +1782,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
     // pair. This is always cheaper.
     if (Subtarget->useMovt()) {
       return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
-                         DAG.getTargetGlobalAddress(GV, PtrVT));
+                         DAG.getTargetGlobalAddress(GV, dl, PtrVT));
     } else {
       SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
@@ -1552,9 +1857,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
 SDValue
 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
-  SDValue Val = Subtarget->isThumb() ?
-    DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::SP, MVT::i32) :
-    DAG.getConstant(0, MVT::i32);
+  SDValue Val = DAG.getConstant(0, MVT::i32);
   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(0),
                      Op.getOperand(1), Val);
 }
@@ -1568,8 +1871,7 @@ ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
-                                           const ARMSubtarget *Subtarget)
-                                             const {
+                                          const ARMSubtarget *Subtarget) const {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   DebugLoc dl = Op.getDebugLoc();
   switch (IntNo) {
@@ -1597,7 +1899,6 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
                   PseudoSourceValue::getConstantPool(), 0,
                   false, false, 0);
-    SDValue Chain = Result.getValue(1);
 
     if (RelocM == Reloc::PIC_) {
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
@@ -1609,25 +1910,21 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
 }
 
 static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
-                          const ARMSubtarget *Subtarget) {
+                               const ARMSubtarget *Subtarget) {
   DebugLoc dl = Op.getDebugLoc();
   SDValue Op5 = Op.getOperand(5);
-  SDValue Res;
   unsigned isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue();
-  if (isDeviceBarrier) {
-    if (Subtarget->hasV7Ops())
-      Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0));
-    else
-      Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0),
-                        DAG.getConstant(0, MVT::i32));
-  } else {
-    if (Subtarget->hasV7Ops())
-      Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
-    else
-      Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
-                        DAG.getConstant(0, MVT::i32));
-  }
-  return Res;
+  // v6 and v7 can both handle barriers directly, but need handled a bit
+  // differently. Thumb1 and pre-v6 ARM mode use a libcall instead and should
+  // never get here.
+  unsigned Opc = isDeviceBarrier ? ARMISD::SYNCBARRIER : ARMISD::MEMBARRIER;
+  if (Subtarget->hasV7Ops())
+    return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0));
+  else if (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only())
+    return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0),
+                       DAG.getConstant(0, MVT::i32));
+  assert(0 && "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
+  return SDValue();
 }
 
 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
@@ -1712,7 +2009,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
   SDValue ArgValue2;
   if (NextVA.isMemLoc()) {
     MachineFrameInfo *MFI = MF.getFrameInfo();
-    int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true, false);
+    int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true);
 
     // Create load node to retrieve arguments from the stack.
     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
@@ -1768,8 +2065,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           VA = ArgLocs[++i]; // skip ahead to next loc
           SDValue ArgValue2;
           if (VA.isMemLoc()) {
-            int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(),
-                                            true, false);
+            int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
             SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
             ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
                                     PseudoSourceValue::getFixedStack(FI), 0,
@@ -1836,8 +2132,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
 
       unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
-      int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(),
-                                      true, false);
+      int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(), true);
 
       // Create load nodes to retrieve arguments from the stack.
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
@@ -1868,7 +2163,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
       AFI->setVarArgsFrameIndex(
         MFI->CreateFixedObject(VARegSaveSize,
                                ArgOffset + VARegSaveSize - VARegSize,
-                               true, false));
+                               true));
       SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(),
                                       getPointerTy());
 
@@ -1884,8 +2179,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
         SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                       PseudoSourceValue::getFixedStack(AFI->getVarArgsFrameIndex()), 0,
-                       false, false, 0);
+               PseudoSourceValue::getFixedStack(AFI->getVarArgsFrameIndex()),
+               0, false, false, 0);
         MemOps.push_back(Store);
         FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
                           DAG.getConstant(4, getPointerTy()));
@@ -1895,8 +2190,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                             &MemOps[0], MemOps.size());
     } else
       // This will point to the next argument passed via stack.
-      AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset,
-                                                       true, false));
+      AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true));
   }
 
   return Chain;
@@ -1978,9 +2272,44 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
   return DAG.getNode(CompareType, dl, MVT::Flag, LHS, RHS);
 }
 
+static bool canBitcastToInt(SDNode *Op) {
+  return Op->hasOneUse() && 
+    ISD::isNormalLoad(Op) &&
+    Op->getValueType(0) == MVT::f32;
+}
+
+static SDValue bitcastToInt(SDValue Op, SelectionDAG &DAG) {
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
+    return DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+                       Ld->getChain(), Ld->getBasePtr(),
+                       Ld->getSrcValue(), Ld->getSrcValueOffset(),
+                       Ld->isVolatile(), Ld->isNonTemporal(),
+                       Ld->getAlignment());
+
+  llvm_unreachable("Unknown VFP cmp argument!");
+}
+
 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
-static SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
-                         DebugLoc dl) {
+SDValue
+ARMTargetLowering::getVFPCmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC,
+                             SDValue &ARMCC, SelectionDAG &DAG,
+                             DebugLoc dl) const {
+  if (UnsafeFPMath && FiniteOnlyFPMath() &&
+      (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
+       CC == ISD::SETNE || CC == ISD::SETUNE) &&
+      canBitcastToInt(LHS.getNode()) && canBitcastToInt(RHS.getNode())) {
+    // If unsafe fp math optimization is enabled and there are no othter uses of
+    // the CMP operands, and the condition code is EQ oe NE, we can optimize it
+    // to an integer comparison.
+    if (CC == ISD::SETOEQ)
+      CC = ISD::SETEQ;
+    else if (CC == ISD::SETUNE)
+      CC = ISD::SETNE;
+    LHS = bitcastToInt(LHS, DAG);
+    RHS = bitcastToInt(RHS, DAG);
+    return getARMCmp(LHS, RHS, CC, ARMCC, DAG, dl);
+  }
+
   SDValue Cmp;
   if (!isFloatingPointZero(RHS))
     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Flag, LHS, RHS);
@@ -2010,13 +2339,13 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
-  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue Cmp = getVFPCmp(LHS, RHS, CC, ARMCC, DAG, dl);
   SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
-                                 ARMCC, CCR, Cmp);
+                               ARMCC, CCR, Cmp);
   if (CondCode2 != ARMCC::AL) {
     SDValue ARMCC2 = DAG.getConstant(CondCode2, MVT::i32);
     // FIXME: Needs another CMP because flag can have but one use.
-    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
+    SDValue Cmp2 = getVFPCmp(LHS, RHS, CC, ARMCC2, DAG, dl);
     Result = DAG.getNode(ARMISD::CMOV, dl, VT,
                          Result, TrueVal, ARMCC2, CCR, Cmp2);
   }
@@ -2043,8 +2372,8 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   ARMCC::CondCodes CondCode, CondCode2;
   FPCCToARMCC(CC, CondCode, CondCode2);
 
-  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
   SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32);
+  SDValue Cmp = getVFPCmp(LHS, RHS, CC, ARMCC, DAG, dl);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag);
   SDValue Ops[] = { Chain, Dest, ARMCC, CCR, Cmp };
@@ -2132,7 +2461,7 @@ static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(Opc, dl, VT, Op);
 }
 
-static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   // Implement fcopysign with a fabs and a conditional fneg.
   SDValue Tmp0 = Op.getOperand(0);
   SDValue Tmp1 = Op.getOperand(1);
@@ -2140,8 +2469,10 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   EVT SrcVT = Tmp1.getValueType();
   SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0);
-  SDValue Cmp = getVFPCmp(Tmp1, DAG.getConstantFP(0.0, SrcVT), DAG, dl);
   SDValue ARMCC = DAG.getConstant(ARMCC::LT, MVT::i32);
+  SDValue FP0 = DAG.getConstantFP(0.0, SrcVT);
+  SDValue Cmp = getVFPCmp(Tmp1, FP0,
+                          ISD::SETLT, ARMCC, DAG, dl);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMCC, CCR, Cmp);
 }
@@ -2206,7 +2537,8 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) {
                              DAG.getConstant(0, MVT::i32));
     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
                              DAG.getConstant(1, MVT::i32));
-    return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, DstVT,
+                       DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
   }
 
   // Turn f64->i64 into VMOVRRD.
@@ -2516,76 +2848,149 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   return Result;
 }
 
-/// isVMOVSplat - Check if the specified splat value corresponds to an immediate
-/// VMOV instruction, and if so, return the constant being splatted.
-static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef,
-                           unsigned SplatBitSize, SelectionDAG &DAG) {
+/// isNEONModifiedImm - Check if the specified splat value corresponds to a
+/// valid vector constant for a NEON instruction with a "modified immediate"
+/// operand (e.g., VMOV).  If so, return either the constant being
+/// splatted or the encoded value, depending on the DoEncode parameter.
+static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
+                                 unsigned SplatBitSize, SelectionDAG &DAG,
+                                 bool isVMOV, bool DoEncode) {
+  unsigned OpCmode, Imm;
+  EVT VT;
+
+  // SplatBitSize is set to the smallest size that splats the vector, so a
+  // zero vector will always have SplatBitSize == 8.  However, NEON modified
+  // immediate instructions others than VMOV do not support the 8-bit encoding
+  // of a zero vector, and the default encoding of zero is supposed to be the
+  // 32-bit version.
+  if (SplatBits == 0)
+    SplatBitSize = 32;
+
   switch (SplatBitSize) {
   case 8:
-    // Any 1-byte value is OK.
+    // Any 1-byte value is OK.  Op=0, Cmode=1110.
     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
-    return DAG.getTargetConstant(SplatBits, MVT::i8);
+    OpCmode = 0xe;
+    Imm = SplatBits;
+    VT = MVT::i8;
+    break;
 
   case 16:
     // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
-    if ((SplatBits & ~0xff) == 0 ||
-        (SplatBits & ~0xff00) == 0)
-      return DAG.getTargetConstant(SplatBits, MVT::i16);
-    break;
+    VT = MVT::i16;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x00nn: Op=x, Cmode=100x.
+      OpCmode = 0x8;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0xnn00: Op=x, Cmode=101x.
+      OpCmode = 0xa;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    return SDValue();
 
   case 32:
     // NEON's 32-bit VMOV supports splat values where:
     // * only one byte is nonzero, or
     // * the least significant byte is 0xff and the second byte is nonzero, or
     // * the least significant 2 bytes are 0xff and the third is nonzero.
-    if ((SplatBits & ~0xff) == 0 ||
-        (SplatBits & ~0xff00) == 0 ||
-        (SplatBits & ~0xff0000) == 0 ||
-        (SplatBits & ~0xff000000) == 0)
-      return DAG.getTargetConstant(SplatBits, MVT::i32);
+    VT = MVT::i32;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x000000nn: Op=x, Cmode=000x.
+      OpCmode = 0;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0x0000nn00: Op=x, Cmode=001x.
+      OpCmode = 0x2;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    if ((SplatBits & ~0xff0000) == 0) {
+      // Value = 0x00nn0000: Op=x, Cmode=010x.
+      OpCmode = 0x4;
+      Imm = SplatBits >> 16;
+      break;
+    }
+    if ((SplatBits & ~0xff000000) == 0) {
+      // Value = 0xnn000000: Op=x, Cmode=011x.
+      OpCmode = 0x6;
+      Imm = SplatBits >> 24;
+      break;
+    }
 
     if ((SplatBits & ~0xffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xff) == 0xff)
-      return DAG.getTargetConstant(SplatBits | 0xff, MVT::i32);
+        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
+      // Value = 0x0000nnff: Op=x, Cmode=1100.
+      OpCmode = 0xc;
+      Imm = SplatBits >> 8;
+      SplatBits |= 0xff;
+      break;
+    }
 
     if ((SplatBits & ~0xffffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xffff) == 0xffff)
-      return DAG.getTargetConstant(SplatBits | 0xffff, MVT::i32);
+        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
+      // Value = 0x00nnffff: Op=x, Cmode=1101.
+      OpCmode = 0xd;
+      Imm = SplatBits >> 16;
+      SplatBits |= 0xffff;
+      break;
+    }
 
     // Note: there are a few 32-bit splat values (specifically: 00ffff00,
     // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
     // VMOV.I32.  A (very) minor optimization would be to replicate the value
     // and fall through here to test for a valid 64-bit splat.  But, then the
     // caller would also need to check and handle the change in size.
-    break;
+    return SDValue();
 
   case 64: {
     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
+    if (!isVMOV)
+      return SDValue();
     uint64_t BitMask = 0xff;
     uint64_t Val = 0;
+    unsigned ImmMask = 1;
+    Imm = 0;
     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
-      if (((SplatBits | SplatUndef) & BitMask) == BitMask)
+      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
         Val |= BitMask;
-      else if ((SplatBits & BitMask) != 0)
+        Imm |= ImmMask;
+      } else if ((SplatBits & BitMask) != 0) {
         return SDValue();
+      }
       BitMask <<= 8;
+      ImmMask <<= 1;
     }
-    return DAG.getTargetConstant(Val, MVT::i64);
+    // Op=1, Cmode=1110.
+    OpCmode = 0x1e;
+    SplatBits = Val;
+    VT = MVT::i64;
+    break;
   }
 
   default:
-    llvm_unreachable("unexpected size for isVMOVSplat");
-    break;
+    llvm_unreachable("unexpected size for isNEONModifiedImm");
+    return SDValue();
   }
 
-  return SDValue();
+  if (DoEncode) {
+    unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
+    return DAG.getTargetConstant(EncodedVal, MVT::i32);
+  }
+  return DAG.getTargetConstant(SplatBits, VT);
 }
 
-/// getVMOVImm - If this is a build_vector of constants which can be
-/// formed by using a VMOV instruction of the specified element size,
-/// return the constant being splatted.  The ByteSize field indicates the
-/// number of bytes of each element [1248].
-SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
+/// getNEONModImm - If this is a valid vector constant for a NEON instruction
+/// with a "modified immediate" operand (e.g., VMOV) of the specified element
+/// size, return the encoded value for that immediate.  The ByteSize field
+/// indicates the number of bytes of each element [1248].
+SDValue ARM::getNEONModImm(SDNode *N, unsigned ByteSize, bool isVMOV,
+                           SelectionDAG &DAG) {
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
@@ -2597,8 +3002,8 @@ SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
   if (SplatBitSize > ByteSize * 8)
     return SDValue();
 
-  return isVMOVSplat(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
-                     SplatBitSize, DAG);
+  return isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
+                           SplatBitSize, DAG, isVMOV, true);
 }
 
 static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT,
@@ -2838,8 +3243,10 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
   bool HasAnyUndefs;
   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
     if (SplatBitSize <= 64) {
-      SDValue Val = isVMOVSplat(SplatBits.getZExtValue(),
-                                SplatUndef.getZExtValue(), SplatBitSize, DAG);
+      // Check if an immediate VMOV works.
+      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(),
+                                      SplatBitSize, DAG, true, false);
       if (Val.getNode())
         return BuildSplat(Val, VT, DAG, dl);
     }
@@ -2883,21 +3290,17 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
     return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
 
   // Vectors with 32- or 64-bit elements can be built by directly assigning
-  // the subregisters.
+  // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
+  // will be legalized.
   if (EltSize >= 32) {
     // Do the expansion with floating-point types, since that is what the VFP
     // registers are defined to use, and since i64 is not legal.
     EVT EltVT = EVT::getFloatingPointVT(EltSize);
     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
-    SDValue Val = DAG.getUNDEF(VecVT);
-    for (unsigned i = 0; i < NumElts; ++i) {
-      SDValue Elt = Op.getOperand(i);
-      if (Elt.getOpcode() == ISD::UNDEF)
-        continue;
-      Elt = DAG.getNode(ISD::BIT_CONVERT, dl, EltVT, Elt);
-      Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Val, Elt,
-                        DAG.getConstant(i, MVT::i32));
-    }
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < NumElts; ++i)
+      Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, dl, EltVT, Op.getOperand(i)));
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
     return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val);
   }
 
@@ -2934,7 +3337,9 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
   bool ReverseVEXT;
   unsigned Imm, WhichResult;
 
-  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  return (EltSize >= 32 ||
+          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
           isVREVMask(M, VT, 64) ||
           isVREVMask(M, VT, 32) ||
           isVREVMask(M, VT, 16) ||
@@ -3032,59 +3437,62 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   // of the same time so that they get CSEd properly.
   SVN->getMask(ShuffleMask);
 
-  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
-    int Lane = SVN->getSplatIndex();
-    // If this is undef splat, generate it via "just" vdup, if possible.
-    if (Lane == -1) Lane = 0;
-
-    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-      return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (EltSize <= 32) {
+    if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
+      int Lane = SVN->getSplatIndex();
+      // If this is undef splat, generate it via "just" vdup, if possible.
+      if (Lane == -1) Lane = 0;
+
+      if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+        return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+      }
+      return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
+                         DAG.getConstant(Lane, MVT::i32));
     }
-    return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
-                       DAG.getConstant(Lane, MVT::i32));
-  }
 
-  bool ReverseVEXT;
-  unsigned Imm;
-  if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
-    if (ReverseVEXT)
-      std::swap(V1, V2);
-    return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
-                       DAG.getConstant(Imm, MVT::i32));
-  }
-
-  if (isVREVMask(ShuffleMask, VT, 64))
-    return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
-  if (isVREVMask(ShuffleMask, VT, 32))
-    return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
-  if (isVREVMask(ShuffleMask, VT, 16))
-    return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
-
-  // Check for Neon shuffles that modify both input vectors in place.
-  // If both results are used, i.e., if there are two shuffles with the same
-  // source operands and with masks corresponding to both results of one of
-  // these operations, DAG memoization will ensure that a single node is
-  // used for both shuffles.
-  unsigned WhichResult;
-  if (isVTRNMask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
-                       V1, V2).getValue(WhichResult);
-  if (isVUZPMask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
-                       V1, V2).getValue(WhichResult);
-  if (isVZIPMask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
-                       V1, V2).getValue(WhichResult);
+    bool ReverseVEXT;
+    unsigned Imm;
+    if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
+      if (ReverseVEXT)
+        std::swap(V1, V2);
+      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
+                         DAG.getConstant(Imm, MVT::i32));
+    }
 
-  if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
-                       V1, V1).getValue(WhichResult);
-  if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
-                       V1, V1).getValue(WhichResult);
-  if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
-                       V1, V1).getValue(WhichResult);
+    if (isVREVMask(ShuffleMask, VT, 64))
+      return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
+    if (isVREVMask(ShuffleMask, VT, 32))
+      return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
+    if (isVREVMask(ShuffleMask, VT, 16))
+      return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
+
+    // Check for Neon shuffles that modify both input vectors in place.
+    // If both results are used, i.e., if there are two shuffles with the same
+    // source operands and with masks corresponding to both results of one of
+    // these operations, DAG memoization will ensure that a single node is
+    // used for both shuffles.
+    unsigned WhichResult;
+    if (isVTRNMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+    if (isVUZPMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+    if (isVZIPMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+
+    if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+    if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+    if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+  }
 
   // If the shuffle is not directly supported and it has 4 elements, use
   // the PerfectShuffle-generated table to synthesize it from other shuffles.
@@ -3108,8 +3516,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
   }
 
-  // Implement shuffles with 32- or 64-bit elements as subreg copies.
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
   if (EltSize >= 32) {
     // Do the expansion with floating-point types, since that is what the VFP
     // registers are defined to use, and since i64 is not legal.
@@ -3117,17 +3524,17 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
     V1 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V1);
     V2 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V2);
-    SDValue Val = DAG.getUNDEF(VecVT);
+    SmallVector<SDValue, 8> Ops;
     for (unsigned i = 0; i < NumElts; ++i) {
       if (ShuffleMask[i] < 0)
-        continue;
-      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
-                                ShuffleMask[i] < (int)NumElts ? V1 : V2,
-                                DAG.getConstant(ShuffleMask[i] & (NumElts-1),
-                                                MVT::i32));
-      Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Val,
-                        Elt, DAG.getConstant(i, MVT::i32));
+        Ops.push_back(DAG.getUNDEF(EltVT));
+      else
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                  ShuffleMask[i] < (int)NumElts ? V1 : V2,
+                                  DAG.getConstant(ShuffleMask[i] & (NumElts-1),
+                                                  MVT::i32)));
     }
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
     return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val);
   }
 
@@ -3277,7 +3684,12 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   MF->insert(It, loop1MBB);
   MF->insert(It, loop2MBB);
   MF->insert(It, exitMBB);
-  exitMBB->transferSuccessors(BB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   //  thisMBB:
   //   ...
@@ -3315,7 +3727,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   //   ...
   BB = exitMBB;
 
-  MF->DeleteMachineInstr(MI);   // The instruction is gone now.
+  MI->eraseFromParent();   // The instruction is gone now.
 
   return BB;
 }
@@ -3358,7 +3770,12 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MF->insert(It, loopMBB);
   MF->insert(It, exitMBB);
-  exitMBB->transferSuccessors(BB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
@@ -3403,7 +3820,7 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   //   ...
   BB = exitMBB;
 
-  MF->DeleteMachineInstr(MI);   // The instruction is gone now.
+  MI->eraseFromParent();   // The instruction is gone now.
 
   return BB;
 }
@@ -3488,22 +3905,21 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     MachineFunction *F = BB->getParent();
     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
-    BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
-      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
     F->insert(It, copy0MBB);
     F->insert(It, sinkMBB);
-    // Update machine-CFG edges by first adding all successors of the current
-    // block to the new block which will contain the Phi node for the select.
-    for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
-           E = BB->succ_end(); I != E; ++I)
-      sinkMBB->addSuccessor(*I);
-    // Next, remove all successors of the current block, and add the true
-    // and fallthrough blocks as its successors.
-    while (!BB->succ_empty())
-      BB->removeSuccessor(BB->succ_begin());
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    sinkMBB->splice(sinkMBB->begin(), BB,
+                    llvm::next(MachineBasicBlock::iterator(MI)),
+                    BB->end());
+    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
     BB->addSuccessor(copy0MBB);
     BB->addSuccessor(sinkMBB);
 
+    BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
+      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
+
     //  copy0MBB:
     //   %FalseValue = ...
     //   # fallthrough to sinkMBB
@@ -3516,11 +3932,12 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
     //  ...
     BB = sinkMBB;
-    BuildMI(BB, dl, TII->get(ARM::PHI), MI->getOperand(0).getReg())
+    BuildMI(*BB, BB->begin(), dl,
+            TII->get(ARM::PHI), MI->getOperand(0).getReg())
       .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
       .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
 
-    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
     return BB;
   }
 
@@ -3541,7 +3958,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
       const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(SrcReg);
       unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
         ? ARM::tMOVtgpr2gpr : ARM::tMOVgpr2gpr;
-      BuildMI(BB, dl, TII->get(CopyOpc), ARM::SP)
+      BuildMI(*BB, MI, dl, TII->get(CopyOpc), ARM::SP)
         .addReg(SrcReg, getKillRegState(SrcIsKill));
     }
 
@@ -3573,7 +3990,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
       NeedPred = true; NeedCC = true; NeedOp3 = true;
       break;
     }
-    MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(OpOpc), ARM::SP);
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(OpOpc), ARM::SP);
     if (OpOpc == ARM::tAND)
       AddDefaultT1CC(MIB);
     MIB.addReg(ARM::SP);
@@ -3589,10 +4006,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(DstReg);
     unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
       ? ARM::tMOVgpr2tgpr : ARM::tMOVgpr2gpr;
-    BuildMI(BB, dl, TII->get(CopyOpc))
+    BuildMI(*BB, MI, dl, TII->get(CopyOpc))
       .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead))
       .addReg(ARM::SP);
-    MF->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
     return BB;
   }
   }
@@ -3893,7 +4310,8 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
       // Narrowing shifts require an immediate right shift.
       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
         break;
-      llvm_unreachable("invalid shift count for narrowing vector shift intrinsic");
+      llvm_unreachable("invalid shift count for narrowing vector shift "
+                       "intrinsic");
 
     default:
       llvm_unreachable("unhandled vector shift");
@@ -4156,14 +4574,13 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
   if (!Subtarget->hasV6Ops())
     // Pre-v6 does not support unaligned mem access.
     return false;
-  else {
-    // v6+ may or may not support unaligned mem access depending on the system
-    // configuration.
-    // FIXME: This is pretty conservative. Should we provide cmdline option to
-    // control the behaviour?
-    if (!Subtarget->isTargetDarwin())
-      return false;
-  }
+
+  // v6+ may or may not support unaligned mem access depending on the system
+  // configuration.
+  // FIXME: This is pretty conservative. Should we provide cmdline option to
+  // control the behaviour?
+  if (!Subtarget->isTargetDarwin())
+    return false;
 
   switch (VT.getSimpleVT().SimpleTy) {
   default:
@@ -4619,7 +5036,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     }
   }
   if (StringRef("{cc}").equals_lower(Constraint))
-    return std::make_pair(0U, ARM::CCRRegisterClass);
+    return std::make_pair(unsigned(ARM::CPSR), ARM::CCRRegisterClass);
 
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
@@ -4669,7 +5086,6 @@ getRegClassForInlineAsmConstraint(const std::string &Constraint,
 /// vector.  If it is invalid, don't add anything to Ops.
 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      char Constraint,
-                                                     bool hasMemory,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result(0, 0);
@@ -4818,8 +5234,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     Ops.push_back(Result);
     return;
   }
-  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
-                                                      Ops, DAG);
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
 bool
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 9c7517c..3a38669 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -70,6 +70,8 @@ namespace llvm {
       EH_SJLJ_SETJMP,    // SjLj exception handling setjmp.
       EH_SJLJ_LONGJMP,   // SjLj exception handling longjmp.
 
+      TC_RETURN,    // Tail call return pseudo.
+
       THREAD_POINTER,
 
       DYN_ALLOC,    // Dynamic allocation on the stack.
@@ -133,6 +135,13 @@ namespace llvm {
       VUZP,         // unzip (deinterleave)
       VTRN,         // transpose
 
+      // Operands of the standard BUILD_VECTOR node are not legalized, which
+      // is fine if BUILD_VECTORs are always lowered to shuffles or other
+      // operations, but for ARM some BUILD_VECTORs are legal as-is and their
+      // operands need to be legalized.  Define an ARM-specific version of
+      // BUILD_VECTOR for this purpose.
+      BUILD_VECTOR,
+
       // Floating-point max and min:
       FMAX,
       FMIN
@@ -141,11 +150,12 @@ namespace llvm {
 
   /// Define some predicates that are used for node matching.
   namespace ARM {
-    /// getVMOVImm - If this is a build_vector of constants which can be
-    /// formed by using a VMOV instruction of the specified element size,
-    /// return the constant being splatted.  The ByteSize field indicates the
-    /// number of bytes of each element [1248].
-    SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+    /// getNEONModImm - If this is a valid vector constant for a NEON
+    /// instruction with a "modified immediate" operand (e.g., VMOV) of the
+    /// specified element size, return the encoded value for that immediate.
+    /// The ByteSize field indicates the number of bytes of each element [1248].
+    SDValue getNEONModImm(SDNode *N, unsigned ByteSize, bool isVMOV,
+                          SelectionDAG &DAG);
 
     /// getVFPf32Imm / getVFPf64Imm - If the given fp immediate can be
     /// materialized with a VMOV.f32 / VMOV.f64 (i.e. fconsts / fconstd)
@@ -189,9 +199,9 @@ namespace llvm {
     bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
-    /// icmp immediate, that is the target has icmp instructions which can compare
-    /// a register against the immediate without having to materialize the
-    /// immediate into a register.
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
     virtual bool isLegalICmpImmediate(int64_t Imm) const;
 
     /// getPreIndexedAddressParts - returns true by value, base pointer and
@@ -232,7 +242,6 @@ namespace llvm {
     /// being processed is 'm'.
     virtual void LowerAsmOperandForConstraint(SDValue Op,
                                               char ConstraintLetter,
-                                              bool hasMemory,
                                               std::vector<SDValue> &Ops,
                                               SelectionDAG &DAG) const;
 
@@ -282,7 +291,8 @@ namespace llvm {
                                  SDValue &Root, SelectionDAG &DAG,
                                  DebugLoc dl) const;
 
-    CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const;
+    CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
+                                  bool isVarArg) const;
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
                              DebugLoc dl, SelectionDAG &DAG,
                              const CCValAssign &VA,
@@ -303,6 +313,7 @@ namespace llvm {
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
@@ -327,18 +338,34 @@ namespace llvm {
                 CallingConv::ID CallConv, bool isVarArg,
                 bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
 
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Targets which want to do tail call
+    /// optimization should implement this function.
+    bool IsEligibleForTailCallOptimization(SDValue Callee,
+                                           CallingConv::ID CalleeCC,
+                                           bool isVarArg,
+                                           bool isCalleeStructRet,
+                                           bool isCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                           SelectionDAG& DAG) const;
     virtual SDValue
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                       SDValue &ARMCC, SelectionDAG &DAG, DebugLoc dl) const;
+    SDValue getVFPCmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC,
+                      SDValue &ARMCC, SelectionDAG &DAG, DebugLoc dl) const;
 
     MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI,
                                          MachineBasicBlock *BB,
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index d487df1..ac568e7 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -50,27 +50,23 @@ def VFPLdStMulFrm : Format<22>;
 def VFPMiscFrm    : Format<23>;
 
 def ThumbFrm      : Format<24>;
-
-def NEONFrm       : Format<25>;
-def NEONGetLnFrm  : Format<26>;
-def NEONSetLnFrm  : Format<27>;
-def NEONDupFrm    : Format<28>;
-
-def MiscFrm       : Format<29>;
-def ThumbMiscFrm  : Format<30>;
-
-def NLdStFrm       : Format<31>;
-def N1RegModImmFrm : Format<32>;
-def N2RegFrm       : Format<33>;
-def NVCVTFrm       : Format<34>;
-def NVDupLnFrm     : Format<35>;
-def N2RegVShLFrm   : Format<36>;
-def N2RegVShRFrm   : Format<37>;
-def N3RegFrm       : Format<38>;
-def N3RegVShFrm    : Format<39>;
-def NVExtFrm       : Format<40>;
-def NVMulSLFrm     : Format<41>;
-def NVTBLFrm       : Format<42>;
+def MiscFrm       : Format<25>;
+
+def NGetLnFrm     : Format<26>;
+def NSetLnFrm     : Format<27>;
+def NDupFrm       : Format<28>;
+def NLdStFrm      : Format<29>;
+def N1RegModImmFrm: Format<30>;
+def N2RegFrm      : Format<31>;
+def NVCVTFrm      : Format<32>;
+def NVDupLnFrm    : Format<33>;
+def N2RegVShLFrm  : Format<34>;
+def N2RegVShRFrm  : Format<35>;
+def N3RegFrm      : Format<36>;
+def N3RegVShFrm   : Format<37>;
+def NVExtFrm      : Format<38>;
+def NVMulSLFrm    : Format<39>;
+def NVTBLFrm      : Format<40>;
 
 // Misc flags.
 
@@ -1653,17 +1649,17 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
 class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
                 dag oops, dag iops, InstrItinClass itin,
                 string opc, string dt, string asm, list<dag> pattern>
-  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, itin,
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NGetLnFrm, itin,
              opc, dt, asm, pattern>;
 class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
                 dag oops, dag iops, InstrItinClass itin,
                 string opc, string dt, string asm, list<dag> pattern>
-  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, itin,
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NSetLnFrm, itin,
              opc, dt, asm, pattern>;
 class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
             dag oops, dag iops, InstrItinClass itin,
             string opc, string dt, string asm, list<dag> pattern>
-  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, itin,
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NDupFrm, itin,
              opc, dt, asm, pattern>;
 
 // Vector Duplicate Lane (from scalar to all elements)
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 85f6b40..ba228ff 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -63,7 +63,7 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
 void ARMInstrInfo::
 reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
               unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig,
-              const TargetRegisterInfo *TRI) const {
+              const TargetRegisterInfo &TRI) const {
   DebugLoc dl = Orig->getDebugLoc();
   unsigned Opcode = Orig->getOpcode();
   switch (Opcode) {
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index d4199d1..4563ffe 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -35,7 +35,7 @@ public:
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, unsigned SubIdx,
                      const MachineInstr *Orig,
-                     const TargetRegisterInfo *TRI) const;
+                     const TargetRegisterInfo &TRI) const;
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index f3156d9..c73e204 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -53,6 +53,8 @@ def SDT_ARMSYNCBARRIERV7 : SDTypeProfile<0, 0, []>;
 def SDT_ARMMEMBARRIERV6  : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 def SDT_ARMSYNCBARRIERV6 : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
@@ -117,6 +119,9 @@ def ARMSyncBarrierV6 : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERV6,
 
 def ARMrbit          : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>;
 
+def ARMtcret         : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, 
+                        [SDNPHasChain,  SDNPOptInFlag, SDNPVariadic]>;
+
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
@@ -858,13 +863,13 @@ def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p),
                     Pseudo, IIC_iALUi,
                     "adr$p\t$dst, #$label", []>;
 
+} // neverHasSideEffects
 def LEApcrelJT : AXI1<0x0, (outs GPR:$dst),
                            (ins i32imm:$label, nohash_imm:$id, pred:$p),
                       Pseudo, IIC_iALUi,
                       "adr$p\t$dst, #${label}_${id}", []> {
     let Inst{25} = 1;
 }
-} // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions.
@@ -1026,6 +1031,74 @@ let isCall = 1,
   }
 }
 
+// Tail calls.
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  // Darwin versions.
+  let Defs = [R0, R1, R2, R3, R9, R12,
+              D0, D1, D2, D3, D4, D5, D6, D7,
+              D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26,
+              D27, D28, D29, D30, D31, PC],
+      Uses = [SP] in {
+    def TCRETURNdi : AInoP<(outs), (ins i32imm:$dst, variable_ops),
+                       Pseudo, IIC_Br,
+                       "@TC_RETURN","\t$dst", []>, Requires<[IsDarwin]>;
+
+    def TCRETURNri : AInoP<(outs), (ins tcGPR:$dst, variable_ops),
+                       Pseudo, IIC_Br,
+                       "@TC_RETURN","\t$dst", []>, Requires<[IsDarwin]>;
+
+    def TAILJMPd : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
+                   IIC_Br, "b\t$dst  @ TAILCALL",
+                   []>, Requires<[IsDarwin]>;
+
+    def TAILJMPdt: ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
+                   IIC_Br, "b.w\t$dst  @ TAILCALL",
+                   []>, Requires<[IsDarwin]>;
+
+    def TAILJMPr : AXI<(outs), (ins tcGPR:$dst, variable_ops),
+                     BrMiscFrm, IIC_Br, "bx\t$dst  @ TAILCALL",
+                   []>, Requires<[IsDarwin]> {
+                   let Inst{7-4}   = 0b0001;
+                   let Inst{19-8}  = 0b111111111111;
+                   let Inst{27-20} = 0b00010010;
+                   let Inst{31-28} = 0b1110;
+    }
+  }
+
+  // Non-Darwin versions (the difference is R9).
+  let Defs = [R0, R1, R2, R3, R12,
+              D0, D1, D2, D3, D4, D5, D6, D7,
+              D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26,
+              D27, D28, D29, D30, D31, PC],
+      Uses = [SP] in {
+    def TCRETURNdiND : AInoP<(outs), (ins i32imm:$dst, variable_ops),
+                       Pseudo, IIC_Br,
+                       "@TC_RETURN","\t$dst", []>, Requires<[IsNotDarwin]>;
+
+    def TCRETURNriND : AInoP<(outs), (ins tcGPR:$dst, variable_ops),
+                       Pseudo, IIC_Br,
+                       "@TC_RETURN","\t$dst", []>, Requires<[IsNotDarwin]>;
+
+    def TAILJMPdND : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
+                   IIC_Br, "b\t$dst  @ TAILCALL",
+                   []>, Requires<[IsARM, IsNotDarwin]>;
+
+    def TAILJMPdNDt : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
+                   IIC_Br, "b.w\t$dst  @ TAILCALL",
+                   []>, Requires<[IsThumb, IsNotDarwin]>;
+
+    def TAILJMPrND : AXI<(outs), (ins tcGPR:$dst, variable_ops),
+                     BrMiscFrm, IIC_Br, "bx\t$dst  @ TAILCALL",
+                   []>, Requires<[IsNotDarwin]> {
+                   let Inst{7-4}   = 0b0001;
+                   let Inst{19-8}  = 0b111111111111;
+                   let Inst{27-20} = 0b00010010;
+                   let Inst{31-28} = 0b1110;
+    }
+  }
+}
+
 let isBranch = 1, isTerminator = 1 in {
   // B is "predicable" since it can be xformed into a Bcc.
   let isBarrier = 1 in {
@@ -1397,6 +1470,14 @@ def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr,
   let Inst{25} = 0;
 }
 
+// A version for the smaller set of tail call registers.
+let neverHasSideEffects = 1 in
+def MOVr_TC : AsI1<0b1101, (outs tcGPR:$dst), (ins tcGPR:$src), DPFrm, 
+                IIC_iMOVr, "mov", "\t$dst, $src", []>, UnaryDP {
+  let Inst{11-4} = 0b00000000;
+  let Inst{25} = 0;
+}
+
 def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src),
                 DPSoRegFrm, IIC_iMOVsr,
                 "mov", "\t$dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP {
@@ -2530,31 +2611,30 @@ let Defs =
   [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
     D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
     D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
-    D31 ] in {
+    D31 ], hasSideEffects = 1, isBarrier = 1 in {
   def Int_eh_sjlj_setjmp : XI<(outs), (ins GPR:$src, GPR:$val),
                                AddrModeNone, SizeSpecial, IndexModeNone,
                                Pseudo, NoItinerary,
-                               "str\tsp, [$src, #+8] ${:comment} eh_setjmp begin\n\t"
-                               "add\t$val, pc, #8\n\t"
-                               "str\t$val, [$src, #+4]\n\t"
-                               "mov\tr0, #0\n\t"
-                               "add\tpc, pc, #0\n\t"
-                               "mov\tr0, #1 ${:comment} eh_setjmp end", "",
+                           "add\t$val, pc, #8\t${:comment} eh_setjmp begin\n\t"
+                           "str\t$val, [$src, #+4]\n\t"
+                           "mov\tr0, #0\n\t"
+                           "add\tpc, pc, #0\n\t"
+                           "mov\tr0, #1 ${:comment} eh_setjmp end", "",
                          [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
                            Requires<[IsARM, HasVFP2]>;
 }
 
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ] in {
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ],
+  hasSideEffects = 1, isBarrier = 1 in {
   def Int_eh_sjlj_setjmp_nofp : XI<(outs), (ins GPR:$src, GPR:$val),
                                    AddrModeNone, SizeSpecial, IndexModeNone,
                                    Pseudo, NoItinerary,
-                                   "str\tsp, [$src, #+8] ${:comment} eh_setjmp begin\n\t"
-                                   "add\t$val, pc, #8\n\t"
-                                   "str\t$val, [$src, #+4]\n\t"
-                                   "mov\tr0, #0\n\t"
-                                   "add\tpc, pc, #0\n\t"
-                                   "mov\tr0, #1 ${:comment} eh_setjmp end", "",
+                           "add\t$val, pc, #8\n ${:comment} eh_setjmp begin\n\t"
+                           "str\t$val, [$src, #+4]\n\t"
+                           "mov\tr0, #0\n\t"
+                           "add\tpc, pc, #0\n\t"
+                           "mov\tr0, #1 ${:comment} eh_setjmp end", "",
                          [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
                                 Requires<[IsARM, NoVFP]>;
 }
@@ -2621,6 +2701,24 @@ def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
 
 // TODO: add,sub,and, 3-instr forms?
 
+// Tail calls
+def : ARMPat<(ARMtcret tcGPR:$dst),
+          (TCRETURNri tcGPR:$dst)>, Requires<[IsDarwin]>;
+
+def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)),
+          (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>;
+
+def : ARMPat<(ARMtcret (i32 texternalsym:$dst)),
+          (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>;
+
+def : ARMPat<(ARMtcret tcGPR:$dst),
+          (TCRETURNriND tcGPR:$dst)>, Requires<[IsNotDarwin]>;
+
+def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)),
+          (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>;
+
+def : ARMPat<(ARMtcret (i32 texternalsym:$dst)),
+          (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>;
 
 // Direct calls
 def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>,
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 197ec16..a84315f 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -98,17 +98,8 @@ def NEONfmin      : SDNode<"ARMISD::FMIN", SDTARMFMAX>;
 // NEON operand definitions
 //===----------------------------------------------------------------------===//
 
-def h8imm  : Operand<i8> {
-  let PrintMethod = "printHex8ImmOperand";
-}
-def h16imm : Operand<i16> {
-  let PrintMethod = "printHex16ImmOperand";
-}
-def h32imm : Operand<i32> {
-  let PrintMethod = "printHex32ImmOperand";
-}
-def h64imm : Operand<i64> {
-  let PrintMethod = "printHex64ImmOperand";
+def nModImm : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
 }
 
 //===----------------------------------------------------------------------===//
@@ -812,11 +803,6 @@ def DSubReg_f64_reg : SDNodeXForm<imm, [{
   assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
   return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), MVT::i32);
 }]>;
-def DSubReg_f64_other_reg : SDNodeXForm<imm, [{
-  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
-  return CurDAG->getTargetConstant(ARM::dsub_0 + (1 - N->getZExtValue()),
-                                   MVT::i32);
-}]>;
 
 // Extract S sub-registers of Q/D registers.
 def SSubReg_f32_reg : SDNodeXForm<imm, [{
@@ -2282,7 +2268,7 @@ def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
                      NEONvceq, 1>;
 // For disassembly only.
 defm VCEQz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
-                           "$dst, $src, #0">;
+                            "$dst, $src, #0">;
 
 //   VCGE     : Vector Compare Greater Than or Equal
 defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
@@ -2834,73 +2820,70 @@ def  VMOVQQQQ : PseudoInst<(outs QQQQPR:$dst), (ins QQQQPR:$src),
 
 // VMOV_get_imm8 xform function: convert build_vector to VMOV.i8 imm.
 def VMOV_get_imm8 : SDNodeXForm<build_vector, [{
-  return ARM::getVMOVImm(N, 1, *CurDAG);
+  return ARM::getNEONModImm(N, 1, true, *CurDAG);
 }]>;
 def vmovImm8 : PatLeaf<(build_vector), [{
-  return ARM::getVMOVImm(N, 1, *CurDAG).getNode() != 0;
+  return ARM::getNEONModImm(N, 1, true, *CurDAG).getNode() != 0;
 }], VMOV_get_imm8>;
 
 // VMOV_get_imm16 xform function: convert build_vector to VMOV.i16 imm.
 def VMOV_get_imm16 : SDNodeXForm<build_vector, [{
-  return ARM::getVMOVImm(N, 2, *CurDAG);
+  return ARM::getNEONModImm(N, 2, true, *CurDAG);
 }]>;
 def vmovImm16 : PatLeaf<(build_vector), [{
-  return ARM::getVMOVImm(N, 2, *CurDAG).getNode() != 0;
+  return ARM::getNEONModImm(N, 2, true, *CurDAG).getNode() != 0;
 }], VMOV_get_imm16>;
 
 // VMOV_get_imm32 xform function: convert build_vector to VMOV.i32 imm.
 def VMOV_get_imm32 : SDNodeXForm<build_vector, [{
-  return ARM::getVMOVImm(N, 4, *CurDAG);
+  return ARM::getNEONModImm(N, 4, true, *CurDAG);
 }]>;
 def vmovImm32 : PatLeaf<(build_vector), [{
-  return ARM::getVMOVImm(N, 4, *CurDAG).getNode() != 0;
+  return ARM::getNEONModImm(N, 4, true, *CurDAG).getNode() != 0;
 }], VMOV_get_imm32>;
 
 // VMOV_get_imm64 xform function: convert build_vector to VMOV.i64 imm.
 def VMOV_get_imm64 : SDNodeXForm<build_vector, [{
-  return ARM::getVMOVImm(N, 8, *CurDAG);
+  return ARM::getNEONModImm(N, 8, true, *CurDAG);
 }]>;
 def vmovImm64 : PatLeaf<(build_vector), [{
-  return ARM::getVMOVImm(N, 8, *CurDAG).getNode() != 0;
+  return ARM::getNEONModImm(N, 8, true, *CurDAG).getNode() != 0;
 }], VMOV_get_imm64>;
 
-// Note: Some of the cmode bits in the following VMOV instructions need to
-// be encoded based on the immed values.
-
 let isReMaterializable = 1 in {
 def VMOVv8i8  : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst),
-                         (ins h8imm:$SIMM), IIC_VMOVImm,
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i8", "$dst, $SIMM", "",
                          [(set DPR:$dst, (v8i8 vmovImm8:$SIMM))]>;
 def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst),
-                         (ins h8imm:$SIMM), IIC_VMOVImm,
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i8", "$dst, $SIMM", "",
                          [(set QPR:$dst, (v16i8 vmovImm8:$SIMM))]>;
 
-def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,?}, 0, 0, {?}, 1, (outs DPR:$dst),
-                         (ins h16imm:$SIMM), IIC_VMOVImm,
+def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$dst),
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i16", "$dst, $SIMM", "",
                          [(set DPR:$dst, (v4i16 vmovImm16:$SIMM))]>;
-def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,?}, 0, 1, {?}, 1, (outs QPR:$dst),
-                         (ins h16imm:$SIMM), IIC_VMOVImm,
+def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$dst),
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i16", "$dst, $SIMM", "",
                          [(set QPR:$dst, (v8i16 vmovImm16:$SIMM))]>;
 
-def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, {?}, 1, (outs DPR:$dst),
-                         (ins h32imm:$SIMM), IIC_VMOVImm,
+def VMOVv2i32 : N1ModImm<1, 0b000, {0,?,?,0}, 0, 0, 0, 1, (outs DPR:$dst),
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i32", "$dst, $SIMM", "",
                          [(set DPR:$dst, (v2i32 vmovImm32:$SIMM))]>;
-def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, {?}, 1, (outs QPR:$dst),
-                         (ins h32imm:$SIMM), IIC_VMOVImm,
+def VMOVv4i32 : N1ModImm<1, 0b000, {0,?,?,0}, 0, 1, 0, 1, (outs QPR:$dst),
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i32", "$dst, $SIMM", "",
                          [(set QPR:$dst, (v4i32 vmovImm32:$SIMM))]>;
 
 def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$dst),
-                         (ins h64imm:$SIMM), IIC_VMOVImm,
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i64", "$dst, $SIMM", "",
                          [(set DPR:$dst, (v1i64 vmovImm64:$SIMM))]>;
 def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$dst),
-                         (ins h64imm:$SIMM), IIC_VMOVImm,
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i64", "$dst, $SIMM", "",
                          [(set QPR:$dst, (v2i64 vmovImm64:$SIMM))]>;
 } // isReMaterializable
@@ -3122,17 +3105,6 @@ def  VDUPfqf  : N2V<0b11, 0b11, {?,1}, {0,0}, 0b11000, 1, 0,
                     IIC_VMOVD, "vdup", "32", "$dst, ${src:lane}", "",
                     [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>;
 
-def : Pat<(v2i64 (NEONvduplane (v2i64 QPR:$src), imm:$lane)),
-          (INSERT_SUBREG QPR:$src, 
-                         (i64 (EXTRACT_SUBREG QPR:$src,
-                               (DSubReg_f64_reg imm:$lane))),
-                         (DSubReg_f64_other_reg imm:$lane))>;
-def : Pat<(v2f64 (NEONvduplane (v2f64 QPR:$src), imm:$lane)),
-          (INSERT_SUBREG QPR:$src, 
-                         (f64 (EXTRACT_SUBREG QPR:$src,
-                               (DSubReg_f64_reg imm:$lane))),
-                         (DSubReg_f64_other_reg imm:$lane))>;
-
 //   VMOVN    : Vector Narrowing Move
 defm VMOVN    : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVD,
                             "vmovn", "i", int_arm_neon_vmovn>;
@@ -3319,22 +3291,16 @@ let hasExtraSrcRegAllocReq = 1 in {
 def  VTBL2
   : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst),
         (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTB2,
-        "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl2
-                               DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
+        "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "", []>;
 def  VTBL3
   : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst),
         (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), NVTBLFrm, IIC_VTB3,
-        "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl3
-                               DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
+        "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "", []>;
 def  VTBL4
   : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst),
         (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src),
         NVTBLFrm, IIC_VTB4,
-        "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl4 DPR:$tbl1, DPR:$tbl2,
-                               DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>;
+        "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "", []>;
 } // hasExtraSrcRegAllocReq = 1
 
 //   VTBX     : Vector Table Extension
@@ -3348,23 +3314,18 @@ let hasExtraSrcRegAllocReq = 1 in {
 def  VTBX2
   : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst),
         (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTBX2,
-        "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx2
-                               DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
+        "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst", []>;
 def  VTBX3
   : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst),
         (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src),
         NVTBLFrm, IIC_VTBX3,
-        "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "$orig = $dst",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx3 DPR:$orig, DPR:$tbl1,
-                               DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
+        "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src",
+        "$orig = $dst", []>;
 def  VTBX4
   : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1,
         DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), NVTBLFrm, IIC_VTBX4,
         "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src",
-        "$orig = $dst",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx4 DPR:$orig, DPR:$tbl1,
-                               DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>;
+        "$orig = $dst", []>;
 } // hasExtraSrcRegAllocReq = 1
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 40f924b..bc0790d 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -894,11 +894,11 @@ def tLEApcrel : T1I<(outs tGPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
                     "adr$p\t$dst, #$label", []>,
                 T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10
 
+} // neverHasSideEffects
 def tLEApcrelJT : T1I<(outs tGPR:$dst),
                       (ins i32imm:$label, nohash_imm:$id, pred:$p),
                       IIC_iALUi, "adr$p\t$dst, #${label}_${id}", []>,
                   T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10
-} // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
 // TLS Instructions
@@ -923,18 +923,18 @@ let isCall = 1,
 //   except for our own input by listing the relevant registers in Defs. By
 //   doing so, we also cause the prologue/epilogue code to actively preserve
 //   all of the callee-saved resgisters, which is exactly what we want.
-//   The current SP is passed in $val, and we reuse the reg as a scratch.
+//   $val is a scratch register for our use.
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12 ] in {
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12 ], hasSideEffects = 1,
+   isBarrier = 1  in {
   def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
                               AddrModeNone, SizeSpecial, NoItinerary,
-                              "str\t$val, [$src, #8]\t${:comment} begin eh.setjmp\n"
-                              "\tmov\t$val, pc\n"
-                              "\tadds\t$val, #7\n"
-                              "\tstr\t$val, [$src, #4]\n"
-                              "\tmovs\tr0, #0\n"
-                              "\tb\t1f\n"
-                              "\tmovs\tr0, #1\t${:comment} end eh.setjmp\n"
+                              "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t"
+                              "adds\t$val, #7\n\t"
+                              "str\t$val, [$src, #4]\n\t"
+                              "movs\tr0, #0\n\t"
+                              "b\t1f\n\t"
+                              "movs\tr0, #1\t${:comment} end eh.setjmp\n\t"
                               "1:", "",
                    [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>;
 }
@@ -1037,7 +1037,8 @@ def : T1Pat<(i32 imm0_255_comp:$src),
 // scheduling.
 let isReMaterializable = 1 in
 def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
-                   NoItinerary, "${:comment} ldr.n\t$dst, $addr\n$cp:\n\tadd\t$dst, pc",
+                   NoItinerary,
+                   "${:comment} ldr.n\t$dst, $addr\n$cp:\n\tadd\t$dst, pc",
                [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
                                            imm:$cp))]>,
                Requires<[IsThumb1Only]>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index b91c089..4692f2a 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -637,8 +637,7 @@ multiclass T2I_st<bits<2> opcod, string opc, PatFrag opnode> {
 multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
   def r     : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
                   opc, ".w\t$dst, $src",
-                 [(set GPR:$dst, (opnode GPR:$src))]>,
-                 Requires<[HasT2ExtractPack]> {
+                 [(set GPR:$dst, (opnode GPR:$src))]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
@@ -649,8 +648,7 @@ multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
    }
   def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi,
                   opc, ".w\t$dst, $src, ror $rot",
-                 [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>,
-                 Requires<[HasT2ExtractPack]> {
+                 [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
@@ -661,8 +659,8 @@ multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
    }
 }
 
-// SXTB16 and UXTB16 do not need the .w qualifier.
-multiclass T2I_unary_rrot_nw<bits<3> opcod, string opc, PatFrag opnode> {
+// UXTB16 - Requres T2ExtractPack, does not need the .w qualifier.
+multiclass T2I_unary_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> {
   def r     : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
                   opc, "\t$dst, $src",
                  [(set GPR:$dst, (opnode GPR:$src))]>,
@@ -689,9 +687,9 @@ multiclass T2I_unary_rrot_nw<bits<3> opcod, string opc, PatFrag opnode> {
    }
 }
 
-// DO variant - disassembly only, no pattern
-
-multiclass T2I_unary_rrot_DO<bits<3> opcod, string opc> {
+// SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern
+// supported yet.
+multiclass T2I_unary_rrot_sxtb16<bits<3> opcod, string opc> {
   def r     : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
                   opc, "\t$dst, $src", []> {
      let Inst{31-27} = 0b11111;
@@ -787,6 +785,7 @@ def t2LEApcrel : T2XI<(outs GPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
   let Inst{19-16} = 0b1111; // Rn
   let Inst{15} = 0;
 }
+} // neverHasSideEffects
 def t2LEApcrelJT : T2XI<(outs GPR:$dst),
                         (ins i32imm:$label, nohash_imm:$id, pred:$p), IIC_iALUi,
                         "adr$p.w\t$dst, #${label}_${id}", []> {
@@ -798,7 +797,6 @@ def t2LEApcrelJT : T2XI<(outs GPR:$dst),
   let Inst{19-16} = 0b1111; // Rn
   let Inst{15} = 0;
 }
-} // neverHasSideEffects
 
 // ADD r, sp, {so_imm|i12}
 def t2ADDrSPi   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
@@ -1330,7 +1328,7 @@ defm t2SXTB  : T2I_unary_rrot<0b100, "sxtb",
                               UnOpFrag<(sext_inreg node:$Src, i8)>>;
 defm t2SXTH  : T2I_unary_rrot<0b000, "sxth",
                               UnOpFrag<(sext_inreg node:$Src, i16)>>;
-defm t2SXTB16 : T2I_unary_rrot_DO<0b010, "sxtb16">;
+defm t2SXTB16 : T2I_unary_rrot_sxtb16<0b010, "sxtb16">;
 
 defm t2SXTAB : T2I_bin_rrot<0b100, "sxtab",
                         BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
@@ -1347,13 +1345,13 @@ defm t2UXTB   : T2I_unary_rrot<0b101, "uxtb",
                                UnOpFrag<(and node:$Src, 0x000000FF)>>;
 defm t2UXTH   : T2I_unary_rrot<0b001, "uxth",
                                UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
-defm t2UXTB16 : T2I_unary_rrot_nw<0b011, "uxtb16",
+defm t2UXTB16 : T2I_unary_rrot_uxtb16<0b011, "uxtb16",
                                UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
 
 def : T2Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
-            (t2UXTB16r_rot GPR:$Src, 24)>;
+            (t2UXTB16r_rot GPR:$Src, 24)>, Requires<[HasT2ExtractPack]>;
 def : T2Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
-            (t2UXTB16r_rot GPR:$Src, 8)>;
+            (t2UXTB16r_rot GPR:$Src, 8)>, Requires<[HasT2ExtractPack]>;
 
 defm t2UXTAB : T2I_bin_rrot<0b101, "uxtab",
                            BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
@@ -2389,37 +2387,36 @@ let isCall = 1,
 //   except for our own input by listing the relevant registers in Defs. By
 //   doing so, we also cause the prologue/epilogue code to actively preserve
 //   all of the callee-saved resgisters, which is exactly what we want.
-//   The current SP is passed in $val, and we reuse the reg as a scratch.
+//   $val is a scratch register for our use.
 let Defs =
   [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
     D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
     D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
-    D31 ] in {
+    D31 ], hasSideEffects = 1, isBarrier = 1 in {
   def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins GPR:$src, tGPR:$val),
                                AddrModeNone, SizeSpecial, NoItinerary,
-                               "str\t$val, [$src, #8]\t${:comment} begin eh.setjmp\n"
-                               "\tmov\t$val, pc\n"
-                               "\tadds\t$val, #7\n"
-                               "\tstr\t$val, [$src, #4]\n"
-                               "\tmovs\tr0, #0\n"
-                               "\tb\t1f\n"
-                               "\tmovs\tr0, #1\t${:comment} end eh.setjmp\n"
+                               "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t"
+                               "adds\t$val, #7\n\t"
+                               "str\t$val, [$src, #4]\n\t"
+                               "movs\tr0, #0\n\t"
+                               "b\t1f\n\t"
+                               "movs\tr0, #1\t${:comment} end eh.setjmp\n\t"
                                "1:", "",
                           [(set R0, (ARMeh_sjlj_setjmp GPR:$src, tGPR:$val))]>,
                              Requires<[IsThumb2, HasVFP2]>;
 }
 
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ] in {
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ],
+  hasSideEffects = 1, isBarrier = 1 in {
   def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins GPR:$src, tGPR:$val),
                                AddrModeNone, SizeSpecial, NoItinerary,
-                               "str\t$val, [$src, #8]\t${:comment} begin eh.setjmp\n"
-                               "\tmov\t$val, pc\n"
-                               "\tadds\t$val, #7\n"
-                               "\tstr\t$val, [$src, #4]\n"
-                               "\tmovs\tr0, #0\n"
-                               "\tb\t1f\n"
-                               "\tmovs\tr0, #1\t${:comment} end eh.setjmp\n"
+                               "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t"
+                               "adds\t$val, #7\n\t"
+                               "str\t$val, [$src, #4]\n\t"
+                               "movs\tr0, #0\n\t"
+                               "b\t1f\n\t"
+                               "movs\tr0, #1\t${:comment} end eh.setjmp\n\t"
                                "1:", "",
                           [(set R0, (ARMeh_sjlj_setjmp GPR:$src, tGPR:$val))]>,
                                   Requires<[IsThumb2, NoVFP]>;
@@ -2529,6 +2526,7 @@ def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
 
 
 // IT block
+let Defs = [ITSTATE] in
 def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
                     AddrModeNone, Size2Bytes,  IIC_iALUx,
                     "it$mask\t$cc", "", []> {
@@ -2691,7 +2689,8 @@ def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
 // scheduling.
 let canFoldAsLoad = 1, isReMaterializable = 1 in
 def t2LDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
-                   NoItinerary, "${:comment} ldr.w\t$dst, $addr\n$cp:\n\tadd\t$dst, pc",
+                   NoItinerary,
+                   "${:comment} ldr.w\t$dst, $addr\n$cp:\n\tadd\t$dst, pc",
                [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
                                            imm:$cp))]>,
                Requires<[IsThumb2]>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 54474cf..84c23e1 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -255,25 +255,25 @@ def VCVTSD : VFPAI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm,
 
 // Between half-precision and single-precision.  For disassembly only.
 
-def VCVTBSH : ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
+def VCVTBSH: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
                  /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$dst, $a",
                  [/* For disassembly only; pattern left blank */]>;
 
 def : ARMPat<(f32_to_f16 SPR:$a),
              (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
 
-def VCVTBHS : ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
+def VCVTBHS: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
                  /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$dst, $a",
                  [/* For disassembly only; pattern left blank */]>;
 
 def : ARMPat<(f16_to_f32 GPR:$a),
              (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
-def VCVTTSH : ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
+def VCVTTSH: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
                  /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$dst, $a",
                  [/* For disassembly only; pattern left blank */]>;
 
-def VCVTTHS : ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
+def VCVTTHS: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
                  /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$dst, $a",
                  [/* For disassembly only; pattern left blank */]>;
 
diff --git a/lib/Target/ARM/ARMJITInfo.h b/lib/Target/ARM/ARMJITInfo.h
index ff332b7..f5d9eff 100644
--- a/lib/Target/ARM/ARMJITInfo.h
+++ b/lib/Target/ARM/ARMJITInfo.h
@@ -143,7 +143,8 @@ namespace llvm {
       JumpTableId2AddrMap[JTI] = Addr;
     }
 
-    /// getPCLabelAddr - Retrieve the address of the PC label of the specified id.
+    /// getPCLabelAddr - Retrieve the address of the PC label of the
+    /// specified id.
     intptr_t getPCLabelAddr(unsigned Id) const {
       DenseMap<unsigned, intptr_t>::const_iterator I = PCLabelMap.find(Id);
       assert(I != PCLabelMap.end());
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 8585c1e..f80e316 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -74,11 +74,14 @@ namespace {
   private:
     struct MemOpQueueEntry {
       int Offset;
+      unsigned Reg;
+      bool isKill;
       unsigned Position;
       MachineBasicBlock::iterator MBBI;
       bool Merged;
-      MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i)
-        : Offset(o), Position(p), MBBI(i), Merged(false) {}
+      MemOpQueueEntry(int o, unsigned r, bool k, unsigned p, 
+                      MachineBasicBlock::iterator i)
+        : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {}
     };
     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
     typedef MemOpQueue::iterator MemOpQueueIter;
@@ -128,30 +131,30 @@ namespace {
 static int getLoadStoreMultipleOpcode(int Opcode) {
   switch (Opcode) {
   case ARM::LDR:
-    NumLDMGened++;
+    ++NumLDMGened;
     return ARM::LDM;
   case ARM::STR:
-    NumSTMGened++;
+    ++NumSTMGened;
     return ARM::STM;
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
-    NumLDMGened++;
+    ++NumLDMGened;
     return ARM::t2LDM;
   case ARM::t2STRi8:
   case ARM::t2STRi12:
-    NumSTMGened++;
+    ++NumSTMGened;
     return ARM::t2STM;
   case ARM::VLDRS:
-    NumVLDMGened++;
+    ++NumVLDMGened;
     return ARM::VLDMS;
   case ARM::VSTRS:
-    NumVSTMGened++;
+    ++NumVSTMGened;
     return ARM::VSTMS;
   case ARM::VLDRD:
-    NumVLDMGened++;
+    ++NumVLDMGened;
     return ARM::VLDMD;
   case ARM::VSTRD:
-    NumVSTMGened++;
+    ++NumVSTMGened;
     return ARM::VSTMD;
   default: llvm_unreachable("Unhandled opcode!");
   }
@@ -264,45 +267,59 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
 
 // MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
 // success.
-void ARMLoadStoreOpt::
-MergeOpsUpdate(MachineBasicBlock &MBB,
-               MemOpQueue &memOps,
-               unsigned memOpsBegin,
-               unsigned memOpsEnd,
-               unsigned insertAfter,
-               int Offset,
-               unsigned Base,
-               bool BaseKill,
-               int Opcode,
-               ARMCC::CondCodes Pred,
-               unsigned PredReg,
-               unsigned Scratch,
-               DebugLoc dl,
-               SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
+void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
+                                     MemOpQueue &memOps,
+                                     unsigned memOpsBegin, unsigned memOpsEnd,
+                                     unsigned insertAfter, int Offset,
+                                     unsigned Base, bool BaseKill,
+                                     int Opcode,
+                                     ARMCC::CondCodes Pred, unsigned PredReg,
+                                     unsigned Scratch,
+                                     DebugLoc dl,
+                          SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
   // First calculate which of the registers should be killed by the merged
   // instruction.
-  SmallVector<std::pair<unsigned, bool>, 8> Regs;
   const unsigned insertPos = memOps[insertAfter].Position;
+
+  SmallSet<unsigned, 4> UnavailRegs;
+  SmallSet<unsigned, 4> KilledRegs;
+  DenseMap<unsigned, unsigned> Killer;
+  for (unsigned i = 0; i < memOpsBegin; ++i) {
+    if (memOps[i].Position < insertPos && memOps[i].isKill) {
+      unsigned Reg = memOps[i].Reg;
+      if (memOps[i].Merged)
+        UnavailRegs.insert(Reg);
+      else {
+        KilledRegs.insert(Reg);
+        Killer[Reg] = i;
+      }
+    }
+  }
+  for (unsigned i = memOpsEnd, e = memOps.size(); i != e; ++i) {
+    if (memOps[i].Position < insertPos && memOps[i].isKill) {
+      unsigned Reg = memOps[i].Reg;
+      KilledRegs.insert(Reg);
+      Killer[Reg] = i;
+    }
+  }
+
+  SmallVector<std::pair<unsigned, bool>, 8> Regs;
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
-    const MachineOperand &MO = memOps[i].MBBI->getOperand(0);
-    unsigned Reg = MO.getReg();
-    bool isKill = MO.isKill();
+    unsigned Reg = memOps[i].Reg;
+    if (UnavailRegs.count(Reg))
+      // Register is killed before and it's not easy / possible to update the
+      // kill marker on already merged instructions. Abort.
+      return;
 
     // If we are inserting the merged operation after an unmerged operation that
     // uses the same register, make sure to transfer any kill flag.
-    for (unsigned j = memOpsEnd, e = memOps.size(); !isKill && j != e; ++j)
-      if (memOps[j].Position<insertPos) {
-        const MachineOperand &MOJ = memOps[j].MBBI->getOperand(0);
-        if (MOJ.getReg() == Reg && MOJ.isKill())
-          isKill = true;
-      }
-
+    bool isKill = memOps[i].isKill || KilledRegs.count(Reg);
     Regs.push_back(std::make_pair(Reg, isKill));
   }
 
   // Try to do the merge.
   MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI;
-  Loc++;
+  ++Loc;
   if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode,
                 Pred, PredReg, Scratch, dl, Regs))
     return;
@@ -311,13 +328,13 @@ MergeOpsUpdate(MachineBasicBlock &MBB,
   Merges.push_back(prior(Loc));
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
     // Remove kill flags from any unmerged memops that come before insertPos.
-    if (Regs[i-memOpsBegin].second)
-      for (unsigned j = memOpsEnd, e = memOps.size(); j != e; ++j)
-        if (memOps[j].Position<insertPos) {
-          MachineOperand &MOJ = memOps[j].MBBI->getOperand(0);
-          if (MOJ.getReg() == Regs[i-memOpsBegin].first && MOJ.isKill())
-            MOJ.setIsKill(false);
-        }
+    if (Regs[i-memOpsBegin].second) {
+      unsigned Reg = Regs[i-memOpsBegin].first;
+      if (KilledRegs.count(Reg)) {
+        unsigned j = Killer[Reg];
+        memOps[j].MBBI->getOperand(0).setIsKill(false);
+      }
+    }
     MBB.erase(memOps[i].MBBI);
     memOps[i].Merged = true;
   }
@@ -517,8 +534,11 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
   }
 
   // Try merging with the previous instruction.
-  if (MBBI != MBB.begin()) {
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  if (MBBI != BeginMBBI) {
     MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+      --PrevMBBI;
     if (isAM4) {
       if (Mode == ARM_AM::ia &&
           isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
@@ -541,8 +561,11 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
   }
 
   // Try merging with the next instruction.
-  if (!DoMerge && MBBI != MBB.end()) {
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (!DoMerge && MBBI != EndMBBI) {
     MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
+    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+      ++NextMBBI;
     if (isAM4) {
       if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
           isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
@@ -669,8 +692,11 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
 
   // Try merging with the previous instruction.
-  if (MBBI != MBB.begin()) {
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  if (MBBI != BeginMBBI) {
     MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+      --PrevMBBI;
     if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
       DoMerge = true;
       AddSub = ARM_AM::sub;
@@ -685,8 +711,11 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   }
 
   // Try merging with the next instruction.
-  if (!DoMerge && MBBI != MBB.end()) {
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (!DoMerge && MBBI != EndMBBI) {
     MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
+    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+      ++NextMBBI;
     if (!isAM5 &&
         isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
       DoMerge = true;
@@ -759,18 +788,21 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
 /// isMemoryOp - Returns true if instruction is a memory operations (that this
 /// pass is capable of operating on).
 static bool isMemoryOp(const MachineInstr *MI) {
-  if (MI->hasOneMemOperand()) {
-    const MachineMemOperand *MMO = *MI->memoperands_begin();
+  // When no memory operands are present, conservatively assume unaligned,
+  // volatile, unfoldable.
+  if (!MI->hasOneMemOperand())
+    return false;
 
-    // Don't touch volatile memory accesses - we may be changing their order.
-    if (MMO->isVolatile())
-      return false;
+  const MachineMemOperand *MMO = *MI->memoperands_begin();
 
-    // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
-    // not.
-    if (MMO->getAlignment() < 4)
-      return false;
-  }
+  // Don't touch volatile memory accesses - we may be changing their order.
+  if (MMO->isVolatile())
+    return false;
+
+  // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
+  // not.
+  if (MMO->getAlignment() < 4)
+    return false;
 
   // str <undef> could probably be eliminated entirely, but for now we just want
   // to avoid making a mess of it.
@@ -898,6 +930,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
     if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)
       return false;
 
+    MachineBasicBlock::iterator NewBBI = MBBI;
     bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
     bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
     bool EvenDeadKill = isLd ?
@@ -942,6 +975,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                   getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
         ++NumSTRD2STM;
       }
+      NewBBI = llvm::prior(MBBI);
     } else {
       // Split into two instructions.
       assert((!isT2 || !OffReg) &&
@@ -962,14 +996,15 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                       OddReg, OddDeadKill, false,
                       BaseReg, false, BaseUndef, OffReg, false, OffUndef,
                       Pred, PredReg, TII, isT2);
+        NewBBI = llvm::prior(MBBI);
         InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
                       EvenReg, EvenDeadKill, false,
                       BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
                       Pred, PredReg, TII, isT2);
       } else {
         if (OddReg == EvenReg && EvenDeadKill) {
-          // If the two source operands are the same, the kill marker is probably
-          // on the first one. e.g.
+          // If the two source operands are the same, the kill marker is
+          // probably on the first one. e.g.
           // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
           EvenDeadKill = false;
           OddDeadKill = true;
@@ -978,6 +1013,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                       EvenReg, EvenDeadKill, EvenUndef,
                       BaseReg, false, BaseUndef, OffReg, false, OffUndef,
                       Pred, PredReg, TII, isT2);
+        NewBBI = llvm::prior(MBBI);
         InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
                       OddReg, OddDeadKill, OddUndef,
                       BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
@@ -989,8 +1025,9 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
         ++NumSTRD2STR;
     }
 
-    MBBI = prior(MBBI);
     MBB.erase(MI);
+    MBBI = NewBBI;
+    return true;
   }
   return false;
 }
@@ -1023,6 +1060,9 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
     if (isMemOp) {
       int Opcode = MBBI->getOpcode();
       unsigned Size = getLSMultipleTransferSize(MBBI);
+      const MachineOperand &MO = MBBI->getOperand(0);
+      unsigned Reg = MO.getReg();
+      bool isKill = MO.isDef() ? false : MO.isKill();
       unsigned Base = MBBI->getOperand(1).getReg();
       unsigned PredReg = 0;
       ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg);
@@ -1044,8 +1084,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         CurrSize = Size;
         CurrPred = Pred;
         CurrPredReg = PredReg;
-        MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
-        NumMemOps++;
+        MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI));
+        ++NumMemOps;
         Advance = true;
       } else {
         if (Clobber) {
@@ -1057,15 +1097,17 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
           // No need to match PredReg.
           // Continue adding to the queue.
           if (Offset > MemOps.back().Offset) {
-            MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
-            NumMemOps++;
+            MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill,
+                                             Position, MBBI));
+            ++NumMemOps;
             Advance = true;
           } else {
             for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
                  I != E; ++I) {
               if (Offset < I->Offset) {
-                MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI));
-                NumMemOps++;
+                MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill,
+                                                 Position, MBBI));
+                ++NumMemOps;
                 Advance = true;
                 break;
               } else if (Offset == I->Offset) {
@@ -1078,7 +1120,12 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       }
     }
 
-    if (Advance) {
+    if (MBBI->isDebugValue()) {
+      ++MBBI;
+      if (MBBI == E)
+        // Reach the end of the block, try merging the memory instructions.
+        TryMerge = true;
+    } else if (Advance) {
       ++Position;
       ++MBBI;
       if (MBBI == E)
@@ -1279,7 +1326,7 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
   // some day.
   SmallSet<unsigned, 4> AddedRegPressure;
   while (++I != E) {
-    if (MemOps.count(&*I))
+    if (I->isDebugValue() || MemOps.count(&*I))
       continue;
     const TargetInstrDesc &TID = I->getDesc();
     if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects())
@@ -1411,7 +1458,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
   std::sort(Ops.begin(), Ops.end(), OffsetCompare());
 
   // The loads / stores of the same base are in order. Scan them from first to
-  // last and check for the followins:
+  // last and check for the following:
   // 1. Any def of base.
   // 2. Any gaps.
   while (Ops.size() > 1) {
@@ -1474,7 +1521,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
       } else {
         // This is the new location for the loads / stores.
         MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
-        while (InsertPos != MBB->end() && MemOps.count(InsertPos))
+        while (InsertPos != MBB->end()
+               && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
           ++InsertPos;
 
         // If we are moving a pair of loads / stores, see if it makes sense
@@ -1562,7 +1610,9 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
         break;
       }
 
-      MI2LocMap[MI] = Loc++;
+      if (!MI->isDebugValue())
+        MI2LocMap[MI] = ++Loc;
+
       if (!isMemoryOp(MI))
         continue;
       unsigned PredReg = 0;
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 0134276..7e57a1c 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -88,6 +88,9 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
 
+  /// HasITBlocks - True if IT blocks have been inserted.
+  bool HasITBlocks;
+
 public:
   ARMFunctionInfo() :
     isThumb(false),
@@ -97,7 +100,8 @@ public:
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
     GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
-    JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0) {}
+    JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0),
+    HasITBlocks(false) {}
 
   explicit ARMFunctionInfo(MachineFunction &MF) :
     isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
@@ -108,7 +112,8 @@ public:
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
     GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32),
     SpilledCSRegs(MF.getTarget().getRegisterInfo()->getNumRegs()),
-    JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0) {}
+    JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0),
+    HasITBlocks(false) {}
 
   bool isThumbFunction() const { return isThumb; }
   bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
@@ -229,6 +234,9 @@ public:
 
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  bool hasITBlocks() const { return HasITBlocks; }
+  void setHasITBlocks(bool h) { HasITBlocks = h; }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 6beca8b..d020f3c 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -153,11 +153,11 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>;
 
 // Pseudo 256-bit registers to represent pairs of Q registers. These should
 // never be present in the emitted code.
-// These are used for NEON load / store instructions, e.g. vld4, vst3.
-// NOTE: It's possible to define more QQ registers since technical the
-// starting D register number doesn't have to be multiple of 4. e.g. 
-// D1, D2, D3, D4 would be a legal quad. But that would make the sub-register
-// stuffs very messy.
+// These are used for NEON load / store instructions, e.g., vld4, vst3.
+// NOTE: It's possible to define more QQ registers since technically the
+// starting D register number doesn't have to be multiple of 4, e.g.,
+// D1, D2, D3, D4 would be a legal quad, but that would make the subregister
+// stuff very messy.
 let SubRegIndices = [qsub_0, qsub_1] in {
 let CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1),
                         (ssub_4 qsub_1, ssub_0), (ssub_5 qsub_1, ssub_1),
@@ -183,7 +183,8 @@ let CompositeIndices = [(qsub_2  qqsub_1, qsub_0), (qsub_3  qqsub_1, qsub_1),
                         (ssub_8  qqsub_1, ssub_0), (ssub_9  qqsub_1, ssub_1),
                         (ssub_10 qqsub_1, ssub_2), (ssub_11 qqsub_1, ssub_3),
                         (ssub_12 qqsub_1, ssub_4), (ssub_13 qqsub_1, ssub_5),
-                        (ssub_14 qqsub_1, ssub_6), (ssub_15 qqsub_1, ssub_7)] in {
+                        (ssub_14 qqsub_1, ssub_6), (ssub_15 qqsub_1, ssub_7)] in
+{
 def QQQQ0 : ARMReg<0, "qqqq0", [QQ0, QQ1]>;
 def QQQQ1 : ARMReg<1, "qqqq1", [QQ2, QQ3]>;
 }
@@ -196,9 +197,9 @@ def QQQQ3 : ARMReg<3, "qqqq3", [QQ6, QQ7]>;
 }
 
 // Current Program Status Register.
-def CPSR  : ARMReg<0, "cpsr">;
-
-def FPSCR : ARMReg<1, "fpscr">;
+def CPSR    : ARMReg<0, "cpsr">;
+def FPSCR   : ARMReg<1, "fpscr">;
+def ITSTATE : ARMReg<2, "itstate">;
 
 // Register classes.
 //
@@ -348,6 +349,73 @@ def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
   }];
 }
 
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of
+// this class and the preceding one(!)  This is what we want.
+def tcGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R9, R12]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // R9 is available.
+    static const unsigned ARM_GPR_R9_TC[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R9, ARM::R12 };
+    // R9 is not available.
+    static const unsigned ARM_GPR_NOR9_TC[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R12 };
+
+    // For Thumb1 mode, we don't want to allocate hi regs at all, as we
+    // don't know how to spill them. If we make our prologue/epilogue code
+    // smarter at some point, we can go back to using the above allocation
+    // orders for the Thumb1 instructions that know how to use hi regs.
+    static const unsigned THUMB_GPR_AO_TC[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+    tcGPRClass::iterator
+    tcGPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.isThumb1Only())
+        return THUMB_GPR_AO_TC;
+      if (Subtarget.isTargetDarwin()) {
+        if (Subtarget.isR9Reserved())
+          return ARM_GPR_NOR9_TC;
+        else
+          return ARM_GPR_R9_TC;
+      } else
+        // R9 is either callee-saved or reserved; can't use it.
+        return ARM_GPR_NOR9_TC;
+    }
+
+    tcGPRClass::iterator
+    tcGPRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      GPRClass::iterator I;
+
+      if (Subtarget.isThumb1Only()) {
+        I = THUMB_GPR_AO_TC + (sizeof(THUMB_GPR_AO_TC)/sizeof(unsigned));
+        return I;
+      }
+
+      if (Subtarget.isTargetDarwin()) {
+        if (Subtarget.isR9Reserved())
+          I = ARM_GPR_NOR9_TC + (sizeof(ARM_GPR_NOR9_TC)/sizeof(unsigned));
+        else
+          I = ARM_GPR_R9_TC + (sizeof(ARM_GPR_R9_TC)/sizeof(unsigned));
+      } else
+        // R9 is either callee-saved or reserved; can't use it.
+        I = ARM_GPR_NOR9_TC + (sizeof(ARM_GPR_NOR9_TC)/sizeof(unsigned));
+      return I;
+    }
+  }];
+}
+
+
 // Scalar single precision floating point register class..
 def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
   S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22,
@@ -479,4 +547,3 @@ def QQQQPR : RegisterClass<"ARM", [v8i64],
 
 // Condition code registers.
 def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
-
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td
index bbfc0b2..282abca 100644
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -1,10 +1,10 @@
 //=- ARMScheduleA8.td - ARM Cortex-A8 Scheduling Definitions -*- tablegen -*-=//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // This file defines the itinerary class data for the ARM Cortex A8 processors.
@@ -32,50 +32,50 @@ def CortexA8Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iALUx    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>,
   //
   // Binary Instructions that produce a result
-  InstrItinData<IIC_iALUi    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
-  InstrItinData<IIC_iALUr    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
-  InstrItinData<IIC_iALUsi   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
-  InstrItinData<IIC_iALUsr   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
+  InstrItinData<IIC_iALUi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iALUr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
+  InstrItinData<IIC_iALUsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
   //
   // Unary Instructions that produce a result
-  InstrItinData<IIC_iUNAr    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
-  InstrItinData<IIC_iUNAsi   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iUNAsr   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iUNAr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iUNAsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iUNAsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
   //
   // Compare instructions
-  InstrItinData<IIC_iCMPi    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
-  InstrItinData<IIC_iCMPr    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
-  InstrItinData<IIC_iCMPsi   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iCMPsr   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMPi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMPr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iCMPsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMPsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
   //
   // Move instructions, unconditional
-  InstrItinData<IIC_iMOVi    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
-  InstrItinData<IIC_iMOVr    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
-  InstrItinData<IIC_iMOVsi   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
-  InstrItinData<IIC_iMOVsr   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
+  InstrItinData<IIC_iMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
+  InstrItinData<IIC_iMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
   //
   // Move instructions, conditional
-  InstrItinData<IIC_iCMOVi   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
-  InstrItinData<IIC_iCMOVr   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iCMOVsi  , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iCMOVsr  , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
 
   // Integer multiply pipeline
   // Result written in E5, but that is relative to the last cycle of multicycle,
   // so we use 6 for those cases
   //
   InstrItinData<IIC_iMUL16   , [InstrStage<1, [A8_Pipe0]>], [5, 1, 1]>,
-  InstrItinData<IIC_iMAC16   , [InstrStage<1, [A8_Pipe1], 0>, 
+  InstrItinData<IIC_iMAC16   , [InstrStage<1, [A8_Pipe1], 0>,
                                 InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
-  InstrItinData<IIC_iMUL32   , [InstrStage<1, [A8_Pipe1], 0>, 
+  InstrItinData<IIC_iMUL32   , [InstrStage<1, [A8_Pipe1], 0>,
                                 InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>,
-  InstrItinData<IIC_iMAC32   , [InstrStage<1, [A8_Pipe1], 0>, 
+  InstrItinData<IIC_iMAC32   , [InstrStage<1, [A8_Pipe1], 0>,
                                 InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
-  InstrItinData<IIC_iMUL64   , [InstrStage<2, [A8_Pipe1], 0>, 
+  InstrItinData<IIC_iMUL64   , [InstrStage<2, [A8_Pipe1], 0>,
                                 InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
-  InstrItinData<IIC_iMAC64   , [InstrStage<2, [A8_Pipe1], 0>, 
+  InstrItinData<IIC_iMAC64   , [InstrStage<2, [A8_Pipe1], 0>,
                                 InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
-  
+
   // Integer load pipeline
   //
   // loads have an extra cycle of latency, but are fully pipelined
@@ -166,7 +166,7 @@ def CortexA8Itineraries : ProcessorItineraries<
                                 InstrStage<2, [A8_Pipe1]>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                 InstrStage<1, [A8_LdSt0]>]>,
-  
+
   // Branch
   //
   // no delay slots, so the latency of a branch is unimportant
@@ -276,14 +276,14 @@ def CortexA8Itineraries : ProcessorItineraries<
   //
   // Single-precision FP Load
   // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>]>,
   //
   // Double-precision FP Load
   // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpLoad64, [InstrStage<2, [A8_Issue], 0>, 
+  InstrItinData<IIC_fpLoad64, [InstrStage<2, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0], 0>,
                                InstrStage<1, [A8_Pipe1]>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
@@ -292,7 +292,7 @@ def CortexA8Itineraries : ProcessorItineraries<
   //
   // FP Load Multiple
   // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpLoadm,  [InstrStage<3, [A8_Issue], 0>, 
+  InstrItinData<IIC_fpLoadm,  [InstrStage<3, [A8_Issue], 0>,
                                InstrStage<2, [A8_Pipe0], 0>,
                                InstrStage<2, [A8_Pipe1]>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
@@ -301,14 +301,14 @@ def CortexA8Itineraries : ProcessorItineraries<
   //
   // Single-precision FP Store
   // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>]>,
   //
   // Double-precision FP Store
   // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpStore64,[InstrStage<2, [A8_Issue], 0>, 
+  InstrItinData<IIC_fpStore64,[InstrStage<2, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0], 0>,
                                InstrStage<1, [A8_Pipe1]>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
@@ -317,7 +317,7 @@ def CortexA8Itineraries : ProcessorItineraries<
   //
   // FP Store Multiple
   // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpStorem, [InstrStage<3, [A8_Issue], 0>, 
+  InstrItinData<IIC_fpStorem, [InstrStage<3, [A8_Issue], 0>,
                                InstrStage<2, [A8_Pipe0], 0>,
                                InstrStage<2, [A8_Pipe1]>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
@@ -329,35 +329,35 @@ def CortexA8Itineraries : ProcessorItineraries<
   //
   // VLD1
   // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD1,     [InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_VLD1,     [InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>]>,
   //
   // VLD2
   // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD2,     [InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_VLD2,     [InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>], [2, 2, 1]>,
   //
   // VLD3
   // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD3,     [InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_VLD3,     [InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>], [2, 2, 2, 1]>,
   //
   // VLD4
   // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD4,     [InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_VLD4,     [InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>], [2, 2, 2, 2, 1]>,
   //
   // VST
   // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VST,      [InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_VST,      [InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>]>,
@@ -600,7 +600,7 @@ def CortexA8Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VTB4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_NLSPipe]>,
                                InstrStage<1, [A8_NPipe], 0>,
-                               InstrStage<2, [A8_NLSPipe]>], [4, 2, 2, 3, 3, 1]>,
+                               InstrStage<2, [A8_NLSPipe]>],[4, 2, 2, 3, 3, 1]>,
   //
   // VTBX
   InstrItinData<IIC_VTBX1,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
@@ -610,9 +610,9 @@ def CortexA8Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VTBX3,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_NLSPipe]>,
                                InstrStage<1, [A8_NPipe], 0>,
-                               InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 1]>,
+                               InstrStage<2, [A8_NLSPipe]>],[4, 1, 2, 2, 3, 1]>,
   InstrItinData<IIC_VTBX4,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_NLSPipe]>,
                                InstrStage<1, [A8_NPipe], 0>,
-                               InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
+                            InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
 ]>;
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 75320d9..df2f896 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1,10 +1,10 @@
 //=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // This file defines the itinerary class data for the ARM Cortex A9 processors.
@@ -16,7 +16,6 @@
 // Reference Manual".
 //
 // Functional units
-def A9_Issue   : FuncUnit; // issue
 def A9_Pipe0   : FuncUnit; // pipeline 0
 def A9_Pipe1   : FuncUnit; // pipeline 1
 def A9_LSPipe  : FuncUnit; // LS pipe
@@ -27,7 +26,121 @@ def A9_DRegsN  : FuncUnit; // FP register set, NEON side
 // Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1
 //
 def CortexA9Itineraries : ProcessorItineraries<
-  [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1, A9_Issue], [
+  [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1], [
+  // Two fully-pipelined integer ALU pipelines
+  // FIXME: There are no operand latencies for these instructions at all!
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
+  InstrItinData<IIC_iMOVr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsi  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsr  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>,
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx    , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2, 2]>,
+  InstrItinData<IIC_iALUsi, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1, 1]>,
+  //
+  // Unary Instructions that produce a result
+  InstrItinData<IIC_iUNAr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iUNAsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iUNAsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMPr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iCMPsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMPsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
+  //
+  // Move instructions, conditional
+  InstrItinData<IIC_iCMOVi  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMOVr  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsr , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
+
+  // Integer multiply pipeline
+  //
+  InstrItinData<IIC_iMUL16   , [InstrStage<1, [A9_Pipe1], 0>,
+                                InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>,
+  InstrItinData<IIC_iMAC16   , [InstrStage<1, [A9_Pipe1], 0>,
+                                InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>,
+  InstrItinData<IIC_iMUL32   , [InstrStage<1, [A9_Pipe1], 0>,
+                                InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>,
+  InstrItinData<IIC_iMAC32   , [InstrStage<1, [A9_Pipe1], 0>,
+                                InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>,
+  InstrItinData<IIC_iMUL64   , [InstrStage<2, [A9_Pipe1], 0>,
+                                InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
+  InstrItinData<IIC_iMAC64   , [InstrStage<2, [A9_Pipe1], 0>,
+                                InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
+  // Integer load pipeline
+  // FIXME: The timings are some rough approximations
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoadi   , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoadr   , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iLoadsi  , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<2, [A9_LSPipe]>], [4, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoadiu  , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<2, [A9_LSPipe]>], [3, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoadru  , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<2, [A9_LSPipe]>], [3, 2, 1, 1]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iLoadsiu , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<2, [A9_LSPipe]>], [4, 3, 1, 1]>,
+  //
+  // Load multiple
+  InstrItinData<IIC_iLoadm   , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>]>,
+
+  // Integer store pipeline
+  ///
+  // Immediate offset
+  InstrItinData<IIC_iStorei  , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStorer  , [InstrStage<1, [ A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iStoresi , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<2, [A9_LSPipe]>], [3, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStoreiu , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>], [2, 3, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStoreru , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>], [2, 3, 1, 1]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iStoresiu, [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<2, [A9_LSPipe]>], [3, 3, 1, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStorem  , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>]>,
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br       , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
+
   // VFP and NEON shares the same register file. This means that every VFP
   // instruction should wait for full completion of the consecutive NEON
   // instruction and vice-versa. We model this behavior with two artificial FUs:
@@ -39,8 +152,8 @@ def CortexA9Itineraries : ProcessorItineraries<
   //    register file writeback!).
   // Every NEON instruction does the same but with FUs swapped.
   //
-  // Since the reserved FU cannot be acquired this models precisly "cross-domain"
-  // stalls.
+  // Since the reserved FU cannot be acquired, this models precisely
+  // "cross-domain" stalls.
 
   // VFP
   // Issue through integer pipeline, and execute in NEON unit.
@@ -48,21 +161,21 @@ def CortexA9Itineraries : ProcessorItineraries<
   // FP Special Register to Integer Register File Move
   InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                              InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                              InstrStage<1, [A9_Pipe1]>,
                               InstrStage<1, [A9_NPipe]>]>,
   //
   // Single-precision FP Unary
   InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
   //
   // Double-precision FP Unary
   InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
 
   //
@@ -70,124 +183,124 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 4 cycles
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
   //
   // Double-precision FP Compare
   InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 4 cycles
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
   //
   // Single to Double FP Convert
   InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Double to Single FP Convert
   InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
 
   //
   // Single to Half FP Convert
   InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Half to Single FP Convert
   InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 1]>,
 
   //
   // Single-Precision FP to Integer Convert
   InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Double-Precision FP to Integer Convert
   InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Integer to Single-Precision FP Convert
   InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Integer to Double-Precision FP Convert
   InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Single-precision FP ALU
   InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
   //
   // Double-precision FP ALU
   InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
   //
   // Single-precision FP Multiply
   InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<6, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [5, 1, 1]>,
   //
   // Double-precision FP Multiply
   InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<7, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 1, 1]>,
   //
   // Single-precision FP MAC
   InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<9, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [8, 0, 1, 1]>,
   //
   // Double-precision FP MAC
   InstrItinData<IIC_fpMAC64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<10, [A9_DRegsN],  0, Reserved>,
-                               InstrStage<1,  [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1,  [A9_Pipe1]>,
                                InstrStage<2,  [A9_NPipe]>], [9, 0, 1, 1]>,
   //
   // Single-precision FP DIV
   InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<16, [A9_DRegsN],  0, Reserved>,
-                               InstrStage<1,  [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1,  [A9_Pipe1]>,
                                InstrStage<10, [A9_NPipe]>], [15, 1, 1]>,
   //
   // Double-precision FP DIV
   InstrItinData<IIC_fpDIV64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<26, [A9_DRegsN],  0, Reserved>,
-                               InstrStage<1,  [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1,  [A9_Pipe1]>,
                                InstrStage<20, [A9_NPipe]>], [25, 1, 1]>,
   //
   // Single-precision FP SQRT
   InstrItinData<IIC_fpSQRT32, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<18, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1,   [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<13,  [A9_NPipe]>], [17, 1]>,
+                               InstrStage<1,  [A9_Pipe1]>,
+                               InstrStage<13, [A9_NPipe]>], [17, 1]>,
   //
   // Double-precision FP SQRT
   InstrItinData<IIC_fpSQRT64, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<33, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1,  [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1,  [A9_Pipe1]>,
                                InstrStage<28, [A9_NPipe]>], [32, 1]>,
 
   //
@@ -195,92 +308,79 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra 1 latency cycle since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
   //
   // Integer to Double-precision Move
   InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra 1 latency cycle since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1, 1]>,
   //
   // Single-precision to Integer Move
   InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
   //
   // Double-precision to Integer Move
   InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1, 1]>,
   //
   // Single-precision FP Load
-  // use A9_Issue to enforce the 1 load/store per cycle limit
   InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // Double-precision FP Load
-  // use A9_Issue to enforce the 1 load/store per cycle limit
   InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // FP Load Multiple
-  // use A9_Issue to enforce the 1 load/store per cycle limit
   InstrItinData<IIC_fpLoadm,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // Single-precision FP Store
-  // use A9_Issue to enforce the 1 load/store per cycle limit
   InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // Double-precision FP Store
-  // use A9_Issue to enforce the 1 load/store per cycle limit
   InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // FP Store Multiple
-  // use A9_Issue to enforce the 1 load/store per cycle limit
   InstrItinData<IIC_fpStorem, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   // NEON
   // Issue through integer pipeline, and execute in NEON unit.
-  // FIXME: Neon pipeline and LdSt unit are multiplexed. 
+  // FIXME: Neon pipeline and LdSt unit are multiplexed.
   //        Add some syntactic sugar to model this!
   // VLD1
   // FIXME: We don't model this instruction properly
   InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // VLD2
@@ -288,9 +388,8 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>], [2, 2, 1]>,
   //
   // VLD3
@@ -298,9 +397,8 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>], [2, 2, 2, 1]>,
   //
   // VLD4
@@ -308,9 +406,8 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>], [2, 2, 2, 2, 1]>,
   //
   // VST
@@ -318,121 +415,120 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VST,      [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // Double-register Integer Unary
   InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 2]>,
   //
   // Quad-register Integer Unary
   InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 2]>,
   //
   // Double-register Integer Q-Unary
   InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Quad-register Integer CountQ-Unary
   InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Double-register Integer Binary
   InstrItinData<IIC_VBINiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
   //
   // Quad-register Integer Binary
   InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
   //
   // Double-register Integer Subtract
   InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 2, 1]>,
   //
   // Quad-register Integer Subtract
   InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 2, 1]>,
   //
   // Double-register Integer Shift
   InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 1, 1]>,
   //
   // Quad-register Integer Shift
   InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 1, 1]>,
   //
   // Double-register Integer Shift (4 cycle)
   InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
   //
   // Quad-register Integer Shift (4 cycle)
   InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
   //
   // Double-register Integer Binary (4 cycle)
   InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 2, 2]>,
   //
   // Quad-register Integer Binary (4 cycle)
   InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 2, 2]>,
   //
   // Double-register Integer Subtract (4 cycle)
   InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 2, 1]>,
   //
   // Quad-register Integer Subtract (4 cycle)
   InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 2, 1]>,
 
   //
@@ -440,7 +536,7 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
   //
   // Quad-register Integer Count
@@ -449,35 +545,35 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [4, 2, 2]>,
   //
   // Double-register Absolute Difference and Accumulate
   InstrItinData<IIC_VABAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [6, 3, 2, 1]>,
   //
   // Quad-register Absolute Difference and Accumulate
   InstrItinData<IIC_VABAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>,
   //
   // Double-register Integer Pair Add Long
   InstrItinData<IIC_VPALiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [6, 3, 1]>,
   //
   // Quad-register Integer Pair Add Long
   InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 3, 1]>,
 
   //
@@ -485,14 +581,14 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [6, 2, 2]>,
   //
   // Quad-register Integer Multiply (.8, .16)
   InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [7, 2, 2]>,
 
   //
@@ -500,56 +596,56 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [7, 2, 1]>,
   //
   // Quad-register Integer Multiply (.32)
   InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<4, [A9_NPipe]>], [9, 2, 1]>,
   //
   // Double-register Integer Multiply-Accumulate (.8, .16)
   InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [6, 3, 2, 2]>,
   //
   // Double-register Integer Multiply-Accumulate (.32)
   InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [7, 3, 2, 1]>,
   //
   // Quad-register Integer Multiply-Accumulate (.8, .16)
   InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [7, 3, 2, 2]>,
   //
   // Quad-register Integer Multiply-Accumulate (.32)
   InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<4, [A9_NPipe]>], [9, 3, 2, 1]>,
   //
   // Move Immediate
   InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3]>,
   //
   // Double-register Permute Move
   InstrItinData<IIC_VMOVD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_LSPipe]>], [2, 1]>,
   //
   // Quad-register Permute Move
@@ -558,42 +654,42 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 1]>,
   //
   // Integer to Single-precision Move
   InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 1]>,
   //
   // Integer to Double-precision Move
   InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 1, 1]>,
   //
   // Single-precision to Integer Move
   InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 1]>,
   //
   // Double-precision to Integer Move
   InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 2, 1]>,
   //
   // Integer to Lane Move
   InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 1, 1]>,
 
   //
@@ -601,7 +697,7 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VUNAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [5, 2]>,
   //
   // Quad-register FP Unary
@@ -610,7 +706,7 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 2]>,
   //
   // Double-register FP Binary
@@ -619,7 +715,7 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VBIND,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [5, 2, 2]>,
   //
   // Quad-register FP Binary
@@ -630,14 +726,14 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VBINQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 2, 2]>,
   //
   // Double-register FP Multiple-Accumulate
   InstrItinData<IIC_VMACD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>,
   //
   // Quad-register FP Multiple-Accumulate
@@ -646,28 +742,28 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VMACQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<4, [A9_NPipe]>], [8, 4, 2, 1]>,
   //
   // Double-register Reciprical Step
   InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 2, 2]>,
   //
   // Quad-register Reciprical Step
   InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<4, [A9_NPipe]>], [8, 2, 2]>,
   //
   // Double-register Permute
   InstrItinData<IIC_VPERMD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 2, 1, 1]>,
   //
   // Quad-register Permute
@@ -676,7 +772,7 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 3, 1, 1]>,
   //
   // Quad-register Permute (3 cycle issue)
@@ -685,7 +781,7 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<3, [A9_LSPipe]>], [4, 4, 1, 1]>,
 
   //
@@ -693,57 +789,57 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VEXTD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 1, 1]>,
   //
   // Quad-register VEXT
   InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 1, 1]>,
   //
   // VTB
   InstrItinData<IIC_VTB1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 2, 1]>,
   InstrItinData<IIC_VTB2,     [InstrStage<2, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 2, 2, 1]>,
   InstrItinData<IIC_VTB3,     [InstrStage<2, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 1]>,
   InstrItinData<IIC_VTB4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 3, 1]>,
   //
   // VTBX
   InstrItinData<IIC_VTBX1,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 1, 2, 1]>,
   InstrItinData<IIC_VTBX2,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 1, 2, 2, 1]>,
   InstrItinData<IIC_VTBX3,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<3, [A9_NPipe]>], [4, 1, 2, 2, 3, 1]>,
   InstrItinData<IIC_VTBX4,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]>
+                               InstrStage<1, [A9_Pipe1]>,
+                              InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]>
 ]>;
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td
index f813022..08b560c 100644
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -16,7 +16,7 @@
 // Functional Units
 def V6_Pipe : FuncUnit; // pipeline
 
-// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual".
+// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual"
 //
 def ARMV6Itineraries : ProcessorItineraries<
   [V6_Pipe], [
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index b4a9252..09203f9 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -60,8 +60,10 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT,
                                    const std::string &FS)
   : ARMBaseTargetMachine(T, TT, FS, false), InstrInfo(Subtarget),
     DataLayout(Subtarget.isAPCS_ABI() ?
-               std::string("e-p:32:32-f64:32:32-i64:32:32-n32") :
-               std::string("e-p:32:32-f64:64:64-i64:64:64-n32")),
+               std::string("e-p:32:32-f64:32:32-i64:32:32-"
+                           "v128:32:128-v64:32:64-n32") :
+               std::string("e-p:32:32-f64:64:64-i64:64:64-"
+                           "v128:64:128-v64:64:64-n32")),
     TLInfo(*this),
     TSInfo(*this) {
 }
@@ -74,9 +76,11 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT,
               : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
     DataLayout(Subtarget.isAPCS_ABI() ?
                std::string("e-p:32:32-f64:32:32-i64:32:32-"
-                           "i16:16:32-i8:8:32-i1:8:32-a:0:32-n32") :
+                           "i16:16:32-i8:8:32-i1:8:32-"
+                           "v128:32:128-v64:32:64-a:0:32-n32") :
                std::string("e-p:32:32-f64:64:64-i64:64:64-"
-                           "i16:16:32-i8:8:32-i1:8:32-a:0:32-n32")),
+                           "i16:16:32-i8:8:32-i1:8:32-"
+                           "v128:64:128-v64:64:64-a:0:32-n32")),
     TLInfo(*this),
     TSInfo(*this) {
 }
@@ -98,6 +102,7 @@ bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
     PM.add(createARMLoadStoreOptimizationPass(true));
+
   return true;
 }
 
@@ -115,21 +120,20 @@ bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM,
   // proper scheduling.
   PM.add(createARMExpandPseudoPass());
 
-  return true;
-}
-
-bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
-                                          CodeGenOpt::Level OptLevel) {
-  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (OptLevel != CodeGenOpt::None) {
     if (!Subtarget.isThumb1Only())
       PM.add(createIfConverterPass());
   }
-
-  if (Subtarget.isThumb2()) {
+  if (Subtarget.isThumb2())
     PM.add(createThumb2ITBlockPass());
+
+  return true;
+}
+
+bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel) {
+  if (Subtarget.isThumb2())
     PM.add(createThumb2SizeReductionPass());
-  }
 
   PM.add(createARMConstantIslandPass());
   return true;
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index bfa89c4..8415d1a 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -425,7 +425,7 @@ bool ARMAsmParser::ParseMemory(OwningPtr<ARMOperand> &Op) {
     const AsmToken &NextTok = Parser.getTok();
     if (NextTok.isNot(AsmToken::EndOfStatement)) {
       if (NextTok.isNot(AsmToken::Comma))
-	return Error(NextTok.getLoc(), "',' expected");
+        return Error(NextTok.getLoc(), "',' expected");
       Parser.Lex(); // Eat comma token.
       if(ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType,
                               ShiftAmount, Offset, OffsetIsReg, OffsetRegNum, 
@@ -488,7 +488,7 @@ bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative,
 
       const AsmToken &Tok = Parser.getTok();
       if (ParseShift(ShiftType, ShiftAmount, E))
-	return Error(Tok.getLoc(), "shift expected");
+        return Error(Tok.getLoc(), "shift expected");
       OffsetRegShifted = true;
     }
   }
@@ -665,7 +665,6 @@ bool ARMAsmParser::ParseInstruction(const StringRef &Name, SMLoc NameLoc,
   
   Operands.push_back(Op.take());
 
-  SMLoc Loc = Parser.getTok().getLoc();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
 
     // Read the first operand.
@@ -763,15 +762,10 @@ bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) {
   if (Tok.isNot(AsmToken::Identifier))
     return Error(L, "unexpected token in .syntax directive");
   const StringRef &Mode = Tok.getString();
-  bool unified_syntax;
-  if (Mode == "unified" || Mode == "UNIFIED") {
+  if (Mode == "unified" || Mode == "UNIFIED")
     Parser.Lex();
-    unified_syntax = true;
-  }
-  else if (Mode == "divided" || Mode == "DIVIDED") {
+  else if (Mode == "divided" || Mode == "DIVIDED")
     Parser.Lex();
-    unified_syntax = false;
-  }
   else
     return Error(L, "unrecognized syntax mode in .syntax directive");
 
@@ -791,15 +785,10 @@ bool ARMAsmParser::ParseDirectiveCode(SMLoc L) {
   if (Tok.isNot(AsmToken::Integer))
     return Error(L, "unexpected token in .code directive");
   int64_t Val = Parser.getTok().getIntVal();
-  bool thumb_mode;
-  if (Val == 16) {
+  if (Val == 16)
     Parser.Lex();
-    thumb_mode = true;
-  }
-  else if (Val == 32) {
+  else if (Val == 32)
     Parser.Lex();
-    thumb_mode = false;
-  }
   else
     return Error(L, "invalid operand to .code directive");
 
diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
index d95efdb..6a40cf3 100644
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@@ -175,23 +175,8 @@ namespace {
                                raw_ostream &O);
     void printVFPf64ImmOperand(const MachineInstr *MI, int OpNum,
                                raw_ostream &O);
-
-    void printHex8ImmOperand(const MachineInstr *MI, int OpNum,
-                             raw_ostream &O) {
-      O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xff);
-    }
-    void printHex16ImmOperand(const MachineInstr *MI, int OpNum,
-                              raw_ostream &O) {
-      O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffff);
-    }
-    void printHex32ImmOperand(const MachineInstr *MI, int OpNum,
-                              raw_ostream &O) {
-      O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffffffff);
-    }
-    void printHex64ImmOperand(const MachineInstr *MI, int OpNum,
-                              raw_ostream &O) {
-      O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm());
-    }
+    void printNEONModImmOperand(const MachineInstr *MI, int OpNum,
+                                raw_ostream &O);
 
     virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                  unsigned AsmVariant, const char *ExtraCode,
@@ -322,7 +307,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
       unsigned DRegLo = TM.getRegisterInfo()->getSubReg(Reg, ARM::dsub_0);
       unsigned DRegHi = TM.getRegisterInfo()->getSubReg(Reg, ARM::dsub_1);
       O << '{'
-        << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi)
+        << getRegisterName(DRegLo) << ", " << getRegisterName(DRegHi)
         << '}';
     } else if (Modifier && strcmp(Modifier, "lane") == 0) {
       unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
@@ -617,8 +602,12 @@ void ARMAsmPrinter::printAddrMode6Operand(const MachineInstr *MI, int Op,
 
   O << "[" << getRegisterName(MO1.getReg());
   if (MO2.getImm()) {
+    unsigned Align = MO2.getImm();
+    assert((Align == 8 || Align == 16 || Align == 32) &&
+           "unexpected NEON load/store alignment");
+    Align <<= 3;
     // FIXME: Both darwin as and GNU as violate ARM docs here.
-    O << ", :" << MO2.getImm();
+    O << ", :" << Align;
   }
   O << "]";
 }
@@ -1039,6 +1028,14 @@ void ARMAsmPrinter::printVFPf64ImmOperand(const MachineInstr *MI, int OpNum,
   }
 }
 
+void ARMAsmPrinter::printNEONModImmOperand(const MachineInstr *MI, int OpNum,
+                                           raw_ostream &O) {
+  unsigned EncodedImm = MI->getOperand(OpNum).getImm();
+  unsigned EltBits;
+  uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
+  O << "#0x" << utohexstr(Val);
+}
+
 bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                     unsigned AsmVariant, const char *ExtraCode,
                                     raw_ostream &O) {
@@ -1064,20 +1061,10 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       printOperand(MI, OpNum, O);
       return false;
     case 'Q':
-      if (TM.getTargetData()->isLittleEndian())
-        break;
-      // Fallthrough
     case 'R':
-      if (TM.getTargetData()->isBigEndian())
-        break;
-      // Fallthrough
-    case 'H': // Write second word of DI / DF reference.
-      // Verify that this operand has two consecutive registers.
-      if (!MI->getOperand(OpNum).isReg() ||
-          OpNum+1 == MI->getNumOperands() ||
-          !MI->getOperand(OpNum+1).isReg())
-        return true;
-      ++OpNum;   // Return the high-part.
+    case 'H':
+      report_fatal_error("llvm does not support 'Q', 'R', and 'H' modifiers!");
+      return true;
     }
   }
 
@@ -1384,11 +1371,11 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
     } else if (MO.isGlobal()) {
       MCSymbol *Symbol = MCInstLowering.GetGlobalAddressSymbol(MO);
       const MCSymbolRefExpr *SymRef1 =
-	MCSymbolRefExpr::Create(Symbol,
-				MCSymbolRefExpr::VK_ARM_LO16, OutContext);
+        MCSymbolRefExpr::Create(Symbol,
+                                MCSymbolRefExpr::VK_ARM_LO16, OutContext);
       const MCSymbolRefExpr *SymRef2 =
-	MCSymbolRefExpr::Create(Symbol,
-				MCSymbolRefExpr::VK_ARM_HI16, OutContext);
+        MCSymbolRefExpr::Create(Symbol,
+                                MCSymbolRefExpr::VK_ARM_HI16, OutContext);
       V1 = MCOperand::CreateExpr(SymRef1);
       V2 = MCOperand::CreateExpr(SymRef2);
     } else {
diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
index 2b94b76..170819a 100644
--- a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
@@ -779,22 +779,10 @@ void ARMInstPrinter::printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum,
   O << '#' << MI->getOperand(OpNum).getImm();
 }
 
-void ARMInstPrinter::printHex8ImmOperand(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xff);
-}
-
-void ARMInstPrinter::printHex16ImmOperand(const MCInst *MI, unsigned OpNum,
-                                          raw_ostream &O) {
-  O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffff);
-}
-
-void ARMInstPrinter::printHex32ImmOperand(const MCInst *MI, unsigned OpNum,
-                                          raw_ostream &O) {
-  O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffffffff);
-}
-
-void ARMInstPrinter::printHex64ImmOperand(const MCInst *MI, unsigned OpNum,
-                                          raw_ostream &O) {
-  O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm());
+void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  unsigned EncodedImm = MI->getOperand(OpNum).getImm();
+  unsigned EltBits;
+  uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
+  O << "#0x" << utohexstr(Val);
 }
diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
index be0b7c1..ddf5047 100644
--- a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
@@ -104,10 +104,7 @@ public:
   void printNoHashImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printVFPf32ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printHex8ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printHex16ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printHex32ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printHex64ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
   void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);  
   // FIXME: Implement.
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 29e66e1..0df3466 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -33,6 +33,7 @@ add_llvm_target(ARMCodeGen
   NEONPreAllocPass.cpp
   Thumb1InstrInfo.cpp
   Thumb1RegisterInfo.cpp
+  Thumb2HazardRecognizer.cpp
   Thumb2ITBlockPass.cpp
   Thumb2InstrInfo.cpp
   Thumb2RegisterInfo.cpp
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
index adb7795..a07ff28 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
@@ -34,7 +34,7 @@
 /// Uses and Defs by this instr.  For the Uses part, the pred:$p operand is
 /// defined with two components:
 ///
-/// def pred {	// Operand PredicateOperand
+/// def pred { // Operand PredicateOperand
 ///   ValueType Type = OtherVT;
 ///   string PrintMethod = "printPredicateOperand";
 ///   string AsmOperandLowerMethod = ?;
@@ -54,7 +54,7 @@
 ///
 /// For the Defs part, in the simple case of only cc_out:$s, we have:
 ///
-/// def cc_out {	// Operand OptionalDefOperand
+/// def cc_out { // Operand OptionalDefOperand
 ///   ValueType Type = OtherVT;
 ///   string PrintMethod = "printSBitModifierOperand";
 ///   string AsmOperandLowerMethod = ?;
@@ -765,7 +765,7 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
           || Opcode == ARM::SMC || Opcode == ARM::SVC) &&
          "Unexpected Opcode");
 
-  assert(NumOps >= 1 && OpInfo[0].RegClass == 0 && "Reg operand expected");
+  assert(NumOps >= 1 && OpInfo[0].RegClass < 0 && "Reg operand expected");
 
   int Imm32 = 0;
   if (Opcode == ARM::SMC) {
@@ -1106,7 +1106,7 @@ static bool DisassembleDPSoRegFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) &&
          (OpInfo[OpIdx+1].RegClass == ARM::GPRRegClassID) &&
-         (OpInfo[OpIdx+2].RegClass == 0) &&
+         (OpInfo[OpIdx+2].RegClass < 0) &&
          "Expect 3 reg operands");
 
   // Register-controlled shifts have Inst{7} = 0 and Inst{4} = 1.
@@ -1201,7 +1201,7 @@ static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     return false;
 
   assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) &&
-         (OpInfo[OpIdx+1].RegClass == 0) &&
+         (OpInfo[OpIdx+1].RegClass < 0) &&
          "Expect 1 reg operand followed by 1 imm operand");
 
   ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
@@ -1323,7 +1323,7 @@ static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     return false;
 
   assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) &&
-         (OpInfo[OpIdx+1].RegClass == 0) &&
+         (OpInfo[OpIdx+1].RegClass < 0) &&
          "Expect 1 reg operand followed by 1 imm operand");
 
   ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
@@ -1494,7 +1494,7 @@ static bool DisassembleArithMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   // If there is still an operand info left which is an immediate operand, add
   // an additional imm5 LSL/ASR operand.
-  if (ThreeReg && OpInfo[OpIdx].RegClass == 0
+  if (ThreeReg && OpInfo[OpIdx].RegClass < 0
       && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
     // Extract the 5-bit immediate field Inst{11-7}.
     unsigned ShiftAmt = (insn >> ARMII::ShiftShift) & 0x1F;
@@ -1540,7 +1540,7 @@ static bool DisassembleExtFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   // If there is still an operand info left which is an immediate operand, add
   // an additional rotate immediate operand.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
       && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
     // Extract the 2-bit rotate field Inst{11-10}.
     unsigned rot = (insn >> ARMII::ExtRotImmShift) & 3;
@@ -1725,7 +1725,7 @@ static bool DisassembleVFPConv1Frm(MCInst &MI, unsigned Opcode, uint32_t insn,
            "Tied to operand expected");
     MI.addOperand(MI.getOperand(0));
 
-    assert(OpInfo[2].RegClass == 0 && !OpInfo[2].isPredicate() &&
+    assert(OpInfo[2].RegClass < 0 && !OpInfo[2].isPredicate() &&
            !OpInfo[2].isOptionalDef() && "Imm operand expected");
     MI.addOperand(MCOperand::CreateImm(fbits));
 
@@ -1984,7 +1984,7 @@ static bool DisassembleVFPMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   ++OpIdx;
 
   // Extract/decode the f64/f32 immediate.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
         && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
     // The asm syntax specifies the before-expanded <imm>.
     // Not VFPExpandImm(slice(insn,19,16) << 4 | slice(insn, 3, 0),
@@ -2077,42 +2077,12 @@ static unsigned decodeLaneIndex(uint32_t insn) {
 // imm3 = Inst{18-16}, imm4 = Inst{3-0}
 // Ref: Table A7-15 Modified immediate values for Advanced SIMD instructions.
 static uint64_t decodeN1VImm(uint32_t insn, ElemSize esize) {
+  unsigned char op = (insn >> 5) & 1;
   unsigned char cmode = (insn >> 8) & 0xF;
   unsigned char Imm8 = ((insn >> 24) & 1) << 7 |
                        ((insn >> 16) & 7) << 4 |
                        (insn & 0xF);
-  uint64_t Imm64 = 0;
-
-  switch (esize) {
-  case ESize8:
-    Imm64 = Imm8;
-    break;
-  case ESize16:
-    Imm64 = Imm8 << 8*(cmode >> 1 & 1);
-    break;
-  case ESize32: {
-    if (cmode == 12)
-      Imm64 = (Imm8 << 8) | 0xFF;
-    else if (cmode == 13)
-      Imm64 = (Imm8 << 16) | 0xFFFF;
-    else {
-      // Imm8 to be shifted left by how many bytes...
-      Imm64 = Imm8 << 8*(cmode >> 1 & 3);
-    }
-    break;
-  }
-  case ESize64: {
-    for (unsigned i = 0; i < 8; ++i)
-      if ((Imm8 >> i) & 1)
-        Imm64 |= (uint64_t)0xFF << 8*i;
-    break;
-  }
-  default:
-    assert(0 && "Unreachable code!");
-    return 0;
-  }
-
-  return Imm64;
+  return (op << 12) | (cmode << 8) | Imm8;
 }
 
 // A8.6.339 VMUL, VMULL (by scalar)
@@ -2303,7 +2273,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     }
 
     assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           OpInfo[OpIdx + 1].RegClass == 0 && "Addrmode #6 Operands expected");
+           OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected");
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        Rn)));
     MI.addOperand(MCOperand::CreateImm(0)); // Alignment ignored?
@@ -2320,7 +2290,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
            "Reg operand expected");
 
     RegClass = OpInfo[OpIdx].RegClass;
-    while (OpIdx < NumOps && OpInfo[OpIdx].RegClass == RegClass) {
+    while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
       MI.addOperand(MCOperand::CreateReg(
                       getRegisterEnum(B, RegClass, Rd,
                                       UseDRegPair(Opcode))));
@@ -2329,7 +2299,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     }
 
     // Handle possible lane index.
-    if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+    if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
         && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
       MI.addOperand(MCOperand::CreateImm(decodeLaneIndex(insn)));
       ++OpIdx;
@@ -2340,7 +2310,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     // possible TIED_TO DPR/QPR's (ignored), then possible lane index.
     RegClass = OpInfo[0].RegClass;
 
-    while (OpIdx < NumOps && OpInfo[OpIdx].RegClass == RegClass) {
+    while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
       MI.addOperand(MCOperand::CreateReg(
                       getRegisterEnum(B, RegClass, Rd,
                                       UseDRegPair(Opcode))));
@@ -2355,7 +2325,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     }
 
     assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           OpInfo[OpIdx + 1].RegClass == 0 && "Addrmode #6 Operands expected");
+           OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected");
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        Rn)));
     MI.addOperand(MCOperand::CreateImm(0)); // Alignment ignored?
@@ -2366,7 +2336,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
       ++OpIdx;
     }
 
-    while (OpIdx < NumOps && OpInfo[OpIdx].RegClass == RegClass) {
+    while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
       assert(TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1 &&
              "Tied to operand expected");
       MI.addOperand(MCOperand::CreateReg(0));
@@ -2374,7 +2344,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     }
 
     // Handle possible lane index.
-    if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+    if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
         && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
       MI.addOperand(MCOperand::CreateImm(decodeLaneIndex(insn)));
       ++OpIdx;
@@ -2438,7 +2408,7 @@ static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode,
   assert(NumOps >= 2 &&
          (OpInfo[0].RegClass == ARM::DPRRegClassID ||
           OpInfo[0].RegClass == ARM::QPRRegClassID) &&
-         (OpInfo[1].RegClass == 0) &&
+         (OpInfo[1].RegClass < 0) &&
          "Expect 1 reg operand followed by 1 imm operand");
 
   // Qd/Dd = Inst{22:15-12} => NEON Rd
@@ -2552,7 +2522,7 @@ static bool DisassembleNVdVmOptImm(MCInst &MI, unsigned Opc, uint32_t insn,
   }
 
   // Add the imm operand, if required.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
       && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
 
     unsigned imm = 0xFFFFFFFF;
@@ -2632,7 +2602,7 @@ static bool DisassembleNVectorShift(MCInst &MI, unsigned Opcode, uint32_t insn,
                                                      decodeNEONRm(insn))));
   ++OpIdx;
 
-  assert(OpInfo[OpIdx].RegClass == 0 && "Imm operand expected");
+  assert(OpInfo[OpIdx].RegClass < 0 && "Imm operand expected");
 
   // Add the imm operand.
   
@@ -2762,7 +2732,7 @@ static bool DisassembleNVdVnVmOptImm(MCInst &MI, unsigned Opcode, uint32_t insn,
                   getRegisterEnum(B, OpInfo[OpIdx].RegClass, m)));
   ++OpIdx;
 
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
       && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
     // Add the imm operand.
     unsigned Imm = 0;
@@ -2869,15 +2839,9 @@ static bool DisassembleNVTBLFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   return true;
 }
 
-static bool DisassembleNEONFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO) {
-  assert(0 && "Unreachable code!");
-  return false;
-}
-
 // Vector Get Lane (move scalar to ARM core register) Instructions.
 // VGETLNi32, VGETLNs16, VGETLNs8, VGETLNu16, VGETLNu8: Rt Dn index
-static bool DisassembleNEONGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
+static bool DisassembleNGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
   const TargetInstrDesc &TID = ARMInsts[Opcode];
@@ -2887,7 +2851,7 @@ static bool DisassembleNEONGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   assert(TID.getNumDefs() == 1 && NumOps >= 3 &&
          OpInfo[0].RegClass == ARM::GPRRegClassID &&
          OpInfo[1].RegClass == ARM::DPRRegClassID &&
-         OpInfo[2].RegClass == 0 &&
+         OpInfo[2].RegClass < 0 &&
          "Expect >= 3 operands with one dst operand");
 
   ElemSize esize =
@@ -2911,7 +2875,7 @@ static bool DisassembleNEONGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
 // Vector Set Lane (move ARM core register to scalar) Instructions.
 // VSETLNi16, VSETLNi32, VSETLNi8: Dd Dd (TIED_TO) Rt index
-static bool DisassembleNEONSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
+static bool DisassembleNSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
   const TargetInstrDesc &TID = ARMInsts[Opcode];
@@ -2923,7 +2887,7 @@ static bool DisassembleNEONSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
          OpInfo[1].RegClass == ARM::DPRRegClassID &&
          TID.getOperandConstraint(1, TOI::TIED_TO) != -1 &&
          OpInfo[2].RegClass == ARM::GPRRegClassID &&
-         OpInfo[3].RegClass == 0 &&
+         OpInfo[3].RegClass < 0 &&
          "Expect >= 3 operands with one dst operand");
 
   ElemSize esize =
@@ -2950,7 +2914,7 @@ static bool DisassembleNEONSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
 // Vector Duplicate Instructions (from ARM core register to all elements).
 // VDUP8d, VDUP16d, VDUP32d, VDUP8q, VDUP16q, VDUP32q: Qd/Dd Rt
-static bool DisassembleNEONDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
+static bool DisassembleNDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
   const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
@@ -3090,13 +3054,6 @@ static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   return false;
 }
 
-static bool DisassembleThumbMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO) {
-
-  assert(0 && "Unexpected thumb misc. instruction!");
-  return false;
-}
-
 /// FuncPtrs - FuncPtrs maps ARMFormat to its corresponding DisassembleFP.
 /// We divide the disassembly task into different categories, with each one
 /// corresponding to a specific instruction encoding format.  There could be
@@ -3128,12 +3085,10 @@ static const DisassembleFP FuncPtrs[] = {
   &DisassembleVFPLdStMulFrm,
   &DisassembleVFPMiscFrm,
   &DisassembleThumbFrm,
-  &DisassembleNEONFrm,
-  &DisassembleNEONGetLnFrm,
-  &DisassembleNEONSetLnFrm,
-  &DisassembleNEONDupFrm,
   &DisassembleMiscFrm,
-  &DisassembleThumbMiscFrm,
+  &DisassembleNGetLnFrm,
+  &DisassembleNSetLnFrm,
+  &DisassembleNDupFrm,
 
   // VLD and VST (including one lane) Instructions.
   &DisassembleNLdSt,
@@ -3233,7 +3188,8 @@ bool ARMBasicMCBuilder::DoPredicateOperands(MCInst& MI, unsigned Opcode,
   // a pair of TargetOperandInfos with isPredicate() property.
   if (NumOpsRemaining >= 2 &&
       OpInfo[Idx].isPredicate() && OpInfo[Idx+1].isPredicate() &&
-      OpInfo[Idx].RegClass == 0 && OpInfo[Idx+1].RegClass == ARM::CCRRegClassID)
+      OpInfo[Idx].RegClass < 0 &&
+      OpInfo[Idx+1].RegClass == ARM::CCRRegClassID)
   {
     // If we are inside an IT block, get the IT condition bits maintained via
     // ARMBasicMCBuilder::ITState[7:0], through ARMBasicMCBuilder::GetITCond().
@@ -3265,7 +3221,8 @@ bool ARMBasicMCBuilder::TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode,
   // a pair of TargetOperandInfos with isPredicate() property.
   if (NumOpsRemaining >= 2 &&
       OpInfo[Idx].isPredicate() && OpInfo[Idx+1].isPredicate() &&
-      OpInfo[Idx].RegClass == 0 && OpInfo[Idx+1].RegClass == ARM::CCRRegClassID)
+      OpInfo[Idx].RegClass < 0 &&
+      OpInfo[Idx+1].RegClass == ARM::CCRRegClassID)
   {
     // If we are inside an IT block, get the IT condition bits maintained via
     // ARMBasicMCBuilder::ITState[7:0], through ARMBasicMCBuilder::GetITCond().
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
index b1d90df..7d21256 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
+++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
@@ -137,25 +137,25 @@ static inline void setSlice(uint32_t &Bits, unsigned From, unsigned To,
 /// Various utilities for checking the target specific flags.
 
 /// A unary data processing instruction doesn't have an Rn operand.
-static inline bool isUnaryDP(unsigned TSFlags) {
+static inline bool isUnaryDP(uint64_t TSFlags) {
   return (TSFlags & ARMII::UnaryDP);
 }
 
 /// This four-bit field describes the addressing mode used.
 /// See also ARMBaseInstrInfo.h.
-static inline unsigned getAddrMode(unsigned TSFlags) {
+static inline unsigned getAddrMode(uint64_t TSFlags) {
   return (TSFlags & ARMII::AddrModeMask);
 }
 
 /// {IndexModePre, IndexModePost}
 /// Only valid for load and store ops.
 /// See also ARMBaseInstrInfo.h.
-static inline unsigned getIndexMode(unsigned TSFlags) {
+static inline unsigned getIndexMode(uint64_t TSFlags) {
   return (TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
 }
 
 /// Pre-/post-indexed operations define an extra $base_wb in the OutOperandList.
-static inline bool isPrePostLdSt(unsigned TSFlags) {
+static inline bool isPrePostLdSt(uint64_t TSFlags) {
   return (TSFlags & ARMII::IndexModeMask) != 0;
 }
 
diff --git a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
index 4b2e308..4b7a0bf 100644
--- a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
+++ b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
@@ -395,7 +395,7 @@ static bool DisassembleThumb1General(MCInst &MI, unsigned Opcode, uint32_t insn,
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
                                                        getT1tRm(insn))));
   } else {
-    assert(OpInfo[OpIdx].RegClass == 0 &&
+    assert(OpInfo[OpIdx].RegClass < 0 &&
            !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()
            && "Pure imm operand expected");
     MI.addOperand(MCOperand::CreateImm(UseRt ? getT1Imm8(insn)
@@ -531,7 +531,7 @@ static bool DisassembleThumb1LdPC(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (!OpInfo) return false;
 
   assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass == 0 &&
+         (OpInfo[1].RegClass < 0 &&
           !OpInfo[1].isPredicate() &&
           !OpInfo[1].isOptionalDef())
          && "Invalid arguments");
@@ -598,7 +598,7 @@ static bool DisassembleThumb1LdSt(unsigned opA, MCInst &MI, unsigned Opcode,
 
   assert(OpIdx < NumOps && "More operands expected");
 
-  if (OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate() &&
+  if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() &&
       !OpInfo[OpIdx].isOptionalDef()) {
 
     MI.addOperand(MCOperand::CreateImm(Imm5 ? getT1Imm5(insn) : 0));
@@ -632,7 +632,7 @@ static bool DisassembleThumb1LdStSP(MCInst &MI, unsigned Opcode, uint32_t insn,
   assert(NumOps >= 3 &&
          OpInfo[0].RegClass == ARM::tGPRRegClassID &&
          OpInfo[1].RegClass == ARM::GPRRegClassID &&
-         (OpInfo[2].RegClass == 0 &&
+         (OpInfo[2].RegClass < 0 &&
           !OpInfo[2].isPredicate() &&
           !OpInfo[2].isOptionalDef())
          && "Invalid arguments");
@@ -658,7 +658,7 @@ static bool DisassembleThumb1AddPCi(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (!OpInfo) return false;
 
   assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass == 0 &&
+         (OpInfo[1].RegClass < 0 &&
           !OpInfo[1].isPredicate() &&
           !OpInfo[1].isOptionalDef())
          && "Invalid arguments");
@@ -685,7 +685,7 @@ static bool DisassembleThumb1AddSPi(MCInst &MI, unsigned Opcode, uint32_t insn,
   assert(NumOps >= 3 &&
          OpInfo[0].RegClass == ARM::tGPRRegClassID &&
          OpInfo[1].RegClass == ARM::GPRRegClassID &&
-         (OpInfo[2].RegClass == 0 &&
+         (OpInfo[2].RegClass < 0 &&
           !OpInfo[2].isPredicate() &&
           !OpInfo[2].isOptionalDef())
          && "Invalid arguments");
@@ -761,7 +761,7 @@ static bool DisassembleThumb1Misc(MCInst &MI, unsigned Opcode, uint32_t insn,
   // Predicate operands are handled elsewhere.
   if (NumOps == 2 &&
       OpInfo[0].isPredicate() && OpInfo[1].isPredicate() &&
-      OpInfo[0].RegClass == 0 && OpInfo[1].RegClass == ARM::CCRRegClassID) {
+      OpInfo[0].RegClass < 0 && OpInfo[1].RegClass == ARM::CCRRegClassID) {
     return true;
   }
 
@@ -808,7 +808,7 @@ static bool DisassembleThumb1Misc(MCInst &MI, unsigned Opcode, uint32_t insn,
   }
 
   assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass==0 || OpInfo[1].RegClass==ARM::tGPRRegClassID)
+         (OpInfo[1].RegClass < 0 || OpInfo[1].RegClass==ARM::tGPRRegClassID)
          && "Expect >=2 operands");
 
   // Add the destination operand.
@@ -913,7 +913,7 @@ static bool DisassembleThumb1CondBr(MCInst &MI, unsigned Opcode, uint32_t insn,
   const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
   if (!OpInfo) return false;
 
-  assert(NumOps == 3 && OpInfo[0].RegClass == 0 &&
+  assert(NumOps == 3 && OpInfo[0].RegClass < 0 &&
          OpInfo[1].isPredicate() && OpInfo[2].RegClass == ARM::CCRRegClassID
          && "Exactly 3 operands expected");
 
@@ -939,7 +939,7 @@ static bool DisassembleThumb1Br(MCInst &MI, unsigned Opcode, uint32_t insn,
   const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
   if (!OpInfo) return false;
 
-  assert(NumOps == 1 && OpInfo[0].RegClass == 0 && "1 imm operand expected");
+  assert(NumOps == 1 && OpInfo[0].RegClass < 0 && "1 imm operand expected");
 
   unsigned Imm11 = getT1Imm11(insn);
 
@@ -1239,7 +1239,7 @@ static bool DisassembleThumb2LdStDual(MCInst &MI, unsigned Opcode,
          && OpInfo[0].RegClass == ARM::GPRRegClassID
          && OpInfo[1].RegClass == ARM::GPRRegClassID
          && OpInfo[2].RegClass == ARM::GPRRegClassID
-         && OpInfo[3].RegClass == 0
+         && OpInfo[3].RegClass < 0
          && "Expect >= 4 operands and first 3 as reg operands");
 
   // Add the <Rt> <Rt2> operands.
@@ -1322,8 +1322,8 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
     assert(NumOps == 4
            && OpInfo[0].RegClass == ARM::GPRRegClassID
            && OpInfo[1].RegClass == ARM::GPRRegClassID
-           && OpInfo[2].RegClass == 0
-           && OpInfo[3].RegClass == 0
+           && OpInfo[2].RegClass < 0
+           && OpInfo[3].RegClass < 0
            && "Exactlt 4 operands expect and first two as reg operands");
     // Only need to populate the src reg operand.
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
@@ -1375,7 +1375,7 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (NumOps == OpIdx)
     return true;
 
-  if (OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate()
+  if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
       && !OpInfo[OpIdx].isOptionalDef()) {
 
     if (Thumb2ShiftOpcode(Opcode))
@@ -1440,7 +1440,7 @@ static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode,
   }
 
   // The modified immediate operand should come next.
-  assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 &&
+  assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 &&
          !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()
          && "Pure imm operand expected");
 
@@ -1555,7 +1555,7 @@ static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode,
     ++OpIdx;
   }
 
-  assert(OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate()
+  assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
          && !OpInfo[OpIdx].isOptionalDef()
          && "Pure imm operand expected");
 
@@ -1772,7 +1772,7 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn,
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        decodeRm(insn))));
   } else {
-    assert(OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate()
+    assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
            && !OpInfo[OpIdx].isOptionalDef()
            && "Pure imm operand expected");
     int Offset = 0;
@@ -1792,7 +1792,7 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn,
   }
   ++OpIdx;
 
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 &&
+  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 &&
       !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
     // Fills in the shift amount for t2PLDs, t2PLDWs, t2PLIs.
     MI.addOperand(MCOperand::CreateImm(slice(insn, 5, 4)));
@@ -1818,7 +1818,7 @@ static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode,
 
   assert(NumOps >= 2 &&
          OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         OpInfo[1].RegClass == 0 &&
+         OpInfo[1].RegClass < 0 &&
          "Expect >= 2 operands, first as reg, and second as imm operand");
 
   // Build the register operand, followed by the (+/-)imm12 immediate.
@@ -1930,7 +1930,7 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode,
     ++OpIdx;
   }
 
-  assert(OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate()
+  assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
          && !OpInfo[OpIdx].isOptionalDef()
          && "Pure imm operand expected");
 
@@ -1981,7 +1981,7 @@ static bool DisassembleThumb2DPReg(MCInst &MI, unsigned Opcode, uint32_t insn,
                                                      decodeRm(insn))));
   ++OpIdx;
 
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
       && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
     // Add the rotation amount immediate.
     MI.addOperand(MCOperand::CreateImm(decodeRotate(insn)));
diff --git a/lib/Target/ARM/NEONMoveFix.cpp b/lib/Target/ARM/NEONMoveFix.cpp
index 0a4400c..bbdd3c7 100644
--- a/lib/Target/ARM/NEONMoveFix.cpp
+++ b/lib/Target/ARM/NEONMoveFix.cpp
@@ -105,8 +105,8 @@ bool NEONMoveFixPass::InsertMoves(MachineBasicBlock &MBB) {
       unsigned MOReg = MO.getReg();
 
       Defs[MOReg] = MI;
-      // Catch subregs as well.
-      for (const unsigned *R = TRI->getSubRegisters(MOReg); *R; ++R)
+      // Catch aliases as well.
+      for (const unsigned *R = TRI->getAliasSet(MOReg); *R; ++R)
         Defs[*R] = MI;
     }
   }
diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp
index a725898..f67717c 100644
--- a/lib/Target/ARM/NEONPreAllocPass.cpp
+++ b/lib/Target/ARM/NEONPreAllocPass.cpp
@@ -407,7 +407,7 @@ NEONPreAllocPass::FormsRegSequence(MachineInstr *MI,
            "expected a virtual register");
     // Extracting from a Q or QQ register.
     MachineInstr *DefMI = MRI->getVRegDef(VirtReg);
-    if (!DefMI || !DefMI->isExtractSubreg())
+    if (!DefMI || !DefMI->isCopy() || !DefMI->getOperand(1).getSubReg())
       return false;
     VirtReg = DefMI->getOperand(1).getReg();
     if (LastSrcReg && LastSrcReg != VirtReg)
@@ -418,7 +418,7 @@ NEONPreAllocPass::FormsRegSequence(MachineInstr *MI,
         RC != ARM::QQPRRegisterClass &&
         RC != ARM::QQQQPRRegisterClass)
       return false;
-    unsigned SubIdx = DefMI->getOperand(2).getImm();
+    unsigned SubIdx = DefMI->getOperand(1).getSubReg();
     if (LastSubIdx) {
       if (LastSubIdx != SubIdx-Stride)
         return false;
@@ -434,22 +434,21 @@ NEONPreAllocPass::FormsRegSequence(MachineInstr *MI,
 
   // FIXME: Update the uses of EXTRACT_SUBREG from REG_SEQUENCE is
   // currently required for correctness. e.g.
-  //  %reg1041;<def> = REG_SEQUENCE %reg1040<kill>, 5, %reg1035<kill>, 6
+  //  %reg1041<def> = REG_SEQUENCE %reg1040<kill>, 5, %reg1035<kill>, 6
   //  %reg1042<def> = EXTRACT_SUBREG %reg1041, 6
   //  %reg1043<def> = EXTRACT_SUBREG %reg1041, 5
   //  VST1q16 %reg1025<kill>, 0, %reg1043<kill>, %reg1042<kill>,
-  // reg1025 and reg1043 should be replaced with reg1041:6 and reg1041:5
+  // reg1042 and reg1043 should be replaced with reg1041:6 and reg1041:5
   // respectively.
   // We need to change how we model uses of REG_SEQUENCE.
   for (unsigned R = 0; R < NumRegs; ++R) {
     MachineOperand &MO = MI->getOperand(FirstOpnd + R);
     unsigned OldReg = MO.getReg();
     MachineInstr *DefMI = MRI->getVRegDef(OldReg);
-    assert(DefMI->isExtractSubreg());
+    assert(DefMI->isCopy());
     MO.setReg(LastSrcReg);
     MO.setSubReg(SubIds[R]);
-    if (R != 0)
-      MO.setIsKill(false);
+    MO.setIsKill(false);
     // Delete the EXTRACT_SUBREG if its result is now dead.
     if (MRI->use_empty(OldReg))
       DefMI->eraseFromParent();
@@ -467,43 +466,9 @@ bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) {
     unsigned FirstOpnd, NumRegs, Offset, Stride;
     if (!isNEONMultiRegOp(MI->getOpcode(), FirstOpnd, NumRegs, Offset, Stride))
       continue;
-    if (llvm::ModelWithRegSequence() &&
-        FormsRegSequence(MI, FirstOpnd, NumRegs, Offset, Stride))
+    if (FormsRegSequence(MI, FirstOpnd, NumRegs, Offset, Stride))
       continue;
-
-    MachineBasicBlock::iterator NextI = llvm::next(MBBI);
-    for (unsigned R = 0; R < NumRegs; ++R) {
-      MachineOperand &MO = MI->getOperand(FirstOpnd + R);
-      assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand");
-      unsigned VirtReg = MO.getReg();
-      assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
-             "expected a virtual register");
-
-      // For now, just assign a fixed set of adjacent registers.
-      // This leaves plenty of room for future improvements.
-      static const unsigned NEONDRegs[] = {
-        ARM::D0, ARM::D1, ARM::D2, ARM::D3,
-        ARM::D4, ARM::D5, ARM::D6, ARM::D7
-      };
-      MO.setReg(NEONDRegs[Offset + R * Stride]);
-
-      if (MO.isUse()) {
-        // Insert a copy from VirtReg.
-        TII->copyRegToReg(MBB, MBBI, MO.getReg(), VirtReg,
-                          ARM::DPRRegisterClass, ARM::DPRRegisterClass,
-                          DebugLoc());
-        if (MO.isKill()) {
-          MachineInstr *CopyMI = prior(MBBI);
-          CopyMI->findRegisterUseOperand(VirtReg)->setIsKill();
-        }
-        MO.setIsKill();
-      } else if (MO.isDef() && !MO.isDead()) {
-        // Add a copy to VirtReg.
-        TII->copyRegToReg(MBB, NextI, VirtReg, MO.getReg(),
-                          ARM::DPRRegisterClass, ARM::DPRRegisterClass,
-                          DebugLoc());
-      }
-    }
+    llvm_unreachable("expected a REG_SEQUENCE");
   }
 
   return Modified;
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index fae84d4..af630ac 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -33,64 +33,24 @@ unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
   return 0;
 }
 
-bool Thumb1InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   const TargetRegisterClass *DestRC,
-                                   const TargetRegisterClass *SrcRC,
-                                   DebugLoc DL) const {
-  if (DestRC == ARM::GPRRegisterClass) {
-    if (SrcRC == ARM::GPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg);
-      return true;
-    } else if (SrcRC == ARM::tGPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg);
-      return true;
-    }
-  } else if (DestRC == ARM::tGPRRegisterClass) {
-    if (SrcRC == ARM::GPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg);
-      return true;
-    } else if (SrcRC == ARM::tGPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg);
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool Thumb1InstrInfo::
-canFoldMemoryOperand(const MachineInstr *MI,
-                     const SmallVectorImpl<unsigned> &Ops) const {
-  if (Ops.size() != 1) return false;
-
-  unsigned OpNum = Ops[0];
-  unsigned Opc = MI->getOpcode();
-  switch (Opc) {
-  default: break;
-  case ARM::tMOVr:
-  case ARM::tMOVtgpr2gpr:
-  case ARM::tMOVgpr2tgpr:
-  case ARM::tMOVgpr2gpr: {
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
-          !isARMLowRegister(SrcReg))
-        // tSpill cannot take a high register operand.
-        return false;
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(DstReg) &&
-          !isARMLowRegister(DstReg))
-        // tRestore cannot target a high register operand.
-        return false;
-    }
-    return true;
-  }
-  }
-
-  return false;
+void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  bool tDest = ARM::tGPRRegClass.contains(DestReg);
+  bool tSrc  = ARM::tGPRRegClass.contains(SrcReg);
+  unsigned Opc = ARM::tMOVgpr2gpr;
+  if (tDest && tSrc)
+    Opc = ARM::tMOVr;
+  else if (tSrc)
+    Opc = ARM::tMOVtgpr2gpr;
+  else if (tDest)
+    Opc = ARM::tMOVgpr2tgpr;
+
+  BuildMI(MBB, I, DL, get(Opc), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  assert(ARM::GPRRegClass.contains(DestReg, SrcReg) &&
+         "Thumb1 can only copy GPR registers");
 }
 
 void Thumb1InstrInfo::
@@ -175,10 +135,10 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
         isKill = false;
     }
 
-    if (isKill) {
+    if (isKill)
       MBB.addLiveIn(Reg);
-      MIB.addReg(Reg, RegState::Kill);
-    }
+
+    MIB.addReg(Reg, getKillRegState(isKill));
   }
   return true;
 }
@@ -221,46 +181,3 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   return true;
 }
-
-MachineInstr *Thumb1InstrInfo::
-foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                      const SmallVectorImpl<unsigned> &Ops, int FI) const {
-  if (Ops.size() != 1) return NULL;
-
-  unsigned OpNum = Ops[0];
-  unsigned Opc = MI->getOpcode();
-  MachineInstr *NewMI = NULL;
-  switch (Opc) {
-  default: break;
-  case ARM::tMOVr:
-  case ARM::tMOVtgpr2gpr:
-  case ARM::tMOVgpr2tgpr:
-  case ARM::tMOVgpr2gpr: {
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      bool isKill = MI->getOperand(1).isKill();
-      if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
-          !isARMLowRegister(SrcReg))
-        // tSpill cannot take a high register operand.
-        break;
-      NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill))
-                             .addReg(SrcReg, getKillRegState(isKill))
-                             .addFrameIndex(FI).addImm(0));
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(DstReg) &&
-          !isARMLowRegister(DstReg))
-        // tRestore cannot target a high register operand.
-        break;
-      bool isDead = MI->getOperand(0).isDead();
-      NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore))
-                             .addReg(DstReg,
-                                     RegState::Define | getDeadRegState(isDead))
-                             .addFrameIndex(FI).addImm(0));
-    }
-    break;
-  }
-  }
-
-  return NewMI;
-}
diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index c937296..555135a 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h
@@ -46,12 +46,10 @@ public:
                                    const std::vector<CalleeSavedInfo> &CSI,
                                    const TargetRegisterInfo *TRI) const;
 
-  bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator I,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
                                    unsigned SrcReg, bool isKill, int FrameIndex,
@@ -64,20 +62,6 @@ public:
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const;
 
-  bool canFoldMemoryOperand(const MachineInstr *MI,
-                                    const SmallVectorImpl<unsigned> &Ops) const;
-
-  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                      MachineInstr* MI,
-                                      const SmallVectorImpl<unsigned> &Ops,
-                                      int FrameIndex) const;
-
-  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                      MachineInstr* MI,
-                                      const SmallVectorImpl<unsigned> &Ops,
-                                      MachineInstr* LoadMI) const {
-    return 0;
-  }
 };
 }
 
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 2f635fe..39b70b4 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -68,21 +68,6 @@ void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
           .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg);
 }
 
-const TargetRegisterClass*
-Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, EVT VT) const {
-  if (isARMLowRegister(Reg))
-    return ARM::tGPRRegisterClass;
-  switch (Reg) {
-   default:
-    break;
-   case ARM::R8:  case ARM::R9:  case ARM::R10:  case ARM::R11:
-   case ARM::R12: case ARM::SP:  case ARM::LR:   case ARM::PC:
-    return ARM::GPRRegisterClass;
-  }
-
-  return TargetRegisterInfo::getPhysicalRegisterRegClass(Reg, VT);
-}
-
 bool Thumb1RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
   const MachineFrameInfo *FFI = MF.getFrameInfo();
   unsigned CFSize = FFI->getMaxCallFrameSize();
@@ -410,6 +395,8 @@ Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
   // before that instead and adjust the UseMI.
   bool done = false;
   for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
+    if (II->isDebugValue())
+      continue;
     // If this instruction affects R12, adjust our restore point.
     for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
       const MachineOperand &MO = II->getOperand(i);
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index 4eca367..9a0308af 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -38,9 +38,6 @@ public:
                         unsigned PredReg = 0) const;
 
   /// Code Generation virtual methods...
-  const TargetRegisterClass *
-    getPhysicalRegisterRegClass(unsigned Reg, EVT VT = MVT::Other) const;
-
   bool hasReservedCallFrame(MachineFunction &MF) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
@@ -51,7 +48,8 @@ public:
   // could not be handled directly in MI.
   int rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
                         unsigned FrameReg, int Offset,
-                        unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const;
+                        unsigned MOVOpc, unsigned ADDriOpc,
+                        unsigned SUBriOpc) const;
 
   bool saveScavengerRegister(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I,
diff --git a/lib/Target/ARM/Thumb2HazardRecognizer.cpp b/lib/Target/ARM/Thumb2HazardRecognizer.cpp
new file mode 100644
index 0000000..172908d
--- /dev/null
+++ b/lib/Target/ARM/Thumb2HazardRecognizer.cpp
@@ -0,0 +1,53 @@
+//===-- Thumb2HazardRecognizer.cpp - Thumb2 postra hazard recognizer ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "Thumb2HazardRecognizer.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+using namespace llvm;
+
+ScheduleHazardRecognizer::HazardType
+Thumb2HazardRecognizer::getHazardType(SUnit *SU) {
+  if (ITBlockSize) {
+    MachineInstr *MI = SU->getInstr();
+    if (!MI->isDebugValue() && MI != ITBlockMIs[ITBlockSize-1])
+      return Hazard;
+  }
+
+  return PostRAHazardRecognizer::getHazardType(SU);
+}
+
+void Thumb2HazardRecognizer::Reset() {
+  ITBlockSize = 0;
+  PostRAHazardRecognizer::Reset();
+}
+
+void Thumb2HazardRecognizer::EmitInstruction(SUnit *SU) {
+  MachineInstr *MI = SU->getInstr();
+  unsigned Opcode = MI->getOpcode();
+  if (ITBlockSize) {
+    --ITBlockSize;
+  } else if (Opcode == ARM::t2IT) {
+    unsigned Mask = MI->getOperand(1).getImm();
+    unsigned NumTZ = CountTrailingZeros_32(Mask);
+    assert(NumTZ <= 3 && "Invalid IT mask!");
+    ITBlockSize = 4 - NumTZ;
+    MachineBasicBlock::iterator I = MI;
+    for (unsigned i = 0; i < ITBlockSize; ++i) {
+      // Advance to the next instruction, skipping any dbg_value instructions.
+      do {
+        ++I;
+      } while (I->isDebugValue());
+      ITBlockMIs[ITBlockSize-1-i] = &*I;
+    }
+  }
+
+  PostRAHazardRecognizer::EmitInstruction(SU);
+}
diff --git a/lib/Target/ARM/Thumb2HazardRecognizer.h b/lib/Target/ARM/Thumb2HazardRecognizer.h
new file mode 100644
index 0000000..4726658
--- /dev/null
+++ b/lib/Target/ARM/Thumb2HazardRecognizer.h
@@ -0,0 +1,40 @@
+//===-- Thumb2HazardRecognizer.h - Thumb2 Hazard Recognizers ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling Thumb2 functions on
+// ARM processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB2HAZARDRECOGNIZER_H
+#define THUMB2HAZARDRECOGNIZER_H
+
+#include "llvm/CodeGen/PostRAHazardRecognizer.h"
+
+namespace llvm {
+
+class MachineInstr;
+
+class Thumb2HazardRecognizer : public PostRAHazardRecognizer {
+  unsigned ITBlockSize;  // No. of MIs in current IT block yet to be scheduled.
+  MachineInstr *ITBlockMIs[4];
+
+public:
+  Thumb2HazardRecognizer(const InstrItineraryData &ItinData) :
+    PostRAHazardRecognizer(ItinData) {}
+
+  virtual HazardType getHazardType(SUnit *SU);
+  virtual void Reset();
+  virtual void EmitInstruction(SUnit *SU);
+};
+
+
+} // end namespace llvm
+
+#endif // THUMB2HAZARDRECOGNIZER_H
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index f36d4ef..cd15bbe 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -14,17 +14,23 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
-STATISTIC(NumITs,     "Number of IT blocks inserted");
+STATISTIC(NumITs,        "Number of IT blocks inserted");
+STATISTIC(NumMovedInsts, "Number of predicated instructions moved");
 
 namespace {
-  struct Thumb2ITBlockPass : public MachineFunctionPass {
+  class Thumb2ITBlockPass : public MachineFunctionPass {
+    bool PreRegAlloc;
+
+  public:
     static char ID;
     Thumb2ITBlockPass() : MachineFunctionPass(&ID) {}
 
     const Thumb2InstrInfo *TII;
+    const TargetRegisterInfo *TRI;
     ARMFunctionInfo *AFI;
 
     virtual bool runOnMachineFunction(MachineFunction &Fn);
@@ -34,61 +40,167 @@ namespace {
     }
 
   private:
-    bool InsertITBlocks(MachineBasicBlock &MBB);
+    bool MoveCopyOutOfITBlock(MachineInstr *MI,
+                              ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
+                              SmallSet<unsigned, 4> &Defs,
+                              SmallSet<unsigned, 4> &Uses);
+    bool InsertITInstructions(MachineBasicBlock &MBB);
   };
   char Thumb2ITBlockPass::ID = 0;
 }
 
-static ARMCC::CondCodes getPredicate(const MachineInstr *MI, unsigned &PredReg){
-  unsigned Opc = MI->getOpcode();
-  if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
-    return ARMCC::AL;
-  return llvm::getInstrPredicate(MI, PredReg);
+/// TrackDefUses - Tracking what registers are being defined and used by
+/// instructions in the IT block. This also tracks "dependencies", i.e. uses
+/// in the IT block that are defined before the IT instruction.
+static void TrackDefUses(MachineInstr *MI,
+                         SmallSet<unsigned, 4> &Defs,
+                         SmallSet<unsigned, 4> &Uses,
+                         const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 4> LocalDefs;
+  SmallVector<unsigned, 4> LocalUses;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP)
+      continue;
+    if (MO.isUse())
+      LocalUses.push_back(Reg);
+    else
+      LocalDefs.push_back(Reg);
+  }
+
+  for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
+    unsigned Reg = LocalUses[i];
+    Uses.insert(Reg);
+    for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+         *Subreg; ++Subreg)
+      Uses.insert(*Subreg);
+  }
+
+  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
+    unsigned Reg = LocalDefs[i];
+    Defs.insert(Reg);
+    for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+         *Subreg; ++Subreg)
+      Defs.insert(*Subreg);
+    if (Reg == ARM::CPSR)
+      continue;
+  }
+}
+
+bool
+Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
+                                      ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
+                                        SmallSet<unsigned, 4> &Defs,
+                                        SmallSet<unsigned, 4> &Uses) {
+  unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+  if (TII->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
+    assert(SrcSubIdx == 0 && DstSubIdx == 0 &&
+           "Sub-register indices still around?");
+    // llvm models select's as two-address instructions. That means a copy
+    // is inserted before a t2MOVccr, etc. If the copy is scheduled in
+    // between selects we would end up creating multiple IT blocks.
+
+    // First check if it's safe to move it.
+    if (Uses.count(DstReg) || Defs.count(SrcReg))
+      return false;
+
+    // Then peek at the next instruction to see if it's predicated on CC or OCC.
+    // If not, then there is nothing to be gained by moving the copy.
+    MachineBasicBlock::iterator I = MI; ++I;
+    MachineBasicBlock::iterator E = MI->getParent()->end();
+    while (I != E && I->isDebugValue())
+      ++I;
+    if (I != E) {
+      unsigned NPredReg = 0;
+      ARMCC::CondCodes NCC = llvm::getITInstrPredicate(I, NPredReg);
+      if (NCC == CC || NCC == OCC)
+        return true;
+    }
+  }
+  return false;
 }
 
-bool Thumb2ITBlockPass::InsertITBlocks(MachineBasicBlock &MBB) {
+bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
   bool Modified = false;
 
+  SmallSet<unsigned, 4> Defs;
+  SmallSet<unsigned, 4> Uses;
   MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
   while (MBBI != E) {
     MachineInstr *MI = &*MBBI;
     DebugLoc dl = MI->getDebugLoc();
     unsigned PredReg = 0;
-    ARMCC::CondCodes CC = getPredicate(MI, PredReg);
-
+    ARMCC::CondCodes CC = llvm::getITInstrPredicate(MI, PredReg);
     if (CC == ARMCC::AL) {
       ++MBBI;
       continue;
     }
 
+    Defs.clear();
+    Uses.clear();
+    TrackDefUses(MI, Defs, Uses, TRI);
+
     // Insert an IT instruction.
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT))
       .addImm(CC);
+
+    // Add implicit use of ITSTATE to IT block instructions.
+    MI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+                                             true/*isImp*/, false/*isKill*/));
+
+    MachineInstr *LastITMI = MI;
+    MachineBasicBlock::iterator InsertPos = MIB;
     ++MBBI;
 
-    // Finalize IT mask.
+    // Form IT block.
     ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
     unsigned Mask = 0, Pos = 3;
     // Branches, including tricky ones like LDM_RET, need to end an IT
     // block so check the instruction we just put in the block.
-    while (MBBI != E && Pos &&
-           (!MI->getDesc().isBranch() && !MI->getDesc().isReturn())) {
+    for (; MBBI != E && Pos &&
+           (!MI->getDesc().isBranch() && !MI->getDesc().isReturn()) ; ++MBBI) {
+      if (MBBI->isDebugValue())
+        continue;
+
       MachineInstr *NMI = &*MBBI;
       MI = NMI;
-      DebugLoc ndl = NMI->getDebugLoc();
+
       unsigned NPredReg = 0;
-      ARMCC::CondCodes NCC = getPredicate(NMI, NPredReg);
-      if (NCC == CC || NCC == OCC)
+      ARMCC::CondCodes NCC = llvm::getITInstrPredicate(NMI, NPredReg);
+      if (NCC == CC || NCC == OCC) {
         Mask |= (NCC & 1) << Pos;
-      else
+        // Add implicit use of ITSTATE.
+        NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+                                               true/*isImp*/, false/*isKill*/));
+        LastITMI = NMI;
+      } else {
+        if (NCC == ARMCC::AL &&
+            MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) {
+          --MBBI;
+          MBB.remove(NMI);
+          MBB.insert(InsertPos, NMI);
+          ++NumMovedInsts;
+          continue;
+        }
         break;
+      }
+      TrackDefUses(NMI, Defs, Uses, TRI);
       --Pos;
-      ++MBBI;
     }
+
+    // Finalize IT mask.
     Mask |= (1 << Pos);
     // Tag along (firstcond[0] << 4) with the mask.
     Mask |= (CC & 1) << 4;
     MIB.addImm(Mask);
+
+    // Last instruction in IT block kills ITSTATE.
+    LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill();
+
     Modified = true;
     ++NumITs;
   }
@@ -100,17 +212,21 @@ bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
   const TargetMachine &TM = Fn.getTarget();
   AFI = Fn.getInfo<ARMFunctionInfo>();
   TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+  TRI = TM.getRegisterInfo();
 
   if (!AFI->isThumbFunction())
     return false;
 
   bool Modified = false;
-  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
-       ++MFI) {
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ) {
     MachineBasicBlock &MBB = *MFI;
-    Modified |= InsertITBlocks(MBB);
+    ++MFI;
+    Modified |= InsertITInstructions(MBB);
   }
 
+  if (Modified)
+    AFI->setHasITBlocks(true);
+
   return Modified;
 }
 
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 531d5e9..ee51727 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -17,15 +17,27 @@
 #include "ARMAddressingModes.h"
 #include "ARMGenInstrInfo.inc"
 #include "ARMMachineFunctionInfo.h"
+#include "Thumb2HazardRecognizer.h"
+#include "Thumb2InstrInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/ADT/SmallVector.h"
-#include "Thumb2InstrInfo.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
+static cl::opt<unsigned>
+IfCvtLimit("thumb2-ifcvt-limit", cl::Hidden,
+           cl::desc("Thumb2 if-conversion limit (default 3)"),
+           cl::init(3));
+
+static cl::opt<unsigned>
+IfCvtDiamondLimit("thumb2-ifcvt-diamond-limit", cl::Hidden,
+                  cl::desc("Thumb2 diamond if-conversion limit (default 3)"),
+                  cl::init(3));
+
 Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
   : ARMBaseInstrInfo(STI), RI(*this, STI) {
 }
@@ -35,33 +47,99 @@ unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const {
   return 0;
 }
 
-bool
-Thumb2InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I,
-                              unsigned DestReg, unsigned SrcReg,
-                              const TargetRegisterClass *DestRC,
-                              const TargetRegisterClass *SrcRC,
-                              DebugLoc DL) const {
-  if (DestRC == ARM::GPRRegisterClass) {
-    if (SrcRC == ARM::GPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg);
-      return true;
-    } else if (SrcRC == ARM::tGPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg);
-      return true;
-    }
-  } else if (DestRC == ARM::tGPRRegisterClass) {
-    if (SrcRC == ARM::GPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg);
-      return true;
-    } else if (SrcRC == ARM::tGPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg);
-      return true;
+void
+Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                                         MachineBasicBlock *NewDest) const {
+  MachineBasicBlock *MBB = Tail->getParent();
+  ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
+  if (!AFI->hasITBlocks()) {
+    TargetInstrInfoImpl::ReplaceTailWithBranchTo(Tail, NewDest);
+    return;
+  }
+
+  // If the first instruction of Tail is predicated, we may have to update
+  // the IT instruction.
+  unsigned PredReg = 0;
+  ARMCC::CondCodes CC = llvm::getInstrPredicate(Tail, PredReg);
+  MachineBasicBlock::iterator MBBI = Tail;
+  if (CC != ARMCC::AL)
+    // Expecting at least the t2IT instruction before it.
+    --MBBI;
+
+  // Actually replace the tail.
+  TargetInstrInfoImpl::ReplaceTailWithBranchTo(Tail, NewDest);
+
+  // Fix up IT.
+  if (CC != ARMCC::AL) {
+    MachineBasicBlock::iterator E = MBB->begin();
+    unsigned Count = 4; // At most 4 instructions in an IT block.
+    while (Count && MBBI != E) {
+      if (MBBI->isDebugValue()) {
+        --MBBI;
+        continue;
+      }
+      if (MBBI->getOpcode() == ARM::t2IT) {
+        unsigned Mask = MBBI->getOperand(1).getImm();
+        if (Count == 4)
+          MBBI->eraseFromParent();
+        else {
+          unsigned MaskOn = 1 << Count;
+          unsigned MaskOff = ~(MaskOn - 1);
+          MBBI->getOperand(1).setImm((Mask & MaskOff) | MaskOn);
+        }
+        return;
+      }
+      --MBBI;
+      --Count;
     }
+
+    // Ctrl flow can reach here if branch folding is run before IT block
+    // formation pass.
   }
+}
+
+bool
+Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI) const {
+  unsigned PredReg = 0;
+  return llvm::getITInstrPredicate(MBBI, PredReg) == ARMCC::AL;
+}
 
+bool Thumb2InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+                                          unsigned NumInstrs) const {
+  return NumInstrs && NumInstrs <= IfCvtLimit;
+}
+  
+bool Thumb2InstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
+                    MachineBasicBlock &FMBB, unsigned NumF) const {
+  // FIXME: Catch optimization such as:
+  //        r0 = movne
+  //        r0 = moveq
+  return NumT && NumF &&
+    NumT <= (IfCvtDiamondLimit) && NumF <= (IfCvtDiamondLimit);
+}
+
+void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
   // Handle SPR, DPR, and QPR copies.
-  return ARMBaseInstrInfo::copyRegToReg(MBB, I, DestReg, SrcReg, DestRC, SrcRC, DL);
+  if (!ARM::GPRRegClass.contains(DestReg, SrcReg))
+    return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc);
+
+  bool tDest = ARM::tGPRRegClass.contains(DestReg);
+  bool tSrc  = ARM::tGPRRegClass.contains(SrcReg);
+  unsigned Opc = ARM::tMOVgpr2gpr;
+  if (tDest && tSrc)
+    Opc = ARM::tMOVr;
+  else if (tSrc)
+    Opc = ARM::tMOVtgpr2gpr;
+  else if (tDest)
+    Opc = ARM::tMOVgpr2tgpr;
+
+  BuildMI(MBB, I, DL, get(Opc), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 void Thumb2InstrInfo::
@@ -69,7 +147,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                     unsigned SrcReg, bool isKill, int FI,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
-  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) {
+  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass ||
+      RC == ARM::tcGPRRegisterClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -94,7 +173,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
-  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) {
+  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass ||
+      RC == ARM::tcGPRRegisterClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -113,6 +193,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI);
 }
 
+ScheduleHazardRecognizer *Thumb2InstrInfo::
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const {
+  return (ScheduleHazardRecognizer *)new Thumb2HazardRecognizer(II);
+}
+
 void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator &MBBI, DebugLoc dl,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,
@@ -131,14 +216,14 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
       // Use a movw to materialize the 16-bit constant.
       BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg)
         .addImm(NumBytes)
-        .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+        .addImm((unsigned)Pred).addReg(PredReg);
       Fits = true;
     } else if ((NumBytes & 0xffff) == 0) {
       // Use a movt to materialize the 32-bit constant.
       BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg)
         .addReg(DestReg)
         .addImm(NumBytes >> 16)
-        .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+        .addImm((unsigned)Pred).addReg(PredReg);
       Fits = true;
     }
 
@@ -502,3 +587,54 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   Offset = (isSub) ? -Offset : Offset;
   return Offset == 0;
 }
+
+/// scheduleTwoAddrSource - Schedule the copy / re-mat of the source of the
+/// two-addrss instruction inserted by two-address pass.
+void
+Thumb2InstrInfo::scheduleTwoAddrSource(MachineInstr *SrcMI,
+                                       MachineInstr *UseMI,
+                                       const TargetRegisterInfo &TRI) const {
+  if (SrcMI->getOpcode() != ARM::tMOVgpr2gpr ||
+      SrcMI->getOperand(1).isKill())
+    return;
+
+  unsigned PredReg = 0;
+  ARMCC::CondCodes CC = llvm::getInstrPredicate(UseMI, PredReg);
+  if (CC == ARMCC::AL || PredReg != ARM::CPSR)
+    return;
+
+  // Schedule the copy so it doesn't come between previous instructions
+  // and UseMI which can form an IT block.
+  unsigned SrcReg = SrcMI->getOperand(1).getReg();
+  ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
+  MachineBasicBlock *MBB = UseMI->getParent();
+  MachineBasicBlock::iterator MBBI = SrcMI;
+  unsigned NumInsts = 0;
+  while (--MBBI != MBB->begin()) {
+    if (MBBI->isDebugValue())
+      continue;
+
+    MachineInstr *NMI = &*MBBI;
+    ARMCC::CondCodes NCC = llvm::getInstrPredicate(NMI, PredReg);
+    if (!(NCC == CC || NCC == OCC) ||
+        NMI->modifiesRegister(SrcReg, &TRI) ||
+        NMI->definesRegister(ARM::CPSR))
+      break;
+    if (++NumInsts == 4)
+      // Too many in a row!
+      return;
+  }
+
+  if (NumInsts) {
+    MBB->remove(SrcMI);
+    MBB->insert(++MBBI, SrcMI);
+  }
+}
+
+ARMCC::CondCodes
+llvm::getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
+    return ARMCC::AL;
+  return llvm::getInstrPredicate(MI, PredReg);
+}
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index 2948770..3a9f8b1 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -20,7 +20,8 @@
 #include "Thumb2RegisterInfo.h"
 
 namespace llvm {
-  class ARMSubtarget;
+class ARMSubtarget;
+class ScheduleHazardRecognizer;
 
 class Thumb2InstrInfo : public ARMBaseInstrInfo {
   Thumb2RegisterInfo RI;
@@ -31,12 +32,21 @@ public:
   // if there is not such an opcode.
   unsigned getUnindexedOpcode(unsigned Opc) const;
 
-  bool copyRegToReg(MachineBasicBlock &MBB,
-                    MachineBasicBlock::iterator I,
-                    unsigned DestReg, unsigned SrcReg,
-                    const TargetRegisterClass *DestRC,
-                    const TargetRegisterClass *SrcRC,
-                    DebugLoc DL) const;
+  void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                               MachineBasicBlock *NewDest) const;
+
+  bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI) const;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs) const;
+  
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTInstrs,
+                           MachineBasicBlock &FMBB, unsigned NumFInstrs) const;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
@@ -50,12 +60,27 @@ public:
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const;
 
+  /// scheduleTwoAddrSource - Schedule the copy / re-mat of the source of the
+  /// two-addrss instruction inserted by two-address pass.
+  void scheduleTwoAddrSource(MachineInstr *SrcMI, MachineInstr *UseMI,
+                             const TargetRegisterInfo &TRI) const;
+
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
   const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
+
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const;
 };
+
+/// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
+/// to llvm::getInstrPredicate except it returns AL for conditional branch
+/// instructions which are "predicated", but are not in IT blocks.
+ARMCC::CondCodes getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
+
+
 }
 
 #endif // THUMB2INSTRUCTIONINFO_H
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 8fe2e42..ba392f3 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -451,11 +451,18 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
   if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
     return false;
 
-  const TargetInstrDesc &TID = MI->getDesc();
   unsigned Reg0 = MI->getOperand(0).getReg();
   unsigned Reg1 = MI->getOperand(1).getReg();
-  if (Reg0 != Reg1)
-    return false;
+  if (Reg0 != Reg1) {
+    // Try to commute the operands to make it a 2-address instruction.
+    unsigned CommOpIdx1, CommOpIdx2;
+    if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) ||
+        CommOpIdx1 != 1 || MI->getOperand(CommOpIdx2).getReg() != Reg0)
+      return false;
+    MachineInstr *CommutedMI = TII->commuteInstruction(MI);
+    if (!CommutedMI)
+      return false;
+  }
   if (Entry.LowRegs2 && !isARMLowRegister(Reg0))
     return false;
   if (Entry.Imm2Limit) {
@@ -484,6 +491,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
 
   bool HasCC = false;
   bool CCDead = false;
+  const TargetInstrDesc &TID = MI->getDesc();
   if (TID.hasOptionalDef()) {
     unsigned NumOps = TID.getNumOperands();
     HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
@@ -689,7 +697,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
         goto ProcessNext;
       }
 
-      // Try to transform ro a 16-bit non-two-address instruction.
+      // Try to transform to a 16-bit non-two-address instruction.
       if (Entry.NarrowOpc1 && ReduceToNarrow(MBB, MI, Entry, LiveCPSR)) {
         Modified = true;
         MachineBasicBlock::iterator I = prior(NextMII);
diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp
index 1d85f12..ea78bf3 100644
--- a/lib/Target/Alpha/AlphaISelLowering.cpp
+++ b/lib/Target/Alpha/AlphaISelLowering.cpp
@@ -224,6 +224,7 @@ AlphaTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                CallingConv::ID CallConv, bool isVarArg,
                                bool &isTailCall,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
                                DebugLoc dl, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const {
@@ -251,7 +252,7 @@ AlphaTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
 
-    SDValue Arg = Outs[i].Val;
+    SDValue Arg = OutVals[i];
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
@@ -425,7 +426,7 @@ AlphaTargetLowering::LowerFormalArguments(SDValue Chain,
       }
     } else { //more args
       // Create the frame index object for this incoming parameter...
-      int FI = MFI->CreateFixedObject(8, 8 * (ArgNo - 6), true, false);
+      int FI = MFI->CreateFixedObject(8, 8 * (ArgNo - 6), true);
 
       // Create the SelectionDAG nodes corresponding to a load
       //from this parameter
@@ -444,7 +445,7 @@ AlphaTargetLowering::LowerFormalArguments(SDValue Chain,
       if (TargetRegisterInfo::isPhysicalRegister(args_int[i]))
         args_int[i] = AddLiveIn(MF, args_int[i], &Alpha::GPRCRegClass);
       SDValue argt = DAG.getCopyFromReg(Chain, dl, args_int[i], MVT::i64);
-      int FI = MFI->CreateFixedObject(8, -8 * (6 - i), true, false);
+      int FI = MFI->CreateFixedObject(8, -8 * (6 - i), true);
       if (i == 0) FuncInfo->setVarArgsBase(FI);
       SDValue SDFI = DAG.getFrameIndex(FI, MVT::i64);
       LS.push_back(DAG.getStore(Chain, dl, argt, SDFI, NULL, 0,
@@ -453,7 +454,7 @@ AlphaTargetLowering::LowerFormalArguments(SDValue Chain,
       if (TargetRegisterInfo::isPhysicalRegister(args_float[i]))
         args_float[i] = AddLiveIn(MF, args_float[i], &Alpha::F8RCRegClass);
       argt = DAG.getCopyFromReg(Chain, dl, args_float[i], MVT::f64);
-      FI = MFI->CreateFixedObject(8, - 8 * (12 - i), true, false);
+      FI = MFI->CreateFixedObject(8, - 8 * (12 - i), true);
       SDFI = DAG.getFrameIndex(FI, MVT::i64);
       LS.push_back(DAG.getStore(Chain, dl, argt, SDFI, NULL, 0,
                                 false, false, 0));
@@ -470,6 +471,7 @@ SDValue
 AlphaTargetLowering::LowerReturn(SDValue Chain,
                                  CallingConv::ID CallConv, bool isVarArg,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
                                  DebugLoc dl, SelectionDAG &DAG) const {
 
   SDValue Copy = DAG.getCopyToReg(Chain, dl, Alpha::R26,
@@ -483,7 +485,7 @@ AlphaTargetLowering::LowerReturn(SDValue Chain,
     break;
     //return SDValue(); // ret void is legal
   case 1: {
-    EVT ArgVT = Outs[0].Val.getValueType();
+    EVT ArgVT = Outs[0].VT;
     unsigned ArgReg;
     if (ArgVT.isInteger())
       ArgReg = Alpha::R0;
@@ -492,13 +494,13 @@ AlphaTargetLowering::LowerReturn(SDValue Chain,
       ArgReg = Alpha::F0;
     }
     Copy = DAG.getCopyToReg(Copy, dl, ArgReg,
-                            Outs[0].Val, Copy.getValue(1));
+                            OutVals[0], Copy.getValue(1));
     if (DAG.getMachineFunction().getRegInfo().liveout_empty())
       DAG.getMachineFunction().getRegInfo().addLiveOut(ArgReg);
     break;
   }
   case 2: {
-    EVT ArgVT = Outs[0].Val.getValueType();
+    EVT ArgVT = Outs[0].VT;
     unsigned ArgReg1, ArgReg2;
     if (ArgVT.isInteger()) {
       ArgReg1 = Alpha::R0;
@@ -509,13 +511,13 @@ AlphaTargetLowering::LowerReturn(SDValue Chain,
       ArgReg2 = Alpha::F1;
     }
     Copy = DAG.getCopyToReg(Copy, dl, ArgReg1,
-                            Outs[0].Val, Copy.getValue(1));
+                            OutVals[0], Copy.getValue(1));
     if (std::find(DAG.getMachineFunction().getRegInfo().liveout_begin(),
                   DAG.getMachineFunction().getRegInfo().liveout_end(), ArgReg1)
         == DAG.getMachineFunction().getRegInfo().liveout_end())
       DAG.getMachineFunction().getRegInfo().addLiveOut(ArgReg1);
     Copy = DAG.getCopyToReg(Copy, dl, ArgReg2,
-                            Outs[1].Val, Copy.getValue(1));
+                            OutVals[1], Copy.getValue(1));
     if (std::find(DAG.getMachineFunction().getRegInfo().liveout_begin(),
                    DAG.getMachineFunction().getRegInfo().liveout_end(), ArgReg2)
         == DAG.getMachineFunction().getRegInfo().liveout_end())
@@ -539,7 +541,7 @@ void AlphaTargetLowering::LowerVAARG(SDNode *N, SDValue &Chain,
                              false, false, 0);
   SDValue Tmp = DAG.getNode(ISD::ADD, dl, MVT::i64, VAListP,
                               DAG.getConstant(8, MVT::i64));
-  SDValue Offset = DAG.getExtLoad(ISD::SEXTLOAD, dl, MVT::i64, Base.getValue(1),
+  SDValue Offset = DAG.getExtLoad(ISD::SEXTLOAD, MVT::i64, dl, Base.getValue(1),
                                   Tmp, NULL, 0, MVT::i32, false, false, 0);
   DataPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Base, Offset);
   if (N->getValueType(0).isFloatingPoint())
@@ -643,10 +645,12 @@ SDValue AlphaTargetLowering::LowerOperation(SDValue Op,
   case ISD::GlobalAddress: {
     GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
     const GlobalValue *GV = GSDN->getGlobal();
-    SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i64, GSDN->getOffset());
+    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i64, 
+                                            GSDN->getOffset());
     // FIXME there isn't really any debug info here
 
-    //    if (!GV->hasWeakLinkage() && !GV->isDeclaration() && !GV->hasLinkOnceLinkage()) {
+    //    if (!GV->hasWeakLinkage() && !GV->isDeclaration() 
+    //        && !GV->hasLinkOnceLinkage()) {
     if (GV->hasLocalLinkage()) {
       SDValue Hi = DAG.getNode(AlphaISD::GPRelHi,  dl, MVT::i64, GA,
                                 DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
@@ -702,7 +706,7 @@ SDValue AlphaTargetLowering::LowerOperation(SDValue Op,
 
     SDValue Result;
     if (Op.getValueType() == MVT::i32)
-      Result = DAG.getExtLoad(ISD::SEXTLOAD, dl, MVT::i64, Chain, DataPtr,
+      Result = DAG.getExtLoad(ISD::SEXTLOAD, MVT::i64, dl, Chain, DataPtr,
                               NULL, 0, MVT::i32, false, false, 0);
     else
       Result = DAG.getLoad(Op.getValueType(), dl, Chain, DataPtr, NULL, 0,
@@ -722,7 +726,7 @@ SDValue AlphaTargetLowering::LowerOperation(SDValue Op,
                                   false, false, 0);
     SDValue NP = DAG.getNode(ISD::ADD, dl, MVT::i64, SrcP,
                                DAG.getConstant(8, MVT::i64));
-    Val = DAG.getExtLoad(ISD::SEXTLOAD, dl, MVT::i64, Result,
+    Val = DAG.getExtLoad(ISD::SEXTLOAD, MVT::i64, dl, Result,
                          NP, NULL,0, MVT::i32, false, false, 0);
     SDValue NPD = DAG.getNode(ISD::ADD, dl, MVT::i64, DestP,
                                 DAG.getConstant(8, MVT::i64));
@@ -863,7 +867,10 @@ AlphaTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   MachineBasicBlock *llscMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
 
-  sinkMBB->transferSuccessors(thisMBB);
+  sinkMBB->splice(sinkMBB->begin(), thisMBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  thisMBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
 
   F->insert(It, llscMBB);
   F->insert(It, sinkMBB);
@@ -912,7 +919,7 @@ AlphaTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   thisMBB->addSuccessor(llscMBB);
   llscMBB->addSuccessor(llscMBB);
   llscMBB->addSuccessor(sinkMBB);
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
 
   return sinkMBB;
 }
diff --git a/lib/Target/Alpha/AlphaISelLowering.h b/lib/Target/Alpha/AlphaISelLowering.h
index 7ee823a..46e0c7d 100644
--- a/lib/Target/Alpha/AlphaISelLowering.h
+++ b/lib/Target/Alpha/AlphaISelLowering.h
@@ -121,6 +121,7 @@ namespace llvm {
       LowerCall(SDValue Chain, SDValue Callee,
                 CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
@@ -129,6 +130,7 @@ namespace llvm {
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
   };
 }
diff --git a/lib/Target/Alpha/AlphaInstrFormats.td b/lib/Target/Alpha/AlphaInstrFormats.td
index d984556..6f4ebf2 100644
--- a/lib/Target/Alpha/AlphaInstrFormats.td
+++ b/lib/Target/Alpha/AlphaInstrFormats.td
@@ -182,7 +182,7 @@ class OForm4<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, Inst
   bits<5> Rb;
   bits<7> Function = fun;
 
-//  let isTwoAddress = 1;
+//  let Constraints = "$RFALSE = $RDEST";
   let Inst{25-21} = Ra;
   let Inst{20-16} = Rb;
   let Inst{15-13} = 0;
@@ -223,7 +223,7 @@ class OForm4L<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, Ins
   bits<8> LIT;
   bits<7> Function = fun;
 
-//  let isTwoAddress = 1;
+//  let Constraints = "$RFALSE = $RDEST";
   let Inst{25-21} = Ra;
   let Inst{20-13} = LIT;
   let Inst{12} = 1;
diff --git a/lib/Target/Alpha/AlphaInstrInfo.cpp b/lib/Target/Alpha/AlphaInstrInfo.cpp
index 3aba363..ad625a2 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.cpp
+++ b/lib/Target/Alpha/AlphaInstrInfo.cpp
@@ -110,9 +110,8 @@ static bool isAlphaIntCondCode(unsigned Opcode) {
 unsigned AlphaInstrInfo::InsertBranch(MachineBasicBlock &MBB,
                                       MachineBasicBlock *TBB,
                                       MachineBasicBlock *FBB,
-                            const SmallVectorImpl<MachineOperand> &Cond) const {
-  // FIXME this should probably have a DebugLoc argument
-  DebugLoc dl;
+                                      const SmallVectorImpl<MachineOperand> &Cond,
+                                      DebugLoc DL) const {
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 2 || Cond.size() == 0) && 
          "Alpha branch conditions have two components!");
@@ -120,58 +119,47 @@ unsigned AlphaInstrInfo::InsertBranch(MachineBasicBlock &MBB,
   // One-way branch.
   if (FBB == 0) {
     if (Cond.empty())   // Unconditional branch
-      BuildMI(&MBB, dl, get(Alpha::BR)).addMBB(TBB);
+      BuildMI(&MBB, DL, get(Alpha::BR)).addMBB(TBB);
     else                // Conditional branch
       if (isAlphaIntCondCode(Cond[0].getImm()))
-        BuildMI(&MBB, dl, get(Alpha::COND_BRANCH_I))
+        BuildMI(&MBB, DL, get(Alpha::COND_BRANCH_I))
           .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
       else
-        BuildMI(&MBB, dl, get(Alpha::COND_BRANCH_F))
+        BuildMI(&MBB, DL, get(Alpha::COND_BRANCH_F))
           .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
     return 1;
   }
   
   // Two-way Conditional Branch.
   if (isAlphaIntCondCode(Cond[0].getImm()))
-    BuildMI(&MBB, dl, get(Alpha::COND_BRANCH_I))
+    BuildMI(&MBB, DL, get(Alpha::COND_BRANCH_I))
       .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
   else
-    BuildMI(&MBB, dl, get(Alpha::COND_BRANCH_F))
+    BuildMI(&MBB, DL, get(Alpha::COND_BRANCH_F))
       .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
-  BuildMI(&MBB, dl, get(Alpha::BR)).addMBB(FBB);
+  BuildMI(&MBB, DL, get(Alpha::BR)).addMBB(FBB);
   return 2;
 }
 
-bool AlphaInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  unsigned DestReg, unsigned SrcReg,
-                                  const TargetRegisterClass *DestRC,
-                                  const TargetRegisterClass *SrcRC,
-                                  DebugLoc DL) const {
-  //cerr << "copyRegToReg " << DestReg << " <- " << SrcReg << "\n";
-  if (DestRC != SrcRC) {
-    // Not yet supported!
-    return false;
-  }
-
-  if (DestRC == Alpha::GPRCRegisterClass) {
+void AlphaInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI, DebugLoc DL,
+                                 unsigned DestReg, unsigned SrcReg,
+                                 bool KillSrc) const {
+  if (Alpha::GPRCRegClass.contains(DestReg, SrcReg)) {
     BuildMI(MBB, MI, DL, get(Alpha::BISr), DestReg)
       .addReg(SrcReg)
-      .addReg(SrcReg);
-  } else if (DestRC == Alpha::F4RCRegisterClass) {
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  } else if (Alpha::F4RCRegClass.contains(DestReg, SrcReg)) {
     BuildMI(MBB, MI, DL, get(Alpha::CPYSS), DestReg)
       .addReg(SrcReg)
-      .addReg(SrcReg);
-  } else if (DestRC == Alpha::F8RCRegisterClass) {
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  } else if (Alpha::F8RCRegClass.contains(DestReg, SrcReg)) {
     BuildMI(MBB, MI, DL, get(Alpha::CPYST), DestReg)
       .addReg(SrcReg)
-      .addReg(SrcReg);
+      .addReg(SrcReg, getKillRegState(KillSrc));
   } else {
-    // Attempt to copy register that is not GPR or FPR
-    return false;
+    llvm_unreachable("Attempt to copy register that is not GPR or FPR");
   }
-  
-  return true;
 }
 
 void
@@ -227,51 +215,6 @@ AlphaInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     llvm_unreachable("Unhandled register class");
 }
 
-MachineInstr *AlphaInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                    MachineInstr *MI,
-                                          const SmallVectorImpl<unsigned> &Ops,
-                                                    int FrameIndex) const {
-   if (Ops.size() != 1) return NULL;
-
-   // Make sure this is a reg-reg copy.
-   unsigned Opc = MI->getOpcode();
-
-   MachineInstr *NewMI = NULL;
-   switch(Opc) {
-   default:
-     break;
-   case Alpha::BISr:
-   case Alpha::CPYSS:
-   case Alpha::CPYST:
-     if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
-       if (Ops[0] == 0) {  // move -> store
-         unsigned InReg = MI->getOperand(1).getReg();
-         bool isKill = MI->getOperand(1).isKill();
-         bool isUndef = MI->getOperand(1).isUndef();
-         Opc = (Opc == Alpha::BISr) ? Alpha::STQ : 
-           ((Opc == Alpha::CPYSS) ? Alpha::STS : Alpha::STT);
-         NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-           .addReg(InReg, getKillRegState(isKill) | getUndefRegState(isUndef))
-           .addFrameIndex(FrameIndex)
-           .addReg(Alpha::F31);
-       } else {           // load -> move
-         unsigned OutReg = MI->getOperand(0).getReg();
-         bool isDead = MI->getOperand(0).isDead();
-         bool isUndef = MI->getOperand(0).isUndef();
-         Opc = (Opc == Alpha::BISr) ? Alpha::LDQ : 
-           ((Opc == Alpha::CPYSS) ? Alpha::LDS : Alpha::LDT);
-         NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-           .addReg(OutReg, RegState::Define | getDeadRegState(isDead) |
-                   getUndefRegState(isUndef))
-           .addFrameIndex(FrameIndex)
-           .addReg(Alpha::F31);
-       }
-     }
-     break;
-   }
-  return NewMI;
-}
-
 static unsigned AlphaRevCondCode(unsigned Opcode) {
   switch (Opcode) {
   case Alpha::BEQ: return Alpha::BNE;
@@ -428,11 +371,8 @@ unsigned AlphaInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
 
   GlobalBaseReg = RegInfo.createVirtualRegister(&Alpha::GPRCRegClass);
-  bool Ok = TII->copyRegToReg(FirstMBB, MBBI, GlobalBaseReg, Alpha::R29,
-                              &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
-                              DebugLoc());
-  assert(Ok && "Couldn't assign to global base register!");
-  Ok = Ok; // Silence warning when assertions are turned off.
+  BuildMI(FirstMBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY),
+          GlobalBaseReg).addReg(Alpha::R29);
   RegInfo.addLiveIn(Alpha::R29);
 
   AlphaFI->setGlobalBaseReg(GlobalBaseReg);
@@ -456,11 +396,8 @@ unsigned AlphaInstrInfo::getGlobalRetAddr(MachineFunction *MF) const {
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
 
   GlobalRetAddr = RegInfo.createVirtualRegister(&Alpha::GPRCRegClass);
-  bool Ok = TII->copyRegToReg(FirstMBB, MBBI, GlobalRetAddr, Alpha::R26,
-                              &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
-                              DebugLoc());
-  assert(Ok && "Couldn't assign to global return address register!");
-  Ok = Ok; // Silence warning when assertions are turned off.
+  BuildMI(FirstMBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY),
+          GlobalRetAddr).addReg(Alpha::R26);
   RegInfo.addLiveIn(Alpha::R26);
 
   AlphaFI->setGlobalRetAddr(GlobalRetAddr);
diff --git a/lib/Target/Alpha/AlphaInstrInfo.h b/lib/Target/Alpha/AlphaInstrInfo.h
index 7d7365b..e20e832 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.h
+++ b/lib/Target/Alpha/AlphaInstrInfo.h
@@ -42,14 +42,13 @@ public:
                                       int &FrameIndex) const;
   
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                            MachineBasicBlock *FBB,
-                            const SmallVectorImpl<MachineOperand> &Cond) const;
-  virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
   virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
                                    unsigned SrcReg, bool isKill, int FrameIndex,
@@ -62,18 +61,6 @@ public:
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
   
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              int FrameIndex) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              MachineInstr* LoadMI) const {
-    return 0;
-  }
-  
   bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
diff --git a/lib/Target/Alpha/AlphaInstrInfo.td b/lib/Target/Alpha/AlphaInstrInfo.td
index a47a29b..92de78a 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.td
+++ b/lib/Target/Alpha/AlphaInstrInfo.td
@@ -680,18 +680,32 @@ def CPYSNSt : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC",
 }
 
 //conditional moves, floats
-let OutOperandList = (outs F4RC:$RDEST), InOperandList = (ins F4RC:$RFALSE, F4RC:$RTRUE, F8RC:$RCOND),
-    isTwoAddress = 1 in {
-def FCMOVEQS : FPForm<0x17, 0x02A, "fcmoveq $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if = zero
-def FCMOVGES : FPForm<0x17, 0x02D, "fcmovge $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if >= zero
-def FCMOVGTS : FPForm<0x17, 0x02F, "fcmovgt $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if > zero
-def FCMOVLES : FPForm<0x17, 0x02E, "fcmovle $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if <= zero
-def FCMOVLTS : FPForm<0x17, 0x02C, "fcmovlt $RCOND,$RTRUE,$RDEST",[], s_fcmov>; // FCMOVE if < zero
-def FCMOVNES : FPForm<0x17, 0x02B, "fcmovne $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if != zero
+let OutOperandList = (outs F4RC:$RDEST),
+    InOperandList = (ins F4RC:$RFALSE, F4RC:$RTRUE, F8RC:$RCOND),
+    Constraints = "$RTRUE = $RDEST" in {
+def FCMOVEQS : FPForm<0x17, 0x02A, 
+                      "fcmoveq $RCOND,$RTRUE,$RDEST",
+                      [], s_fcmov>; //FCMOVE if = zero
+def FCMOVGES : FPForm<0x17, 0x02D, 
+                      "fcmovge $RCOND,$RTRUE,$RDEST",
+                      [], s_fcmov>; //FCMOVE if >= zero
+def FCMOVGTS : FPForm<0x17, 0x02F, 
+                      "fcmovgt $RCOND,$RTRUE,$RDEST",
+                      [], s_fcmov>; //FCMOVE if > zero
+def FCMOVLES : FPForm<0x17, 0x02E, 
+                      "fcmovle $RCOND,$RTRUE,$RDEST",
+                      [], s_fcmov>; //FCMOVE if <= zero
+def FCMOVLTS : FPForm<0x17, 0x02C,
+                      "fcmovlt $RCOND,$RTRUE,$RDEST",
+                      [], s_fcmov>; // FCMOVE if < zero
+def FCMOVNES : FPForm<0x17, 0x02B, 
+                      "fcmovne $RCOND,$RTRUE,$RDEST",
+                      [], s_fcmov>; //FCMOVE if != zero
 }
 //conditional moves, doubles
-let OutOperandList = (outs F8RC:$RDEST), InOperandList = (ins F8RC:$RFALSE, F8RC:$RTRUE, F8RC:$RCOND),
-    isTwoAddress = 1 in {
+let OutOperandList = (outs F8RC:$RDEST), 
+    InOperandList = (ins F8RC:$RFALSE, F8RC:$RTRUE, F8RC:$RCOND),
+    Constraints = "$RTRUE = $RDEST" in {
 def FCMOVEQT : FPForm<0x17, 0x02A, "fcmoveq $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
 def FCMOVGET : FPForm<0x17, 0x02D, "fcmovge $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
 def FCMOVGTT : FPForm<0x17, 0x02F, "fcmovgt $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.cpp b/lib/Target/Alpha/AlphaRegisterInfo.cpp
index c083d8c..dc9d935 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.cpp
+++ b/lib/Target/Alpha/AlphaRegisterInfo.cpp
@@ -74,20 +74,6 @@ const unsigned* AlphaRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
   return CalleeSavedRegs;
 }
 
-const TargetRegisterClass* const*
-AlphaRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
-    &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
-    &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
-    &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
-    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,
-    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,
-    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,
-    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,  0
-  };
-  return CalleeSavedRegClasses;
-}
-
 BitVector AlphaRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(Alpha::R15);
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.h b/lib/Target/Alpha/AlphaRegisterInfo.h
index 720367a..f9fd87a 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.h
+++ b/lib/Target/Alpha/AlphaRegisterInfo.h
@@ -30,9 +30,6 @@ struct AlphaRegisterInfo : public AlphaGenRegisterInfo {
   /// Code Generation virtual methods...
   const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
 
-  const TargetRegisterClass* const* getCalleeSavedRegClasses(
-                                     const MachineFunction *MF = 0) const;
-
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
   bool hasFP(const MachineFunction &MF) const;
diff --git a/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
index b4da96c..80ee107 100644
--- a/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
+++ b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
@@ -132,8 +132,8 @@ static void UpdateNodeOperand(SelectionDAG &DAG,
                               SDValue Val) {
   SmallVector<SDValue, 8> ops(N->op_begin(), N->op_end());
   ops[Num] = Val;
-  SDValue New = DAG.UpdateNodeOperands(SDValue(N, 0), ops.data(), ops.size());
-  DAG.ReplaceAllUsesWith(N, New.getNode());
+  SDNode *New = DAG.UpdateNodeOperands(N, ops.data(), ops.size());
+  DAG.ReplaceAllUsesWith(N, New);
 }
 
 // After instruction selection, insert COPY_TO_REGCLASS nodes to help in
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.cpp b/lib/Target/Blackfin/BlackfinISelLowering.cpp
index adf2118..6e828e1 100644
--- a/lib/Target/Blackfin/BlackfinISelLowering.cpp
+++ b/lib/Target/Blackfin/BlackfinISelLowering.cpp
@@ -143,7 +143,7 @@ SDValue BlackfinTargetLowering::LowerGlobalAddress(SDValue Op,
   DebugLoc DL = Op.getDebugLoc();
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
 
-  Op = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  Op = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
   return DAG.getNode(BFISD::Wrapper, DL, MVT::i32, Op);
 }
 
@@ -205,8 +205,7 @@ BlackfinTargetLowering::LowerFormalArguments(SDValue Chain,
     } else {
       assert(VA.isMemLoc() && "CCValAssign must be RegLoc or MemLoc");
       unsigned ObjSize = VA.getLocVT().getStoreSize();
-      int FI = MFI->CreateFixedObject(ObjSize, VA.getLocMemOffset(),
-                                      true, false);
+      int FI = MFI->CreateFixedObject(ObjSize, VA.getLocMemOffset(), true);
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
       InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, NULL, 0,
                                    false, false, 0));
@@ -220,6 +219,7 @@ SDValue
 BlackfinTargetLowering::LowerReturn(SDValue Chain,
                                     CallingConv::ID CallConv, bool isVarArg,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
                                     DebugLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to locations.
@@ -245,7 +245,7 @@ BlackfinTargetLowering::LowerReturn(SDValue Chain,
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
-    SDValue Opi = Outs[i].Val;
+    SDValue Opi = OutVals[i];
 
     // Expand to i32 if necessary
     switch (VA.getLocInfo()) {
@@ -278,6 +278,7 @@ BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   bool &isTailCall,
                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<SDValue> &OutVals,
                                   const SmallVectorImpl<ISD::InputArg> &Ins,
                                   DebugLoc dl, SelectionDAG &DAG,
                                   SmallVectorImpl<SDValue> &InVals) const {
@@ -301,7 +302,7 @@ BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = Outs[i].Val;
+    SDValue Arg = OutVals[i];
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
@@ -357,7 +358,7 @@ BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i32);
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
     Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
 
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.h b/lib/Target/Blackfin/BlackfinISelLowering.h
index a784248..6bebcc3 100644
--- a/lib/Target/Blackfin/BlackfinISelLowering.h
+++ b/lib/Target/Blackfin/BlackfinISelLowering.h
@@ -63,6 +63,7 @@ namespace llvm {
       LowerCall(SDValue Chain, SDValue Callee,
                 CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
@@ -71,6 +72,7 @@ namespace llvm {
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
   };
 } // end namespace llvm
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.cpp b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
index 73924b7..a74d42d 100644
--- a/lib/Target/Blackfin/BlackfinInstrInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
@@ -104,10 +104,8 @@ unsigned BlackfinInstrInfo::
 InsertBranch(MachineBasicBlock &MBB,
              MachineBasicBlock *TBB,
              MachineBasicBlock *FBB,
-             const SmallVectorImpl<MachineOperand> &Cond) const {
-  // FIXME this should probably have a DebugLoc operand
-  DebugLoc DL;
-
+             const SmallVectorImpl<MachineOperand> &Cond,
+             DebugLoc DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 1 || Cond.size() == 0) &&
@@ -124,69 +122,73 @@ InsertBranch(MachineBasicBlock &MBB,
   llvm_unreachable("Implement conditional branches!");
 }
 
-static bool inClass(const TargetRegisterClass &Test,
-                    unsigned Reg,
-                    const TargetRegisterClass *RC) {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
-    return Test.contains(Reg);
-  else
-    return &Test==RC || Test.hasSubClass(RC);
-}
-
-bool BlackfinInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I,
-                                     unsigned DestReg,
-                                     unsigned SrcReg,
-                                     const TargetRegisterClass *DestRC,
-                                     const TargetRegisterClass *SrcRC,
-                                     DebugLoc DL) const {
-  if (inClass(BF::ALLRegClass, DestReg, DestRC) &&
-      inClass(BF::ALLRegClass, SrcReg,  SrcRC)) {
-    BuildMI(MBB, I, DL, get(BF::MOVE), DestReg).addReg(SrcReg);
-    return true;
+void BlackfinInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I, DebugLoc DL,
+                                    unsigned DestReg, unsigned SrcReg,
+                                    bool KillSrc) const {
+  if (BF::ALLRegClass.contains(DestReg, SrcReg)) {
+    BuildMI(MBB, I, DL, get(BF::MOVE), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
   }
 
-  if (inClass(BF::D16RegClass, DestReg, DestRC) &&
-      inClass(BF::D16RegClass, SrcReg,  SrcRC)) {
-    BuildMI(MBB, I, DL, get(BF::SLL16i), DestReg).addReg(SrcReg).addImm(0);
-    return true;
+  if (BF::D16RegClass.contains(DestReg, SrcReg)) {
+    BuildMI(MBB, I, DL, get(BF::SLL16i), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .addImm(0);
+    return;
   }
 
-  if (inClass(BF::AnyCCRegClass, SrcReg, SrcRC) &&
-      inClass(BF::DRegClass, DestReg, DestRC)) {
-    if (inClass(BF::NotCCRegClass, SrcReg, SrcRC)) {
-      BuildMI(MBB, I, DL, get(BF::MOVENCC_z), DestReg).addReg(SrcReg);
+  if (BF::DRegClass.contains(DestReg)) {
+    if (SrcReg == BF::NCC) {
+      BuildMI(MBB, I, DL, get(BF::MOVENCC_z), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
       BuildMI(MBB, I, DL, get(BF::BITTGL), DestReg).addReg(DestReg).addImm(0);
-    } else {
-      BuildMI(MBB, I, DL, get(BF::MOVECC_zext), DestReg).addReg(SrcReg);
+      return;
+    }
+    if (SrcReg == BF::CC) {
+      BuildMI(MBB, I, DL, get(BF::MOVECC_zext), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+      return;
     }
-    return true;
   }
 
-  if (inClass(BF::AnyCCRegClass, DestReg, DestRC) &&
-      inClass(BF::DRegClass, SrcReg,  SrcRC)) {
-    if (inClass(BF::NotCCRegClass, DestReg, DestRC))
-      BuildMI(MBB, I, DL, get(BF::SETEQri_not), DestReg).addReg(SrcReg);
-    else
-      BuildMI(MBB, I, DL, get(BF::MOVECC_nz), DestReg).addReg(SrcReg);
-    return true;
+  if (BF::DRegClass.contains(SrcReg)) {
+    if (DestReg == BF::NCC) {
+      BuildMI(MBB, I, DL, get(BF::SETEQri_not), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc)).addImm(0);
+      return;
+    }
+    if (DestReg == BF::CC) {
+      BuildMI(MBB, I, DL, get(BF::MOVECC_nz), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+      return;
+    }
   }
 
-  if (inClass(BF::NotCCRegClass, DestReg, DestRC) &&
-      inClass(BF::JustCCRegClass, SrcReg,  SrcRC)) {
-    BuildMI(MBB, I, DL, get(BF::MOVE_ncccc), DestReg).addReg(SrcReg);
-    return true;
+
+  if (DestReg == BF::NCC && SrcReg == BF::CC) {
+    BuildMI(MBB, I, DL, get(BF::MOVE_ncccc), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
   }
 
-  if (inClass(BF::JustCCRegClass, DestReg, DestRC) &&
-      inClass(BF::NotCCRegClass, SrcReg,  SrcRC)) {
-    BuildMI(MBB, I, DL, get(BF::MOVE_ccncc), DestReg).addReg(SrcReg);
-    return true;
+  if (DestReg == BF::CC && SrcReg == BF::NCC) {
+    BuildMI(MBB, I, DL, get(BF::MOVE_ccncc), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
   }
 
-  llvm_unreachable((std::string("Bad regclasses for reg-to-reg copy: ")+
-                    SrcRC->getName() + " -> " + DestRC->getName()).c_str());
-  return false;
+  llvm_unreachable("Bad reg-to-reg copy");
+}
+
+static bool inClass(const TargetRegisterClass &Test,
+                    unsigned Reg,
+                    const TargetRegisterClass *RC) {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return Test.contains(Reg);
+  else
+    return &Test==RC || Test.hasSubClass(RC);
 }
 
 void
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.h b/lib/Target/Blackfin/BlackfinInstrInfo.h
index c1dcd58..6c35917 100644
--- a/lib/Target/Blackfin/BlackfinInstrInfo.h
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.h
@@ -44,14 +44,13 @@ namespace llvm {
     InsertBranch(MachineBasicBlock &MBB,
                  MachineBasicBlock *TBB,
                  MachineBasicBlock *FBB,
-                 const SmallVectorImpl<MachineOperand> &Cond) const;
-
-    virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I,
-                              unsigned DestReg, unsigned SrcReg,
-                              const TargetRegisterClass *DestRC,
-                              const TargetRegisterClass *SrcRC,
-                              DebugLoc DL) const;
+                 const SmallVectorImpl<MachineOperand> &Cond,
+                 DebugLoc DL) const;
+
+    virtual void copyPhysReg(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MI, DebugLoc DL,
+                             unsigned DestReg, unsigned SrcReg,
+                             bool KillSrc) const;
 
     virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI,
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.td b/lib/Target/Blackfin/BlackfinInstrInfo.td
index 5cf350a..8034a7f 100644
--- a/lib/Target/Blackfin/BlackfinInstrInfo.td
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.td
@@ -488,7 +488,7 @@ def MOVE: F1<(outs ALL:$dst), (ins ALL:$src),
              "$dst = $src;",
              []>;
 
-let isTwoAddress = 1 in
+let Constraints = "$src1 = $dst" in
 def MOVEcc: F1<(outs DP:$dst), (ins DP:$src1, DP:$src2, AnyCC:$cc),
                "if $cc $dst = $src2;",
                [(set DP:$dst, (select AnyCC:$cc, DP:$src2, DP:$src1))]>;
@@ -645,7 +645,7 @@ def XOR: F1<(outs D:$dst), (ins D:$src1, D:$src2),
 // Table C-15. Bit Operations Instructions
 //===----------------------------------------------------------------------===//
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 def BITCLR: F1<(outs D:$dst), (ins D:$src1, uimm5imask:$src2),
               "bitclr($dst, $src2);",
               [(set D:$dst, (and D:$src1, uimm5imask:$src2))]>;
@@ -691,7 +691,7 @@ multiclass SHIFT32<SDNode opnode, string ops> {
 }
 
 let Defs = [AZ, AN, V, VS],
-    isTwoAddress = 1 in {
+    Constraints = "$src = $dst" in {
 defm SRA : SHIFT32<sra, ">>>">;
 defm SRL : SHIFT32<srl, ">>">;
 defm SLL : SHIFT32<shl, "<<">;
@@ -748,7 +748,7 @@ def ADD16: F2<(outs D16:$dst), (ins D16:$src1, D16:$src2),
               "$dst = $src1 + $src2;",
               [(set D16:$dst, (add D16:$src1, D16:$src2))]>;
 
-let isTwoAddress = 1 in
+let Constraints = "$src1 = $dst" in
 def ADDimm7: F1<(outs D:$dst), (ins D:$src1, i32imm:$src2),
                 "$dst += $src2;",
                 [(set D:$dst, (add D:$src1, imm7:$src2))]>;
@@ -775,7 +775,7 @@ def NEG: F1<(outs D:$dst), (ins D:$src),
 def ADDpp: F1<(outs P:$dst), (ins P:$src1, P:$src2),
               "$dst = $src1 + $src2;", []>;
 
-let isTwoAddress = 1 in
+let Constraints = "$src1 = $dst" in
 def ADDpp_imm7: F1<(outs P:$dst), (ins P:$src1, i32imm:$src2),
                 "$dst += $src2;", []>;
 
@@ -802,7 +802,7 @@ def MULhh32u: F2<(outs D:$dst), (ins D16:$src1, D16:$src2),
 }
 
 
-let isTwoAddress = 1 in
+let Constraints = "$src1 = $dst" in
 def MUL32: F1<(outs D:$dst), (ins D:$src1, D:$src2),
             "$dst *= $src2;",
             [(set D:$dst, (mul D:$src1, D:$src2))]>;
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
index 5153ace..06e95de 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
@@ -48,17 +48,6 @@ BlackfinRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return  CalleeSavedRegs;
 }
 
-const TargetRegisterClass* const *BlackfinRegisterInfo::
-getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  using namespace BF;
-  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
-    &PRegClass,
-    &DRegClass, &DRegClass, &DRegClass, &DRegClass,
-    &PRegClass, &PRegClass, &PRegClass,
-    0 };
-  return CalleeSavedRegClasses;
-}
-
 BitVector
 BlackfinRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   using namespace BF;
@@ -86,25 +75,6 @@ BlackfinRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
-const TargetRegisterClass*
-BlackfinRegisterInfo::getPhysicalRegisterRegClass(unsigned reg, EVT VT) const {
-  assert(isPhysicalRegister(reg) && "reg must be a physical register");
-
-  // Pick the smallest register class of the right type that contains
-  // this physreg.
-  const TargetRegisterClass* BestRC = 0;
-  for (regclass_iterator I = regclass_begin(), E = regclass_end();
-       I != E; ++I) {
-    const TargetRegisterClass* RC = *I;
-    if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) &&
-        (!BestRC || RC->getNumRegs() < BestRC->getNumRegs()))
-      BestRC = RC;
-  }
-
-  assert(BestRC && "Couldn't find the register class");
-  return BestRC;
-}
-
 // hasFP - Return true if the specified function should have a dedicated frame
 // pointer register.  This is true if the function has variable sized allocas or
 // if frame pointer elimination is disabled.
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.h b/lib/Target/Blackfin/BlackfinRegisterInfo.h
index 03c5450..ead0b4a 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.h
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.h
@@ -33,9 +33,6 @@ namespace llvm {
     /// Code Generation virtual methods...
     const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
 
-    const TargetRegisterClass* const*
-    getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
-
     BitVector getReservedRegs(const MachineFunction &MF) const;
 
     // getSubReg implemented by tablegen
@@ -44,9 +41,6 @@ namespace llvm {
       return &BF::PRegClass;
     }
 
-    const TargetRegisterClass *getPhysicalRegisterRegClass(unsigned reg,
-                                                           EVT VT) const;
-
     bool hasFP(const MachineFunction &MF) const;
 
     // bool hasReservedCallFrame(MachineFunction &MF) const;
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index 55b8aaa..e8d8474 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -264,7 +264,7 @@ namespace {
     //
     static const AllocaInst *isDirectAlloca(const Value *V) {
       const AllocaInst *AI = dyn_cast<AllocaInst>(V);
-      if (!AI) return false;
+      if (!AI) return 0;
       if (AI->isArrayAllocation())
         return 0;   // FIXME: we can also inline fixed size array allocas!
       if (AI->getParent() != &AI->getParent()->getParent()->getEntryBlock())
@@ -2889,7 +2889,7 @@ void CWriter::visitCallInst(CallInst &I) {
   bool hasByVal = I.hasByValArgument();
   bool isStructRet = I.hasStructRetAttr();
   if (isStructRet) {
-    writeOperandDeref(I.getOperand(1));
+    writeOperandDeref(I.getArgOperand(0));
     Out << " = ";
   }
   
@@ -2944,8 +2944,8 @@ void CWriter::visitCallInst(CallInst &I) {
   }
 
   unsigned NumDeclaredParams = FTy->getNumParams();
-
-  CallSite::arg_iterator AI = I.op_begin()+1, AE = I.op_end();
+  CallSite CS(&I);
+  CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
   unsigned ArgNo = 0;
   if (isStructRet) {   // Skip struct return argument.
     ++AI;
@@ -2999,7 +2999,7 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
     Out << "0; ";
       
     Out << "va_start(*(va_list*)";
-    writeOperand(I.getOperand(1));
+    writeOperand(I.getArgOperand(0));
     Out << ", ";
     // Output the last argument to the enclosing function.
     if (I.getParent()->getParent()->arg_empty())
@@ -3009,9 +3009,9 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
     Out << ')';
     return true;
   case Intrinsic::vaend:
-    if (!isa<ConstantPointerNull>(I.getOperand(1))) {
+    if (!isa<ConstantPointerNull>(I.getArgOperand(0))) {
       Out << "0; va_end(*(va_list*)";
-      writeOperand(I.getOperand(1));
+      writeOperand(I.getArgOperand(0));
       Out << ')';
     } else {
       Out << "va_end(*(va_list*)0)";
@@ -3020,47 +3020,47 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
   case Intrinsic::vacopy:
     Out << "0; ";
     Out << "va_copy(*(va_list*)";
-    writeOperand(I.getOperand(1));
+    writeOperand(I.getArgOperand(0));
     Out << ", *(va_list*)";
-    writeOperand(I.getOperand(2));
+    writeOperand(I.getArgOperand(1));
     Out << ')';
     return true;
   case Intrinsic::returnaddress:
     Out << "__builtin_return_address(";
-    writeOperand(I.getOperand(1));
+    writeOperand(I.getArgOperand(0));
     Out << ')';
     return true;
   case Intrinsic::frameaddress:
     Out << "__builtin_frame_address(";
-    writeOperand(I.getOperand(1));
+    writeOperand(I.getArgOperand(0));
     Out << ')';
     return true;
   case Intrinsic::powi:
     Out << "__builtin_powi(";
-    writeOperand(I.getOperand(1));
+    writeOperand(I.getArgOperand(0));
     Out << ", ";
-    writeOperand(I.getOperand(2));
+    writeOperand(I.getArgOperand(1));
     Out << ')';
     return true;
   case Intrinsic::setjmp:
     Out << "setjmp(*(jmp_buf*)";
-    writeOperand(I.getOperand(1));
+    writeOperand(I.getArgOperand(0));
     Out << ')';
     return true;
   case Intrinsic::longjmp:
     Out << "longjmp(*(jmp_buf*)";
-    writeOperand(I.getOperand(1));
+    writeOperand(I.getArgOperand(0));
     Out << ", ";
-    writeOperand(I.getOperand(2));
+    writeOperand(I.getArgOperand(1));
     Out << ')';
     return true;
   case Intrinsic::prefetch:
     Out << "LLVM_PREFETCH((const void *)";
-    writeOperand(I.getOperand(1));
+    writeOperand(I.getArgOperand(0));
     Out << ", ";
-    writeOperand(I.getOperand(2));
+    writeOperand(I.getArgOperand(1));
     Out << ", ";
-    writeOperand(I.getOperand(3));
+    writeOperand(I.getArgOperand(2));
     Out << ")";
     return true;
   case Intrinsic::stacksave:
@@ -3077,7 +3077,7 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
     printType(Out, I.getType());
     Out << ')';  
     // Multiple GCC builtins multiplex onto this intrinsic.
-    switch (cast<ConstantInt>(I.getOperand(3))->getZExtValue()) {
+    switch (cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()) {
     default: llvm_unreachable("Invalid llvm.x86.sse.cmp!");
     case 0: Out << "__builtin_ia32_cmpeq"; break;
     case 1: Out << "__builtin_ia32_cmplt"; break;
@@ -3098,9 +3098,9 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
       Out << 'd';
       
     Out << "(";
-    writeOperand(I.getOperand(1));
+    writeOperand(I.getArgOperand(0));
     Out << ", ";
-    writeOperand(I.getOperand(2));
+    writeOperand(I.getArgOperand(1));
     Out << ")";
     return true;
   case Intrinsic::ppc_altivec_lvsl:
@@ -3108,7 +3108,7 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
     printType(Out, I.getType());
     Out << ')';  
     Out << "__builtin_altivec_lvsl(0, (void*)";
-    writeOperand(I.getOperand(1));
+    writeOperand(I.getArgOperand(0));
     Out << ")";
     return true;
   }
@@ -3221,7 +3221,7 @@ void CWriter::visitInlineAsm(CallInst &CI) {
       DestVal = ResultVals[ValueCount].first;
       DestValNo = ResultVals[ValueCount].second;
     } else
-      DestVal = CI.getOperand(ValueCount-ResultVals.size()+1);
+      DestVal = CI.getArgOperand(ValueCount-ResultVals.size());
 
     if (I->isEarlyClobber)
       C = "&"+C;
@@ -3255,7 +3255,7 @@ void CWriter::visitInlineAsm(CallInst &CI) {
     }
     
     assert(ValueCount >= ResultVals.size() && "Input can't refer to result");
-    Value *SrcVal = CI.getOperand(ValueCount-ResultVals.size()+1);
+    Value *SrcVal = CI.getArgOperand(ValueCount-ResultVals.size());
     
     Out << "\"" << C << "\"(";
     if (!I->isIndirect)
diff --git a/lib/Target/CellSPU/SPUCallingConv.td b/lib/Target/CellSPU/SPUCallingConv.td
index 10dc837..ec2f663 100644
--- a/lib/Target/CellSPU/SPUCallingConv.td
+++ b/lib/Target/CellSPU/SPUCallingConv.td
@@ -34,76 +34,19 @@ def RetCC_SPU : CallingConv<[
 
 //===----------------------------------------------------------------------===//
 // CellSPU Argument Calling Conventions
-// (note: this isn't used, but presumably should be at some point when other
-//  targets do.)
 //===----------------------------------------------------------------------===//
-/*
-def CC_SPU : CallingConv<[
-  CCIfType<[i8],  CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
-                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
-                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
-                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
-                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
-                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
-                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
-                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
-                                 R75, R76, R77, R78, R79]>>,
-  CCIfType<[i16], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
-                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
-                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
-                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
-                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
-                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
-                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
-                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
-                                 R75, R76, R77, R78, R79]>>,
-  CCIfType<[i32], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
-                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
-                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
-                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
-                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
-                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
-                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
-                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
-                                 R75, R76, R77, R78, R79]>>,
-  CCIfType<[f32], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
-                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
-                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
-                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
-                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
-                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
-                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
-                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
-                                 R75, R76, R77, R78, R79]>>,
-  CCIfType<[i64], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
-                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
-                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
-                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
-                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
-                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
-                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
-                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
-                                 R75, R76, R77, R78, R79]>>,
-  CCIfType<[f64], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
-                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
-                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
-                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
-                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
-                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
-                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
-                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
-                                 R75, R76, R77, R78, R79]>>,
-  CCIfType<[v16i8, v8i16, v4i32, v4f32, v2i64, v2f64],
-                  CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
-                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
-                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
-                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
-                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
-                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
-                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
-                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
-                                 R75, R76, R77, R78, R79]>>,
-  
+def CCC_SPU : CallingConv<[
+  CCIfType<[i8, i16, i32, i64, i128, f32, f64, 
+            v16i8, v8i16, v4i32, v4f32, v2i64, v2f64],
+            CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                           R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                           R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                           R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                           R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                           R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                           R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                           R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                           R75, R76, R77, R78, R79]>>,
   // Integer/FP values get stored in stack slots that are 8 bytes in size and
   // 8-byte aligned if there are no more registers to hold them.
   CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
@@ -112,4 +55,3 @@ def CC_SPU : CallingConv<[
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
               CCAssignToStack<16, 16>>
 ]>;
-*/
diff --git a/lib/Target/CellSPU/SPUFrameInfo.h b/lib/Target/CellSPU/SPUFrameInfo.h
index e8ca333..f511acd 100644
--- a/lib/Target/CellSPU/SPUFrameInfo.h
+++ b/lib/Target/CellSPU/SPUFrameInfo.h
@@ -53,10 +53,6 @@ namespace llvm {
     static int minStackSize() {
       return (2 * stackSlotSize());
     }
-    //! Frame size required to spill all registers plus frame info
-    static int fullSpillSize() {
-      return (SPURegisterInfo::getNumArgRegs() * stackSlotSize());
-    }
     //! Convert frame index to stack offset
     static int FItoStackOffset(int frame_index) {
       return frame_index * stackSlotSize();
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index 9afdb2b..9b8c2dd 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -275,7 +275,6 @@ namespace {
 
     SDNode *emitBuildVector(SDNode *bvNode) {
       EVT vecVT = bvNode->getValueType(0);
-      EVT eltVT = vecVT.getVectorElementType();
       DebugLoc dl = bvNode->getDebugLoc();
 
       // Check to see if this vector can be represented as a CellSPU immediate
@@ -606,18 +605,14 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDNode *Op, SDValue N, SDValue &Base,
     Base = CurDAG->getTargetConstant(0, N.getValueType());
     Index = N;
     return true;
-  } else if (Opc == ISD::Register || Opc == ISD::CopyFromReg) {
+  } else if (Opc == ISD::Register 
+           ||Opc == ISD::CopyFromReg 
+           ||Opc == ISD::UNDEF) {
     unsigned OpOpc = Op->getOpcode();
 
     if (OpOpc == ISD::STORE || OpOpc == ISD::LOAD) {
       // Direct load/store without getelementptr
-      SDValue Addr, Offs;
-
-      // Get the register from CopyFromReg
-      if (Opc == ISD::CopyFromReg)
-        Addr = N.getOperand(1);
-      else
-        Addr = N;                       // Register
+      SDValue Offs;
 
       Offs = ((OpOpc == ISD::STORE) ? Op->getOperand(3) : Op->getOperand(2));
 
@@ -626,7 +621,7 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDNode *Op, SDValue N, SDValue &Base,
           Offs = CurDAG->getTargetConstant(0, Offs.getValueType());
 
         Base = Offs;
-        Index = Addr;
+        Index = N;
         return true;
       }
     } else {
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index 081e8d0..ece19b9 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -953,7 +953,8 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   EVT PtrVT = Op.getValueType();
   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GSDN->getGlobal();
-  SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
+  SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+                                          PtrVT, GSDN->getOffset());
   const TargetMachine &TM = DAG.getTarget();
   SDValue Zero = DAG.getConstant(0, PtrVT);
   // FIXME there is no actual debug info here
@@ -1013,22 +1014,26 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>();
 
-  const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
-  const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
-
   unsigned ArgOffset = SPUFrameInfo::minStackSize();
   unsigned ArgRegIdx = 0;
   unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+                 *DAG.getContext());
+  // FIXME: allow for other calling conventions
+  CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
+
   // Add DAG nodes to load the arguments or copy them out of registers.
   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
     EVT ObjectVT = Ins[ArgNo].VT;
     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
     SDValue ArgVal;
+    CCValAssign &VA = ArgLocs[ArgNo];
 
-    if (ArgRegIdx < NumArgRegs) {
+    if (VA.isRegLoc()) {
       const TargetRegisterClass *ArgRegClass;
 
       switch (ObjectVT.getSimpleVT().SimpleTy) {
@@ -1067,14 +1072,14 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
       }
 
       unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
-      RegInfo.addLiveIn(ArgRegs[ArgRegIdx], VReg);
+      RegInfo.addLiveIn(VA.getLocReg(), VReg);
       ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
       ++ArgRegIdx;
     } else {
       // We need to load the argument to a virtual register if we determined
       // above that we ran out of physical registers of the appropriate type
       // or we're forced to do vararg
-      int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true, false);
+      int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, NULL, 0, false, false, 0);
       ArgOffset += StackSlotSize;
@@ -1087,16 +1092,31 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
 
   // vararg handling:
   if (isVarArg) {
-    // unsigned int ptr_size = PtrVT.getSizeInBits() / 8;
+    // FIXME: we should be able to query the argument registers from 
+    //        tablegen generated code. 
+    static const unsigned ArgRegs[] = {
+      SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
+      SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
+      SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
+      SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
+      SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
+      SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
+      SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
+      SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
+      SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
+      SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
+      SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
+    };
+    // size of ArgRegs array
+    unsigned NumArgRegs = 77;
+
     // We will spill (79-3)+1 registers to the stack
     SmallVector<SDValue, 79-3+1> MemOps;
 
     // Create the frame slot
-
     for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
       FuncInfo->setVarArgsFrameIndex(
-        MFI->CreateFixedObject(StackSlotSize, ArgOffset,
-                               true, false));
+        MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
       SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
       unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass);
       SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
@@ -1135,6 +1155,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                              CallingConv::ID CallConv, bool isVarArg,
                              bool &isTailCall,
                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
                              const SmallVectorImpl<ISD::InputArg> &Ins,
                              DebugLoc dl, SelectionDAG &DAG,
                              SmallVectorImpl<SDValue> &InVals) const {
@@ -1144,8 +1165,15 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
   unsigned NumOps     = Outs.size();
   unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
-  const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
-  const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+                 *DAG.getContext()); 
+  // FIXME: allow for other calling conventions
+  CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
+  
+  const unsigned NumArgRegs = ArgLocs.size();
+
 
   // Handy pointer type
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
@@ -1165,8 +1193,9 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // And the arguments passed on the stack
   SmallVector<SDValue, 8> MemOpChains;
 
-  for (unsigned i = 0; i != NumOps; ++i) {
-    SDValue Arg = Outs[i].Val;
+  for (; ArgRegIdx != NumOps; ++ArgRegIdx) {
+    SDValue Arg = OutVals[ArgRegIdx];
+    CCValAssign &VA = ArgLocs[ArgRegIdx];
 
     // PtrOff will be used to store the current argument to the stack if a
     // register cannot be found for it.
@@ -1180,24 +1209,8 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     case MVT::i32:
     case MVT::i64:
     case MVT::i128:
-      if (ArgRegIdx != NumArgRegs) {
-        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
-      } else {
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0,
-                                           false, false, 0));
-        ArgOffset += StackSlotSize;
-      }
-      break;
     case MVT::f32:
     case MVT::f64:
-      if (ArgRegIdx != NumArgRegs) {
-        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
-      } else {
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0,
-                                           false, false, 0));
-        ArgOffset += StackSlotSize;
-      }
-      break;
     case MVT::v2i64:
     case MVT::v2f64:
     case MVT::v4f32:
@@ -1205,7 +1218,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     case MVT::v8i16:
     case MVT::v16i8:
       if (ArgRegIdx != NumArgRegs) {
-        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
+        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
       } else {
         MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0,
                                            false, false, 0));
@@ -1249,7 +1262,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     const GlobalValue *GV = G->getGlobal();
     EVT CalleeVT = Callee.getValueType();
     SDValue Zero = DAG.getConstant(0, PtrVT);
-    SDValue GA = DAG.getTargetGlobalAddress(GV, CalleeVT);
+    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT);
 
     if (!ST->usingLargeMem()) {
       // Turn calls to targets that are defined (i.e., have bodies) into BRSL
@@ -1355,6 +1368,7 @@ SDValue
 SPUTargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
                                DebugLoc dl, SelectionDAG &DAG) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
@@ -1376,7 +1390,7 @@ SPUTargetLowering::LowerReturn(SDValue Chain,
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             Outs[i].Val, Flag);
+                             OutVals[i], Flag);
     Flag = Chain.getValue(1);
   }
 
@@ -1746,15 +1760,20 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   unsigned V0Elt = 0;
   bool monotonic = true;
   bool rotate = true;
+  EVT maskVT;             // which of the c?d instructions to use
 
   if (EltVT == MVT::i8) {
     V2EltIdx0 = 16;
+    maskVT = MVT::v16i8; 
   } else if (EltVT == MVT::i16) {
     V2EltIdx0 = 8;
+    maskVT = MVT::v8i16;
   } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
     V2EltIdx0 = 4;
+    maskVT = MVT::v4i32;
   } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
     V2EltIdx0 = 2;
+    maskVT = MVT::v2i64;
   } else
     llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
 
@@ -1786,7 +1805,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
         } else {
           rotate = false;
         }
-      } else if (PrevElt == 0) {
+      } else if (i == 0) {
         // First time through, need to keep track of previous element
         PrevElt = SrcElt;
       } else {
@@ -1798,18 +1817,16 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
 
   if (EltsFromV2 == 1 && monotonic) {
     // Compute mask and shuffle
-    MachineFunction &MF = DAG.getMachineFunction();
-    MachineRegisterInfo &RegInfo = MF.getRegInfo();
-    unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-    // Initialize temporary register to 0
-    SDValue InitTempReg =
-      DAG.getCopyToReg(DAG.getEntryNode(), dl, VReg, DAG.getConstant(0, PtrVT));
-    // Copy register's contents as index in SHUFFLE_MASK:
-    SDValue ShufMaskOp =
-      DAG.getNode(SPUISD::SHUFFLE_MASK, dl, MVT::v4i32,
-                  DAG.getTargetConstant(V2Elt, MVT::i32),
-                  DAG.getCopyFromReg(InitTempReg, dl, VReg, PtrVT));
+
+    // As SHUFFLE_MASK becomes a c?d instruction, feed it an address
+    // R1 ($sp) is used here only as it is guaranteed to have last bits zero
+    SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                DAG.getRegister(SPU::R1, PtrVT),
+                                DAG.getConstant(V2Elt, MVT::i32));
+    SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, 
+                                     maskVT, Pointer);
+
     // Use shuffle mask in SHUFB synthetic instruction:
     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
                        ShufMaskOp);
@@ -2056,14 +2073,19 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
 
-  ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
-  assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
+  // use 0 when the lane to insert to is 'undef'
+  int64_t Idx=0;
+  if (IdxOp.getOpcode() != ISD::UNDEF) {
+    ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
+    assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
+    Idx = (CN->getSExtValue());
+  }
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   // Use $sp ($1) because it's always 16-byte aligned and it's available:
   SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                 DAG.getRegister(SPU::R1, PtrVT),
-                                DAG.getConstant(CN->getSExtValue(), PtrVT));
+                                DAG.getConstant(Idx, PtrVT));
   SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, VT, Pointer);
 
   SDValue result =
@@ -2862,7 +2884,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   case SPUISD::IndirectAddr: {
     if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
       ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
-      if (CN != 0 && CN->getZExtValue() == 0) {
+      if (CN != 0 && CN->isNullValue()) {
         // (SPUindirect (SPUaform <addr>, 0), 0) ->
         // (SPUaform <addr>, 0)
 
@@ -3056,12 +3078,10 @@ SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
 void
 SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                 char ConstraintLetter,
-                                                bool hasMemory,
                                                 std::vector<SDValue> &Ops,
                                                 SelectionDAG &DAG) const {
   // Default, for the time being, to the base class handler
-  TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, hasMemory,
-                                               Ops, DAG);
+  TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, Ops, DAG);
 }
 
 /// isLegalAddressImmediate - Return true if the integer value can be used
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index 9ebd442..6d3c90b 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -134,7 +134,6 @@ namespace llvm {
                                    EVT VT) const;
 
     void LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter,
-                                      bool hasMemory,
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const;
 
@@ -160,6 +159,7 @@ namespace llvm {
                 CallingConv::ID CallConv, bool isVarArg,
                 bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
@@ -168,6 +168,7 @@ namespace llvm {
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
   };
 }
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index 4c53c98..69aa088 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -164,11 +164,9 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
            MI.getOperand(0).isReg() &&
            MI.getOperand(1).isReg() &&
            "invalid SPU OR<type>_<vec> or LR instruction!");
-    if (MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
       sourceReg = MI.getOperand(1).getReg();
       destReg = MI.getOperand(0).getReg();
       return true;
-    }
     break;
   }
   case SPU::ORv16i8:
@@ -251,40 +249,18 @@ SPUInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   return 0;
 }
 
-bool SPUInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   const TargetRegisterClass *DestRC,
-                                   const TargetRegisterClass *SrcRC,
-                                   DebugLoc DL) const
+void SPUInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, DebugLoc DL,
+                               unsigned DestReg, unsigned SrcReg,
+                               bool KillSrc) const
 {
   // We support cross register class moves for our aliases, such as R3 in any
   // reg class to any other reg class containing R3.  This is required because
   // we instruction select bitconvert i64 -> f64 as a noop for example, so our
   // types have no specific meaning.
 
-  if (DestRC == SPU::R8CRegisterClass) {
-    BuildMI(MBB, MI, DL, get(SPU::LRr8), DestReg).addReg(SrcReg);
-  } else if (DestRC == SPU::R16CRegisterClass) {
-    BuildMI(MBB, MI, DL, get(SPU::LRr16), DestReg).addReg(SrcReg);
-  } else if (DestRC == SPU::R32CRegisterClass) {
-    BuildMI(MBB, MI, DL, get(SPU::LRr32), DestReg).addReg(SrcReg);
-  } else if (DestRC == SPU::R32FPRegisterClass) {
-    BuildMI(MBB, MI, DL, get(SPU::LRf32), DestReg).addReg(SrcReg);
-  } else if (DestRC == SPU::R64CRegisterClass) {
-    BuildMI(MBB, MI, DL, get(SPU::LRr64), DestReg).addReg(SrcReg);
-  } else if (DestRC == SPU::R64FPRegisterClass) {
-    BuildMI(MBB, MI, DL, get(SPU::LRf64), DestReg).addReg(SrcReg);
-  } else if (DestRC == SPU::GPRCRegisterClass) {
-    BuildMI(MBB, MI, DL, get(SPU::LRr128), DestReg).addReg(SrcReg);
-  } else if (DestRC == SPU::VECREGRegisterClass) {
-    BuildMI(MBB, MI, DL, get(SPU::LRv16i8), DestReg).addReg(SrcReg);
-  } else {
-    // Attempt to copy unknown/unsupported register class!
-    return false;
-  }
-
-  return true;
+  BuildMI(MBB, I, DL, get(SPU::LRr128), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 void
@@ -356,88 +332,6 @@ SPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   addFrameReference(BuildMI(MBB, MI, DL, get(opc), DestReg), FrameIdx);
 }
 
-//! Return true if the specified load or store can be folded
-bool
-SPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
-                                   const SmallVectorImpl<unsigned> &Ops) const {
-  if (Ops.size() != 1) return false;
-
-  // Make sure this is a reg-reg copy.
-  unsigned Opc = MI->getOpcode();
-
-  switch (Opc) {
-  case SPU::ORv16i8:
-  case SPU::ORv8i16:
-  case SPU::ORv4i32:
-  case SPU::ORv2i64:
-  case SPU::ORr8:
-  case SPU::ORr16:
-  case SPU::ORr32:
-  case SPU::ORr64:
-  case SPU::ORf32:
-  case SPU::ORf64:
-    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())
-      return true;
-    break;
-  }
-
-  return false;
-}
-
-/// foldMemoryOperand - SPU, like PPC, can only fold spills into
-/// copy instructions, turning them into load/store instructions.
-MachineInstr *
-SPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                    MachineInstr *MI,
-                                    const SmallVectorImpl<unsigned> &Ops,
-                                    int FrameIndex) const
-{
-  if (Ops.size() != 1) return 0;
-
-  unsigned OpNum = Ops[0];
-  unsigned Opc = MI->getOpcode();
-  MachineInstr *NewMI = 0;
-
-  switch (Opc) {
-  case SPU::ORv16i8:
-  case SPU::ORv8i16:
-  case SPU::ORv4i32:
-  case SPU::ORv2i64:
-  case SPU::ORr8:
-  case SPU::ORr16:
-  case SPU::ORr32:
-  case SPU::ORr64:
-  case SPU::ORf32:
-  case SPU::ORf64:
-    if (OpNum == 0) {  // move -> store
-      unsigned InReg = MI->getOperand(1).getReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      if (FrameIndex < SPUFrameInfo::maxFrameOffset()) {
-        MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(),
-                                          get(SPU::STQDr32));
-
-        MIB.addReg(InReg, getKillRegState(isKill) | getUndefRegState(isUndef));
-        NewMI = addFrameReference(MIB, FrameIndex);
-      }
-    } else {           // move -> load
-      unsigned OutReg = MI->getOperand(0).getReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc));
-
-      MIB.addReg(OutReg, RegState::Define | getDeadRegState(isDead) |
-                 getUndefRegState(isUndef));
-      Opc = (FrameIndex < SPUFrameInfo::maxFrameOffset())
-        ? SPU::STQDr32 : SPU::STQXr32;
-      NewMI = addFrameReference(MIB, FrameIndex);
-    break;
-  }
-  }
-
-  return NewMI;
-}
-
 //! Branch analysis
 /*!
   \note This code was kiped from PPC. There may be more branch analysis for
@@ -554,9 +448,8 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 unsigned
 SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                            MachineBasicBlock *FBB,
-                           const SmallVectorImpl<MachineOperand> &Cond) const {
-  // FIXME this should probably have a DebugLoc argument
-  DebugLoc dl;
+                           const SmallVectorImpl<MachineOperand> &Cond,
+                           DebugLoc DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 2 || Cond.size() == 0) &&
@@ -566,14 +459,14 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   if (FBB == 0) {
     if (Cond.empty()) {
       // Unconditional branch
-      MachineInstrBuilder MIB = BuildMI(&MBB, dl, get(SPU::BR));
+      MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(SPU::BR));
       MIB.addMBB(TBB);
 
       DEBUG(errs() << "Inserted one-way uncond branch: ");
       DEBUG((*MIB).dump());
     } else {
       // Conditional branch
-      MachineInstrBuilder  MIB = BuildMI(&MBB, dl, get(Cond[0].getImm()));
+      MachineInstrBuilder  MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
       MIB.addReg(Cond[1].getReg()).addMBB(TBB);
 
       DEBUG(errs() << "Inserted one-way cond branch:   ");
@@ -581,8 +474,8 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
     }
     return 1;
   } else {
-    MachineInstrBuilder MIB = BuildMI(&MBB, dl, get(Cond[0].getImm()));
-    MachineInstrBuilder MIB2 = BuildMI(&MBB, dl, get(SPU::BR));
+    MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
+    MachineInstrBuilder MIB2 = BuildMI(&MBB, DL, get(SPU::BR));
 
     // Two-way Conditional Branch.
     MIB.addReg(Cond[1].getReg()).addMBB(TBB);
diff --git a/lib/Target/CellSPU/SPUInstrInfo.h b/lib/Target/CellSPU/SPUInstrInfo.h
index 6dabd7c..fbb1733 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.h
+++ b/lib/Target/CellSPU/SPUInstrInfo.h
@@ -23,19 +23,6 @@ namespace llvm {
   class SPUInstrInfo : public TargetInstrInfoImpl {
     SPUTargetMachine &TM;
     const SPURegisterInfo RI;
-  protected:
-    virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                            MachineInstr* MI,
-                                            const SmallVectorImpl<unsigned> &Ops,
-                                            int FrameIndex) const;
-
-    virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                                MachineInstr* MI,
-                                                const SmallVectorImpl<unsigned> &Ops,
-                                                MachineInstr* LoadMI) const {
-      return 0;
-    }
-
   public:
     explicit SPUInstrInfo(SPUTargetMachine &tm);
 
@@ -56,12 +43,10 @@ namespace llvm {
     unsigned isStoreToStackSlot(const MachineInstr *MI,
                                 int &FrameIndex) const;
 
-    virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MI,
-                              unsigned DestReg, unsigned SrcReg,
-                              const TargetRegisterClass *DestRC,
-                              const TargetRegisterClass *SrcRC,
-                              DebugLoc DL) const;
+    virtual void copyPhysReg(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I, DebugLoc DL,
+                             unsigned DestReg, unsigned SrcReg,
+                             bool KillSrc) const;
 
     //! Store a register to a stack slot, based on its register class.
     virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -77,11 +62,6 @@ namespace llvm {
                                       const TargetRegisterClass *RC,
                                       const TargetRegisterInfo *TRI) const;
 
-    //! Return true if the specified load or store can be folded
-    virtual
-    bool canFoldMemoryOperand(const MachineInstr *MI,
-                              const SmallVectorImpl<unsigned> &Ops) const;
-
     //! Reverses a branch's condition, returning false on success.
     virtual
     bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
@@ -94,8 +74,9 @@ namespace llvm {
     virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
 
     virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                              MachineBasicBlock *FBB,
-                              const SmallVectorImpl<MachineOperand> &Cond) const;
+                                  MachineBasicBlock *FBB,
+                                  const SmallVectorImpl<MachineOperand> &Cond,
+                                  DebugLoc DL) const;
    };
 }
 
diff --git a/lib/Target/CellSPU/SPUNodes.td b/lib/Target/CellSPU/SPUNodes.td
index 846c7ed..647da30 100644
--- a/lib/Target/CellSPU/SPUNodes.td
+++ b/lib/Target/CellSPU/SPUNodes.td
@@ -21,7 +21,7 @@ def SPUshufmask    : SDNode<"SPUISD::SHUFFLE_MASK", SPU_GenControl, []>;
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPUCallSeq,
                            [SDNPHasChain, SDNPOutFlag]>;
 def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_SPUCallSeq,
-                           [SDNPHasChain, SDNPOutFlag]>;
+                           [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
 //===----------------------------------------------------------------------===//
 // Operand constraints:
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index d8937ec..f7cfa42 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -191,33 +191,6 @@ SPURegisterInfo::SPURegisterInfo(const SPUSubtarget &subtarget,
 {
 }
 
-// SPU's 128-bit registers used for argument passing:
-static const unsigned SPU_ArgRegs[] = {
-  SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
-  SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
-  SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
-  SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
-  SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
-  SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
-  SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
-  SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
-  SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
-  SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
-  SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
-};
-
-const unsigned *
-SPURegisterInfo::getArgRegs()
-{
-  return SPU_ArgRegs;
-}
-
-unsigned
-SPURegisterInfo::getNumArgRegs()
-{
-  return sizeof(SPU_ArgRegs) / sizeof(SPU_ArgRegs[0]);
-}
-
 /// getPointerRegClass - Return the register class to use to hold pointers.
 /// This is used for addressing modes.
 const TargetRegisterClass *
@@ -251,36 +224,6 @@ SPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const
   return SPU_CalleeSaveRegs;
 }
 
-const TargetRegisterClass* const*
-SPURegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const
-{
-  // Cell ABI Calling Convention
-  static const TargetRegisterClass * const SPU_CalleeSaveRegClasses[] = {
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
-    &SPU::GPRCRegClass, /* environment pointer */
-    &SPU::GPRCRegClass, /* stack pointer */
-    &SPU::GPRCRegClass, /* link register */
-    0 /* end */
-  };
-
-  return SPU_CalleeSaveRegClasses;
-}
-
 /*!
  R0 (link register), R1 (stack pointer) and R2 (environment pointer -- this is
  generally unused) are the Cell's reserved registers
diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
index 0a70318..7a6ae6d 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.h
+++ b/lib/Target/CellSPU/SPURegisterInfo.h
@@ -49,10 +49,6 @@ namespace llvm {
     //! Return the array of callee-saved registers
     virtual const unsigned* getCalleeSavedRegs(const MachineFunction *MF) const;
 
-    //! Return the register class array of the callee-saved registers
-    virtual const TargetRegisterClass* const *
-      getCalleeSavedRegClasses(const MachineFunction *MF) const;
-
     //! Allow for scavenging, so we can get scratch registers when needed.
     virtual bool requiresRegisterScavenging(const MachineFunction &MF) const
     { return true; }
@@ -90,15 +86,6 @@ namespace llvm {
     // New methods added:
     //------------------------------------------------------------------------
 
-    //! Return the array of argument passing registers
-    /*!
-      \note The size of this array is returned by getArgRegsSize().
-     */
-    static const unsigned *getArgRegs();
-
-    //! Return the size of the argument passing register array
-    static unsigned getNumArgRegs();
-
     //! Get DWARF debugging register number
     int getDwarfRegNum(unsigned RegNum, bool isEH) const;
 
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 45a0c84..145568a 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -99,11 +99,12 @@ namespace {
     ValueSet DefinedValues;
     ForwardRefMap ForwardRefs;
     bool is_inline;
+    unsigned indent_level;
 
   public:
     static char ID;
     explicit CppWriter(formatted_raw_ostream &o) :
-      ModulePass(&ID), Out(o), uniqueNum(0), is_inline(false) {}
+      ModulePass(&ID), Out(o), uniqueNum(0), is_inline(false), indent_level(0){}
 
     virtual const char *getPassName() const { return "C++ backend"; }
 
@@ -120,6 +121,11 @@ namespace {
 
     void error(const std::string& msg);
 
+    
+    formatted_raw_ostream& nl(formatted_raw_ostream &Out, int delta = 0);
+    inline void in() { indent_level++; }
+    inline void out() { if (indent_level >0) indent_level--; }
+    
   private:
     void printLinkageType(GlobalValue::LinkageTypes LT);
     void printVisibilityType(GlobalValue::VisibilityTypes VisTypes);
@@ -153,1857 +159,1856 @@ namespace {
 
     void printModuleBody();
   };
+} // end anonymous namespace.
+
+formatted_raw_ostream &CppWriter::nl(formatted_raw_ostream &Out, int delta) {
+  Out << '\n';
+  if (delta >= 0 || indent_level >= unsigned(-delta))
+    indent_level += delta;
+  Out.indent(indent_level);
+  return Out;
+}
+
+static inline void sanitize(std::string &str) {
+  for (size_t i = 0; i < str.length(); ++i)
+    if (!isalnum(str[i]) && str[i] != '_')
+      str[i] = '_';
+}
 
-  static unsigned indent_level = 0;
-  inline formatted_raw_ostream& nl(formatted_raw_ostream& Out, int delta = 0) {
-    Out << "\n";
-    if (delta >= 0 || indent_level >= unsigned(-delta))
-      indent_level += delta;
-    for (unsigned i = 0; i < indent_level; ++i)
-      Out << "  ";
-    return Out;
+static std::string getTypePrefix(const Type *Ty) {
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:     return "void_";
+  case Type::IntegerTyID:
+    return "int" + utostr(cast<IntegerType>(Ty)->getBitWidth()) + "_";
+  case Type::FloatTyID:    return "float_";
+  case Type::DoubleTyID:   return "double_";
+  case Type::LabelTyID:    return "label_";
+  case Type::FunctionTyID: return "func_";
+  case Type::StructTyID:   return "struct_";
+  case Type::ArrayTyID:    return "array_";
+  case Type::PointerTyID:  return "ptr_";
+  case Type::VectorTyID:   return "packed_";
+  case Type::OpaqueTyID:   return "opaque_";
+  default:                 return "other_";
   }
+  return "unknown_";
+}
 
-  inline void in() { indent_level++; }
-  inline void out() { if (indent_level >0) indent_level--; }
+// Looks up the type in the symbol table and returns a pointer to its name or
+// a null pointer if it wasn't found. Note that this isn't the same as the
+// Mode::getTypeName function which will return an empty string, not a null
+// pointer if the name is not found.
+static const std::string *
+findTypeName(const TypeSymbolTable& ST, const Type* Ty) {
+  TypeSymbolTable::const_iterator TI = ST.begin();
+  TypeSymbolTable::const_iterator TE = ST.end();
+  for (;TI != TE; ++TI)
+    if (TI->second == Ty)
+      return &(TI->first);
+  return 0;
+}
 
-  inline void
-  sanitize(std::string& str) {
-    for (size_t i = 0; i < str.length(); ++i)
-      if (!isalnum(str[i]) && str[i] != '_')
-        str[i] = '_';
-  }
+void CppWriter::error(const std::string& msg) {
+  report_fatal_error(msg);
+}
 
-  inline std::string
-  getTypePrefix(const Type* Ty ) {
-    switch (Ty->getTypeID()) {
-    case Type::VoidTyID:     return "void_";
-    case Type::IntegerTyID:
-      return std::string("int") + utostr(cast<IntegerType>(Ty)->getBitWidth()) +
-        "_";
-    case Type::FloatTyID:    return "float_";
-    case Type::DoubleTyID:   return "double_";
-    case Type::LabelTyID:    return "label_";
-    case Type::FunctionTyID: return "func_";
-    case Type::StructTyID:   return "struct_";
-    case Type::ArrayTyID:    return "array_";
-    case Type::PointerTyID:  return "ptr_";
-    case Type::VectorTyID:   return "packed_";
-    case Type::OpaqueTyID:   return "opaque_";
-    default:                 return "other_";
-    }
-    return "unknown_";
-  }
-
-  // Looks up the type in the symbol table and returns a pointer to its name or
-  // a null pointer if it wasn't found. Note that this isn't the same as the
-  // Mode::getTypeName function which will return an empty string, not a null
-  // pointer if the name is not found.
-  inline const std::string*
-  findTypeName(const TypeSymbolTable& ST, const Type* Ty) {
-    TypeSymbolTable::const_iterator TI = ST.begin();
-    TypeSymbolTable::const_iterator TE = ST.end();
-    for (;TI != TE; ++TI)
-      if (TI->second == Ty)
-        return &(TI->first);
-    return 0;
-  }
-
-  void CppWriter::error(const std::string& msg) {
-    report_fatal_error(msg);
-  }
-
-  // printCFP - Print a floating point constant .. very carefully :)
-  // This makes sure that conversion to/from floating yields the same binary
-  // result so that we don't lose precision.
-  void CppWriter::printCFP(const ConstantFP *CFP) {
-    bool ignored;
-    APFloat APF = APFloat(CFP->getValueAPF());  // copy
-    if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
-      APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
-    Out << "ConstantFP::get(mod->getContext(), ";
-    Out << "APFloat(";
+// printCFP - Print a floating point constant .. very carefully :)
+// This makes sure that conversion to/from floating yields the same binary
+// result so that we don't lose precision.
+void CppWriter::printCFP(const ConstantFP *CFP) {
+  bool ignored;
+  APFloat APF = APFloat(CFP->getValueAPF());  // copy
+  if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
+    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
+  Out << "ConstantFP::get(mod->getContext(), ";
+  Out << "APFloat(";
 #if HAVE_PRINTF_A
-    char Buffer[100];
-    sprintf(Buffer, "%A", APF.convertToDouble());
-    if ((!strncmp(Buffer, "0x", 2) ||
-         !strncmp(Buffer, "-0x", 3) ||
-         !strncmp(Buffer, "+0x", 3)) &&
-        APF.bitwiseIsEqual(APFloat(atof(Buffer)))) {
-      if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
-        Out << "BitsToDouble(" << Buffer << ")";
-      else
-        Out << "BitsToFloat((float)" << Buffer << ")";
-      Out << ")";
-    } else {
+  char Buffer[100];
+  sprintf(Buffer, "%A", APF.convertToDouble());
+  if ((!strncmp(Buffer, "0x", 2) ||
+       !strncmp(Buffer, "-0x", 3) ||
+       !strncmp(Buffer, "+0x", 3)) &&
+      APF.bitwiseIsEqual(APFloat(atof(Buffer)))) {
+    if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
+      Out << "BitsToDouble(" << Buffer << ")";
+    else
+      Out << "BitsToFloat((float)" << Buffer << ")";
+    Out << ")";
+  } else {
 #endif
-      std::string StrVal = ftostr(CFP->getValueAPF());
-
-      while (StrVal[0] == ' ')
-        StrVal.erase(StrVal.begin());
-
-      // Check to make sure that the stringized number is not some string like
-      // "Inf" or NaN.  Check that the string matches the "[-+]?[0-9]" regex.
-      if (((StrVal[0] >= '0' && StrVal[0] <= '9') ||
-           ((StrVal[0] == '-' || StrVal[0] == '+') &&
-            (StrVal[1] >= '0' && StrVal[1] <= '9'))) &&
-          (CFP->isExactlyValue(atof(StrVal.c_str())))) {
-        if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
-          Out <<  StrVal;
-        else
-          Out << StrVal << "f";
-      } else if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
-        Out << "BitsToDouble(0x"
-            << utohexstr(CFP->getValueAPF().bitcastToAPInt().getZExtValue())
-            << "ULL) /* " << StrVal << " */";
+    std::string StrVal = ftostr(CFP->getValueAPF());
+
+    while (StrVal[0] == ' ')
+      StrVal.erase(StrVal.begin());
+
+    // Check to make sure that the stringized number is not some string like
+    // "Inf" or NaN.  Check that the string matches the "[-+]?[0-9]" regex.
+    if (((StrVal[0] >= '0' && StrVal[0] <= '9') ||
+         ((StrVal[0] == '-' || StrVal[0] == '+') &&
+          (StrVal[1] >= '0' && StrVal[1] <= '9'))) &&
+        (CFP->isExactlyValue(atof(StrVal.c_str())))) {
+      if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
+        Out <<  StrVal;
       else
-        Out << "BitsToFloat(0x"
-            << utohexstr((uint32_t)CFP->getValueAPF().
-                                        bitcastToAPInt().getZExtValue())
-            << "U) /* " << StrVal << " */";
-      Out << ")";
+        Out << StrVal << "f";
+    } else if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
+      Out << "BitsToDouble(0x"
+          << utohexstr(CFP->getValueAPF().bitcastToAPInt().getZExtValue())
+          << "ULL) /* " << StrVal << " */";
+    else
+      Out << "BitsToFloat(0x"
+          << utohexstr((uint32_t)CFP->getValueAPF().
+                                      bitcastToAPInt().getZExtValue())
+          << "U) /* " << StrVal << " */";
+    Out << ")";
 #if HAVE_PRINTF_A
-    }
+  }
 #endif
-    Out << ")";
+  Out << ")";
+}
+
+void CppWriter::printCallingConv(CallingConv::ID cc){
+  // Print the calling convention.
+  switch (cc) {
+  case CallingConv::C:     Out << "CallingConv::C"; break;
+  case CallingConv::Fast:  Out << "CallingConv::Fast"; break;
+  case CallingConv::Cold:  Out << "CallingConv::Cold"; break;
+  case CallingConv::FirstTargetCC: Out << "CallingConv::FirstTargetCC"; break;
+  default:                 Out << cc; break;
   }
+}
 
-  void CppWriter::printCallingConv(CallingConv::ID cc){
-    // Print the calling convention.
-    switch (cc) {
-    case CallingConv::C:     Out << "CallingConv::C"; break;
-    case CallingConv::Fast:  Out << "CallingConv::Fast"; break;
-    case CallingConv::Cold:  Out << "CallingConv::Cold"; break;
-    case CallingConv::FirstTargetCC: Out << "CallingConv::FirstTargetCC"; break;
-    default:                 Out << cc; break;
-    }
+void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
+  switch (LT) {
+  case GlobalValue::InternalLinkage:
+    Out << "GlobalValue::InternalLinkage"; break;
+  case GlobalValue::PrivateLinkage:
+    Out << "GlobalValue::PrivateLinkage"; break;
+  case GlobalValue::LinkerPrivateLinkage:
+    Out << "GlobalValue::LinkerPrivateLinkage"; break;
+  case GlobalValue::LinkerPrivateWeakLinkage:
+    Out << "GlobalValue::LinkerPrivateWeakLinkage"; break;
+  case GlobalValue::AvailableExternallyLinkage:
+    Out << "GlobalValue::AvailableExternallyLinkage "; break;
+  case GlobalValue::LinkOnceAnyLinkage:
+    Out << "GlobalValue::LinkOnceAnyLinkage "; break;
+  case GlobalValue::LinkOnceODRLinkage:
+    Out << "GlobalValue::LinkOnceODRLinkage "; break;
+  case GlobalValue::WeakAnyLinkage:
+    Out << "GlobalValue::WeakAnyLinkage"; break;
+  case GlobalValue::WeakODRLinkage:
+    Out << "GlobalValue::WeakODRLinkage"; break;
+  case GlobalValue::AppendingLinkage:
+    Out << "GlobalValue::AppendingLinkage"; break;
+  case GlobalValue::ExternalLinkage:
+    Out << "GlobalValue::ExternalLinkage"; break;
+  case GlobalValue::DLLImportLinkage:
+    Out << "GlobalValue::DLLImportLinkage"; break;
+  case GlobalValue::DLLExportLinkage:
+    Out << "GlobalValue::DLLExportLinkage"; break;
+  case GlobalValue::ExternalWeakLinkage:
+    Out << "GlobalValue::ExternalWeakLinkage"; break;
+  case GlobalValue::CommonLinkage:
+    Out << "GlobalValue::CommonLinkage"; break;
   }
+}
 
-  void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
-    switch (LT) {
-    case GlobalValue::InternalLinkage:
-      Out << "GlobalValue::InternalLinkage"; break;
-    case GlobalValue::PrivateLinkage:
-      Out << "GlobalValue::PrivateLinkage"; break;
-    case GlobalValue::LinkerPrivateLinkage:
-      Out << "GlobalValue::LinkerPrivateLinkage"; break;
-    case GlobalValue::AvailableExternallyLinkage:
-      Out << "GlobalValue::AvailableExternallyLinkage "; break;
-    case GlobalValue::LinkOnceAnyLinkage:
-      Out << "GlobalValue::LinkOnceAnyLinkage "; break;
-    case GlobalValue::LinkOnceODRLinkage:
-      Out << "GlobalValue::LinkOnceODRLinkage "; break;
-    case GlobalValue::WeakAnyLinkage:
-      Out << "GlobalValue::WeakAnyLinkage"; break;
-    case GlobalValue::WeakODRLinkage:
-      Out << "GlobalValue::WeakODRLinkage"; break;
-    case GlobalValue::AppendingLinkage:
-      Out << "GlobalValue::AppendingLinkage"; break;
-    case GlobalValue::ExternalLinkage:
-      Out << "GlobalValue::ExternalLinkage"; break;
-    case GlobalValue::DLLImportLinkage:
-      Out << "GlobalValue::DLLImportLinkage"; break;
-    case GlobalValue::DLLExportLinkage:
-      Out << "GlobalValue::DLLExportLinkage"; break;
-    case GlobalValue::ExternalWeakLinkage:
-      Out << "GlobalValue::ExternalWeakLinkage"; break;
-    case GlobalValue::CommonLinkage:
-      Out << "GlobalValue::CommonLinkage"; break;
-    }
+void CppWriter::printVisibilityType(GlobalValue::VisibilityTypes VisType) {
+  switch (VisType) {
+  default: llvm_unreachable("Unknown GVar visibility");
+  case GlobalValue::DefaultVisibility:
+    Out << "GlobalValue::DefaultVisibility";
+    break;
+  case GlobalValue::HiddenVisibility:
+    Out << "GlobalValue::HiddenVisibility";
+    break;
+  case GlobalValue::ProtectedVisibility:
+    Out << "GlobalValue::ProtectedVisibility";
+    break;
   }
+}
 
-  void CppWriter::printVisibilityType(GlobalValue::VisibilityTypes VisType) {
-    switch (VisType) {
-    default: llvm_unreachable("Unknown GVar visibility");
-    case GlobalValue::DefaultVisibility:
-      Out << "GlobalValue::DefaultVisibility";
-      break;
-    case GlobalValue::HiddenVisibility:
-      Out << "GlobalValue::HiddenVisibility";
-      break;
-    case GlobalValue::ProtectedVisibility:
-      Out << "GlobalValue::ProtectedVisibility";
-      break;
+// printEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+void CppWriter::printEscapedString(const std::string &Str) {
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    unsigned char C = Str[i];
+    if (isprint(C) && C != '"' && C != '\\') {
+      Out << C;
+    } else {
+      Out << "\\x"
+          << (char) ((C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'))
+          << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
     }
   }
+}
 
-  // printEscapedString - Print each character of the specified string, escaping
-  // it if it is not printable or if it is an escape char.
-  void CppWriter::printEscapedString(const std::string &Str) {
-    for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-      unsigned char C = Str[i];
-      if (isprint(C) && C != '"' && C != '\\') {
-        Out << C;
-      } else {
-        Out << "\\x"
-            << (char) ((C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'))
-            << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
-      }
+std::string CppWriter::getCppName(const Type* Ty) {
+  // First, handle the primitive types .. easy
+  if (Ty->isPrimitiveType() || Ty->isIntegerTy()) {
+    switch (Ty->getTypeID()) {
+    case Type::VoidTyID:   return "Type::getVoidTy(mod->getContext())";
+    case Type::IntegerTyID: {
+      unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
+      return "IntegerType::get(mod->getContext(), " + utostr(BitWidth) + ")";
+    }
+    case Type::X86_FP80TyID: return "Type::getX86_FP80Ty(mod->getContext())";
+    case Type::FloatTyID:    return "Type::getFloatTy(mod->getContext())";
+    case Type::DoubleTyID:   return "Type::getDoubleTy(mod->getContext())";
+    case Type::LabelTyID:    return "Type::getLabelTy(mod->getContext())";
+    default:
+      error("Invalid primitive type");
+      break;
     }
+    // shouldn't be returned, but make it sensible
+    return "Type::getVoidTy(mod->getContext())";
   }
 
-  std::string CppWriter::getCppName(const Type* Ty) {
-    // First, handle the primitive types .. easy
-    if (Ty->isPrimitiveType() || Ty->isIntegerTy()) {
-      switch (Ty->getTypeID()) {
-      case Type::VoidTyID:   return "Type::getVoidTy(mod->getContext())";
-      case Type::IntegerTyID: {
-        unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
-        return "IntegerType::get(mod->getContext(), " + utostr(BitWidth) + ")";
-      }
-      case Type::X86_FP80TyID: return "Type::getX86_FP80Ty(mod->getContext())";
-      case Type::FloatTyID:    return "Type::getFloatTy(mod->getContext())";
-      case Type::DoubleTyID:   return "Type::getDoubleTy(mod->getContext())";
-      case Type::LabelTyID:    return "Type::getLabelTy(mod->getContext())";
-      default:
-        error("Invalid primitive type");
-        break;
-      }
-      // shouldn't be returned, but make it sensible
-      return "Type::getVoidTy(mod->getContext())";
-    }
+  // Now, see if we've seen the type before and return that
+  TypeMap::iterator I = TypeNames.find(Ty);
+  if (I != TypeNames.end())
+    return I->second;
+
+  // Okay, let's build a new name for this type. Start with a prefix
+  const char* prefix = 0;
+  switch (Ty->getTypeID()) {
+  case Type::FunctionTyID:    prefix = "FuncTy_"; break;
+  case Type::StructTyID:      prefix = "StructTy_"; break;
+  case Type::ArrayTyID:       prefix = "ArrayTy_"; break;
+  case Type::PointerTyID:     prefix = "PointerTy_"; break;
+  case Type::OpaqueTyID:      prefix = "OpaqueTy_"; break;
+  case Type::VectorTyID:      prefix = "VectorTy_"; break;
+  default:                    prefix = "OtherTy_"; break; // prevent breakage
+  }
 
-    // Now, see if we've seen the type before and return that
-    TypeMap::iterator I = TypeNames.find(Ty);
-    if (I != TypeNames.end())
-      return I->second;
+  // See if the type has a name in the symboltable and build accordingly
+  const std::string* tName = findTypeName(TheModule->getTypeSymbolTable(), Ty);
+  std::string name;
+  if (tName)
+    name = std::string(prefix) + *tName;
+  else
+    name = std::string(prefix) + utostr(uniqueNum++);
+  sanitize(name);
+
+  // Save the name
+  return TypeNames[Ty] = name;
+}
 
-    // Okay, let's build a new name for this type. Start with a prefix
-    const char* prefix = 0;
-    switch (Ty->getTypeID()) {
-    case Type::FunctionTyID:    prefix = "FuncTy_"; break;
-    case Type::StructTyID:      prefix = "StructTy_"; break;
-    case Type::ArrayTyID:       prefix = "ArrayTy_"; break;
-    case Type::PointerTyID:     prefix = "PointerTy_"; break;
-    case Type::OpaqueTyID:      prefix = "OpaqueTy_"; break;
-    case Type::VectorTyID:      prefix = "VectorTy_"; break;
-    default:                    prefix = "OtherTy_"; break; // prevent breakage
-    }
+void CppWriter::printCppName(const Type* Ty) {
+  printEscapedString(getCppName(Ty));
+}
 
-    // See if the type has a name in the symboltable and build accordingly
-    const std::string* tName = findTypeName(TheModule->getTypeSymbolTable(), Ty);
-    std::string name;
-    if (tName)
-      name = std::string(prefix) + *tName;
-    else
-      name = std::string(prefix) + utostr(uniqueNum++);
-    sanitize(name);
-
-    // Save the name
-    return TypeNames[Ty] = name;
-  }
-
-  void CppWriter::printCppName(const Type* Ty) {
-    printEscapedString(getCppName(Ty));
-  }
-
-  std::string CppWriter::getCppName(const Value* val) {
-    std::string name;
-    ValueMap::iterator I = ValueNames.find(val);
-    if (I != ValueNames.end() && I->first == val)
-      return  I->second;
-
-    if (const GlobalVariable* GV = dyn_cast<GlobalVariable>(val)) {
-      name = std::string("gvar_") +
-        getTypePrefix(GV->getType()->getElementType());
-    } else if (isa<Function>(val)) {
-      name = std::string("func_");
-    } else if (const Constant* C = dyn_cast<Constant>(val)) {
-      name = std::string("const_") + getTypePrefix(C->getType());
-    } else if (const Argument* Arg = dyn_cast<Argument>(val)) {
-      if (is_inline) {
-        unsigned argNum = std::distance(Arg->getParent()->arg_begin(),
-                                        Function::const_arg_iterator(Arg)) + 1;
-        name = std::string("arg_") + utostr(argNum);
-        NameSet::iterator NI = UsedNames.find(name);
-        if (NI != UsedNames.end())
-          name += std::string("_") + utostr(uniqueNum++);
-        UsedNames.insert(name);
-        return ValueNames[val] = name;
-      } else {
-        name = getTypePrefix(val->getType());
-      }
+std::string CppWriter::getCppName(const Value* val) {
+  std::string name;
+  ValueMap::iterator I = ValueNames.find(val);
+  if (I != ValueNames.end() && I->first == val)
+    return  I->second;
+
+  if (const GlobalVariable* GV = dyn_cast<GlobalVariable>(val)) {
+    name = std::string("gvar_") +
+      getTypePrefix(GV->getType()->getElementType());
+  } else if (isa<Function>(val)) {
+    name = std::string("func_");
+  } else if (const Constant* C = dyn_cast<Constant>(val)) {
+    name = std::string("const_") + getTypePrefix(C->getType());
+  } else if (const Argument* Arg = dyn_cast<Argument>(val)) {
+    if (is_inline) {
+      unsigned argNum = std::distance(Arg->getParent()->arg_begin(),
+                                      Function::const_arg_iterator(Arg)) + 1;
+      name = std::string("arg_") + utostr(argNum);
+      NameSet::iterator NI = UsedNames.find(name);
+      if (NI != UsedNames.end())
+        name += std::string("_") + utostr(uniqueNum++);
+      UsedNames.insert(name);
+      return ValueNames[val] = name;
     } else {
       name = getTypePrefix(val->getType());
     }
-    if (val->hasName())
-      name += val->getName();
-    else
-      name += utostr(uniqueNum++);
-    sanitize(name);
-    NameSet::iterator NI = UsedNames.find(name);
-    if (NI != UsedNames.end())
-      name += std::string("_") + utostr(uniqueNum++);
-    UsedNames.insert(name);
-    return ValueNames[val] = name;
+  } else {
+    name = getTypePrefix(val->getType());
   }
+  if (val->hasName())
+    name += val->getName();
+  else
+    name += utostr(uniqueNum++);
+  sanitize(name);
+  NameSet::iterator NI = UsedNames.find(name);
+  if (NI != UsedNames.end())
+    name += std::string("_") + utostr(uniqueNum++);
+  UsedNames.insert(name);
+  return ValueNames[val] = name;
+}
 
-  void CppWriter::printCppName(const Value* val) {
-    printEscapedString(getCppName(val));
-  }
+void CppWriter::printCppName(const Value* val) {
+  printEscapedString(getCppName(val));
+}
 
-  void CppWriter::printAttributes(const AttrListPtr &PAL,
-                                  const std::string &name) {
-    Out << "AttrListPtr " << name << "_PAL;";
-    nl(Out);
-    if (!PAL.isEmpty()) {
-      Out << '{'; in(); nl(Out);
-      Out << "SmallVector<AttributeWithIndex, 4> Attrs;"; nl(Out);
-      Out << "AttributeWithIndex PAWI;"; nl(Out);
-      for (unsigned i = 0; i < PAL.getNumSlots(); ++i) {
-        unsigned index = PAL.getSlot(i).Index;
-        Attributes attrs = PAL.getSlot(i).Attrs;
-        Out << "PAWI.Index = " << index << "U; PAWI.Attrs = 0 ";
+void CppWriter::printAttributes(const AttrListPtr &PAL,
+                                const std::string &name) {
+  Out << "AttrListPtr " << name << "_PAL;";
+  nl(Out);
+  if (!PAL.isEmpty()) {
+    Out << '{'; in(); nl(Out);
+    Out << "SmallVector<AttributeWithIndex, 4> Attrs;"; nl(Out);
+    Out << "AttributeWithIndex PAWI;"; nl(Out);
+    for (unsigned i = 0; i < PAL.getNumSlots(); ++i) {
+      unsigned index = PAL.getSlot(i).Index;
+      Attributes attrs = PAL.getSlot(i).Attrs;
+      Out << "PAWI.Index = " << index << "U; PAWI.Attrs = 0 ";
 #define HANDLE_ATTR(X)                 \
-        if (attrs & Attribute::X)      \
-          Out << " | Attribute::" #X;  \
-        attrs &= ~Attribute::X;
-        
-        HANDLE_ATTR(SExt);
-        HANDLE_ATTR(ZExt);
-        HANDLE_ATTR(NoReturn);
-        HANDLE_ATTR(InReg);
-        HANDLE_ATTR(StructRet);
-        HANDLE_ATTR(NoUnwind);
-        HANDLE_ATTR(NoAlias);
-        HANDLE_ATTR(ByVal);
-        HANDLE_ATTR(Nest);
-        HANDLE_ATTR(ReadNone);
-        HANDLE_ATTR(ReadOnly);
-        HANDLE_ATTR(InlineHint);
-        HANDLE_ATTR(NoInline);
-        HANDLE_ATTR(AlwaysInline);
-        HANDLE_ATTR(OptimizeForSize);
-        HANDLE_ATTR(StackProtect);
-        HANDLE_ATTR(StackProtectReq);
-        HANDLE_ATTR(NoCapture);
+      if (attrs & Attribute::X)      \
+        Out << " | Attribute::" #X;  \
+      attrs &= ~Attribute::X;
+      
+      HANDLE_ATTR(SExt);
+      HANDLE_ATTR(ZExt);
+      HANDLE_ATTR(NoReturn);
+      HANDLE_ATTR(InReg);
+      HANDLE_ATTR(StructRet);
+      HANDLE_ATTR(NoUnwind);
+      HANDLE_ATTR(NoAlias);
+      HANDLE_ATTR(ByVal);
+      HANDLE_ATTR(Nest);
+      HANDLE_ATTR(ReadNone);
+      HANDLE_ATTR(ReadOnly);
+      HANDLE_ATTR(InlineHint);
+      HANDLE_ATTR(NoInline);
+      HANDLE_ATTR(AlwaysInline);
+      HANDLE_ATTR(OptimizeForSize);
+      HANDLE_ATTR(StackProtect);
+      HANDLE_ATTR(StackProtectReq);
+      HANDLE_ATTR(NoCapture);
 #undef HANDLE_ATTR
-        assert(attrs == 0 && "Unhandled attribute!");
-        Out << ";";
-        nl(Out);
-        Out << "Attrs.push_back(PAWI);";
-        nl(Out);
-      }
-      Out << name << "_PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());";
+      assert(attrs == 0 && "Unhandled attribute!");
+      Out << ";";
+      nl(Out);
+      Out << "Attrs.push_back(PAWI);";
       nl(Out);
-      out(); nl(Out);
-      Out << '}'; nl(Out);
     }
+    Out << name << "_PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());";
+    nl(Out);
+    out(); nl(Out);
+    Out << '}'; nl(Out);
   }
+}
 
-  bool CppWriter::printTypeInternal(const Type* Ty) {
-    // We don't print definitions for primitive types
-    if (Ty->isPrimitiveType() || Ty->isIntegerTy())
-      return false;
-
-    // If we already defined this type, we don't need to define it again.
-    if (DefinedTypes.find(Ty) != DefinedTypes.end())
-      return false;
-
-    // Everything below needs the name for the type so get it now.
-    std::string typeName(getCppName(Ty));
-
-    // Search the type stack for recursion. If we find it, then generate this
-    // as an OpaqueType, but make sure not to do this multiple times because
-    // the type could appear in multiple places on the stack. Once the opaque
-    // definition is issued, it must not be re-issued. Consequently we have to
-    // check the UnresolvedTypes list as well.
-    TypeList::const_iterator TI = std::find(TypeStack.begin(), TypeStack.end(),
-                                            Ty);
-    if (TI != TypeStack.end()) {
-      TypeMap::const_iterator I = UnresolvedTypes.find(Ty);
-      if (I == UnresolvedTypes.end()) {
-        Out << "PATypeHolder " << typeName;
-        Out << "_fwd = OpaqueType::get(mod->getContext());";
-        nl(Out);
-        UnresolvedTypes[Ty] = typeName;
-      }
-      return true;
-    }
+bool CppWriter::printTypeInternal(const Type* Ty) {
+  // We don't print definitions for primitive types
+  if (Ty->isPrimitiveType() || Ty->isIntegerTy())
+    return false;
 
-    // We're going to print a derived type which, by definition, contains other
-    // types. So, push this one we're printing onto the type stack to assist with
-    // recursive definitions.
-    TypeStack.push_back(Ty);
+  // If we already defined this type, we don't need to define it again.
+  if (DefinedTypes.find(Ty) != DefinedTypes.end())
+    return false;
 
-    // Print the type definition
-    switch (Ty->getTypeID()) {
-    case Type::FunctionTyID:  {
-      const FunctionType* FT = cast<FunctionType>(Ty);
-      Out << "std::vector<const Type*>" << typeName << "_args;";
+  // Everything below needs the name for the type so get it now.
+  std::string typeName(getCppName(Ty));
+
+  // Search the type stack for recursion. If we find it, then generate this
+  // as an OpaqueType, but make sure not to do this multiple times because
+  // the type could appear in multiple places on the stack. Once the opaque
+  // definition is issued, it must not be re-issued. Consequently we have to
+  // check the UnresolvedTypes list as well.
+  TypeList::const_iterator TI = std::find(TypeStack.begin(), TypeStack.end(),
+                                          Ty);
+  if (TI != TypeStack.end()) {
+    TypeMap::const_iterator I = UnresolvedTypes.find(Ty);
+    if (I == UnresolvedTypes.end()) {
+      Out << "PATypeHolder " << typeName;
+      Out << "_fwd = OpaqueType::get(mod->getContext());";
       nl(Out);
-      FunctionType::param_iterator PI = FT->param_begin();
-      FunctionType::param_iterator PE = FT->param_end();
-      for (; PI != PE; ++PI) {
-        const Type* argTy = static_cast<const Type*>(*PI);
-        bool isForward = printTypeInternal(argTy);
-        std::string argName(getCppName(argTy));
-        Out << typeName << "_args.push_back(" << argName;
-        if (isForward)
-          Out << "_fwd";
-        Out << ");";
-        nl(Out);
-      }
-      bool isForward = printTypeInternal(FT->getReturnType());
-      std::string retTypeName(getCppName(FT->getReturnType()));
-      Out << "FunctionType* " << typeName << " = FunctionType::get(";
-      in(); nl(Out) << "/*Result=*/" << retTypeName;
+      UnresolvedTypes[Ty] = typeName;
+    }
+    return true;
+  }
+
+  // We're going to print a derived type which, by definition, contains other
+  // types. So, push this one we're printing onto the type stack to assist with
+  // recursive definitions.
+  TypeStack.push_back(Ty);
+
+  // Print the type definition
+  switch (Ty->getTypeID()) {
+  case Type::FunctionTyID:  {
+    const FunctionType* FT = cast<FunctionType>(Ty);
+    Out << "std::vector<const Type*>" << typeName << "_args;";
+    nl(Out);
+    FunctionType::param_iterator PI = FT->param_begin();
+    FunctionType::param_iterator PE = FT->param_end();
+    for (; PI != PE; ++PI) {
+      const Type* argTy = static_cast<const Type*>(*PI);
+      bool isForward = printTypeInternal(argTy);
+      std::string argName(getCppName(argTy));
+      Out << typeName << "_args.push_back(" << argName;
       if (isForward)
         Out << "_fwd";
-      Out << ",";
-      nl(Out) << "/*Params=*/" << typeName << "_args,";
-      nl(Out) << "/*isVarArg=*/" << (FT->isVarArg() ? "true" : "false") << ");";
-      out();
-      nl(Out);
-      break;
-    }
-    case Type::StructTyID: {
-      const StructType* ST = cast<StructType>(Ty);
-      Out << "std::vector<const Type*>" << typeName << "_fields;";
-      nl(Out);
-      StructType::element_iterator EI = ST->element_begin();
-      StructType::element_iterator EE = ST->element_end();
-      for (; EI != EE; ++EI) {
-        const Type* fieldTy = static_cast<const Type*>(*EI);
-        bool isForward = printTypeInternal(fieldTy);
-        std::string fieldName(getCppName(fieldTy));
-        Out << typeName << "_fields.push_back(" << fieldName;
-        if (isForward)
-          Out << "_fwd";
-        Out << ");";
-        nl(Out);
-      }
-      Out << "StructType* " << typeName << " = StructType::get("
-          << "mod->getContext(), "
-          << typeName << "_fields, /*isPacked=*/"
-          << (ST->isPacked() ? "true" : "false") << ");";
-      nl(Out);
-      break;
-    }
-    case Type::ArrayTyID: {
-      const ArrayType* AT = cast<ArrayType>(Ty);
-      const Type* ET = AT->getElementType();
-      bool isForward = printTypeInternal(ET);
-      std::string elemName(getCppName(ET));
-      Out << "ArrayType* " << typeName << " = ArrayType::get("
-          << elemName << (isForward ? "_fwd" : "")
-          << ", " << utostr(AT->getNumElements()) << ");";
-      nl(Out);
-      break;
-    }
-    case Type::PointerTyID: {
-      const PointerType* PT = cast<PointerType>(Ty);
-      const Type* ET = PT->getElementType();
-      bool isForward = printTypeInternal(ET);
-      std::string elemName(getCppName(ET));
-      Out << "PointerType* " << typeName << " = PointerType::get("
-          << elemName << (isForward ? "_fwd" : "")
-          << ", " << utostr(PT->getAddressSpace()) << ");";
-      nl(Out);
-      break;
-    }
-    case Type::VectorTyID: {
-      const VectorType* PT = cast<VectorType>(Ty);
-      const Type* ET = PT->getElementType();
-      bool isForward = printTypeInternal(ET);
-      std::string elemName(getCppName(ET));
-      Out << "VectorType* " << typeName << " = VectorType::get("
-          << elemName << (isForward ? "_fwd" : "")
-          << ", " << utostr(PT->getNumElements()) << ");";
-      nl(Out);
-      break;
-    }
-    case Type::OpaqueTyID: {
-      Out << "OpaqueType* " << typeName;
-      Out << " = OpaqueType::get(mod->getContext());";
+      Out << ");";
       nl(Out);
-      break;
-    }
-    default:
-      error("Invalid TypeID");
     }
-
-    // If the type had a name, make sure we recreate it.
-    const std::string* progTypeName =
-      findTypeName(TheModule->getTypeSymbolTable(),Ty);
-    if (progTypeName) {
-      Out << "mod->addTypeName(\"" << *progTypeName << "\", "
-          << typeName << ");";
+    bool isForward = printTypeInternal(FT->getReturnType());
+    std::string retTypeName(getCppName(FT->getReturnType()));
+    Out << "FunctionType* " << typeName << " = FunctionType::get(";
+    in(); nl(Out) << "/*Result=*/" << retTypeName;
+    if (isForward)
+      Out << "_fwd";
+    Out << ",";
+    nl(Out) << "/*Params=*/" << typeName << "_args,";
+    nl(Out) << "/*isVarArg=*/" << (FT->isVarArg() ? "true" : "false") << ");";
+    out();
+    nl(Out);
+    break;
+  }
+  case Type::StructTyID: {
+    const StructType* ST = cast<StructType>(Ty);
+    Out << "std::vector<const Type*>" << typeName << "_fields;";
+    nl(Out);
+    StructType::element_iterator EI = ST->element_begin();
+    StructType::element_iterator EE = ST->element_end();
+    for (; EI != EE; ++EI) {
+      const Type* fieldTy = static_cast<const Type*>(*EI);
+      bool isForward = printTypeInternal(fieldTy);
+      std::string fieldName(getCppName(fieldTy));
+      Out << typeName << "_fields.push_back(" << fieldName;
+      if (isForward)
+        Out << "_fwd";
+      Out << ");";
       nl(Out);
     }
+    Out << "StructType* " << typeName << " = StructType::get("
+        << "mod->getContext(), "
+        << typeName << "_fields, /*isPacked=*/"
+        << (ST->isPacked() ? "true" : "false") << ");";
+    nl(Out);
+    break;
+  }
+  case Type::ArrayTyID: {
+    const ArrayType* AT = cast<ArrayType>(Ty);
+    const Type* ET = AT->getElementType();
+    bool isForward = printTypeInternal(ET);
+    std::string elemName(getCppName(ET));
+    Out << "ArrayType* " << typeName << " = ArrayType::get("
+        << elemName << (isForward ? "_fwd" : "")
+        << ", " << utostr(AT->getNumElements()) << ");";
+    nl(Out);
+    break;
+  }
+  case Type::PointerTyID: {
+    const PointerType* PT = cast<PointerType>(Ty);
+    const Type* ET = PT->getElementType();
+    bool isForward = printTypeInternal(ET);
+    std::string elemName(getCppName(ET));
+    Out << "PointerType* " << typeName << " = PointerType::get("
+        << elemName << (isForward ? "_fwd" : "")
+        << ", " << utostr(PT->getAddressSpace()) << ");";
+    nl(Out);
+    break;
+  }
+  case Type::VectorTyID: {
+    const VectorType* PT = cast<VectorType>(Ty);
+    const Type* ET = PT->getElementType();
+    bool isForward = printTypeInternal(ET);
+    std::string elemName(getCppName(ET));
+    Out << "VectorType* " << typeName << " = VectorType::get("
+        << elemName << (isForward ? "_fwd" : "")
+        << ", " << utostr(PT->getNumElements()) << ");";
+    nl(Out);
+    break;
+  }
+  case Type::OpaqueTyID: {
+    Out << "OpaqueType* " << typeName;
+    Out << " = OpaqueType::get(mod->getContext());";
+    nl(Out);
+    break;
+  }
+  default:
+    error("Invalid TypeID");
+  }
 
-    // Pop us off the type stack
-    TypeStack.pop_back();
+  // If the type had a name, make sure we recreate it.
+  const std::string* progTypeName =
+    findTypeName(TheModule->getTypeSymbolTable(),Ty);
+  if (progTypeName) {
+    Out << "mod->addTypeName(\"" << *progTypeName << "\", "
+        << typeName << ");";
+    nl(Out);
+  }
 
-    // Indicate that this type is now defined.
-    DefinedTypes.insert(Ty);
+  // Pop us off the type stack
+  TypeStack.pop_back();
 
-    // Early resolve as many unresolved types as possible. Search the unresolved
-    // types map for the type we just printed. Now that its definition is complete
-    // we can resolve any previous references to it. This prevents a cascade of
-    // unresolved types.
-    TypeMap::iterator I = UnresolvedTypes.find(Ty);
-    if (I != UnresolvedTypes.end()) {
-      Out << "cast<OpaqueType>(" << I->second
-          << "_fwd.get())->refineAbstractTypeTo(" << I->second << ");";
-      nl(Out);
-      Out << I->second << " = cast<";
-      switch (Ty->getTypeID()) {
-      case Type::FunctionTyID: Out << "FunctionType"; break;
-      case Type::ArrayTyID:    Out << "ArrayType"; break;
-      case Type::StructTyID:   Out << "StructType"; break;
-      case Type::VectorTyID:   Out << "VectorType"; break;
-      case Type::PointerTyID:  Out << "PointerType"; break;
-      case Type::OpaqueTyID:   Out << "OpaqueType"; break;
-      default:                 Out << "NoSuchDerivedType"; break;
-      }
-      Out << ">(" << I->second << "_fwd.get());";
-      nl(Out); nl(Out);
-      UnresolvedTypes.erase(I);
-    }
+  // Indicate that this type is now defined.
+  DefinedTypes.insert(Ty);
 
-    // Finally, separate the type definition from other with a newline.
+  // Early resolve as many unresolved types as possible. Search the unresolved
+  // types map for the type we just printed. Now that its definition is complete
+  // we can resolve any previous references to it. This prevents a cascade of
+  // unresolved types.
+  TypeMap::iterator I = UnresolvedTypes.find(Ty);
+  if (I != UnresolvedTypes.end()) {
+    Out << "cast<OpaqueType>(" << I->second
+        << "_fwd.get())->refineAbstractTypeTo(" << I->second << ");";
     nl(Out);
-
-    // We weren't a recursive type
-    return false;
+    Out << I->second << " = cast<";
+    switch (Ty->getTypeID()) {
+    case Type::FunctionTyID: Out << "FunctionType"; break;
+    case Type::ArrayTyID:    Out << "ArrayType"; break;
+    case Type::StructTyID:   Out << "StructType"; break;
+    case Type::VectorTyID:   Out << "VectorType"; break;
+    case Type::PointerTyID:  Out << "PointerType"; break;
+    case Type::OpaqueTyID:   Out << "OpaqueType"; break;
+    default:                 Out << "NoSuchDerivedType"; break;
+    }
+    Out << ">(" << I->second << "_fwd.get());";
+    nl(Out); nl(Out);
+    UnresolvedTypes.erase(I);
   }
 
-  // Prints a type definition. Returns true if it could not resolve all the
-  // types in the definition but had to use a forward reference.
-  void CppWriter::printType(const Type* Ty) {
-    assert(TypeStack.empty());
-    TypeStack.clear();
-    printTypeInternal(Ty);
-    assert(TypeStack.empty());
-  }
-
-  void CppWriter::printTypes(const Module* M) {
-    // Walk the symbol table and print out all its types
-    const TypeSymbolTable& symtab = M->getTypeSymbolTable();
-    for (TypeSymbolTable::const_iterator TI = symtab.begin(), TE = symtab.end();
-         TI != TE; ++TI) {
-
-      // For primitive types and types already defined, just add a name
-      TypeMap::const_iterator TNI = TypeNames.find(TI->second);
-      if (TI->second->isIntegerTy() || TI->second->isPrimitiveType() ||
-          TNI != TypeNames.end()) {
-        Out << "mod->addTypeName(\"";
-        printEscapedString(TI->first);
-        Out << "\", " << getCppName(TI->second) << ");";
-        nl(Out);
-        // For everything else, define the type
-      } else {
-        printType(TI->second);
-      }
-    }
+  // Finally, separate the type definition from other with a newline.
+  nl(Out);
 
-    // Add all of the global variables to the value table...
-    for (Module::const_global_iterator I = TheModule->global_begin(),
-           E = TheModule->global_end(); I != E; ++I) {
-      if (I->hasInitializer())
-        printType(I->getInitializer()->getType());
-      printType(I->getType());
+  // We weren't a recursive type
+  return false;
+}
+
+// Prints a type definition. Returns true if it could not resolve all the
+// types in the definition but had to use a forward reference.
+void CppWriter::printType(const Type* Ty) {
+  assert(TypeStack.empty());
+  TypeStack.clear();
+  printTypeInternal(Ty);
+  assert(TypeStack.empty());
+}
+
+void CppWriter::printTypes(const Module* M) {
+  // Walk the symbol table and print out all its types
+  const TypeSymbolTable& symtab = M->getTypeSymbolTable();
+  for (TypeSymbolTable::const_iterator TI = symtab.begin(), TE = symtab.end();
+       TI != TE; ++TI) {
+
+    // For primitive types and types already defined, just add a name
+    TypeMap::const_iterator TNI = TypeNames.find(TI->second);
+    if (TI->second->isIntegerTy() || TI->second->isPrimitiveType() ||
+        TNI != TypeNames.end()) {
+      Out << "mod->addTypeName(\"";
+      printEscapedString(TI->first);
+      Out << "\", " << getCppName(TI->second) << ");";
+      nl(Out);
+      // For everything else, define the type
+    } else {
+      printType(TI->second);
     }
+  }
 
-    // Add all the functions to the table
-    for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
-         FI != FE; ++FI) {
-      printType(FI->getReturnType());
-      printType(FI->getFunctionType());
-      // Add all the function arguments
-      for (Function::const_arg_iterator AI = FI->arg_begin(),
-             AE = FI->arg_end(); AI != AE; ++AI) {
-        printType(AI->getType());
-      }
+  // Add all of the global variables to the value table...
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I) {
+    if (I->hasInitializer())
+      printType(I->getInitializer()->getType());
+    printType(I->getType());
+  }
 
-      // Add all of the basic blocks and instructions
-      for (Function::const_iterator BB = FI->begin(),
-             E = FI->end(); BB != E; ++BB) {
-        printType(BB->getType());
-        for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
-             ++I) {
-          printType(I->getType());
-          for (unsigned i = 0; i < I->getNumOperands(); ++i)
-            printType(I->getOperand(i)->getType());
-        }
+  // Add all the functions to the table
+  for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
+       FI != FE; ++FI) {
+    printType(FI->getReturnType());
+    printType(FI->getFunctionType());
+    // Add all the function arguments
+    for (Function::const_arg_iterator AI = FI->arg_begin(),
+           AE = FI->arg_end(); AI != AE; ++AI) {
+      printType(AI->getType());
+    }
+
+    // Add all of the basic blocks and instructions
+    for (Function::const_iterator BB = FI->begin(),
+           E = FI->end(); BB != E; ++BB) {
+      printType(BB->getType());
+      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
+           ++I) {
+        printType(I->getType());
+        for (unsigned i = 0; i < I->getNumOperands(); ++i)
+          printType(I->getOperand(i)->getType());
       }
     }
   }
+}
 
 
-  // printConstant - Print out a constant pool entry...
-  void CppWriter::printConstant(const Constant *CV) {
-    // First, if the constant is actually a GlobalValue (variable or function)
-    // or its already in the constant list then we've printed it already and we
-    // can just return.
-    if (isa<GlobalValue>(CV) || ValueNames.find(CV) != ValueNames.end())
-      return;
+// printConstant - Print out a constant pool entry...
+void CppWriter::printConstant(const Constant *CV) {
+  // First, if the constant is actually a GlobalValue (variable or function)
+  // or its already in the constant list then we've printed it already and we
+  // can just return.
+  if (isa<GlobalValue>(CV) || ValueNames.find(CV) != ValueNames.end())
+    return;
 
-    std::string constName(getCppName(CV));
-    std::string typeName(getCppName(CV->getType()));
+  std::string constName(getCppName(CV));
+  std::string typeName(getCppName(CV->getType()));
 
-    if (isa<GlobalValue>(CV)) {
-      // Skip variables and functions, we emit them elsewhere
-      return;
-    }
+  if (isa<GlobalValue>(CV)) {
+    // Skip variables and functions, we emit them elsewhere
+    return;
+  }
 
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
-      std::string constValue = CI->getValue().toString(10, true);
-      Out << "ConstantInt* " << constName
-          << " = ConstantInt::get(mod->getContext(), APInt("
-          << cast<IntegerType>(CI->getType())->getBitWidth()
-          << ", StringRef(\"" <<  constValue << "\"), 10));";
-    } else if (isa<ConstantAggregateZero>(CV)) {
-      Out << "ConstantAggregateZero* " << constName
-          << " = ConstantAggregateZero::get(" << typeName << ");";
-    } else if (isa<ConstantPointerNull>(CV)) {
-      Out << "ConstantPointerNull* " << constName
-          << " = ConstantPointerNull::get(" << typeName << ");";
-    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
-      Out << "ConstantFP* " << constName << " = ";
-      printCFP(CFP);
-      Out << ";";
-    } else if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
-      if (CA->isString() &&
-          CA->getType()->getElementType() ==
-              Type::getInt8Ty(CA->getContext())) {
-        Out << "Constant* " << constName <<
-               " = ConstantArray::get(mod->getContext(), \"";
-        std::string tmp = CA->getAsString();
-        bool nullTerminate = false;
-        if (tmp[tmp.length()-1] == 0) {
-          tmp.erase(tmp.length()-1);
-          nullTerminate = true;
-        }
-        printEscapedString(tmp);
-        // Determine if we want null termination or not.
-        if (nullTerminate)
-          Out << "\", true"; // Indicate that the null terminator should be
-                             // added.
-        else
-          Out << "\", false";// No null terminator
-        Out << ");";
-      } else {
-        Out << "std::vector<Constant*> " << constName << "_elems;";
-        nl(Out);
-        unsigned N = CA->getNumOperands();
-        for (unsigned i = 0; i < N; ++i) {
-          printConstant(CA->getOperand(i)); // recurse to print operands
-          Out << constName << "_elems.push_back("
-              << getCppName(CA->getOperand(i)) << ");";
-          nl(Out);
-        }
-        Out << "Constant* " << constName << " = ConstantArray::get("
-            << typeName << ", " << constName << "_elems);";
-      }
-    } else if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CV)) {
-      Out << "std::vector<Constant*> " << constName << "_fields;";
-      nl(Out);
-      unsigned N = CS->getNumOperands();
-      for (unsigned i = 0; i < N; i++) {
-        printConstant(CS->getOperand(i));
-        Out << constName << "_fields.push_back("
-            << getCppName(CS->getOperand(i)) << ");";
-        nl(Out);
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    std::string constValue = CI->getValue().toString(10, true);
+    Out << "ConstantInt* " << constName
+        << " = ConstantInt::get(mod->getContext(), APInt("
+        << cast<IntegerType>(CI->getType())->getBitWidth()
+        << ", StringRef(\"" <<  constValue << "\"), 10));";
+  } else if (isa<ConstantAggregateZero>(CV)) {
+    Out << "ConstantAggregateZero* " << constName
+        << " = ConstantAggregateZero::get(" << typeName << ");";
+  } else if (isa<ConstantPointerNull>(CV)) {
+    Out << "ConstantPointerNull* " << constName
+        << " = ConstantPointerNull::get(" << typeName << ");";
+  } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+    Out << "ConstantFP* " << constName << " = ";
+    printCFP(CFP);
+    Out << ";";
+  } else if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
+    if (CA->isString() &&
+        CA->getType()->getElementType() ==
+            Type::getInt8Ty(CA->getContext())) {
+      Out << "Constant* " << constName <<
+             " = ConstantArray::get(mod->getContext(), \"";
+      std::string tmp = CA->getAsString();
+      bool nullTerminate = false;
+      if (tmp[tmp.length()-1] == 0) {
+        tmp.erase(tmp.length()-1);
+        nullTerminate = true;
       }
-      Out << "Constant* " << constName << " = ConstantStruct::get("
-          << typeName << ", " << constName << "_fields);";
-    } else if (const ConstantVector *CP = dyn_cast<ConstantVector>(CV)) {
+      printEscapedString(tmp);
+      // Determine if we want null termination or not.
+      if (nullTerminate)
+        Out << "\", true"; // Indicate that the null terminator should be
+                           // added.
+      else
+        Out << "\", false";// No null terminator
+      Out << ");";
+    } else {
       Out << "std::vector<Constant*> " << constName << "_elems;";
       nl(Out);
-      unsigned N = CP->getNumOperands();
+      unsigned N = CA->getNumOperands();
       for (unsigned i = 0; i < N; ++i) {
-        printConstant(CP->getOperand(i));
+        printConstant(CA->getOperand(i)); // recurse to print operands
         Out << constName << "_elems.push_back("
-            << getCppName(CP->getOperand(i)) << ");";
+            << getCppName(CA->getOperand(i)) << ");";
         nl(Out);
       }
-      Out << "Constant* " << constName << " = ConstantVector::get("
+      Out << "Constant* " << constName << " = ConstantArray::get("
           << typeName << ", " << constName << "_elems);";
-    } else if (isa<UndefValue>(CV)) {
-      Out << "UndefValue* " << constName << " = UndefValue::get("
-          << typeName << ");";
-    } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
-      if (CE->getOpcode() == Instruction::GetElementPtr) {
-        Out << "std::vector<Constant*> " << constName << "_indices;";
+    }
+  } else if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CV)) {
+    Out << "std::vector<Constant*> " << constName << "_fields;";
+    nl(Out);
+    unsigned N = CS->getNumOperands();
+    for (unsigned i = 0; i < N; i++) {
+      printConstant(CS->getOperand(i));
+      Out << constName << "_fields.push_back("
+          << getCppName(CS->getOperand(i)) << ");";
+      nl(Out);
+    }
+    Out << "Constant* " << constName << " = ConstantStruct::get("
+        << typeName << ", " << constName << "_fields);";
+  } else if (const ConstantVector *CP = dyn_cast<ConstantVector>(CV)) {
+    Out << "std::vector<Constant*> " << constName << "_elems;";
+    nl(Out);
+    unsigned N = CP->getNumOperands();
+    for (unsigned i = 0; i < N; ++i) {
+      printConstant(CP->getOperand(i));
+      Out << constName << "_elems.push_back("
+          << getCppName(CP->getOperand(i)) << ");";
+      nl(Out);
+    }
+    Out << "Constant* " << constName << " = ConstantVector::get("
+        << typeName << ", " << constName << "_elems);";
+  } else if (isa<UndefValue>(CV)) {
+    Out << "UndefValue* " << constName << " = UndefValue::get("
+        << typeName << ");";
+  } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+    if (CE->getOpcode() == Instruction::GetElementPtr) {
+      Out << "std::vector<Constant*> " << constName << "_indices;";
+      nl(Out);
+      printConstant(CE->getOperand(0));
+      for (unsigned i = 1; i < CE->getNumOperands(); ++i ) {
+        printConstant(CE->getOperand(i));
+        Out << constName << "_indices.push_back("
+            << getCppName(CE->getOperand(i)) << ");";
         nl(Out);
-        printConstant(CE->getOperand(0));
-        for (unsigned i = 1; i < CE->getNumOperands(); ++i ) {
-          printConstant(CE->getOperand(i));
-          Out << constName << "_indices.push_back("
-              << getCppName(CE->getOperand(i)) << ");";
-          nl(Out);
-        }
-        Out << "Constant* " << constName
-            << " = ConstantExpr::getGetElementPtr("
-            << getCppName(CE->getOperand(0)) << ", "
-            << "&" << constName << "_indices[0], "
-            << constName << "_indices.size()"
-            << ");";
-      } else if (CE->isCast()) {
-        printConstant(CE->getOperand(0));
-        Out << "Constant* " << constName << " = ConstantExpr::getCast(";
-        switch (CE->getOpcode()) {
-        default: llvm_unreachable("Invalid cast opcode");
-        case Instruction::Trunc: Out << "Instruction::Trunc"; break;
-        case Instruction::ZExt:  Out << "Instruction::ZExt"; break;
-        case Instruction::SExt:  Out << "Instruction::SExt"; break;
-        case Instruction::FPTrunc:  Out << "Instruction::FPTrunc"; break;
-        case Instruction::FPExt:  Out << "Instruction::FPExt"; break;
-        case Instruction::FPToUI:  Out << "Instruction::FPToUI"; break;
-        case Instruction::FPToSI:  Out << "Instruction::FPToSI"; break;
-        case Instruction::UIToFP:  Out << "Instruction::UIToFP"; break;
-        case Instruction::SIToFP:  Out << "Instruction::SIToFP"; break;
-        case Instruction::PtrToInt:  Out << "Instruction::PtrToInt"; break;
-        case Instruction::IntToPtr:  Out << "Instruction::IntToPtr"; break;
-        case Instruction::BitCast:  Out << "Instruction::BitCast"; break;
-        }
-        Out << ", " << getCppName(CE->getOperand(0)) << ", "
-            << getCppName(CE->getType()) << ");";
-      } else {
-        unsigned N = CE->getNumOperands();
-        for (unsigned i = 0; i < N; ++i ) {
-          printConstant(CE->getOperand(i));
+      }
+      Out << "Constant* " << constName
+          << " = ConstantExpr::getGetElementPtr("
+          << getCppName(CE->getOperand(0)) << ", "
+          << "&" << constName << "_indices[0], "
+          << constName << "_indices.size()"
+          << ");";
+    } else if (CE->isCast()) {
+      printConstant(CE->getOperand(0));
+      Out << "Constant* " << constName << " = ConstantExpr::getCast(";
+      switch (CE->getOpcode()) {
+      default: llvm_unreachable("Invalid cast opcode");
+      case Instruction::Trunc: Out << "Instruction::Trunc"; break;
+      case Instruction::ZExt:  Out << "Instruction::ZExt"; break;
+      case Instruction::SExt:  Out << "Instruction::SExt"; break;
+      case Instruction::FPTrunc:  Out << "Instruction::FPTrunc"; break;
+      case Instruction::FPExt:  Out << "Instruction::FPExt"; break;
+      case Instruction::FPToUI:  Out << "Instruction::FPToUI"; break;
+      case Instruction::FPToSI:  Out << "Instruction::FPToSI"; break;
+      case Instruction::UIToFP:  Out << "Instruction::UIToFP"; break;
+      case Instruction::SIToFP:  Out << "Instruction::SIToFP"; break;
+      case Instruction::PtrToInt:  Out << "Instruction::PtrToInt"; break;
+      case Instruction::IntToPtr:  Out << "Instruction::IntToPtr"; break;
+      case Instruction::BitCast:  Out << "Instruction::BitCast"; break;
+      }
+      Out << ", " << getCppName(CE->getOperand(0)) << ", "
+          << getCppName(CE->getType()) << ");";
+    } else {
+      unsigned N = CE->getNumOperands();
+      for (unsigned i = 0; i < N; ++i ) {
+        printConstant(CE->getOperand(i));
+      }
+      Out << "Constant* " << constName << " = ConstantExpr::";
+      switch (CE->getOpcode()) {
+      case Instruction::Add:    Out << "getAdd(";  break;
+      case Instruction::FAdd:   Out << "getFAdd(";  break;
+      case Instruction::Sub:    Out << "getSub("; break;
+      case Instruction::FSub:   Out << "getFSub("; break;
+      case Instruction::Mul:    Out << "getMul("; break;
+      case Instruction::FMul:   Out << "getFMul("; break;
+      case Instruction::UDiv:   Out << "getUDiv("; break;
+      case Instruction::SDiv:   Out << "getSDiv("; break;
+      case Instruction::FDiv:   Out << "getFDiv("; break;
+      case Instruction::URem:   Out << "getURem("; break;
+      case Instruction::SRem:   Out << "getSRem("; break;
+      case Instruction::FRem:   Out << "getFRem("; break;
+      case Instruction::And:    Out << "getAnd("; break;
+      case Instruction::Or:     Out << "getOr("; break;
+      case Instruction::Xor:    Out << "getXor("; break;
+      case Instruction::ICmp:
+        Out << "getICmp(ICmpInst::ICMP_";
+        switch (CE->getPredicate()) {
+        case ICmpInst::ICMP_EQ:  Out << "EQ"; break;
+        case ICmpInst::ICMP_NE:  Out << "NE"; break;
+        case ICmpInst::ICMP_SLT: Out << "SLT"; break;
+        case ICmpInst::ICMP_ULT: Out << "ULT"; break;
+        case ICmpInst::ICMP_SGT: Out << "SGT"; break;
+        case ICmpInst::ICMP_UGT: Out << "UGT"; break;
+        case ICmpInst::ICMP_SLE: Out << "SLE"; break;
+        case ICmpInst::ICMP_ULE: Out << "ULE"; break;
+        case ICmpInst::ICMP_SGE: Out << "SGE"; break;
+        case ICmpInst::ICMP_UGE: Out << "UGE"; break;
+        default: error("Invalid ICmp Predicate");
         }
-        Out << "Constant* " << constName << " = ConstantExpr::";
-        switch (CE->getOpcode()) {
-        case Instruction::Add:    Out << "getAdd(";  break;
-        case Instruction::FAdd:   Out << "getFAdd(";  break;
-        case Instruction::Sub:    Out << "getSub("; break;
-        case Instruction::FSub:   Out << "getFSub("; break;
-        case Instruction::Mul:    Out << "getMul("; break;
-        case Instruction::FMul:   Out << "getFMul("; break;
-        case Instruction::UDiv:   Out << "getUDiv("; break;
-        case Instruction::SDiv:   Out << "getSDiv("; break;
-        case Instruction::FDiv:   Out << "getFDiv("; break;
-        case Instruction::URem:   Out << "getURem("; break;
-        case Instruction::SRem:   Out << "getSRem("; break;
-        case Instruction::FRem:   Out << "getFRem("; break;
-        case Instruction::And:    Out << "getAnd("; break;
-        case Instruction::Or:     Out << "getOr("; break;
-        case Instruction::Xor:    Out << "getXor("; break;
-        case Instruction::ICmp:
-          Out << "getICmp(ICmpInst::ICMP_";
-          switch (CE->getPredicate()) {
-          case ICmpInst::ICMP_EQ:  Out << "EQ"; break;
-          case ICmpInst::ICMP_NE:  Out << "NE"; break;
-          case ICmpInst::ICMP_SLT: Out << "SLT"; break;
-          case ICmpInst::ICMP_ULT: Out << "ULT"; break;
-          case ICmpInst::ICMP_SGT: Out << "SGT"; break;
-          case ICmpInst::ICMP_UGT: Out << "UGT"; break;
-          case ICmpInst::ICMP_SLE: Out << "SLE"; break;
-          case ICmpInst::ICMP_ULE: Out << "ULE"; break;
-          case ICmpInst::ICMP_SGE: Out << "SGE"; break;
-          case ICmpInst::ICMP_UGE: Out << "UGE"; break;
-          default: error("Invalid ICmp Predicate");
-          }
-          break;
-        case Instruction::FCmp:
-          Out << "getFCmp(FCmpInst::FCMP_";
-          switch (CE->getPredicate()) {
-          case FCmpInst::FCMP_FALSE: Out << "FALSE"; break;
-          case FCmpInst::FCMP_ORD:   Out << "ORD"; break;
-          case FCmpInst::FCMP_UNO:   Out << "UNO"; break;
-          case FCmpInst::FCMP_OEQ:   Out << "OEQ"; break;
-          case FCmpInst::FCMP_UEQ:   Out << "UEQ"; break;
-          case FCmpInst::FCMP_ONE:   Out << "ONE"; break;
-          case FCmpInst::FCMP_UNE:   Out << "UNE"; break;
-          case FCmpInst::FCMP_OLT:   Out << "OLT"; break;
-          case FCmpInst::FCMP_ULT:   Out << "ULT"; break;
-          case FCmpInst::FCMP_OGT:   Out << "OGT"; break;
-          case FCmpInst::FCMP_UGT:   Out << "UGT"; break;
-          case FCmpInst::FCMP_OLE:   Out << "OLE"; break;
-          case FCmpInst::FCMP_ULE:   Out << "ULE"; break;
-          case FCmpInst::FCMP_OGE:   Out << "OGE"; break;
-          case FCmpInst::FCMP_UGE:   Out << "UGE"; break;
-          case FCmpInst::FCMP_TRUE:  Out << "TRUE"; break;
-          default: error("Invalid FCmp Predicate");
-          }
-          break;
-        case Instruction::Shl:     Out << "getShl("; break;
-        case Instruction::LShr:    Out << "getLShr("; break;
-        case Instruction::AShr:    Out << "getAShr("; break;
-        case Instruction::Select:  Out << "getSelect("; break;
-        case Instruction::ExtractElement: Out << "getExtractElement("; break;
-        case Instruction::InsertElement:  Out << "getInsertElement("; break;
-        case Instruction::ShuffleVector:  Out << "getShuffleVector("; break;
-        default:
-          error("Invalid constant expression");
-          break;
+        break;
+      case Instruction::FCmp:
+        Out << "getFCmp(FCmpInst::FCMP_";
+        switch (CE->getPredicate()) {
+        case FCmpInst::FCMP_FALSE: Out << "FALSE"; break;
+        case FCmpInst::FCMP_ORD:   Out << "ORD"; break;
+        case FCmpInst::FCMP_UNO:   Out << "UNO"; break;
+        case FCmpInst::FCMP_OEQ:   Out << "OEQ"; break;
+        case FCmpInst::FCMP_UEQ:   Out << "UEQ"; break;
+        case FCmpInst::FCMP_ONE:   Out << "ONE"; break;
+        case FCmpInst::FCMP_UNE:   Out << "UNE"; break;
+        case FCmpInst::FCMP_OLT:   Out << "OLT"; break;
+        case FCmpInst::FCMP_ULT:   Out << "ULT"; break;
+        case FCmpInst::FCMP_OGT:   Out << "OGT"; break;
+        case FCmpInst::FCMP_UGT:   Out << "UGT"; break;
+        case FCmpInst::FCMP_OLE:   Out << "OLE"; break;
+        case FCmpInst::FCMP_ULE:   Out << "ULE"; break;
+        case FCmpInst::FCMP_OGE:   Out << "OGE"; break;
+        case FCmpInst::FCMP_UGE:   Out << "UGE"; break;
+        case FCmpInst::FCMP_TRUE:  Out << "TRUE"; break;
+        default: error("Invalid FCmp Predicate");
         }
-        Out << getCppName(CE->getOperand(0));
-        for (unsigned i = 1; i < CE->getNumOperands(); ++i)
-          Out << ", " << getCppName(CE->getOperand(i));
-        Out << ");";
+        break;
+      case Instruction::Shl:     Out << "getShl("; break;
+      case Instruction::LShr:    Out << "getLShr("; break;
+      case Instruction::AShr:    Out << "getAShr("; break;
+      case Instruction::Select:  Out << "getSelect("; break;
+      case Instruction::ExtractElement: Out << "getExtractElement("; break;
+      case Instruction::InsertElement:  Out << "getInsertElement("; break;
+      case Instruction::ShuffleVector:  Out << "getShuffleVector("; break;
+      default:
+        error("Invalid constant expression");
+        break;
       }
-    } else {
-      error("Bad Constant");
-      Out << "Constant* " << constName << " = 0; ";
+      Out << getCppName(CE->getOperand(0));
+      for (unsigned i = 1; i < CE->getNumOperands(); ++i)
+        Out << ", " << getCppName(CE->getOperand(i));
+      Out << ");";
     }
-    nl(Out);
+  } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV)) {
+    Out << "Constant* " << constName << " = ";
+    Out << "BlockAddress::get(" << getOpName(BA->getBasicBlock()) << ");";
+  } else {
+    error("Bad Constant");
+    Out << "Constant* " << constName << " = 0; ";
   }
+  nl(Out);
+}
 
-  void CppWriter::printConstants(const Module* M) {
-    // Traverse all the global variables looking for constant initializers
-    for (Module::const_global_iterator I = TheModule->global_begin(),
-           E = TheModule->global_end(); I != E; ++I)
-      if (I->hasInitializer())
-        printConstant(I->getInitializer());
-
-    // Traverse the LLVM functions looking for constants
-    for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
-         FI != FE; ++FI) {
-      // Add all of the basic blocks and instructions
-      for (Function::const_iterator BB = FI->begin(),
-             E = FI->end(); BB != E; ++BB) {
-        for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
-             ++I) {
-          for (unsigned i = 0; i < I->getNumOperands(); ++i) {
-            if (Constant* C = dyn_cast<Constant>(I->getOperand(i))) {
-              printConstant(C);
-            }
+void CppWriter::printConstants(const Module* M) {
+  // Traverse all the global variables looking for constant initializers
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I)
+    if (I->hasInitializer())
+      printConstant(I->getInitializer());
+
+  // Traverse the LLVM functions looking for constants
+  for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
+       FI != FE; ++FI) {
+    // Add all of the basic blocks and instructions
+    for (Function::const_iterator BB = FI->begin(),
+           E = FI->end(); BB != E; ++BB) {
+      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
+           ++I) {
+        for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+          if (Constant* C = dyn_cast<Constant>(I->getOperand(i))) {
+            printConstant(C);
           }
         }
       }
     }
   }
+}
 
-  void CppWriter::printVariableUses(const GlobalVariable *GV) {
-    nl(Out) << "// Type Definitions";
-    nl(Out);
-    printType(GV->getType());
-    if (GV->hasInitializer()) {
-      Constant *Init = GV->getInitializer();
-      printType(Init->getType());
-      if (Function *F = dyn_cast<Function>(Init)) {
-        nl(Out)<< "/ Function Declarations"; nl(Out);
-        printFunctionHead(F);
-      } else if (GlobalVariable* gv = dyn_cast<GlobalVariable>(Init)) {
-        nl(Out) << "// Global Variable Declarations"; nl(Out);
-        printVariableHead(gv);
-        
-        nl(Out) << "// Global Variable Definitions"; nl(Out);
-        printVariableBody(gv);
-      } else  {
-        nl(Out) << "// Constant Definitions"; nl(Out);
-        printConstant(Init);
-      }
+void CppWriter::printVariableUses(const GlobalVariable *GV) {
+  nl(Out) << "// Type Definitions";
+  nl(Out);
+  printType(GV->getType());
+  if (GV->hasInitializer()) {
+    Constant *Init = GV->getInitializer();
+    printType(Init->getType());
+    if (Function *F = dyn_cast<Function>(Init)) {
+      nl(Out)<< "/ Function Declarations"; nl(Out);
+      printFunctionHead(F);
+    } else if (GlobalVariable* gv = dyn_cast<GlobalVariable>(Init)) {
+      nl(Out) << "// Global Variable Declarations"; nl(Out);
+      printVariableHead(gv);
+      
+      nl(Out) << "// Global Variable Definitions"; nl(Out);
+      printVariableBody(gv);
+    } else  {
+      nl(Out) << "// Constant Definitions"; nl(Out);
+      printConstant(Init);
     }
   }
+}
 
-  void CppWriter::printVariableHead(const GlobalVariable *GV) {
-    nl(Out) << "GlobalVariable* " << getCppName(GV);
-    if (is_inline) {
-      Out << " = mod->getGlobalVariable(mod->getContext(), ";
-      printEscapedString(GV->getName());
-      Out << ", " << getCppName(GV->getType()->getElementType()) << ",true)";
-      nl(Out) << "if (!" << getCppName(GV) << ") {";
-      in(); nl(Out) << getCppName(GV);
-    }
-    Out << " = new GlobalVariable(/*Module=*/*mod, ";
-    nl(Out) << "/*Type=*/";
-    printCppName(GV->getType()->getElementType());
-    Out << ",";
-    nl(Out) << "/*isConstant=*/" << (GV->isConstant()?"true":"false");
-    Out << ",";
-    nl(Out) << "/*Linkage=*/";
-    printLinkageType(GV->getLinkage());
-    Out << ",";
-    nl(Out) << "/*Initializer=*/0, ";
-    if (GV->hasInitializer()) {
-      Out << "// has initializer, specified below";
-    }
-    nl(Out) << "/*Name=*/\"";
+void CppWriter::printVariableHead(const GlobalVariable *GV) {
+  nl(Out) << "GlobalVariable* " << getCppName(GV);
+  if (is_inline) {
+    Out << " = mod->getGlobalVariable(mod->getContext(), ";
     printEscapedString(GV->getName());
+    Out << ", " << getCppName(GV->getType()->getElementType()) << ",true)";
+    nl(Out) << "if (!" << getCppName(GV) << ") {";
+    in(); nl(Out) << getCppName(GV);
+  }
+  Out << " = new GlobalVariable(/*Module=*/*mod, ";
+  nl(Out) << "/*Type=*/";
+  printCppName(GV->getType()->getElementType());
+  Out << ",";
+  nl(Out) << "/*isConstant=*/" << (GV->isConstant()?"true":"false");
+  Out << ",";
+  nl(Out) << "/*Linkage=*/";
+  printLinkageType(GV->getLinkage());
+  Out << ",";
+  nl(Out) << "/*Initializer=*/0, ";
+  if (GV->hasInitializer()) {
+    Out << "// has initializer, specified below";
+  }
+  nl(Out) << "/*Name=*/\"";
+  printEscapedString(GV->getName());
+  Out << "\");";
+  nl(Out);
+
+  if (GV->hasSection()) {
+    printCppName(GV);
+    Out << "->setSection(\"";
+    printEscapedString(GV->getSection());
     Out << "\");";
     nl(Out);
-
-    if (GV->hasSection()) {
-      printCppName(GV);
-      Out << "->setSection(\"";
-      printEscapedString(GV->getSection());
-      Out << "\");";
-      nl(Out);
-    }
-    if (GV->getAlignment()) {
-      printCppName(GV);
-      Out << "->setAlignment(" << utostr(GV->getAlignment()) << ");";
-      nl(Out);
-    }
-    if (GV->getVisibility() != GlobalValue::DefaultVisibility) {
-      printCppName(GV);
-      Out << "->setVisibility(";
-      printVisibilityType(GV->getVisibility());
-      Out << ");";
-      nl(Out);
-    }
-    if (GV->isThreadLocal()) {
-      printCppName(GV);
-      Out << "->setThreadLocal(true);";
-      nl(Out);
-    }
-    if (is_inline) {
-      out(); Out << "}"; nl(Out);
-    }
   }
-
-  void CppWriter::printVariableBody(const GlobalVariable *GV) {
-    if (GV->hasInitializer()) {
-      printCppName(GV);
-      Out << "->setInitializer(";
-      Out << getCppName(GV->getInitializer()) << ");";
-      nl(Out);
-    }
+  if (GV->getAlignment()) {
+    printCppName(GV);
+    Out << "->setAlignment(" << utostr(GV->getAlignment()) << ");";
+    nl(Out);
   }
+  if (GV->getVisibility() != GlobalValue::DefaultVisibility) {
+    printCppName(GV);
+    Out << "->setVisibility(";
+    printVisibilityType(GV->getVisibility());
+    Out << ");";
+    nl(Out);
+  }
+  if (GV->isThreadLocal()) {
+    printCppName(GV);
+    Out << "->setThreadLocal(true);";
+    nl(Out);
+  }
+  if (is_inline) {
+    out(); Out << "}"; nl(Out);
+  }
+}
 
-  std::string CppWriter::getOpName(Value* V) {
-    if (!isa<Instruction>(V) || DefinedValues.find(V) != DefinedValues.end())
-      return getCppName(V);
-
-    // See if its alread in the map of forward references, if so just return the
-    // name we already set up for it
-    ForwardRefMap::const_iterator I = ForwardRefs.find(V);
-    if (I != ForwardRefs.end())
-      return I->second;
-
-    // This is a new forward reference. Generate a unique name for it
-    std::string result(std::string("fwdref_") + utostr(uniqueNum++));
-
-    // Yes, this is a hack. An Argument is the smallest instantiable value that
-    // we can make as a placeholder for the real value. We'll replace these
-    // Argument instances later.
-    Out << "Argument* " << result << " = new Argument("
-        << getCppName(V->getType()) << ");";
+void CppWriter::printVariableBody(const GlobalVariable *GV) {
+  if (GV->hasInitializer()) {
+    printCppName(GV);
+    Out << "->setInitializer(";
+    Out << getCppName(GV->getInitializer()) << ");";
     nl(Out);
-    ForwardRefs[V] = result;
-    return result;
   }
+}
 
-  // printInstruction - This member is called for each Instruction in a function.
-  void CppWriter::printInstruction(const Instruction *I,
-                                   const std::string& bbname) {
-    std::string iName(getCppName(I));
+std::string CppWriter::getOpName(Value* V) {
+  if (!isa<Instruction>(V) || DefinedValues.find(V) != DefinedValues.end())
+    return getCppName(V);
 
-    // Before we emit this instruction, we need to take care of generating any
-    // forward references. So, we get the names of all the operands in advance
-    const unsigned Ops(I->getNumOperands());
-    std::string* opNames = new std::string[Ops];
-    for (unsigned i = 0; i < Ops; i++) {
-      opNames[i] = getOpName(I->getOperand(i));
-    }
+  // See if its alread in the map of forward references, if so just return the
+  // name we already set up for it
+  ForwardRefMap::const_iterator I = ForwardRefs.find(V);
+  if (I != ForwardRefs.end())
+    return I->second;
 
-    switch (I->getOpcode()) {
-    default:
-      error("Invalid instruction");
-      break;
+  // This is a new forward reference. Generate a unique name for it
+  std::string result(std::string("fwdref_") + utostr(uniqueNum++));
 
-    case Instruction::Ret: {
-      const ReturnInst* ret =  cast<ReturnInst>(I);
-      Out << "ReturnInst::Create(mod->getContext(), "
-          << (ret->getReturnValue() ? opNames[0] + ", " : "") << bbname << ");";
-      break;
+  // Yes, this is a hack. An Argument is the smallest instantiable value that
+  // we can make as a placeholder for the real value. We'll replace these
+  // Argument instances later.
+  Out << "Argument* " << result << " = new Argument("
+      << getCppName(V->getType()) << ");";
+  nl(Out);
+  ForwardRefs[V] = result;
+  return result;
+}
+
+// printInstruction - This member is called for each Instruction in a function.
+void CppWriter::printInstruction(const Instruction *I,
+                                 const std::string& bbname) {
+  std::string iName(getCppName(I));
+
+  // Before we emit this instruction, we need to take care of generating any
+  // forward references. So, we get the names of all the operands in advance
+  const unsigned Ops(I->getNumOperands());
+  std::string* opNames = new std::string[Ops];
+  for (unsigned i = 0; i < Ops; i++)
+    opNames[i] = getOpName(I->getOperand(i));
+
+  switch (I->getOpcode()) {
+  default:
+    error("Invalid instruction");
+    break;
+
+  case Instruction::Ret: {
+    const ReturnInst* ret =  cast<ReturnInst>(I);
+    Out << "ReturnInst::Create(mod->getContext(), "
+        << (ret->getReturnValue() ? opNames[0] + ", " : "") << bbname << ");";
+    break;
+  }
+  case Instruction::Br: {
+    const BranchInst* br = cast<BranchInst>(I);
+    Out << "BranchInst::Create(" ;
+    if (br->getNumOperands() == 3) {
+      Out << opNames[2] << ", "
+          << opNames[1] << ", "
+          << opNames[0] << ", ";
+
+    } else if (br->getNumOperands() == 1) {
+      Out << opNames[0] << ", ";
+    } else {
+      error("Branch with 2 operands?");
     }
-    case Instruction::Br: {
-      const BranchInst* br = cast<BranchInst>(I);
-      Out << "BranchInst::Create(" ;
-      if (br->getNumOperands() == 3 ) {
-        Out << opNames[2] << ", "
-            << opNames[1] << ", "
-            << opNames[0] << ", ";
-
-      } else if (br->getNumOperands() == 1) {
-        Out << opNames[0] << ", ";
-      } else {
-        error("Branch with 2 operands?");
-      }
-      Out << bbname << ");";
-      break;
+    Out << bbname << ");";
+    break;
+  }
+  case Instruction::Switch: {
+    const SwitchInst *SI = cast<SwitchInst>(I);
+    Out << "SwitchInst* " << iName << " = SwitchInst::Create("
+        << opNames[0] << ", "
+        << opNames[1] << ", "
+        << SI->getNumCases() << ", " << bbname << ");";
+    nl(Out);
+    for (unsigned i = 2; i != SI->getNumOperands(); i += 2) {
+      Out << iName << "->addCase("
+          << opNames[i] << ", "
+          << opNames[i+1] << ");";
+      nl(Out);
     }
-    case Instruction::Switch: {
-      const SwitchInst *SI = cast<SwitchInst>(I);
-      Out << "SwitchInst* " << iName << " = SwitchInst::Create("
-          << opNames[0] << ", "
-          << opNames[1] << ", "
-          << SI->getNumCases() << ", " << bbname << ");";
+    break;
+  }
+  case Instruction::IndirectBr: {
+    const IndirectBrInst *IBI = cast<IndirectBrInst>(I);
+    Out << "IndirectBrInst *" << iName << " = IndirectBrInst::Create("
+        << opNames[0] << ", " << IBI->getNumDestinations() << ");";
+    nl(Out);
+    for (unsigned i = 1; i != IBI->getNumOperands(); ++i) {
+      Out << iName << "->addDestination(" << opNames[i] << ");";
       nl(Out);
-      for (unsigned i = 2; i != SI->getNumOperands(); i += 2) {
-        Out << iName << "->addCase("
-            << opNames[i] << ", "
-            << opNames[i+1] << ");";
-        nl(Out);
-      }
-      break;
     }
-    case Instruction::IndirectBr: {
-      const IndirectBrInst *IBI = cast<IndirectBrInst>(I);
-      Out << "IndirectBrInst *" << iName << " = IndirectBrInst::Create("
-          << opNames[0] << ", " << IBI->getNumDestinations() << ");";
+    break;
+  }
+  case Instruction::Invoke: {
+    const InvokeInst* inv = cast<InvokeInst>(I);
+    Out << "std::vector<Value*> " << iName << "_params;";
+    nl(Out);
+    for (unsigned i = 0; i < inv->getNumArgOperands(); ++i) {
+      Out << iName << "_params.push_back("
+          << getOpName(inv->getArgOperand(i)) << ");";
       nl(Out);
-      for (unsigned i = 1; i != IBI->getNumOperands(); ++i) {
-        Out << iName << "->addDestination(" << opNames[i] << ");";
-        nl(Out);
-      }
-      break;
     }
-    case Instruction::Invoke: {
-      const InvokeInst* inv = cast<InvokeInst>(I);
-      Out << "std::vector<Value*> " << iName << "_params;";
+    // FIXME: This shouldn't use magic numbers -3, -2, and -1.
+    Out << "InvokeInst *" << iName << " = InvokeInst::Create("
+        << getOpName(inv->getCalledFunction()) << ", "
+        << getOpName(inv->getNormalDest()) << ", "
+        << getOpName(inv->getUnwindDest()) << ", "
+        << iName << "_params.begin(), "
+        << iName << "_params.end(), \"";
+    printEscapedString(inv->getName());
+    Out << "\", " << bbname << ");";
+    nl(Out) << iName << "->setCallingConv(";
+    printCallingConv(inv->getCallingConv());
+    Out << ");";
+    printAttributes(inv->getAttributes(), iName);
+    Out << iName << "->setAttributes(" << iName << "_PAL);";
+    nl(Out);
+    break;
+  }
+  case Instruction::Unwind: {
+    Out << "new UnwindInst("
+        << bbname << ");";
+    break;
+  }
+  case Instruction::Unreachable: {
+    Out << "new UnreachableInst("
+        << "mod->getContext(), "
+        << bbname << ");";
+    break;
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:{
+    Out << "BinaryOperator* " << iName << " = BinaryOperator::Create(";
+    switch (I->getOpcode()) {
+    case Instruction::Add: Out << "Instruction::Add"; break;
+    case Instruction::FAdd: Out << "Instruction::FAdd"; break;
+    case Instruction::Sub: Out << "Instruction::Sub"; break;
+    case Instruction::FSub: Out << "Instruction::FSub"; break;
+    case Instruction::Mul: Out << "Instruction::Mul"; break;
+    case Instruction::FMul: Out << "Instruction::FMul"; break;
+    case Instruction::UDiv:Out << "Instruction::UDiv"; break;
+    case Instruction::SDiv:Out << "Instruction::SDiv"; break;
+    case Instruction::FDiv:Out << "Instruction::FDiv"; break;
+    case Instruction::URem:Out << "Instruction::URem"; break;
+    case Instruction::SRem:Out << "Instruction::SRem"; break;
+    case Instruction::FRem:Out << "Instruction::FRem"; break;
+    case Instruction::And: Out << "Instruction::And"; break;
+    case Instruction::Or:  Out << "Instruction::Or";  break;
+    case Instruction::Xor: Out << "Instruction::Xor"; break;
+    case Instruction::Shl: Out << "Instruction::Shl"; break;
+    case Instruction::LShr:Out << "Instruction::LShr"; break;
+    case Instruction::AShr:Out << "Instruction::AShr"; break;
+    default: Out << "Instruction::BadOpCode"; break;
+    }
+    Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+    printEscapedString(I->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::FCmp: {
+    Out << "FCmpInst* " << iName << " = new FCmpInst(*" << bbname << ", ";
+    switch (cast<FCmpInst>(I)->getPredicate()) {
+    case FCmpInst::FCMP_FALSE: Out << "FCmpInst::FCMP_FALSE"; break;
+    case FCmpInst::FCMP_OEQ  : Out << "FCmpInst::FCMP_OEQ"; break;
+    case FCmpInst::FCMP_OGT  : Out << "FCmpInst::FCMP_OGT"; break;
+    case FCmpInst::FCMP_OGE  : Out << "FCmpInst::FCMP_OGE"; break;
+    case FCmpInst::FCMP_OLT  : Out << "FCmpInst::FCMP_OLT"; break;
+    case FCmpInst::FCMP_OLE  : Out << "FCmpInst::FCMP_OLE"; break;
+    case FCmpInst::FCMP_ONE  : Out << "FCmpInst::FCMP_ONE"; break;
+    case FCmpInst::FCMP_ORD  : Out << "FCmpInst::FCMP_ORD"; break;
+    case FCmpInst::FCMP_UNO  : Out << "FCmpInst::FCMP_UNO"; break;
+    case FCmpInst::FCMP_UEQ  : Out << "FCmpInst::FCMP_UEQ"; break;
+    case FCmpInst::FCMP_UGT  : Out << "FCmpInst::FCMP_UGT"; break;
+    case FCmpInst::FCMP_UGE  : Out << "FCmpInst::FCMP_UGE"; break;
+    case FCmpInst::FCMP_ULT  : Out << "FCmpInst::FCMP_ULT"; break;
+    case FCmpInst::FCMP_ULE  : Out << "FCmpInst::FCMP_ULE"; break;
+    case FCmpInst::FCMP_UNE  : Out << "FCmpInst::FCMP_UNE"; break;
+    case FCmpInst::FCMP_TRUE : Out << "FCmpInst::FCMP_TRUE"; break;
+    default: Out << "FCmpInst::BAD_ICMP_PREDICATE"; break;
+    }
+    Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+    printEscapedString(I->getName());
+    Out << "\");";
+    break;
+  }
+  case Instruction::ICmp: {
+    Out << "ICmpInst* " << iName << " = new ICmpInst(*" << bbname << ", ";
+    switch (cast<ICmpInst>(I)->getPredicate()) {
+    case ICmpInst::ICMP_EQ:  Out << "ICmpInst::ICMP_EQ";  break;
+    case ICmpInst::ICMP_NE:  Out << "ICmpInst::ICMP_NE";  break;
+    case ICmpInst::ICMP_ULE: Out << "ICmpInst::ICMP_ULE"; break;
+    case ICmpInst::ICMP_SLE: Out << "ICmpInst::ICMP_SLE"; break;
+    case ICmpInst::ICMP_UGE: Out << "ICmpInst::ICMP_UGE"; break;
+    case ICmpInst::ICMP_SGE: Out << "ICmpInst::ICMP_SGE"; break;
+    case ICmpInst::ICMP_ULT: Out << "ICmpInst::ICMP_ULT"; break;
+    case ICmpInst::ICMP_SLT: Out << "ICmpInst::ICMP_SLT"; break;
+    case ICmpInst::ICMP_UGT: Out << "ICmpInst::ICMP_UGT"; break;
+    case ICmpInst::ICMP_SGT: Out << "ICmpInst::ICMP_SGT"; break;
+    default: Out << "ICmpInst::BAD_ICMP_PREDICATE"; break;
+    }
+    Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+    printEscapedString(I->getName());
+    Out << "\");";
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst* allocaI = cast<AllocaInst>(I);
+    Out << "AllocaInst* " << iName << " = new AllocaInst("
+        << getCppName(allocaI->getAllocatedType()) << ", ";
+    if (allocaI->isArrayAllocation())
+      Out << opNames[0] << ", ";
+    Out << "\"";
+    printEscapedString(allocaI->getName());
+    Out << "\", " << bbname << ");";
+    if (allocaI->getAlignment())
+      nl(Out) << iName << "->setAlignment("
+          << allocaI->getAlignment() << ");";
+    break;
+  }
+  case Instruction::Load: {
+    const LoadInst* load = cast<LoadInst>(I);
+    Out << "LoadInst* " << iName << " = new LoadInst("
+        << opNames[0] << ", \"";
+    printEscapedString(load->getName());
+    Out << "\", " << (load->isVolatile() ? "true" : "false" )
+        << ", " << bbname << ");";
+    break;
+  }
+  case Instruction::Store: {
+    const StoreInst* store = cast<StoreInst>(I);
+    Out << " new StoreInst("
+        << opNames[0] << ", "
+        << opNames[1] << ", "
+        << (store->isVolatile() ? "true" : "false")
+        << ", " << bbname << ");";
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    const GetElementPtrInst* gep = cast<GetElementPtrInst>(I);
+    if (gep->getNumOperands() <= 2) {
+      Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create("
+          << opNames[0];
+      if (gep->getNumOperands() == 2)
+        Out << ", " << opNames[1];
+    } else {
+      Out << "std::vector<Value*> " << iName << "_indices;";
       nl(Out);
-      for (unsigned i = 0; i < inv->getNumOperands() - 3; ++i) {
-        Out << iName << "_params.push_back("
+      for (unsigned i = 1; i < gep->getNumOperands(); ++i ) {
+        Out << iName << "_indices.push_back("
             << opNames[i] << ");";
         nl(Out);
       }
-      Out << "InvokeInst *" << iName << " = InvokeInst::Create("
-          << opNames[Ops - 3] << ", "
-          << opNames[Ops - 2] << ", "
-          << opNames[Ops - 1] << ", "
-          << iName << "_params.begin(), " << iName << "_params.end(), \"";
-      printEscapedString(inv->getName());
-      Out << "\", " << bbname << ");";
-      nl(Out) << iName << "->setCallingConv(";
-      printCallingConv(inv->getCallingConv());
-      Out << ");";
-      printAttributes(inv->getAttributes(), iName);
-      Out << iName << "->setAttributes(" << iName << "_PAL);";
+      Out << "Instruction* " << iName << " = GetElementPtrInst::Create("
+          << opNames[0] << ", " << iName << "_indices.begin(), "
+          << iName << "_indices.end()";
+    }
+    Out << ", \"";
+    printEscapedString(gep->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::PHI: {
+    const PHINode* phi = cast<PHINode>(I);
+
+    Out << "PHINode* " << iName << " = PHINode::Create("
+        << getCppName(phi->getType()) << ", \"";
+    printEscapedString(phi->getName());
+    Out << "\", " << bbname << ");";
+    nl(Out) << iName << "->reserveOperandSpace("
+      << phi->getNumIncomingValues()
+        << ");";
+    nl(Out);
+    for (unsigned i = 0; i < phi->getNumOperands(); i+=2) {
+      Out << iName << "->addIncoming("
+          << opNames[i] << ", " << opNames[i+1] << ");";
       nl(Out);
-      break;
-    }
-    case Instruction::Unwind: {
-      Out << "new UnwindInst("
-          << bbname << ");";
-      break;
-    }
-    case Instruction::Unreachable: {
-      Out << "new UnreachableInst("
-          << "mod->getContext(), "
-          << bbname << ");";
-      break;
-    }
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:{
-      Out << "BinaryOperator* " << iName << " = BinaryOperator::Create(";
-      switch (I->getOpcode()) {
-      case Instruction::Add: Out << "Instruction::Add"; break;
-      case Instruction::FAdd: Out << "Instruction::FAdd"; break;
-      case Instruction::Sub: Out << "Instruction::Sub"; break;
-      case Instruction::FSub: Out << "Instruction::FSub"; break;
-      case Instruction::Mul: Out << "Instruction::Mul"; break;
-      case Instruction::FMul: Out << "Instruction::FMul"; break;
-      case Instruction::UDiv:Out << "Instruction::UDiv"; break;
-      case Instruction::SDiv:Out << "Instruction::SDiv"; break;
-      case Instruction::FDiv:Out << "Instruction::FDiv"; break;
-      case Instruction::URem:Out << "Instruction::URem"; break;
-      case Instruction::SRem:Out << "Instruction::SRem"; break;
-      case Instruction::FRem:Out << "Instruction::FRem"; break;
-      case Instruction::And: Out << "Instruction::And"; break;
-      case Instruction::Or:  Out << "Instruction::Or";  break;
-      case Instruction::Xor: Out << "Instruction::Xor"; break;
-      case Instruction::Shl: Out << "Instruction::Shl"; break;
-      case Instruction::LShr:Out << "Instruction::LShr"; break;
-      case Instruction::AShr:Out << "Instruction::AShr"; break;
-      default: Out << "Instruction::BadOpCode"; break;
-      }
-      Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
-      printEscapedString(I->getName());
-      Out << "\", " << bbname << ");";
-      break;
     }
-    case Instruction::FCmp: {
-      Out << "FCmpInst* " << iName << " = new FCmpInst(*" << bbname << ", ";
-      switch (cast<FCmpInst>(I)->getPredicate()) {
-      case FCmpInst::FCMP_FALSE: Out << "FCmpInst::FCMP_FALSE"; break;
-      case FCmpInst::FCMP_OEQ  : Out << "FCmpInst::FCMP_OEQ"; break;
-      case FCmpInst::FCMP_OGT  : Out << "FCmpInst::FCMP_OGT"; break;
-      case FCmpInst::FCMP_OGE  : Out << "FCmpInst::FCMP_OGE"; break;
-      case FCmpInst::FCMP_OLT  : Out << "FCmpInst::FCMP_OLT"; break;
-      case FCmpInst::FCMP_OLE  : Out << "FCmpInst::FCMP_OLE"; break;
-      case FCmpInst::FCMP_ONE  : Out << "FCmpInst::FCMP_ONE"; break;
-      case FCmpInst::FCMP_ORD  : Out << "FCmpInst::FCMP_ORD"; break;
-      case FCmpInst::FCMP_UNO  : Out << "FCmpInst::FCMP_UNO"; break;
-      case FCmpInst::FCMP_UEQ  : Out << "FCmpInst::FCMP_UEQ"; break;
-      case FCmpInst::FCMP_UGT  : Out << "FCmpInst::FCMP_UGT"; break;
-      case FCmpInst::FCMP_UGE  : Out << "FCmpInst::FCMP_UGE"; break;
-      case FCmpInst::FCMP_ULT  : Out << "FCmpInst::FCMP_ULT"; break;
-      case FCmpInst::FCMP_ULE  : Out << "FCmpInst::FCMP_ULE"; break;
-      case FCmpInst::FCMP_UNE  : Out << "FCmpInst::FCMP_UNE"; break;
-      case FCmpInst::FCMP_TRUE : Out << "FCmpInst::FCMP_TRUE"; break;
-      default: Out << "FCmpInst::BAD_ICMP_PREDICATE"; break;
-      }
-      Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
-      printEscapedString(I->getName());
-      Out << "\");";
-      break;
-    }
-    case Instruction::ICmp: {
-      Out << "ICmpInst* " << iName << " = new ICmpInst(*" << bbname << ", ";
-      switch (cast<ICmpInst>(I)->getPredicate()) {
-      case ICmpInst::ICMP_EQ:  Out << "ICmpInst::ICMP_EQ";  break;
-      case ICmpInst::ICMP_NE:  Out << "ICmpInst::ICMP_NE";  break;
-      case ICmpInst::ICMP_ULE: Out << "ICmpInst::ICMP_ULE"; break;
-      case ICmpInst::ICMP_SLE: Out << "ICmpInst::ICMP_SLE"; break;
-      case ICmpInst::ICMP_UGE: Out << "ICmpInst::ICMP_UGE"; break;
-      case ICmpInst::ICMP_SGE: Out << "ICmpInst::ICMP_SGE"; break;
-      case ICmpInst::ICMP_ULT: Out << "ICmpInst::ICMP_ULT"; break;
-      case ICmpInst::ICMP_SLT: Out << "ICmpInst::ICMP_SLT"; break;
-      case ICmpInst::ICMP_UGT: Out << "ICmpInst::ICMP_UGT"; break;
-      case ICmpInst::ICMP_SGT: Out << "ICmpInst::ICMP_SGT"; break;
-      default: Out << "ICmpInst::BAD_ICMP_PREDICATE"; break;
-      }
-      Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
-      printEscapedString(I->getName());
-      Out << "\");";
-      break;
-    }
-    case Instruction::Alloca: {
-      const AllocaInst* allocaI = cast<AllocaInst>(I);
-      Out << "AllocaInst* " << iName << " = new AllocaInst("
-          << getCppName(allocaI->getAllocatedType()) << ", ";
-      if (allocaI->isArrayAllocation())
-        Out << opNames[0] << ", ";
-      Out << "\"";
-      printEscapedString(allocaI->getName());
-      Out << "\", " << bbname << ");";
-      if (allocaI->getAlignment())
-        nl(Out) << iName << "->setAlignment("
-            << allocaI->getAlignment() << ");";
-      break;
-    }
-    case Instruction::Load:{
-      const LoadInst* load = cast<LoadInst>(I);
-      Out << "LoadInst* " << iName << " = new LoadInst("
-          << opNames[0] << ", \"";
-      printEscapedString(load->getName());
-      Out << "\", " << (load->isVolatile() ? "true" : "false" )
-          << ", " << bbname << ");";
-      break;
-    }
-    case Instruction::Store: {
-      const StoreInst* store = cast<StoreInst>(I);
-      Out << " new StoreInst("
-          << opNames[0] << ", "
-          << opNames[1] << ", "
-          << (store->isVolatile() ? "true" : "false")
-          << ", " << bbname << ");";
-      break;
-    }
-    case Instruction::GetElementPtr: {
-      const GetElementPtrInst* gep = cast<GetElementPtrInst>(I);
-      if (gep->getNumOperands() <= 2) {
-        Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create("
-            << opNames[0];
-        if (gep->getNumOperands() == 2)
-          Out << ", " << opNames[1];
-      } else {
-        Out << "std::vector<Value*> " << iName << "_indices;";
-        nl(Out);
-        for (unsigned i = 1; i < gep->getNumOperands(); ++i ) {
-          Out << iName << "_indices.push_back("
-              << opNames[i] << ");";
-          nl(Out);
-        }
-        Out << "Instruction* " << iName << " = GetElementPtrInst::Create("
-            << opNames[0] << ", " << iName << "_indices.begin(), "
-            << iName << "_indices.end()";
-      }
-      Out << ", \"";
-      printEscapedString(gep->getName());
-      Out << "\", " << bbname << ");";
-      break;
-    }
-    case Instruction::PHI: {
-      const PHINode* phi = cast<PHINode>(I);
-
-      Out << "PHINode* " << iName << " = PHINode::Create("
-          << getCppName(phi->getType()) << ", \"";
-      printEscapedString(phi->getName());
-      Out << "\", " << bbname << ");";
-      nl(Out) << iName << "->reserveOperandSpace("
-        << phi->getNumIncomingValues()
-          << ");";
+    break;
+  }
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast: {
+    const CastInst* cst = cast<CastInst>(I);
+    Out << "CastInst* " << iName << " = new ";
+    switch (I->getOpcode()) {
+    case Instruction::Trunc:    Out << "TruncInst"; break;
+    case Instruction::ZExt:     Out << "ZExtInst"; break;
+    case Instruction::SExt:     Out << "SExtInst"; break;
+    case Instruction::FPTrunc:  Out << "FPTruncInst"; break;
+    case Instruction::FPExt:    Out << "FPExtInst"; break;
+    case Instruction::FPToUI:   Out << "FPToUIInst"; break;
+    case Instruction::FPToSI:   Out << "FPToSIInst"; break;
+    case Instruction::UIToFP:   Out << "UIToFPInst"; break;
+    case Instruction::SIToFP:   Out << "SIToFPInst"; break;
+    case Instruction::PtrToInt: Out << "PtrToIntInst"; break;
+    case Instruction::IntToPtr: Out << "IntToPtrInst"; break;
+    case Instruction::BitCast:  Out << "BitCastInst"; break;
+    default: assert(!"Unreachable"); break;
+    }
+    Out << "(" << opNames[0] << ", "
+        << getCppName(cst->getType()) << ", \"";
+    printEscapedString(cst->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::Call: {
+    const CallInst* call = cast<CallInst>(I);
+    if (const InlineAsm* ila = dyn_cast<InlineAsm>(call->getCalledValue())) {
+      Out << "InlineAsm* " << getCppName(ila) << " = InlineAsm::get("
+          << getCppName(ila->getFunctionType()) << ", \""
+          << ila->getAsmString() << "\", \""
+          << ila->getConstraintString() << "\","
+          << (ila->hasSideEffects() ? "true" : "false") << ");";
       nl(Out);
-      for (unsigned i = 0; i < phi->getNumOperands(); i+=2) {
-        Out << iName << "->addIncoming("
-            << opNames[i] << ", " << opNames[i+1] << ");";
-        nl(Out);
-      }
-      break;
-    }
-    case Instruction::Trunc:
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPTrunc:
-    case Instruction::FPExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::UIToFP:
-    case Instruction::SIToFP:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::BitCast: {
-      const CastInst* cst = cast<CastInst>(I);
-      Out << "CastInst* " << iName << " = new ";
-      switch (I->getOpcode()) {
-      case Instruction::Trunc:    Out << "TruncInst"; break;
-      case Instruction::ZExt:     Out << "ZExtInst"; break;
-      case Instruction::SExt:     Out << "SExtInst"; break;
-      case Instruction::FPTrunc:  Out << "FPTruncInst"; break;
-      case Instruction::FPExt:    Out << "FPExtInst"; break;
-      case Instruction::FPToUI:   Out << "FPToUIInst"; break;
-      case Instruction::FPToSI:   Out << "FPToSIInst"; break;
-      case Instruction::UIToFP:   Out << "UIToFPInst"; break;
-      case Instruction::SIToFP:   Out << "SIToFPInst"; break;
-      case Instruction::PtrToInt: Out << "PtrToIntInst"; break;
-      case Instruction::IntToPtr: Out << "IntToPtrInst"; break;
-      case Instruction::BitCast:  Out << "BitCastInst"; break;
-      default: assert(!"Unreachable"); break;
-      }
-      Out << "(" << opNames[0] << ", "
-          << getCppName(cst->getType()) << ", \"";
-      printEscapedString(cst->getName());
-      Out << "\", " << bbname << ");";
-      break;
     }
-    case Instruction::Call:{
-      const CallInst* call = cast<CallInst>(I);
-      if (const InlineAsm* ila = dyn_cast<InlineAsm>(call->getCalledValue())) {
-        Out << "InlineAsm* " << getCppName(ila) << " = InlineAsm::get("
-            << getCppName(ila->getFunctionType()) << ", \""
-            << ila->getAsmString() << "\", \""
-            << ila->getConstraintString() << "\","
-            << (ila->hasSideEffects() ? "true" : "false") << ");";
-        nl(Out);
-      }
-      if (call->getNumOperands() > 2) {
-        Out << "std::vector<Value*> " << iName << "_params;";
+    if (call->getNumArgOperands() > 1) {
+      Out << "std::vector<Value*> " << iName << "_params;";
+      nl(Out);
+      for (unsigned i = 0; i < call->getNumArgOperands(); ++i) {
+        Out << iName << "_params.push_back(" << opNames[i] << ");";
         nl(Out);
-        for (unsigned i = 1; i < call->getNumOperands(); ++i) {
-          Out << iName << "_params.push_back(" << opNames[i] << ");";
-          nl(Out);
-        }
-        Out << "CallInst* " << iName << " = CallInst::Create("
-            << opNames[0] << ", " << iName << "_params.begin(), "
-            << iName << "_params.end(), \"";
-      } else if (call->getNumOperands() == 2) {
-        Out << "CallInst* " << iName << " = CallInst::Create("
-            << opNames[0] << ", " << opNames[1] << ", \"";
-      } else {
-        Out << "CallInst* " << iName << " = CallInst::Create(" << opNames[0]
-            << ", \"";
       }
-      printEscapedString(call->getName());
-      Out << "\", " << bbname << ");";
-      nl(Out) << iName << "->setCallingConv(";
-      printCallingConv(call->getCallingConv());
-      Out << ");";
-      nl(Out) << iName << "->setTailCall("
-          << (call->isTailCall() ? "true":"false");
-      Out << ");";
-      printAttributes(call->getAttributes(), iName);
-      Out << iName << "->setAttributes(" << iName << "_PAL);";
-      nl(Out);
-      break;
-    }
-    case Instruction::Select: {
-      const SelectInst* sel = cast<SelectInst>(I);
-      Out << "SelectInst* " << getCppName(sel) << " = SelectInst::Create(";
-      Out << opNames[0] << ", " << opNames[1] << ", " << opNames[2] << ", \"";
-      printEscapedString(sel->getName());
-      Out << "\", " << bbname << ");";
-      break;
-    }
-    case Instruction::UserOp1:
-      /// FALL THROUGH
-    case Instruction::UserOp2: {
-      /// FIXME: What should be done here?
-      break;
-    }
-    case Instruction::VAArg: {
-      const VAArgInst* va = cast<VAArgInst>(I);
-      Out << "VAArgInst* " << getCppName(va) << " = new VAArgInst("
-          << opNames[0] << ", " << getCppName(va->getType()) << ", \"";
-      printEscapedString(va->getName());
-      Out << "\", " << bbname << ");";
-      break;
-    }
-    case Instruction::ExtractElement: {
-      const ExtractElementInst* eei = cast<ExtractElementInst>(I);
-      Out << "ExtractElementInst* " << getCppName(eei)
-          << " = new ExtractElementInst(" << opNames[0]
-          << ", " << opNames[1] << ", \"";
-      printEscapedString(eei->getName());
-      Out << "\", " << bbname << ");";
-      break;
-    }
-    case Instruction::InsertElement: {
-      const InsertElementInst* iei = cast<InsertElementInst>(I);
-      Out << "InsertElementInst* " << getCppName(iei)
-          << " = InsertElementInst::Create(" << opNames[0]
-          << ", " << opNames[1] << ", " << opNames[2] << ", \"";
-      printEscapedString(iei->getName());
-      Out << "\", " << bbname << ");";
-      break;
-    }
-    case Instruction::ShuffleVector: {
-      const ShuffleVectorInst* svi = cast<ShuffleVectorInst>(I);
-      Out << "ShuffleVectorInst* " << getCppName(svi)
-          << " = new ShuffleVectorInst(" << opNames[0]
-          << ", " << opNames[1] << ", " << opNames[2] << ", \"";
-      printEscapedString(svi->getName());
-      Out << "\", " << bbname << ");";
-      break;
+      Out << "CallInst* " << iName << " = CallInst::Create("
+          << opNames[call->getNumArgOperands()] << ", " << iName << "_params.begin(), "
+          << iName << "_params.end(), \"";
+    } else if (call->getNumArgOperands() == 1) {
+      Out << "CallInst* " << iName << " = CallInst::Create("
+          << opNames[call->getNumArgOperands()] << ", " << opNames[0] << ", \"";
+    } else {
+      Out << "CallInst* " << iName << " = CallInst::Create("
+          << opNames[call->getNumArgOperands()] << ", \"";
     }
-    case Instruction::ExtractValue: {
-      const ExtractValueInst *evi = cast<ExtractValueInst>(I);
-      Out << "std::vector<unsigned> " << iName << "_indices;";
+    printEscapedString(call->getName());
+    Out << "\", " << bbname << ");";
+    nl(Out) << iName << "->setCallingConv(";
+    printCallingConv(call->getCallingConv());
+    Out << ");";
+    nl(Out) << iName << "->setTailCall("
+        << (call->isTailCall() ? "true" : "false");
+    Out << ");";
+    nl(Out);
+    printAttributes(call->getAttributes(), iName);
+    Out << iName << "->setAttributes(" << iName << "_PAL);";
+    nl(Out);
+    break;
+  }
+  case Instruction::Select: {
+    const SelectInst* sel = cast<SelectInst>(I);
+    Out << "SelectInst* " << getCppName(sel) << " = SelectInst::Create(";
+    Out << opNames[0] << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+    printEscapedString(sel->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::UserOp1:
+    /// FALL THROUGH
+  case Instruction::UserOp2: {
+    /// FIXME: What should be done here?
+    break;
+  }
+  case Instruction::VAArg: {
+    const VAArgInst* va = cast<VAArgInst>(I);
+    Out << "VAArgInst* " << getCppName(va) << " = new VAArgInst("
+        << opNames[0] << ", " << getCppName(va->getType()) << ", \"";
+    printEscapedString(va->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::ExtractElement: {
+    const ExtractElementInst* eei = cast<ExtractElementInst>(I);
+    Out << "ExtractElementInst* " << getCppName(eei)
+        << " = new ExtractElementInst(" << opNames[0]
+        << ", " << opNames[1] << ", \"";
+    printEscapedString(eei->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::InsertElement: {
+    const InsertElementInst* iei = cast<InsertElementInst>(I);
+    Out << "InsertElementInst* " << getCppName(iei)
+        << " = InsertElementInst::Create(" << opNames[0]
+        << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+    printEscapedString(iei->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::ShuffleVector: {
+    const ShuffleVectorInst* svi = cast<ShuffleVectorInst>(I);
+    Out << "ShuffleVectorInst* " << getCppName(svi)
+        << " = new ShuffleVectorInst(" << opNames[0]
+        << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+    printEscapedString(svi->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::ExtractValue: {
+    const ExtractValueInst *evi = cast<ExtractValueInst>(I);
+    Out << "std::vector<unsigned> " << iName << "_indices;";
+    nl(Out);
+    for (unsigned i = 0; i < evi->getNumIndices(); ++i) {
+      Out << iName << "_indices.push_back("
+          << evi->idx_begin()[i] << ");";
       nl(Out);
-      for (unsigned i = 0; i < evi->getNumIndices(); ++i) {
-        Out << iName << "_indices.push_back("
-            << evi->idx_begin()[i] << ");";
-        nl(Out);
-      }
-      Out << "ExtractValueInst* " << getCppName(evi)
-          << " = ExtractValueInst::Create(" << opNames[0]
-          << ", "
-          << iName << "_indices.begin(), " << iName << "_indices.end(), \"";
-      printEscapedString(evi->getName());
-      Out << "\", " << bbname << ");";
-      break;
     }
-    case Instruction::InsertValue: {
-      const InsertValueInst *ivi = cast<InsertValueInst>(I);
-      Out << "std::vector<unsigned> " << iName << "_indices;";
+    Out << "ExtractValueInst* " << getCppName(evi)
+        << " = ExtractValueInst::Create(" << opNames[0]
+        << ", "
+        << iName << "_indices.begin(), " << iName << "_indices.end(), \"";
+    printEscapedString(evi->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::InsertValue: {
+    const InsertValueInst *ivi = cast<InsertValueInst>(I);
+    Out << "std::vector<unsigned> " << iName << "_indices;";
+    nl(Out);
+    for (unsigned i = 0; i < ivi->getNumIndices(); ++i) {
+      Out << iName << "_indices.push_back("
+          << ivi->idx_begin()[i] << ");";
       nl(Out);
-      for (unsigned i = 0; i < ivi->getNumIndices(); ++i) {
-        Out << iName << "_indices.push_back("
-            << ivi->idx_begin()[i] << ");";
-        nl(Out);
-      }
-      Out << "InsertValueInst* " << getCppName(ivi)
-          << " = InsertValueInst::Create(" << opNames[0]
-          << ", " << opNames[1] << ", "
-          << iName << "_indices.begin(), " << iName << "_indices.end(), \"";
-      printEscapedString(ivi->getName());
-      Out << "\", " << bbname << ");";
-      break;
     }
+    Out << "InsertValueInst* " << getCppName(ivi)
+        << " = InsertValueInst::Create(" << opNames[0]
+        << ", " << opNames[1] << ", "
+        << iName << "_indices.begin(), " << iName << "_indices.end(), \"";
+    printEscapedString(ivi->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
   }
   DefinedValues.insert(I);
   nl(Out);
   delete [] opNames;
 }
 
-  // Print out the types, constants and declarations needed by one function
-  void CppWriter::printFunctionUses(const Function* F) {
-    nl(Out) << "// Type Definitions"; nl(Out);
-    if (!is_inline) {
-      // Print the function's return type
-      printType(F->getReturnType());
+// Print out the types, constants and declarations needed by one function
+void CppWriter::printFunctionUses(const Function* F) {
+  nl(Out) << "// Type Definitions"; nl(Out);
+  if (!is_inline) {
+    // Print the function's return type
+    printType(F->getReturnType());
 
-      // Print the function's function type
-      printType(F->getFunctionType());
+    // Print the function's function type
+    printType(F->getFunctionType());
 
-      // Print the types of each of the function's arguments
-      for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-           AI != AE; ++AI) {
-        printType(AI->getType());
-      }
+    // Print the types of each of the function's arguments
+    for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+         AI != AE; ++AI) {
+      printType(AI->getType());
     }
+  }
 
-    // Print type definitions for every type referenced by an instruction and
-    // make a note of any global values or constants that are referenced
-    SmallPtrSet<GlobalValue*,64> gvs;
-    SmallPtrSet<Constant*,64> consts;
-    for (Function::const_iterator BB = F->begin(), BE = F->end();
-         BB != BE; ++BB){
-      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
-           I != E; ++I) {
-        // Print the type of the instruction itself
-        printType(I->getType());
+  // Print type definitions for every type referenced by an instruction and
+  // make a note of any global values or constants that are referenced
+  SmallPtrSet<GlobalValue*,64> gvs;
+  SmallPtrSet<Constant*,64> consts;
+  for (Function::const_iterator BB = F->begin(), BE = F->end();
+       BB != BE; ++BB){
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      // Print the type of the instruction itself
+      printType(I->getType());
 
-        // Print the type of each of the instruction's operands
-        for (unsigned i = 0; i < I->getNumOperands(); ++i) {
-          Value* operand = I->getOperand(i);
-          printType(operand->getType());
-
-          // If the operand references a GVal or Constant, make a note of it
-          if (GlobalValue* GV = dyn_cast<GlobalValue>(operand)) {
-            gvs.insert(GV);
-            if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
-              if (GVar->hasInitializer())
-                consts.insert(GVar->getInitializer());
-          } else if (Constant* C = dyn_cast<Constant>(operand))
-            consts.insert(C);
-        }
+      // Print the type of each of the instruction's operands
+      for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+        Value* operand = I->getOperand(i);
+        printType(operand->getType());
+
+        // If the operand references a GVal or Constant, make a note of it
+        if (GlobalValue* GV = dyn_cast<GlobalValue>(operand)) {
+          gvs.insert(GV);
+          if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+            if (GVar->hasInitializer())
+              consts.insert(GVar->getInitializer());
+        } else if (Constant* C = dyn_cast<Constant>(operand))
+          consts.insert(C);
       }
     }
+  }
 
-    // Print the function declarations for any functions encountered
-    nl(Out) << "// Function Declarations"; nl(Out);
-    for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
-         I != E; ++I) {
-      if (Function* Fun = dyn_cast<Function>(*I)) {
-        if (!is_inline || Fun != F)
-          printFunctionHead(Fun);
-      }
+  // Print the function declarations for any functions encountered
+  nl(Out) << "// Function Declarations"; nl(Out);
+  for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
+       I != E; ++I) {
+    if (Function* Fun = dyn_cast<Function>(*I)) {
+      if (!is_inline || Fun != F)
+        printFunctionHead(Fun);
     }
+  }
 
-    // Print the global variable declarations for any variables encountered
-    nl(Out) << "// Global Variable Declarations"; nl(Out);
-    for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
-         I != E; ++I) {
-      if (GlobalVariable* F = dyn_cast<GlobalVariable>(*I))
-        printVariableHead(F);
-    }
+  // Print the global variable declarations for any variables encountered
+  nl(Out) << "// Global Variable Declarations"; nl(Out);
+  for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
+       I != E; ++I) {
+    if (GlobalVariable* F = dyn_cast<GlobalVariable>(*I))
+      printVariableHead(F);
+  }
 
-  // Print the constants found
-    nl(Out) << "// Constant Definitions"; nl(Out);
-    for (SmallPtrSet<Constant*,64>::iterator I = consts.begin(),
-           E = consts.end(); I != E; ++I) {
-      printConstant(*I);
-    }
+// Print the constants found
+  nl(Out) << "// Constant Definitions"; nl(Out);
+  for (SmallPtrSet<Constant*,64>::iterator I = consts.begin(),
+         E = consts.end(); I != E; ++I) {
+    printConstant(*I);
+  }
 
-    // Process the global variables definitions now that all the constants have
-    // been emitted. These definitions just couple the gvars with their constant
-    // initializers.
-    nl(Out) << "// Global Variable Definitions"; nl(Out);
-    for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
-         I != E; ++I) {
-      if (GlobalVariable* GV = dyn_cast<GlobalVariable>(*I))
-        printVariableBody(GV);
-    }
+  // Process the global variables definitions now that all the constants have
+  // been emitted. These definitions just couple the gvars with their constant
+  // initializers.
+  nl(Out) << "// Global Variable Definitions"; nl(Out);
+  for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
+       I != E; ++I) {
+    if (GlobalVariable* GV = dyn_cast<GlobalVariable>(*I))
+      printVariableBody(GV);
   }
+}
 
-  void CppWriter::printFunctionHead(const Function* F) {
-    nl(Out) << "Function* " << getCppName(F);
-    if (is_inline) {
-      Out << " = mod->getFunction(\"";
-      printEscapedString(F->getName());
-      Out << "\", " << getCppName(F->getFunctionType()) << ");";
-      nl(Out) << "if (!" << getCppName(F) << ") {";
-      nl(Out) << getCppName(F);
-    }
-    Out<< " = Function::Create(";
-    nl(Out,1) << "/*Type=*/" << getCppName(F->getFunctionType()) << ",";
-    nl(Out) << "/*Linkage=*/";
-    printLinkageType(F->getLinkage());
-    Out << ",";
-    nl(Out) << "/*Name=*/\"";
+void CppWriter::printFunctionHead(const Function* F) {
+  nl(Out) << "Function* " << getCppName(F);
+  if (is_inline) {
+    Out << " = mod->getFunction(\"";
     printEscapedString(F->getName());
-    Out << "\", mod); " << (F->isDeclaration()? "// (external, no body)" : "");
-    nl(Out,-1);
+    Out << "\", " << getCppName(F->getFunctionType()) << ");";
+    nl(Out) << "if (!" << getCppName(F) << ") {";
+    nl(Out) << getCppName(F);
+  }
+  Out<< " = Function::Create(";
+  nl(Out,1) << "/*Type=*/" << getCppName(F->getFunctionType()) << ",";
+  nl(Out) << "/*Linkage=*/";
+  printLinkageType(F->getLinkage());
+  Out << ",";
+  nl(Out) << "/*Name=*/\"";
+  printEscapedString(F->getName());
+  Out << "\", mod); " << (F->isDeclaration()? "// (external, no body)" : "");
+  nl(Out,-1);
+  printCppName(F);
+  Out << "->setCallingConv(";
+  printCallingConv(F->getCallingConv());
+  Out << ");";
+  nl(Out);
+  if (F->hasSection()) {
+    printCppName(F);
+    Out << "->setSection(\"" << F->getSection() << "\");";
+    nl(Out);
+  }
+  if (F->getAlignment()) {
+    printCppName(F);
+    Out << "->setAlignment(" << F->getAlignment() << ");";
+    nl(Out);
+  }
+  if (F->getVisibility() != GlobalValue::DefaultVisibility) {
     printCppName(F);
-    Out << "->setCallingConv(";
-    printCallingConv(F->getCallingConv());
+    Out << "->setVisibility(";
+    printVisibilityType(F->getVisibility());
     Out << ");";
     nl(Out);
-    if (F->hasSection()) {
-      printCppName(F);
-      Out << "->setSection(\"" << F->getSection() << "\");";
-      nl(Out);
-    }
-    if (F->getAlignment()) {
-      printCppName(F);
-      Out << "->setAlignment(" << F->getAlignment() << ");";
-      nl(Out);
-    }
-    if (F->getVisibility() != GlobalValue::DefaultVisibility) {
-      printCppName(F);
-      Out << "->setVisibility(";
-      printVisibilityType(F->getVisibility());
-      Out << ");";
-      nl(Out);
-    }
-    if (F->hasGC()) {
-      printCppName(F);
-      Out << "->setGC(\"" << F->getGC() << "\");";
-      nl(Out);
-    }
-    if (is_inline) {
-      Out << "}";
-      nl(Out);
-    }
-    printAttributes(F->getAttributes(), getCppName(F));
+  }
+  if (F->hasGC()) {
     printCppName(F);
-    Out << "->setAttributes(" << getCppName(F) << "_PAL);";
+    Out << "->setGC(\"" << F->getGC() << "\");";
     nl(Out);
   }
+  if (is_inline) {
+    Out << "}";
+    nl(Out);
+  }
+  printAttributes(F->getAttributes(), getCppName(F));
+  printCppName(F);
+  Out << "->setAttributes(" << getCppName(F) << "_PAL);";
+  nl(Out);
+}
 
-  void CppWriter::printFunctionBody(const Function *F) {
-    if (F->isDeclaration())
-      return; // external functions have no bodies.
-
-    // Clear the DefinedValues and ForwardRefs maps because we can't have
-    // cross-function forward refs
-    ForwardRefs.clear();
-    DefinedValues.clear();
+void CppWriter::printFunctionBody(const Function *F) {
+  if (F->isDeclaration())
+    return; // external functions have no bodies.
 
-    // Create all the argument values
-    if (!is_inline) {
-      if (!F->arg_empty()) {
-        Out << "Function::arg_iterator args = " << getCppName(F)
-            << "->arg_begin();";
-        nl(Out);
-      }
-      for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-           AI != AE; ++AI) {
-        Out << "Value* " << getCppName(AI) << " = args++;";
-        nl(Out);
-        if (AI->hasName()) {
-          Out << getCppName(AI) << "->setName(\"" << AI->getName() << "\");";
-          nl(Out);
-        }
-      }
-    }
+  // Clear the DefinedValues and ForwardRefs maps because we can't have
+  // cross-function forward refs
+  ForwardRefs.clear();
+  DefinedValues.clear();
 
-    // Create all the basic blocks
-    nl(Out);
-    for (Function::const_iterator BI = F->begin(), BE = F->end();
-         BI != BE; ++BI) {
-      std::string bbname(getCppName(BI));
-      Out << "BasicBlock* " << bbname <<
-             " = BasicBlock::Create(mod->getContext(), \"";
-      if (BI->hasName())
-        printEscapedString(BI->getName());
-      Out << "\"," << getCppName(BI->getParent()) << ",0);";
+  // Create all the argument values
+  if (!is_inline) {
+    if (!F->arg_empty()) {
+      Out << "Function::arg_iterator args = " << getCppName(F)
+          << "->arg_begin();";
       nl(Out);
     }
-
-    // Output all of its basic blocks... for the function
-    for (Function::const_iterator BI = F->begin(), BE = F->end();
-         BI != BE; ++BI) {
-      std::string bbname(getCppName(BI));
-      nl(Out) << "// Block " << BI->getName() << " (" << bbname << ")";
+    for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+         AI != AE; ++AI) {
+      Out << "Value* " << getCppName(AI) << " = args++;";
       nl(Out);
-
-      // Output all of the instructions in the basic block...
-      for (BasicBlock::const_iterator I = BI->begin(), E = BI->end();
-           I != E; ++I) {
-        printInstruction(I,bbname);
+      if (AI->hasName()) {
+        Out << getCppName(AI) << "->setName(\"" << AI->getName() << "\");";
+        nl(Out);
       }
     }
+  }
 
-    // Loop over the ForwardRefs and resolve them now that all instructions
-    // are generated.
-    if (!ForwardRefs.empty()) {
-      nl(Out) << "// Resolve Forward References";
-      nl(Out);
-    }
+  // Create all the basic blocks
+  nl(Out);
+  for (Function::const_iterator BI = F->begin(), BE = F->end();
+       BI != BE; ++BI) {
+    std::string bbname(getCppName(BI));
+    Out << "BasicBlock* " << bbname <<
+           " = BasicBlock::Create(mod->getContext(), \"";
+    if (BI->hasName())
+      printEscapedString(BI->getName());
+    Out << "\"," << getCppName(BI->getParent()) << ",0);";
+    nl(Out);
+  }
 
-    while (!ForwardRefs.empty()) {
-      ForwardRefMap::iterator I = ForwardRefs.begin();
-      Out << I->second << "->replaceAllUsesWith("
-          << getCppName(I->first) << "); delete " << I->second << ";";
-      nl(Out);
-      ForwardRefs.erase(I);
+  // Output all of its basic blocks... for the function
+  for (Function::const_iterator BI = F->begin(), BE = F->end();
+       BI != BE; ++BI) {
+    std::string bbname(getCppName(BI));
+    nl(Out) << "// Block " << BI->getName() << " (" << bbname << ")";
+    nl(Out);
+
+    // Output all of the instructions in the basic block...
+    for (BasicBlock::const_iterator I = BI->begin(), E = BI->end();
+         I != E; ++I) {
+      printInstruction(I,bbname);
     }
   }
 
-  void CppWriter::printInline(const std::string& fname,
-                              const std::string& func) {
-    const Function* F = TheModule->getFunction(func);
-    if (!F) {
-      error(std::string("Function '") + func + "' not found in input module");
-      return;
-    }
-    if (F->isDeclaration()) {
-      error(std::string("Function '") + func + "' is external!");
-      return;
-    }
-    nl(Out) << "BasicBlock* " << fname << "(Module* mod, Function *"
-            << getCppName(F);
-    unsigned arg_count = 1;
-    for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-         AI != AE; ++AI) {
-      Out << ", Value* arg_" << arg_count;
-    }
-    Out << ") {";
+  // Loop over the ForwardRefs and resolve them now that all instructions
+  // are generated.
+  if (!ForwardRefs.empty()) {
+    nl(Out) << "// Resolve Forward References";
     nl(Out);
-    is_inline = true;
-    printFunctionUses(F);
-    printFunctionBody(F);
-    is_inline = false;
-    Out << "return " << getCppName(F->begin()) << ";";
-    nl(Out) << "}";
+  }
+
+  while (!ForwardRefs.empty()) {
+    ForwardRefMap::iterator I = ForwardRefs.begin();
+    Out << I->second << "->replaceAllUsesWith("
+        << getCppName(I->first) << "); delete " << I->second << ";";
     nl(Out);
+    ForwardRefs.erase(I);
   }
+}
 
-  void CppWriter::printModuleBody() {
-    // Print out all the type definitions
-    nl(Out) << "// Type Definitions"; nl(Out);
-    printTypes(TheModule);
-
-    // Functions can call each other and global variables can reference them so
-    // define all the functions first before emitting their function bodies.
-    nl(Out) << "// Function Declarations"; nl(Out);
-    for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
-         I != E; ++I)
-      printFunctionHead(I);
-
-    // Process the global variables declarations. We can't initialze them until
-    // after the constants are printed so just print a header for each global
-    nl(Out) << "// Global Variable Declarations\n"; nl(Out);
-    for (Module::const_global_iterator I = TheModule->global_begin(),
-           E = TheModule->global_end(); I != E; ++I) {
-      printVariableHead(I);
-    }
+void CppWriter::printInline(const std::string& fname,
+                            const std::string& func) {
+  const Function* F = TheModule->getFunction(func);
+  if (!F) {
+    error(std::string("Function '") + func + "' not found in input module");
+    return;
+  }
+  if (F->isDeclaration()) {
+    error(std::string("Function '") + func + "' is external!");
+    return;
+  }
+  nl(Out) << "BasicBlock* " << fname << "(Module* mod, Function *"
+          << getCppName(F);
+  unsigned arg_count = 1;
+  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+       AI != AE; ++AI) {
+    Out << ", Value* arg_" << arg_count;
+  }
+  Out << ") {";
+  nl(Out);
+  is_inline = true;
+  printFunctionUses(F);
+  printFunctionBody(F);
+  is_inline = false;
+  Out << "return " << getCppName(F->begin()) << ";";
+  nl(Out) << "}";
+  nl(Out);
+}
 
-    // Print out all the constants definitions. Constants don't recurse except
-    // through GlobalValues. All GlobalValues have been declared at this point
-    // so we can proceed to generate the constants.
-    nl(Out) << "// Constant Definitions"; nl(Out);
-    printConstants(TheModule);
-
-    // Process the global variables definitions now that all the constants have
-    // been emitted. These definitions just couple the gvars with their constant
-    // initializers.
-    nl(Out) << "// Global Variable Definitions"; nl(Out);
-    for (Module::const_global_iterator I = TheModule->global_begin(),
-           E = TheModule->global_end(); I != E; ++I) {
-      printVariableBody(I);
-    }
+void CppWriter::printModuleBody() {
+  // Print out all the type definitions
+  nl(Out) << "// Type Definitions"; nl(Out);
+  printTypes(TheModule);
+
+  // Functions can call each other and global variables can reference them so
+  // define all the functions first before emitting their function bodies.
+  nl(Out) << "// Function Declarations"; nl(Out);
+  for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+       I != E; ++I)
+    printFunctionHead(I);
+
+  // Process the global variables declarations. We can't initialze them until
+  // after the constants are printed so just print a header for each global
+  nl(Out) << "// Global Variable Declarations\n"; nl(Out);
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I) {
+    printVariableHead(I);
+  }
 
-    // Finally, we can safely put out all of the function bodies.
-    nl(Out) << "// Function Definitions"; nl(Out);
-    for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
-         I != E; ++I) {
-      if (!I->isDeclaration()) {
-        nl(Out) << "// Function: " << I->getName() << " (" << getCppName(I)
-                << ")";
-        nl(Out) << "{";
-        nl(Out,1);
-        printFunctionBody(I);
-        nl(Out,-1) << "}";
-        nl(Out);
-      }
-    }
+  // Print out all the constants definitions. Constants don't recurse except
+  // through GlobalValues. All GlobalValues have been declared at this point
+  // so we can proceed to generate the constants.
+  nl(Out) << "// Constant Definitions"; nl(Out);
+  printConstants(TheModule);
+
+  // Process the global variables definitions now that all the constants have
+  // been emitted. These definitions just couple the gvars with their constant
+  // initializers.
+  nl(Out) << "// Global Variable Definitions"; nl(Out);
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I) {
+    printVariableBody(I);
   }
 
-  void CppWriter::printProgram(const std::string& fname,
-                               const std::string& mName) {
-    Out << "#include <llvm/LLVMContext.h>\n";
-    Out << "#include <llvm/Module.h>\n";
-    Out << "#include <llvm/DerivedTypes.h>\n";
-    Out << "#include <llvm/Constants.h>\n";
-    Out << "#include <llvm/GlobalVariable.h>\n";
-    Out << "#include <llvm/Function.h>\n";
-    Out << "#include <llvm/CallingConv.h>\n";
-    Out << "#include <llvm/BasicBlock.h>\n";
-    Out << "#include <llvm/Instructions.h>\n";
-    Out << "#include <llvm/InlineAsm.h>\n";
-    Out << "#include <llvm/Support/FormattedStream.h>\n";
-    Out << "#include <llvm/Support/MathExtras.h>\n";
-    Out << "#include <llvm/Pass.h>\n";
-    Out << "#include <llvm/PassManager.h>\n";
-    Out << "#include <llvm/ADT/SmallVector.h>\n";
-    Out << "#include <llvm/Analysis/Verifier.h>\n";
-    Out << "#include <llvm/Assembly/PrintModulePass.h>\n";
-    Out << "#include <algorithm>\n";
-    Out << "using namespace llvm;\n\n";
-    Out << "Module* " << fname << "();\n\n";
-    Out << "int main(int argc, char**argv) {\n";
-    Out << "  Module* Mod = " << fname << "();\n";
-    Out << "  verifyModule(*Mod, PrintMessageAction);\n";
-    Out << "  PassManager PM;\n";
-    Out << "  PM.add(createPrintModulePass(&outs()));\n";
-    Out << "  PM.run(*Mod);\n";
-    Out << "  return 0;\n";
-    Out << "}\n\n";
-    printModule(fname,mName);
-  }
-
-  void CppWriter::printModule(const std::string& fname,
-                              const std::string& mName) {
-    nl(Out) << "Module* " << fname << "() {";
-    nl(Out,1) << "// Module Construction";
-    nl(Out) << "Module* mod = new Module(\"";
-    printEscapedString(mName);
-    Out << "\", getGlobalContext());";
-    if (!TheModule->getTargetTriple().empty()) {
-      nl(Out) << "mod->setDataLayout(\"" << TheModule->getDataLayout() << "\");";
-    }
-    if (!TheModule->getTargetTriple().empty()) {
-      nl(Out) << "mod->setTargetTriple(\"" << TheModule->getTargetTriple()
-              << "\");";
+  // Finally, we can safely put out all of the function bodies.
+  nl(Out) << "// Function Definitions"; nl(Out);
+  for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+       I != E; ++I) {
+    if (!I->isDeclaration()) {
+      nl(Out) << "// Function: " << I->getName() << " (" << getCppName(I)
+              << ")";
+      nl(Out) << "{";
+      nl(Out,1);
+      printFunctionBody(I);
+      nl(Out,-1) << "}";
+      nl(Out);
     }
+  }
+}
 
-    if (!TheModule->getModuleInlineAsm().empty()) {
-      nl(Out) << "mod->setModuleInlineAsm(\"";
-      printEscapedString(TheModule->getModuleInlineAsm());
-      Out << "\");";
-    }
-    nl(Out);
+void CppWriter::printProgram(const std::string& fname,
+                             const std::string& mName) {
+  Out << "#include <llvm/LLVMContext.h>\n";
+  Out << "#include <llvm/Module.h>\n";
+  Out << "#include <llvm/DerivedTypes.h>\n";
+  Out << "#include <llvm/Constants.h>\n";
+  Out << "#include <llvm/GlobalVariable.h>\n";
+  Out << "#include <llvm/Function.h>\n";
+  Out << "#include <llvm/CallingConv.h>\n";
+  Out << "#include <llvm/BasicBlock.h>\n";
+  Out << "#include <llvm/Instructions.h>\n";
+  Out << "#include <llvm/InlineAsm.h>\n";
+  Out << "#include <llvm/Support/FormattedStream.h>\n";
+  Out << "#include <llvm/Support/MathExtras.h>\n";
+  Out << "#include <llvm/Pass.h>\n";
+  Out << "#include <llvm/PassManager.h>\n";
+  Out << "#include <llvm/ADT/SmallVector.h>\n";
+  Out << "#include <llvm/Analysis/Verifier.h>\n";
+  Out << "#include <llvm/Assembly/PrintModulePass.h>\n";
+  Out << "#include <algorithm>\n";
+  Out << "using namespace llvm;\n\n";
+  Out << "Module* " << fname << "();\n\n";
+  Out << "int main(int argc, char**argv) {\n";
+  Out << "  Module* Mod = " << fname << "();\n";
+  Out << "  verifyModule(*Mod, PrintMessageAction);\n";
+  Out << "  PassManager PM;\n";
+  Out << "  PM.add(createPrintModulePass(&outs()));\n";
+  Out << "  PM.run(*Mod);\n";
+  Out << "  return 0;\n";
+  Out << "}\n\n";
+  printModule(fname,mName);
+}
 
-    // Loop over the dependent libraries and emit them.
-    Module::lib_iterator LI = TheModule->lib_begin();
-    Module::lib_iterator LE = TheModule->lib_end();
-    while (LI != LE) {
-      Out << "mod->addLibrary(\"" << *LI << "\");";
-      nl(Out);
-      ++LI;
-    }
-    printModuleBody();
-    nl(Out) << "return mod;";
-    nl(Out,-1) << "}";
+void CppWriter::printModule(const std::string& fname,
+                            const std::string& mName) {
+  nl(Out) << "Module* " << fname << "() {";
+  nl(Out,1) << "// Module Construction";
+  nl(Out) << "Module* mod = new Module(\"";
+  printEscapedString(mName);
+  Out << "\", getGlobalContext());";
+  if (!TheModule->getTargetTriple().empty()) {
+    nl(Out) << "mod->setDataLayout(\"" << TheModule->getDataLayout() << "\");";
+  }
+  if (!TheModule->getTargetTriple().empty()) {
+    nl(Out) << "mod->setTargetTriple(\"" << TheModule->getTargetTriple()
+            << "\");";
+  }
+
+  if (!TheModule->getModuleInlineAsm().empty()) {
+    nl(Out) << "mod->setModuleInlineAsm(\"";
+    printEscapedString(TheModule->getModuleInlineAsm());
+    Out << "\");";
+  }
+  nl(Out);
+
+  // Loop over the dependent libraries and emit them.
+  Module::lib_iterator LI = TheModule->lib_begin();
+  Module::lib_iterator LE = TheModule->lib_end();
+  while (LI != LE) {
+    Out << "mod->addLibrary(\"" << *LI << "\");";
     nl(Out);
+    ++LI;
   }
+  printModuleBody();
+  nl(Out) << "return mod;";
+  nl(Out,-1) << "}";
+  nl(Out);
+}
+
+void CppWriter::printContents(const std::string& fname,
+                              const std::string& mName) {
+  Out << "\nModule* " << fname << "(Module *mod) {\n";
+  Out << "\nmod->setModuleIdentifier(\"";
+  printEscapedString(mName);
+  Out << "\");\n";
+  printModuleBody();
+  Out << "\nreturn mod;\n";
+  Out << "\n}\n";
+}
 
-  void CppWriter::printContents(const std::string& fname,
-                                const std::string& mName) {
-    Out << "\nModule* " << fname << "(Module *mod) {\n";
-    Out << "\nmod->setModuleIdentifier(\"";
-    printEscapedString(mName);
-    Out << "\");\n";
-    printModuleBody();
-    Out << "\nreturn mod;\n";
-    Out << "\n}\n";
+void CppWriter::printFunction(const std::string& fname,
+                              const std::string& funcName) {
+  const Function* F = TheModule->getFunction(funcName);
+  if (!F) {
+    error(std::string("Function '") + funcName + "' not found in input module");
+    return;
   }
+  Out << "\nFunction* " << fname << "(Module *mod) {\n";
+  printFunctionUses(F);
+  printFunctionHead(F);
+  printFunctionBody(F);
+  Out << "return " << getCppName(F) << ";\n";
+  Out << "}\n";
+}
 
-  void CppWriter::printFunction(const std::string& fname,
-                                const std::string& funcName) {
-    const Function* F = TheModule->getFunction(funcName);
-    if (!F) {
-      error(std::string("Function '") + funcName + "' not found in input module");
-      return;
-    }
-    Out << "\nFunction* " << fname << "(Module *mod) {\n";
-    printFunctionUses(F);
-    printFunctionHead(F);
-    printFunctionBody(F);
-    Out << "return " << getCppName(F) << ";\n";
-    Out << "}\n";
-  }
-
-  void CppWriter::printFunctions() {
-    const Module::FunctionListType &funcs = TheModule->getFunctionList();
-    Module::const_iterator I  = funcs.begin();
-    Module::const_iterator IE = funcs.end();
-
-    for (; I != IE; ++I) {
-      const Function &func = *I;
-      if (!func.isDeclaration()) {
-        std::string name("define_");
-        name += func.getName();
-        printFunction(name, func.getName());
-      }
+void CppWriter::printFunctions() {
+  const Module::FunctionListType &funcs = TheModule->getFunctionList();
+  Module::const_iterator I  = funcs.begin();
+  Module::const_iterator IE = funcs.end();
+
+  for (; I != IE; ++I) {
+    const Function &func = *I;
+    if (!func.isDeclaration()) {
+      std::string name("define_");
+      name += func.getName();
+      printFunction(name, func.getName());
     }
   }
+}
 
-  void CppWriter::printVariable(const std::string& fname,
-                                const std::string& varName) {
-    const GlobalVariable* GV = TheModule->getNamedGlobal(varName);
+void CppWriter::printVariable(const std::string& fname,
+                              const std::string& varName) {
+  const GlobalVariable* GV = TheModule->getNamedGlobal(varName);
 
-    if (!GV) {
-      error(std::string("Variable '") + varName + "' not found in input module");
-      return;
-    }
-    Out << "\nGlobalVariable* " << fname << "(Module *mod) {\n";
-    printVariableUses(GV);
-    printVariableHead(GV);
-    printVariableBody(GV);
-    Out << "return " << getCppName(GV) << ";\n";
-    Out << "}\n";
-  }
-
-  void CppWriter::printType(const std::string& fname,
-                            const std::string& typeName) {
-    const Type* Ty = TheModule->getTypeByName(typeName);
-    if (!Ty) {
-      error(std::string("Type '") + typeName + "' not found in input module");
-      return;
-    }
-    Out << "\nType* " << fname << "(Module *mod) {\n";
-    printType(Ty);
-    Out << "return " << getCppName(Ty) << ";\n";
-    Out << "}\n";
-  }
-
-  bool CppWriter::runOnModule(Module &M) {
-    TheModule = &M;
-
-    // Emit a header
-    Out << "// Generated by llvm2cpp - DO NOT MODIFY!\n\n";
-
-    // Get the name of the function we're supposed to generate
-    std::string fname = FuncName.getValue();
-
-    // Get the name of the thing we are to generate
-    std::string tgtname = NameToGenerate.getValue();
-    if (GenerationType == GenModule ||
-        GenerationType == GenContents ||
-        GenerationType == GenProgram ||
-        GenerationType == GenFunctions) {
-      if (tgtname == "!bad!") {
-        if (M.getModuleIdentifier() == "-")
-          tgtname = "<stdin>";
-        else
-          tgtname = M.getModuleIdentifier();
-      }
-    } else if (tgtname == "!bad!")
-      error("You must use the -for option with -gen-{function,variable,type}");
-
-    switch (WhatToGenerate(GenerationType)) {
-     case GenProgram:
-      if (fname.empty())
-        fname = "makeLLVMModule";
-      printProgram(fname,tgtname);
-      break;
-     case GenModule:
-      if (fname.empty())
-        fname = "makeLLVMModule";
-      printModule(fname,tgtname);
-      break;
-     case GenContents:
-      if (fname.empty())
-        fname = "makeLLVMModuleContents";
-      printContents(fname,tgtname);
-      break;
-     case GenFunction:
-      if (fname.empty())
-        fname = "makeLLVMFunction";
-      printFunction(fname,tgtname);
-      break;
-     case GenFunctions:
-      printFunctions();
-      break;
-     case GenInline:
-      if (fname.empty())
-        fname = "makeLLVMInline";
-      printInline(fname,tgtname);
-      break;
-     case GenVariable:
-      if (fname.empty())
-        fname = "makeLLVMVariable";
-      printVariable(fname,tgtname);
-      break;
-     case GenType:
-      if (fname.empty())
-        fname = "makeLLVMType";
-      printType(fname,tgtname);
-      break;
-     default:
-      error("Invalid generation option");
-    }
+  if (!GV) {
+    error(std::string("Variable '") + varName + "' not found in input module");
+    return;
+  }
+  Out << "\nGlobalVariable* " << fname << "(Module *mod) {\n";
+  printVariableUses(GV);
+  printVariableHead(GV);
+  printVariableBody(GV);
+  Out << "return " << getCppName(GV) << ";\n";
+  Out << "}\n";
+}
 
-    return false;
+void CppWriter::printType(const std::string& fname,
+                          const std::string& typeName) {
+  const Type* Ty = TheModule->getTypeByName(typeName);
+  if (!Ty) {
+    error(std::string("Type '") + typeName + "' not found in input module");
+    return;
   }
+  Out << "\nType* " << fname << "(Module *mod) {\n";
+  printType(Ty);
+  Out << "return " << getCppName(Ty) << ";\n";
+  Out << "}\n";
+}
+
+bool CppWriter::runOnModule(Module &M) {
+  TheModule = &M;
+
+  // Emit a header
+  Out << "// Generated by llvm2cpp - DO NOT MODIFY!\n\n";
+
+  // Get the name of the function we're supposed to generate
+  std::string fname = FuncName.getValue();
+
+  // Get the name of the thing we are to generate
+  std::string tgtname = NameToGenerate.getValue();
+  if (GenerationType == GenModule ||
+      GenerationType == GenContents ||
+      GenerationType == GenProgram ||
+      GenerationType == GenFunctions) {
+    if (tgtname == "!bad!") {
+      if (M.getModuleIdentifier() == "-")
+        tgtname = "<stdin>";
+      else
+        tgtname = M.getModuleIdentifier();
+    }
+  } else if (tgtname == "!bad!")
+    error("You must use the -for option with -gen-{function,variable,type}");
+
+  switch (WhatToGenerate(GenerationType)) {
+   case GenProgram:
+    if (fname.empty())
+      fname = "makeLLVMModule";
+    printProgram(fname,tgtname);
+    break;
+   case GenModule:
+    if (fname.empty())
+      fname = "makeLLVMModule";
+    printModule(fname,tgtname);
+    break;
+   case GenContents:
+    if (fname.empty())
+      fname = "makeLLVMModuleContents";
+    printContents(fname,tgtname);
+    break;
+   case GenFunction:
+    if (fname.empty())
+      fname = "makeLLVMFunction";
+    printFunction(fname,tgtname);
+    break;
+   case GenFunctions:
+    printFunctions();
+    break;
+   case GenInline:
+    if (fname.empty())
+      fname = "makeLLVMInline";
+    printInline(fname,tgtname);
+    break;
+   case GenVariable:
+    if (fname.empty())
+      fname = "makeLLVMVariable";
+    printVariable(fname,tgtname);
+    break;
+   case GenType:
+    if (fname.empty())
+      fname = "makeLLVMType";
+    printType(fname,tgtname);
+    break;
+   default:
+    error("Invalid generation option");
+  }
+
+  return false;
 }
 
 char CppWriter::ID = 0;
diff --git a/lib/Target/MBlaze/AsmPrinter/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/AsmPrinter/MBlazeAsmPrinter.cpp
index e42e9b3..b6e4d65 100644
--- a/lib/Target/MBlaze/AsmPrinter/MBlazeAsmPrinter.cpp
+++ b/lib/Target/MBlaze/AsmPrinter/MBlazeAsmPrinter.cpp
@@ -145,8 +145,9 @@ void MBlazeAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
   const MachineFrameInfo *MFI = MF->getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned RegNum = MBlazeRegisterInfo::getRegisterNumbering(CSI[i].getReg());
-    if (CSI[i].getRegClass() == MBlaze::CPURegsRegisterClass)
+    unsigned Reg = CSI[i].getReg();
+    unsigned RegNum = MBlazeRegisterInfo::getRegisterNumbering(Reg);
+    if (MBlaze::CPURegsRegisterClass->contains(Reg))
       CPUBitmask |= (1 << RegNum);
   }
 
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index 23889b1..1730b68 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -234,6 +234,24 @@ MBlazeTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     MachineRegisterInfo &R = F->getRegInfo();
     MachineBasicBlock *loop = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *finish = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, loop);
+    F->insert(It, finish);
+
+    // Update machine-CFG edges by transfering adding all successors and
+    // remaining instructions from the current block to the new block which
+    // will contain the Phi node for the select.
+    finish->splice(finish->begin(), BB,
+                   llvm::next(MachineBasicBlock::iterator(MI)),
+                   BB->end());
+    finish->transferSuccessorsAndUpdatePHIs(BB);
+
+    // Add the true and fallthrough blocks as its successors.
+    BB->addSuccessor(loop);
+    BB->addSuccessor(finish);
+
+    // Next, add the finish block as a successor of the loop block
+    loop->addSuccessor(finish);
+    loop->addSuccessor(loop);
 
     unsigned IAMT = R.createVirtualRegister(MBlaze::CPURegsRegisterClass);
     BuildMI(BB, dl, TII->get(MBlaze::ANDI), IAMT)
@@ -249,26 +267,6 @@ MBlazeTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
       .addReg(IAMT)
       .addMBB(finish);
 
-    F->insert(It, loop);
-    F->insert(It, finish);
-
-    // Update machine-CFG edges by first adding all successors of the current
-    // block to the new block which will contain the Phi node for the select.
-    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
-          e = BB->succ_end(); i != e; ++i)
-      finish->addSuccessor(*i);
-
-    // Next, remove all successors of the current block, and add the true
-    // and fallthrough blocks as its successors.
-    while(!BB->succ_empty())
-      BB->removeSuccessor(BB->succ_begin());
-    BB->addSuccessor(loop);
-    BB->addSuccessor(finish);
-
-    // Next, add the finish block as a successor of the loop block
-    loop->addSuccessor(finish);
-    loop->addSuccessor(loop);
-
     unsigned DST = R.createVirtualRegister(MBlaze::CPURegsRegisterClass);
     unsigned NDST = R.createVirtualRegister(MBlaze::CPURegsRegisterClass);
     BuildMI(loop, dl, TII->get(MBlaze::PHI), DST)
@@ -298,12 +296,13 @@ MBlazeTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
       .addReg(NAMT)
       .addMBB(loop);
 
-    BuildMI(finish, dl, TII->get(MBlaze::PHI), MI->getOperand(0).getReg())
+    BuildMI(*finish, finish->begin(), dl,
+            TII->get(MBlaze::PHI), MI->getOperand(0).getReg())
       .addReg(IVAL).addMBB(BB)
       .addReg(NDST).addMBB(loop);
 
     // The pseudo instruction is no longer needed so remove it
-    F->DeleteMachineInstr(MI);
+    MI->eraseFromParent();
     return finish;
     }
 
@@ -338,27 +337,23 @@ MBlazeTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     case MBlazeCC::LE: Opc = MBlaze::BGTID; break;
     }
 
-    BuildMI(BB, dl, TII->get(Opc))
-      .addReg(MI->getOperand(3).getReg())
-      .addMBB(dneBB);
-
     F->insert(It, flsBB);
     F->insert(It, dneBB);
 
-    // Update machine-CFG edges by first adding all successors of the current
-    // block to the new block which will contain the Phi node for the select.
-    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
-          e = BB->succ_end(); i != e; ++i)
-      dneBB->addSuccessor(*i);
+    // Transfer the remainder of BB and its successor edges to dneBB.
+    dneBB->splice(dneBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+    dneBB->transferSuccessorsAndUpdatePHIs(BB);
 
-    // Next, remove all successors of the current block, and add the true
-    // and fallthrough blocks as its successors.
-    while(!BB->succ_empty())
-      BB->removeSuccessor(BB->succ_begin());
     BB->addSuccessor(flsBB);
     BB->addSuccessor(dneBB);
     flsBB->addSuccessor(dneBB);
 
+    BuildMI(BB, dl, TII->get(Opc))
+      .addReg(MI->getOperand(3).getReg())
+      .addMBB(dneBB);
+
     //  sinkMBB:
     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
     //  ...
@@ -366,11 +361,12 @@ MBlazeTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     //  .addReg(MI->getOperand(1).getReg()).addMBB(flsBB)
     //  .addReg(MI->getOperand(2).getReg()).addMBB(BB);
 
-    BuildMI(dneBB, dl, TII->get(MBlaze::PHI), MI->getOperand(0).getReg())
+    BuildMI(*dneBB, dneBB->begin(), dl,
+            TII->get(MBlaze::PHI), MI->getOperand(0).getReg())
       .addReg(MI->getOperand(2).getReg()).addMBB(flsBB)
       .addReg(MI->getOperand(1).getReg()).addMBB(BB);
 
-    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
     return dneBB;
   }
   }
@@ -408,7 +404,7 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   // FIXME there isn't actually debug info here
   DebugLoc dl = Op.getDebugLoc();
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
 
   return DAG.getNode(MBlazeISD::Wrap, dl, MVT::i32, GA);
 }
@@ -439,10 +435,8 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
 SDValue MBlazeTargetLowering::
 LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   SDValue ResNode;
-  EVT PtrVT = Op.getValueType();
   ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
   const Constant *C = N->getConstVal();
-  SDValue Zero = DAG.getConstant(0, PtrVT);
   DebugLoc dl = Op.getDebugLoc();
 
   SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
@@ -531,6 +525,7 @@ SDValue MBlazeTargetLowering::
 LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
           bool isVarArg, bool &isTailCall,
           const SmallVectorImpl<ISD::OutputArg> &Outs,
+          const SmallVectorImpl<SDValue> &OutVals,
           const SmallVectorImpl<ISD::InputArg> &Ins,
           DebugLoc dl, SelectionDAG &DAG,
           SmallVectorImpl<SDValue> &InVals) const {
@@ -562,7 +557,7 @@ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     EVT RegVT = VA.getLocVT();
-    SDValue Arg = Outs[i].Val;
+    SDValue Arg = OutVals[i];
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
@@ -590,7 +585,7 @@ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
       // Create the frame index object for this incoming parameter
       LastArgStackLoc = (FirstStackArgLoc + VA.getLocMemOffset());
       int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
-                                      LastArgStackLoc, true, false);
+                                      LastArgStackLoc, true);
 
       SDValue PtrOff = DAG.getFrameIndex(FI,getPointerTy());
 
@@ -623,7 +618,7 @@ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
   // node so that legalize doesn't hack it.
   unsigned char OpFlag = MBlazeII::MO_NO_FLAG;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(),
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
                                 getPointerTy(), 0, OpFlag);
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
     Callee = DAG.getTargetExternalSymbol(S->getSymbol(),
@@ -779,7 +774,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
       // offset on PEI::calculateFrameObjectOffsets.
       // Arguments are always 32-bit.
       unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
-      int FI = MFI->CreateFixedObject(ArgSize, 0, true, false);
+      int FI = MFI->CreateFixedObject(ArgSize, 0, true);
       MBlazeFI->recordLoadArgsFI(FI, -(ArgSize+
         (FirstStackArgLoc + VA.getLocMemOffset())));
 
@@ -810,7 +805,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
       unsigned LiveReg = MF.addLiveIn(Reg, RC);
       SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, LiveReg, MVT::i32);
 
-      int FI = MFI->CreateFixedObject(4, 0, true, false);
+      int FI = MFI->CreateFixedObject(4, 0, true);
       MBlazeFI->recordStoreVarArgsFI(FI, -(4+(StackLoc*4)));
       SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
       OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff, NULL, 0,
@@ -841,6 +836,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
 SDValue MBlazeTargetLowering::
 LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
             const SmallVectorImpl<ISD::OutputArg> &Outs,
+            const SmallVectorImpl<SDValue> &OutVals,
             DebugLoc dl, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of
   // the return value to a location
@@ -869,7 +865,7 @@ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     assert(VA.isRegLoc() && "Can only return in registers!");
 
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             Outs[i].Val, Flag);
+                             OutVals[i], Flag);
 
     // guarantee that all emitted copies are
     // stuck together, avoiding something bad
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.h b/lib/Target/MBlaze/MBlazeISelLowering.h
index 9f9ac89..5ec2563 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.h
+++ b/lib/Target/MBlaze/MBlazeISelLowering.h
@@ -109,6 +109,7 @@ namespace llvm {
                 CallingConv::ID CallConv, bool isVarArg,
                 bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
@@ -117,6 +118,7 @@ namespace llvm {
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     virtual MachineBasicBlock *
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
index 4c4d86b..6ff5825 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
@@ -110,15 +110,13 @@ insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const {
   BuildMI(MBB, MI, DL, get(MBlaze::NOP));
 }
 
-bool MBlazeInstrInfo::
-copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-             unsigned DestReg, unsigned SrcReg,
-             const TargetRegisterClass *DestRC,
-             const TargetRegisterClass *SrcRC,
-             DebugLoc DL) const {
+void MBlazeInstrInfo::
+copyPhysReg(MachineBasicBlock &MBB,
+            MachineBasicBlock::iterator I, DebugLoc DL,
+            unsigned DestReg, unsigned SrcReg,
+            bool KillSrc) const {
   llvm::BuildMI(MBB, I, DL, get(MBlaze::ADD), DestReg)
-      .addReg(SrcReg).addReg(MBlaze::R0);
-  return true;
+    .addReg(SrcReg, getKillRegState(KillSrc)).addReg(MBlaze::R0);
 }
 
 void MBlazeInstrInfo::
@@ -141,54 +139,17 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       .addImm(0).addFrameIndex(FI);
 }
 
-MachineInstr *MBlazeInstrInfo::
-foldMemoryOperandImpl(MachineFunction &MF,
-                      MachineInstr* MI,
-                      const SmallVectorImpl<unsigned> &Ops, int FI) const {
-  if (Ops.size() != 1) return NULL;
-
-  MachineInstr *NewMI = NULL;
-
-  switch (MI->getOpcode()) {
-  case MBlaze::OR:
-  case MBlaze::ADD:
-    if ((MI->getOperand(0).isReg()) &&
-        (MI->getOperand(2).isReg()) &&
-        (MI->getOperand(2).getReg() == MBlaze::R0) &&
-        (MI->getOperand(1).isReg())) {
-      if (Ops[0] == 0) {    // COPY -> STORE
-        unsigned SrcReg = MI->getOperand(1).getReg();
-        bool isKill = MI->getOperand(1).isKill();
-        bool isUndef = MI->getOperand(1).isUndef();
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(MBlaze::SW))
-          .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
-          .addImm(0).addFrameIndex(FI);
-      } else {              // COPY -> LOAD
-        unsigned DstReg = MI->getOperand(0).getReg();
-        bool isDead = MI->getOperand(0).isDead();
-        bool isUndef = MI->getOperand(0).isUndef();
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(MBlaze::LW))
-          .addReg(DstReg, RegState::Define | getDeadRegState(isDead) |
-                  getUndefRegState(isUndef))
-          .addImm(0).addFrameIndex(FI);
-      }
-    }
-    break;
-  }
-
-  return NewMI;
-}
-
 //===----------------------------------------------------------------------===//
 // Branch Analysis
 //===----------------------------------------------------------------------===//
 unsigned MBlazeInstrInfo::
 InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
              MachineBasicBlock *FBB,
-             const SmallVectorImpl<MachineOperand> &Cond) const {
+             const SmallVectorImpl<MachineOperand> &Cond,
+             DebugLoc DL) const {
   // Can only insert uncond branches so far.
   assert(Cond.empty() && !FBB && TBB && "Can only handle uncond branches!");
-  BuildMI(&MBB, DebugLoc(), get(MBlaze::BRI)).addMBB(TBB);
+  BuildMI(&MBB, DL, get(MBlaze::BRI)).addMBB(TBB);
   return 1;
 }
 
@@ -209,12 +170,8 @@ unsigned MBlazeInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
 
   GlobalBaseReg = RegInfo.createVirtualRegister(MBlaze::CPURegsRegisterClass);
-  bool Ok = TII->copyRegToReg(FirstMBB, MBBI, GlobalBaseReg, MBlaze::R20,
-                              MBlaze::CPURegsRegisterClass,
-                              MBlaze::CPURegsRegisterClass,
-                              DebugLoc());
-  assert(Ok && "Couldn't assign to global base register!");
-  Ok = Ok; // Silence warning when assertions are turned off.
+  BuildMI(FirstMBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY),
+          GlobalBaseReg).addReg(MBlaze::R20);
   RegInfo.addLiveIn(MBlaze::R20);
 
   MBlazeFI->setGlobalBaseReg(GlobalBaseReg);
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.h b/lib/Target/MBlaze/MBlazeInstrInfo.h
index c9fdc88..f074370 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.h
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.h
@@ -198,13 +198,12 @@ public:
   /// Branch Analysis
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                 MachineBasicBlock *FBB,
-                            const SmallVectorImpl<MachineOperand> &Cond) const;
-  virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator I,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
   virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
                                    unsigned SrcReg, bool isKill, int FrameIndex,
@@ -217,18 +216,6 @@ public:
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
 
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              int FrameIndex) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              MachineInstr* LoadMI) const {
-    return 0;
-  }
-
   /// Insert nop instruction when hazard condition is found
   virtual void insertNoop(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MI) const;
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index f15eea9..8cafa8c 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -148,22 +148,6 @@ getCalleeSavedRegs(const MachineFunction *MF) const {
   return CalleeSavedRegs;
 }
 
-/// MBlaze Callee Saved Register Classes
-const TargetRegisterClass* const* MBlazeRegisterInfo::
-getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  static const TargetRegisterClass * const CalleeSavedRC[] = {
-    &MBlaze::CPURegsRegClass, &MBlaze::CPURegsRegClass,
-    &MBlaze::CPURegsRegClass, &MBlaze::CPURegsRegClass,
-    &MBlaze::CPURegsRegClass, &MBlaze::CPURegsRegClass,
-    &MBlaze::CPURegsRegClass, &MBlaze::CPURegsRegClass,
-    &MBlaze::CPURegsRegClass, &MBlaze::CPURegsRegClass,
-    &MBlaze::CPURegsRegClass, &MBlaze::CPURegsRegClass,
-    0
-  };
-
-  return CalleeSavedRC;
-}
-
 BitVector MBlazeRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h
index b618bf4..af97b0e 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.h
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.h
@@ -54,9 +54,6 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
   /// Code Generation virtual methods...
   const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
 
-  const TargetRegisterClass* const*
-  getCalleeSavedRegClasses(const MachineFunction* MF = 0) const;
-
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
   bool hasFP(const MachineFunction &MF) const;
diff --git a/lib/Target/MSIL/MSILWriter.cpp b/lib/Target/MSIL/MSILWriter.cpp
index 3de173c..8f97d25 100644
--- a/lib/Target/MSIL/MSILWriter.cpp
+++ b/lib/Target/MSIL/MSILWriter.cpp
@@ -808,7 +808,7 @@ void MSILWriter::printIntrinsicCall(const IntrinsicInst* Inst) {
   std::string Name;
   switch (Inst->getIntrinsicID()) {
   case Intrinsic::vastart:
-    Name = getValueName(Inst->getOperand(1));
+    Name = getValueName(Inst->getArgOperand(0));
     Name.insert(Name.length()-1,"$valist");
     // Obtain the argument handle.
     printSimpleInstruction("ldloca",Name.c_str());
@@ -817,20 +817,20 @@ void MSILWriter::printIntrinsicCall(const IntrinsicInst* Inst) {
       "instance void [mscorlib]System.ArgIterator::.ctor"
       "(valuetype [mscorlib]System.RuntimeArgumentHandle)");
     // Save as pointer type "void*"
-    printValueLoad(Inst->getOperand(1));
+    printValueLoad(Inst->getArgOperand(0));
     printSimpleInstruction("ldloca",Name.c_str());
     printIndirectSave(PointerType::getUnqual(
           IntegerType::get(Inst->getContext(), 8)));
     break;
   case Intrinsic::vaend:
     // Close argument list handle.
-    printIndirectLoad(Inst->getOperand(1));
+    printIndirectLoad(Inst->getArgOperand(0));
     printSimpleInstruction("call","instance void [mscorlib]System.ArgIterator::End()");
     break;
   case Intrinsic::vacopy:
     // Copy "ArgIterator" valuetype.
-    printIndirectLoad(Inst->getOperand(1));
-    printIndirectLoad(Inst->getOperand(2));
+    printIndirectLoad(Inst->getArgOperand(0));
+    printIndirectLoad(Inst->getArgOperand(1));
     printSimpleInstruction("cpobj","[mscorlib]System.ArgIterator");
     break;        
   default:
@@ -845,10 +845,11 @@ void MSILWriter::printCallInstruction(const Instruction* Inst) {
     // Handle intrinsic function.
     printIntrinsicCall(cast<IntrinsicInst>(Inst));
   } else {
+    const CallInst *CI = cast<CallInst>(Inst);
     // Load arguments to stack and call function.
-    for (int I = 1, E = Inst->getNumOperands(); I!=E; ++I)
-      printValueLoad(Inst->getOperand(I));
-    printFunctionCall(Inst->getOperand(0),Inst);
+    for (int I = 0, E = CI->getNumArgOperands(); I!=E; ++I)
+      printValueLoad(CI->getArgOperand(I));
+    printFunctionCall(CI->getCalledFunction(), Inst);
   }
 }
 
@@ -1002,8 +1003,8 @@ void MSILWriter::printInvokeInstruction(const InvokeInst* Inst) {
   std::string Label = "leave$normal_"+utostr(getUniqID());
   Out << ".try {\n";
   // Load arguments
-  for (int I = 3, E = Inst->getNumOperands(); I!=E; ++I)
-    printValueLoad(Inst->getOperand(I));
+  for (int I = 0, E = Inst->getNumArgOperands(); I!=E; ++I)
+    printValueLoad(Inst->getArgOperand(I));
   // Print call instruction
   printFunctionCall(Inst->getOperand(0),Inst);
   // Save function result and leave "try" block
@@ -1280,7 +1281,7 @@ void MSILWriter::printLocalVariables(const Function& F) {
       case Intrinsic::vaend:
       case Intrinsic::vacopy:
         isVaList = true;
-        VaList = Inst->getOperand(1);
+        VaList = Inst->getArgOperand(0);
         break;
       default:
         isVaList = false;
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 7b328bb..3395e9f 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -272,7 +272,8 @@ bool MSP430DAGToDAGISel::SelectAddr(SDNode *Op, SDValue N,
     AM.Base.Reg;
 
   if (AM.GV)
-    Disp = CurDAG->getTargetGlobalAddress(AM.GV, MVT::i16, AM.Disp,
+    Disp = CurDAG->getTargetGlobalAddress(AM.GV, Op->getDebugLoc(),
+                                          MVT::i16, AM.Disp,
                                           0/*AM.SymbolFlags*/);
   else if (AM.CP)
     Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i16,
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 403400e..a1703a3 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -278,6 +278,7 @@ MSP430TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                 CallingConv::ID CallConv, bool isVarArg,
                                 bool &isTailCall,
                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                const SmallVectorImpl<SDValue> &OutVals,
                                 const SmallVectorImpl<ISD::InputArg> &Ins,
                                 DebugLoc dl, SelectionDAG &DAG,
                                 SmallVectorImpl<SDValue> &InVals) const {
@@ -290,7 +291,7 @@ MSP430TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   case CallingConv::Fast:
   case CallingConv::C:
     return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
-                          Outs, Ins, dl, DAG, InVals);
+                          Outs, OutVals, Ins, dl, DAG, InVals);
   case CallingConv::MSP430_INTR:
     report_fatal_error("ISRs cannot be called directly");
     return SDValue();
@@ -369,7 +370,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
              << "\n";
       }
       // Create the frame index object for this incoming parameter...
-      int FI = MFI->CreateFixedObject(ObjSize, VA.getLocMemOffset(), true, false);
+      int FI = MFI->CreateFixedObject(ObjSize, VA.getLocMemOffset(), true);
 
       // Create the SelectionDAG nodes corresponding to a load
       //from this parameter
@@ -387,6 +388,7 @@ SDValue
 MSP430TargetLowering::LowerReturn(SDValue Chain,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<SDValue> &OutVals,
                                   DebugLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to a location
@@ -421,7 +423,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
     assert(VA.isRegLoc() && "Can only return in registers!");
 
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             Outs[i].Val, Flag);
+                             OutVals[i], Flag);
 
     // Guarantee that all emitted copies are stuck together,
     // avoiding something bad.
@@ -447,6 +449,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
                                      bool isTailCall,
                                      const SmallVectorImpl<ISD::OutputArg>
                                        &Outs,
+                                     const SmallVectorImpl<SDValue> &OutVals,
                                      const SmallVectorImpl<ISD::InputArg> &Ins,
                                      DebugLoc dl, SelectionDAG &DAG,
                                      SmallVectorImpl<SDValue> &InVals) const {
@@ -471,7 +474,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
 
-    SDValue Arg = Outs[i].Val;
+    SDValue Arg = OutVals[i];
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
@@ -529,7 +532,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i16);
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i16);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
     Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i16);
 
@@ -642,7 +645,8 @@ SDValue MSP430TargetLowering::LowerGlobalAddress(SDValue Op,
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
 
   // Create the TargetGlobalAddress node, folding in the constant offset.
-  SDValue Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
+  SDValue Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+                                              getPointerTy(), Offset);
   return DAG.getNode(MSP430ISD::Wrapper, Op.getDebugLoc(),
                      getPointerTy(), Result);
 }
@@ -888,7 +892,7 @@ MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
     // Set up a frame object for the return address.
     uint64_t SlotSize = TD->getPointerSize();
     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
-                                                           true, false);
+                                                           true);
     FuncInfo->setRAIndex(ReturnAddrIndex);
   }
 
@@ -1070,7 +1074,10 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
 
   // Update machine-CFG edges by transferring all successors of the current
   // block to the block containing instructions after shift.
-  RemBB->transferSuccessors(BB);
+  RemBB->splice(RemBB->begin(), BB,
+                llvm::next(MachineBasicBlock::iterator(MI)),
+                BB->end());
+  RemBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB
   BB->addSuccessor(LoopBB);
@@ -1116,11 +1123,11 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
 
   // RemBB:
   // DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
-  BuildMI(RemBB, dl, TII.get(MSP430::PHI), DstReg)
+  BuildMI(*RemBB, RemBB->begin(), dl, TII.get(MSP430::PHI), DstReg)
     .addReg(SrcReg).addMBB(BB)
     .addReg(ShiftReg2).addMBB(LoopBB);
 
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
   return RemBB;
 }
 
@@ -1158,18 +1165,22 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   MachineFunction *F = BB->getParent();
   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *copy1MBB = F->CreateMachineBasicBlock(LLVM_BB);
-  BuildMI(BB, dl, TII.get(MSP430::JCC))
-    .addMBB(copy1MBB)
-    .addImm(MI->getOperand(3).getImm());
   F->insert(I, copy0MBB);
   F->insert(I, copy1MBB);
   // Update machine-CFG edges by transferring all successors of the current
   // block to the new block which will contain the Phi node for the select.
-  copy1MBB->transferSuccessors(BB);
+  copy1MBB->splice(copy1MBB->begin(), BB,
+                   llvm::next(MachineBasicBlock::iterator(MI)),
+                   BB->end());
+  copy1MBB->transferSuccessorsAndUpdatePHIs(BB);
   // Next, add the true and fallthrough blocks as its successors.
   BB->addSuccessor(copy0MBB);
   BB->addSuccessor(copy1MBB);
 
+  BuildMI(BB, dl, TII.get(MSP430::JCC))
+    .addMBB(copy1MBB)
+    .addImm(MI->getOperand(3).getImm());
+
   //  copy0MBB:
   //   %FalseValue = ...
   //   # fallthrough to copy1MBB
@@ -1182,11 +1193,11 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   //  ...
   BB = copy1MBB;
-  BuildMI(BB, dl, TII.get(MSP430::PHI),
+  BuildMI(*BB, BB->begin(), dl, TII.get(MSP430::PHI),
           MI->getOperand(0).getReg())
     .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
     .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
 
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
   return BB;
 }
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 01c5071..673c543 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -127,6 +127,7 @@ namespace llvm {
                            CallingConv::ID CallConv, bool isVarArg,
                            bool isTailCall,
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            DebugLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
@@ -155,6 +156,7 @@ namespace llvm {
       LowerCall(SDValue Chain, SDValue Callee,
                 CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
@@ -163,6 +165,7 @@ namespace llvm {
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 18226ab..df28d07 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -83,27 +83,20 @@ void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     llvm_unreachable("Cannot store this register to stack slot!");
 }
 
-bool MSP430InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   const TargetRegisterClass *DestRC,
-                                   const TargetRegisterClass *SrcRC,
-                                   DebugLoc DL) const {
-  if (DestRC == SrcRC) {
-    unsigned Opc;
-    if (DestRC == &MSP430::GR16RegClass) {
-      Opc = MSP430::MOV16rr;
-    } else if (DestRC == &MSP430::GR8RegClass) {
-      Opc = MSP430::MOV8rr;
-    } else {
-      return false;
-    }
-
-    BuildMI(MBB, I, DL, get(Opc), DestReg).addReg(SrcReg);
-    return true;
-  }
+void MSP430InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  unsigned Opc;
+  if (MSP430::GR16RegClass.contains(DestReg, SrcReg))
+    Opc = MSP430::MOV16rr;
+  else if (MSP430::GR8RegClass.contains(DestReg, SrcReg))
+    Opc = MSP430::MOV8rr;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
 
-  return false;
+  BuildMI(MBB, I, DL, get(Opc), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 bool
@@ -330,10 +323,8 @@ bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 unsigned
 MSP430InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                               MachineBasicBlock *FBB,
-                            const SmallVectorImpl<MachineOperand> &Cond) const {
-  // FIXME this should probably have a DebugLoc operand
-  DebugLoc DL;
-
+                              const SmallVectorImpl<MachineOperand> &Cond,
+                              DebugLoc DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 1 || Cond.size() == 0) &&
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index 842b4cb..ebbda1a 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -49,11 +49,10 @@ public:
   ///
   virtual const TargetRegisterInfo &getRegisterInfo() const { return RI; }
 
-  bool copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                    unsigned DestReg, unsigned SrcReg,
-                    const TargetRegisterClass *DestRC,
-                    const TargetRegisterClass *SrcRC,
-                    DebugLoc DL) const;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const;
 
   bool isMoveInstr(const MachineInstr& MI,
                    unsigned &SrcReg, unsigned &DstReg,
@@ -93,7 +92,8 @@ public:
   unsigned RemoveBranch(MachineBasicBlock &MBB) const;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB,
-                        const SmallVectorImpl<MachineOperand> &Cond) const;
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const;
 
 };
 
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 6b9a2f2..8792b22 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -25,13 +25,16 @@ class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
 def SDT_MSP430Call         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
 def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
 def SDT_MSP430CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
-def SDT_MSP430Wrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def SDT_MSP430Wrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                                  SDTCisPtrTy<0>]>;
 def SDT_MSP430Cmp          : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
 def SDT_MSP430BrCC         : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
                                                   SDTCisVT<1, i8>]>;
-def SDT_MSP430SelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, 
+def SDT_MSP430SelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+                                                  SDTCisSameAs<1, 2>, 
                                                   SDTCisVT<3, i8>]>;
-def SDT_MSP430Shift        : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisI8<2>]>;
+def SDT_MSP430Shift        : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                                  SDTCisI8<2>]>;
 
 //===----------------------------------------------------------------------===//
 // MSP430 Specific Node Definitions.
@@ -46,7 +49,7 @@ def MSP430rla     : SDNode<"MSP430ISD::RLA", SDTIntUnaryOp, []>;
 def MSP430rrc     : SDNode<"MSP430ISD::RRC", SDTIntUnaryOp, []>;
 
 def MSP430call    : SDNode<"MSP430ISD::CALL", SDT_MSP430Call,
-                     [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+                     [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag, SDNPVariadic]>;
 def MSP430callseq_start :
                  SDNode<"ISD::CALLSEQ_START", SDT_MSP430CallSeqStart,
                         [SDNPHasChain, SDNPOutFlag]>;
@@ -55,8 +58,10 @@ def MSP430callseq_end :
                         [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
 def MSP430Wrapper : SDNode<"MSP430ISD::Wrapper", SDT_MSP430Wrapper>;
 def MSP430cmp     : SDNode<"MSP430ISD::CMP", SDT_MSP430Cmp, [SDNPOutFlag]>;
-def MSP430brcc    : SDNode<"MSP430ISD::BR_CC", SDT_MSP430BrCC, [SDNPHasChain, SDNPInFlag]>;
-def MSP430selectcc: SDNode<"MSP430ISD::SELECT_CC", SDT_MSP430SelectCC, [SDNPInFlag]>;
+def MSP430brcc    : SDNode<"MSP430ISD::BR_CC", SDT_MSP430BrCC,
+                            [SDNPHasChain, SDNPInFlag]>;
+def MSP430selectcc: SDNode<"MSP430ISD::SELECT_CC", SDT_MSP430SelectCC,
+                            [SDNPInFlag]>;
 def MSP430shl     : SDNode<"MSP430ISD::SHL", SDT_MSP430Shift, []>;
 def MSP430sra     : SDNode<"MSP430ISD::SRA", SDT_MSP430Shift, []>;
 def MSP430srl     : SDNode<"MSP430ISD::SRL", SDT_MSP430Shift, []>;
@@ -117,14 +122,14 @@ def ADJCALLSTACKUP   : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
 }
 
 let usesCustomInserter = 1 in {
-  def Select8  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cc),
+  def Select8  : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc),
                         "# Select8 PSEUDO",
                         [(set GR8:$dst,
-                          (MSP430selectcc GR8:$src1, GR8:$src2, imm:$cc))]>;
-  def Select16 : Pseudo<(outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cc),
+                          (MSP430selectcc GR8:$src, GR8:$src2, imm:$cc))]>;
+  def Select16 : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR16:$src2, i8imm:$cc),
                         "# Select16 PSEUDO",
                         [(set GR16:$dst,
-                          (MSP430selectcc GR16:$src1, GR16:$src2, imm:$cc))]>;
+                          (MSP430selectcc GR16:$src, GR16:$src2, imm:$cc))]>;
   let Defs = [SRW] in {
   def Shl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
                         "# Shl8 PSEUDO",
@@ -330,60 +335,60 @@ def MOV16mm : I16mm<0x0,
 //===----------------------------------------------------------------------===//
 // Arithmetic Instructions
 
-let isTwoAddress = 1 in {
+let Constraints = "$src = $dst" in {
 
 let Defs = [SRW] in {
 
 let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
 
 def ADD8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                    "add.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (add GR8:$src1, GR8:$src2)),
+                   [(set GR8:$dst, (add GR8:$src, GR8:$src2)),
                     (implicit SRW)]>;
 def ADD16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                     "add.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (add GR16:$src1, GR16:$src2)),
+                    [(set GR16:$dst, (add GR16:$src, GR16:$src2)),
                      (implicit SRW)]>;
 }
 
 def ADD8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                    "add.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (add GR8:$src1, (load addr:$src2))),
+                   [(set GR8:$dst, (add GR8:$src, (load addr:$src2))),
                     (implicit SRW)]>;
 def ADD16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                     "add.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (add GR16:$src1, (load addr:$src2))),
+                    [(set GR16:$dst, (add GR16:$src, (load addr:$src2))),
                      (implicit SRW)]>;
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src1 = $dst" in {
+Constraints = "$base = $base_wb, $src = $dst" in {
 def ADD8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
                          (outs GR8:$dst, GR16:$base_wb),
-                         (ins GR8:$src1, GR16:$base),
+                         (ins GR8:$src, GR16:$base),
                          "add.b\t{@$base+, $dst}", []>;
 def ADD16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
                            (outs GR16:$dst, GR16:$base_wb),
-                           (ins GR16:$src1, GR16:$base),
+                           (ins GR16:$src, GR16:$base),
                           "add.w\t{@$base+, $dst}", []>;
 }
 
 
 def ADD8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
                    "add.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (add GR8:$src1, imm:$src2)),
+                   [(set GR8:$dst, (add GR8:$src, imm:$src2)),
                     (implicit SRW)]>;
 def ADD16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
                     "add.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (add GR16:$src1, imm:$src2)),
+                    [(set GR16:$dst, (add GR16:$src, imm:$src2)),
                      (implicit SRW)]>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
 def ADD8mr  : I8mr<0x0,
                    (outs), (ins memdst:$dst, GR8:$src),
                    "add.b\t{$src, $dst}",
@@ -424,40 +429,40 @@ let Uses = [SRW] in {
 
 let isCommutable = 1 in { // X = ADDC Y, Z  == X = ADDC Z, Y
 def ADC8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                    "addc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (adde GR8:$src1, GR8:$src2)),
+                   [(set GR8:$dst, (adde GR8:$src, GR8:$src2)),
                     (implicit SRW)]>;
 def ADC16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                     "addc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (adde GR16:$src1, GR16:$src2)),
+                    [(set GR16:$dst, (adde GR16:$src, GR16:$src2)),
                      (implicit SRW)]>;
 } // isCommutable
 
 def ADC8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
                    "addc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (adde GR8:$src1, imm:$src2)),
+                   [(set GR8:$dst, (adde GR8:$src, imm:$src2)),
                     (implicit SRW)]>;
 def ADC16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
                     "addc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (adde GR16:$src1, imm:$src2)),
+                    [(set GR16:$dst, (adde GR16:$src, imm:$src2)),
                      (implicit SRW)]>;
 
 def ADC8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                    "addc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (adde GR8:$src1, (load addr:$src2))),
+                   [(set GR8:$dst, (adde GR8:$src, (load addr:$src2))),
                     (implicit SRW)]>;
 def ADC16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                     "addc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (adde GR16:$src1, (load addr:$src2))),
+                    [(set GR16:$dst, (adde GR16:$src, (load addr:$src2))),
                      (implicit SRW)]>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
 def ADC8mr  : I8mr<0x0,
                    (outs), (ins memdst:$dst, GR8:$src),
                    "addc.b\t{$src, $dst}",
@@ -498,52 +503,52 @@ def ADC16mm : I8mm<0x0,
 
 let isCommutable = 1 in { // X = AND Y, Z  == X = AND Z, Y
 def AND8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                    "and.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (and GR8:$src1, GR8:$src2)),
+                   [(set GR8:$dst, (and GR8:$src, GR8:$src2)),
                     (implicit SRW)]>;
 def AND16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                     "and.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src1, GR16:$src2)),
+                    [(set GR16:$dst, (and GR16:$src, GR16:$src2)),
                      (implicit SRW)]>;
 }
 
 def AND8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
                    "and.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (and GR8:$src1, imm:$src2)),
+                   [(set GR8:$dst, (and GR8:$src, imm:$src2)),
                     (implicit SRW)]>;
 def AND16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
                     "and.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src1, imm:$src2)),
+                    [(set GR16:$dst, (and GR16:$src, imm:$src2)),
                      (implicit SRW)]>;
 
 def AND8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                    "and.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (and GR8:$src1, (load addr:$src2))),
+                   [(set GR8:$dst, (and GR8:$src, (load addr:$src2))),
                     (implicit SRW)]>;
 def AND16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                     "and.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src1, (load addr:$src2))),
+                    [(set GR16:$dst, (and GR16:$src, (load addr:$src2))),
                      (implicit SRW)]>;
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src1 = $dst" in {
+Constraints = "$base = $base_wb, $src = $dst" in {
 def AND8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
                          (outs GR8:$dst, GR16:$base_wb),
-                         (ins GR8:$src1, GR16:$base),
+                         (ins GR8:$src, GR16:$base),
                          "and.b\t{@$base+, $dst}", []>;
 def AND16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
                            (outs GR16:$dst, GR16:$base_wb),
-                           (ins GR16:$src1, GR16:$base),
+                           (ins GR16:$src, GR16:$base),
                            "and.w\t{@$base+, $dst}", []>;
 }
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
 def AND8mr  : I8mr<0x0,
                    (outs), (ins memdst:$dst, GR8:$src),
                    "and.b\t{$src, $dst}",
@@ -582,46 +587,46 @@ def AND16mm : I16mm<0x0,
 
 let isCommutable = 1 in { // X = OR Y, Z  == X = OR Z, Y
 def OR8rr  : I8rr<0x0,
-                  (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                  (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                   "bis.b\t{$src2, $dst}",
-                  [(set GR8:$dst, (or GR8:$src1, GR8:$src2))]>;
+                  [(set GR8:$dst, (or GR8:$src, GR8:$src2))]>;
 def OR16rr : I16rr<0x0,
-                   (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                   (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                    "bis.w\t{$src2, $dst}",
-                   [(set GR16:$dst, (or GR16:$src1, GR16:$src2))]>;
+                   [(set GR16:$dst, (or GR16:$src, GR16:$src2))]>;
 }
 
 def OR8ri  : I8ri<0x0,
-                  (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                  (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
                   "bis.b\t{$src2, $dst}",
-                  [(set GR8:$dst, (or GR8:$src1, imm:$src2))]>;
+                  [(set GR8:$dst, (or GR8:$src, imm:$src2))]>;
 def OR16ri : I16ri<0x0,
-                   (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                   (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
                    "bis.w\t{$src2, $dst}",
-                   [(set GR16:$dst, (or GR16:$src1, imm:$src2))]>;
+                   [(set GR16:$dst, (or GR16:$src, imm:$src2))]>;
 
 def OR8rm  : I8rm<0x0,
-                  (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                  (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                   "bis.b\t{$src2, $dst}",
-                  [(set GR8:$dst, (or GR8:$src1, (load addr:$src2)))]>;
+                  [(set GR8:$dst, (or GR8:$src, (load addr:$src2)))]>;
 def OR16rm : I16rm<0x0,
-                   (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                   (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                    "bis.w\t{$src2, $dst}",
-                   [(set GR16:$dst, (or GR16:$src1, (load addr:$src2)))]>;
+                   [(set GR16:$dst, (or GR16:$src, (load addr:$src2)))]>;
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src1 = $dst" in {
+Constraints = "$base = $base_wb, $src = $dst" in {
 def OR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
                         (outs GR8:$dst, GR16:$base_wb),
-                        (ins GR8:$src1, GR16:$base),
+                        (ins GR8:$src, GR16:$base),
                         "bis.b\t{@$base+, $dst}", []>;
 def OR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
                           (outs GR16:$dst, GR16:$base_wb),
-                          (ins GR16:$src1, GR16:$base),
+                          (ins GR16:$src, GR16:$base),
                           "bis.w\t{@$base+, $dst}", []>;
 }
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
 def OR8mr  : I8mr<0x0,
                   (outs), (ins memdst:$dst, GR8:$src),
                   "bis.b\t{$src, $dst}",
@@ -654,24 +659,24 @@ def OR16mm : I16mm<0x0,
 
 // bic does not modify condition codes
 def BIC8rr :  I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                    "bic.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (and GR8:$src1, (not GR8:$src2)))]>;
+                   [(set GR8:$dst, (and GR8:$src, (not GR8:$src2)))]>;
 def BIC16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                     "bic.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src1, (not GR16:$src2)))]>;
+                    [(set GR16:$dst, (and GR16:$src, (not GR16:$src2)))]>;
 
 def BIC8rm :  I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                    "bic.b\t{$src2, $dst}",
-                    [(set GR8:$dst, (and GR8:$src1, (not (i8 (load addr:$src2)))))]>;
+                    [(set GR8:$dst, (and GR8:$src, (not (i8 (load addr:$src2)))))]>;
 def BIC16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                     "bic.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src1, (not (i16 (load addr:$src2)))))]>;
+                    [(set GR16:$dst, (and GR16:$src, (not (i16 (load addr:$src2)))))]>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
 def BIC8mr :  I8mr<0x0,
                    (outs), (ins memdst:$dst, GR8:$src),
                    "bic.b\t{$src, $dst}",
@@ -695,52 +700,52 @@ def BIC16mm : I16mm<0x0,
 
 let isCommutable = 1 in { // X = XOR Y, Z  == X = XOR Z, Y
 def XOR8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                    "xor.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (xor GR8:$src1, GR8:$src2)),
+                   [(set GR8:$dst, (xor GR8:$src, GR8:$src2)),
                     (implicit SRW)]>;
 def XOR16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                     "xor.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (xor GR16:$src1, GR16:$src2)),
+                    [(set GR16:$dst, (xor GR16:$src, GR16:$src2)),
                      (implicit SRW)]>;
 }
 
 def XOR8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
                    "xor.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (xor GR8:$src1, imm:$src2)),
+                   [(set GR8:$dst, (xor GR8:$src, imm:$src2)),
                     (implicit SRW)]>;
 def XOR16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
                     "xor.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (xor GR16:$src1, imm:$src2)),
+                    [(set GR16:$dst, (xor GR16:$src, imm:$src2)),
                      (implicit SRW)]>;
 
 def XOR8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                    "xor.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (xor GR8:$src1, (load addr:$src2))),
+                   [(set GR8:$dst, (xor GR8:$src, (load addr:$src2))),
                     (implicit SRW)]>;
 def XOR16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                     "xor.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2))),
+                    [(set GR16:$dst, (xor GR16:$src, (load addr:$src2))),
                      (implicit SRW)]>;
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src1 = $dst" in {
+Constraints = "$base = $base_wb, $src = $dst" in {
 def XOR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
                          (outs GR8:$dst, GR16:$base_wb),
-                         (ins GR8:$src1, GR16:$base),
+                         (ins GR8:$src, GR16:$base),
                          "xor.b\t{@$base+, $dst}", []>;
 def XOR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
                            (outs GR16:$dst, GR16:$base_wb),
-                           (ins GR16:$src1, GR16:$base),
+                           (ins GR16:$src, GR16:$base),
                            "xor.w\t{@$base+, $dst}", []>;
 }
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
 def XOR8mr  : I8mr<0x0,
                    (outs), (ins memdst:$dst, GR8:$src),
                    "xor.b\t{$src, $dst}",
@@ -777,51 +782,51 @@ def XOR16mm : I16mm<0x0,
 
 
 def SUB8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                    "sub.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sub GR8:$src1, GR8:$src2)),
+                   [(set GR8:$dst, (sub GR8:$src, GR8:$src2)),
                     (implicit SRW)]>;
 def SUB16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                     "sub.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sub GR16:$src1, GR16:$src2)),
+                    [(set GR16:$dst, (sub GR16:$src, GR16:$src2)),
                      (implicit SRW)]>;
 
 def SUB8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
                    "sub.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sub GR8:$src1, imm:$src2)),
+                   [(set GR8:$dst, (sub GR8:$src, imm:$src2)),
                     (implicit SRW)]>;
 def SUB16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
                     "sub.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sub GR16:$src1, imm:$src2)),
+                    [(set GR16:$dst, (sub GR16:$src, imm:$src2)),
                      (implicit SRW)]>;
 
 def SUB8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                    "sub.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sub GR8:$src1, (load addr:$src2))),
+                   [(set GR8:$dst, (sub GR8:$src, (load addr:$src2))),
                     (implicit SRW)]>;
 def SUB16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                     "sub.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sub GR16:$src1, (load addr:$src2))),
+                    [(set GR16:$dst, (sub GR16:$src, (load addr:$src2))),
                      (implicit SRW)]>;
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src1 = $dst" in {
+Constraints = "$base = $base_wb, $src = $dst" in {
 def SUB8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
                          (outs GR8:$dst, GR16:$base_wb),
-                         (ins GR8:$src1, GR16:$base),
+                         (ins GR8:$src, GR16:$base),
                          "sub.b\t{@$base+, $dst}", []>;
 def SUB16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
                           (outs GR16:$dst, GR16:$base_wb),
-                          (ins GR16:$src1, GR16:$base),
+                          (ins GR16:$src, GR16:$base),
                           "sub.w\t{@$base+, $dst}", []>;
 }
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
 def SUB8mr  : I8mr<0x0,
                    (outs), (ins memdst:$dst, GR8:$src),
                    "sub.b\t{$src, $dst}",
@@ -860,39 +865,39 @@ def SUB16mm : I16mm<0x0,
 
 let Uses = [SRW] in {
 def SBC8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                    "subc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sube GR8:$src1, GR8:$src2)),
+                   [(set GR8:$dst, (sube GR8:$src, GR8:$src2)),
                     (implicit SRW)]>;
 def SBC16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                     "subc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sube GR16:$src1, GR16:$src2)),
+                    [(set GR16:$dst, (sube GR16:$src, GR16:$src2)),
                      (implicit SRW)]>;
 
 def SBC8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
                    "subc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sube GR8:$src1, imm:$src2)),
+                   [(set GR8:$dst, (sube GR8:$src, imm:$src2)),
                     (implicit SRW)]>;
 def SBC16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
                     "subc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sube GR16:$src1, imm:$src2)),
+                    [(set GR16:$dst, (sube GR16:$src, imm:$src2)),
                      (implicit SRW)]>;
 
 def SBC8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                    "subc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sube GR8:$src1, (load addr:$src2))),
+                   [(set GR8:$dst, (sube GR8:$src, (load addr:$src2))),
                     (implicit SRW)]>;
 def SBC16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                     "subc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sube GR16:$src1, (load addr:$src2))),
+                    [(set GR16:$dst, (sube GR16:$src, (load addr:$src2))),
                      (implicit SRW)]>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
 def SBC8mr  : I8mr<0x0,
                    (outs), (ins memdst:$dst, GR8:$src),
                    "subc.b\t{$src, $dst}",
@@ -985,59 +990,59 @@ def SWPB16r : II16r<0x0,
                     "swpb\t$dst",
                     [(set GR16:$dst, (bswap GR16:$src))]>;
 
-} // isTwoAddress = 1
+} // Constraints = "$src = $dst"
 
 // Integer comparisons
 let Defs = [SRW] in {
 def CMP8rr  : I8rr<0x0,
-                   (outs), (ins GR8:$src1, GR8:$src2),
-                   "cmp.b\t{$src2, $src1}",
-                   [(MSP430cmp GR8:$src1, GR8:$src2), (implicit SRW)]>;
+                   (outs), (ins GR8:$src, GR8:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp GR8:$src, GR8:$src2), (implicit SRW)]>;
 def CMP16rr : I16rr<0x0,
-                    (outs), (ins GR16:$src1, GR16:$src2),
-                    "cmp.w\t{$src2, $src1}",
-                    [(MSP430cmp GR16:$src1, GR16:$src2), (implicit SRW)]>;
+                    (outs), (ins GR16:$src, GR16:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp GR16:$src, GR16:$src2), (implicit SRW)]>;
 
 def CMP8ri  : I8ri<0x0,
-                   (outs), (ins GR8:$src1, i8imm:$src2),
-                   "cmp.b\t{$src2, $src1}",
-                   [(MSP430cmp GR8:$src1, imm:$src2), (implicit SRW)]>;
+                   (outs), (ins GR8:$src, i8imm:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp GR8:$src, imm:$src2), (implicit SRW)]>;
 def CMP16ri : I16ri<0x0,
-                    (outs), (ins GR16:$src1, i16imm:$src2),
-                    "cmp.w\t{$src2, $src1}",
-                    [(MSP430cmp GR16:$src1, imm:$src2), (implicit SRW)]>;
+                    (outs), (ins GR16:$src, i16imm:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp GR16:$src, imm:$src2), (implicit SRW)]>;
 
 def CMP8mi  : I8mi<0x0,
-                   (outs), (ins memsrc:$src1, i8imm:$src2),
-                   "cmp.b\t{$src2, $src1}",
-                   [(MSP430cmp (load addr:$src1),
+                   (outs), (ins memsrc:$src, i8imm:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp (load addr:$src),
                                (i8 imm:$src2)), (implicit SRW)]>;
 def CMP16mi : I16mi<0x0,
-                    (outs), (ins memsrc:$src1, i16imm:$src2),
-                    "cmp.w\t{$src2, $src1}",
-                     [(MSP430cmp (load addr:$src1),
+                    (outs), (ins memsrc:$src, i16imm:$src2),
+                    "cmp.w\t{$src2, $src}",
+                     [(MSP430cmp (load addr:$src),
                                  (i16 imm:$src2)), (implicit SRW)]>;
 
 def CMP8rm  : I8rm<0x0,
-                   (outs), (ins GR8:$src1, memsrc:$src2),
-                   "cmp.b\t{$src2, $src1}",
-                   [(MSP430cmp GR8:$src1, (load addr:$src2)), 
+                   (outs), (ins GR8:$src, memsrc:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp GR8:$src, (load addr:$src2)), 
                     (implicit SRW)]>;
 def CMP16rm : I16rm<0x0,
-                    (outs), (ins GR16:$src1, memsrc:$src2),
-                    "cmp.w\t{$src2, $src1}",
-                    [(MSP430cmp GR16:$src1, (load addr:$src2)),
+                    (outs), (ins GR16:$src, memsrc:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp GR16:$src, (load addr:$src2)),
                      (implicit SRW)]>;
 
 def CMP8mr  : I8mr<0x0,
-                   (outs), (ins memsrc:$src1, GR8:$src2),
-                   "cmp.b\t{$src2, $src1}",
-                   [(MSP430cmp (load addr:$src1), GR8:$src2),
+                   (outs), (ins memsrc:$src, GR8:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp (load addr:$src), GR8:$src2),
                     (implicit SRW)]>;
 def CMP16mr : I16mr<0x0,
-                    (outs), (ins memsrc:$src1, GR16:$src2),
-                    "cmp.w\t{$src2, $src1}",
-                    [(MSP430cmp (load addr:$src1), GR16:$src2), 
+                    (outs), (ins memsrc:$src, GR16:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp (load addr:$src), GR16:$src2), 
                      (implicit SRW)]>;
 
 
@@ -1045,71 +1050,71 @@ def CMP16mr : I16mr<0x0,
 // Note that the C condition is set differently than when using CMP.
 let isCommutable = 1 in {
 def BIT8rr  : I8rr<0x0,
-                   (outs), (ins GR8:$src1, GR8:$src2),
-                   "bit.b\t{$src2, $src1}",
-                   [(MSP430cmp (and_su GR8:$src1, GR8:$src2), 0),
+                   (outs), (ins GR8:$src, GR8:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su GR8:$src, GR8:$src2), 0),
                     (implicit SRW)]>;
 def BIT16rr : I16rr<0x0,
-                    (outs), (ins GR16:$src1, GR16:$src2),
-                    "bit.w\t{$src2, $src1}",
-                    [(MSP430cmp (and_su GR16:$src1, GR16:$src2), 0),
+                    (outs), (ins GR16:$src, GR16:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su GR16:$src, GR16:$src2), 0),
                      (implicit SRW)]>;
 }
 def BIT8ri  : I8ri<0x0,
-                   (outs), (ins GR8:$src1, i8imm:$src2),
-                   "bit.b\t{$src2, $src1}",
-                   [(MSP430cmp (and_su GR8:$src1, imm:$src2), 0),
+                   (outs), (ins GR8:$src, i8imm:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su GR8:$src, imm:$src2), 0),
                     (implicit SRW)]>;
 def BIT16ri : I16ri<0x0,
-                    (outs), (ins GR16:$src1, i16imm:$src2),
-                    "bit.w\t{$src2, $src1}",
-                    [(MSP430cmp (and_su GR16:$src1, imm:$src2), 0),
+                    (outs), (ins GR16:$src, i16imm:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su GR16:$src, imm:$src2), 0),
                      (implicit SRW)]>;
 
 def BIT8rm  : I8rm<0x0,
-                   (outs), (ins GR8:$src1, memdst:$src2),
-                   "bit.b\t{$src2, $src1}",
-                   [(MSP430cmp (and_su GR8:$src1,  (load addr:$src2)), 0),
+                   (outs), (ins GR8:$src, memdst:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su GR8:$src,  (load addr:$src2)), 0),
                     (implicit SRW)]>;
 def BIT16rm : I16rm<0x0,
-                    (outs), (ins GR16:$src1, memdst:$src2),
-                    "bit.w\t{$src2, $src1}",
-                    [(MSP430cmp (and_su GR16:$src1,  (load addr:$src2)), 0),
+                    (outs), (ins GR16:$src, memdst:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su GR16:$src,  (load addr:$src2)), 0),
                      (implicit SRW)]>;
 
 def BIT8mr  : I8mr<0x0,
-                  (outs), (ins memsrc:$src1, GR8:$src2),
-                  "bit.b\t{$src2, $src1}",
-                  [(MSP430cmp (and_su (load addr:$src1), GR8:$src2), 0),
+                  (outs), (ins memsrc:$src, GR8:$src2),
+                  "bit.b\t{$src2, $src}",
+                  [(MSP430cmp (and_su (load addr:$src), GR8:$src2), 0),
                    (implicit SRW)]>;
 def BIT16mr : I16mr<0x0,
-                    (outs), (ins memsrc:$src1, GR16:$src2),
-                    "bit.w\t{$src2, $src1}",
-                    [(MSP430cmp (and_su (load addr:$src1), GR16:$src2), 0),
+                    (outs), (ins memsrc:$src, GR16:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su (load addr:$src), GR16:$src2), 0),
                      (implicit SRW)]>;
 
 def BIT8mi  : I8mi<0x0,
-                   (outs), (ins memsrc:$src1, i8imm:$src2),
-                   "bit.b\t{$src2, $src1}",
-                   [(MSP430cmp (and_su (load addr:$src1), (i8 imm:$src2)), 0),
+                   (outs), (ins memsrc:$src, i8imm:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su (load addr:$src), (i8 imm:$src2)), 0),
                     (implicit SRW)]>;
 def BIT16mi : I16mi<0x0,
-                    (outs), (ins memsrc:$src1, i16imm:$src2),
-                    "bit.w\t{$src2, $src1}",
-                    [(MSP430cmp (and_su (load addr:$src1), (i16 imm:$src2)), 0),
+                    (outs), (ins memsrc:$src, i16imm:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su (load addr:$src), (i16 imm:$src2)), 0),
                      (implicit SRW)]>;
 
 def BIT8mm  : I8mm<0x0,
-                   (outs), (ins memsrc:$src1, memsrc:$src2),
-                   "bit.b\t{$src2, $src1}",
-                   [(MSP430cmp (and_su (i8 (load addr:$src1)),
+                   (outs), (ins memsrc:$src, memsrc:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su (i8 (load addr:$src)),
                                        (load addr:$src2)),
                                  0),
                       (implicit SRW)]>;
 def BIT16mm : I16mm<0x0,
-                    (outs), (ins memsrc:$src1, memsrc:$src2),
-                    "bit.w\t{$src2, $src1}",
-                    [(MSP430cmp (and_su (i16 (load addr:$src1)),
+                    (outs), (ins memsrc:$src, memsrc:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su (i16 (load addr:$src)),
                                         (load addr:$src2)),
                                  0),
                      (implicit SRW)]>;
@@ -1134,12 +1139,12 @@ def : Pat<(i16 (MSP430Wrapper tglobaladdr:$dst)), (MOV16ri tglobaladdr:$dst)>;
 def : Pat<(i16 (MSP430Wrapper texternalsym:$dst)), (MOV16ri texternalsym:$dst)>;
 def : Pat<(i16 (MSP430Wrapper tblockaddress:$dst)), (MOV16ri tblockaddress:$dst)>;
 
-def : Pat<(add GR16:$src1, (MSP430Wrapper tglobaladdr :$src2)),
-          (ADD16ri GR16:$src1, tglobaladdr:$src2)>;
-def : Pat<(add GR16:$src1, (MSP430Wrapper texternalsym:$src2)),
-          (ADD16ri GR16:$src1, texternalsym:$src2)>;
-def : Pat<(add GR16:$src1, (MSP430Wrapper tblockaddress:$src2)),
-          (ADD16ri GR16:$src1, tblockaddress:$src2)>;
+def : Pat<(add GR16:$src, (MSP430Wrapper tglobaladdr :$src2)),
+          (ADD16ri GR16:$src, tglobaladdr:$src2)>;
+def : Pat<(add GR16:$src, (MSP430Wrapper texternalsym:$src2)),
+          (ADD16ri GR16:$src, texternalsym:$src2)>;
+def : Pat<(add GR16:$src, (MSP430Wrapper tblockaddress:$src2)),
+          (ADD16ri GR16:$src, tblockaddress:$src2)>;
 
 def : Pat<(store (i16 (MSP430Wrapper tglobaladdr:$src)), addr:$dst),
           (MOV16mi addr:$dst, tglobaladdr:$src)>;
@@ -1155,45 +1160,45 @@ def : Pat<(MSP430call (i16 texternalsym:$dst)),
           (CALLi texternalsym:$dst)>;
 
 // add and sub always produce carry
-def : Pat<(addc GR16:$src1, GR16:$src2),
-          (ADD16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(addc GR16:$src1, (load addr:$src2)),
-          (ADD16rm GR16:$src1, addr:$src2)>;
-def : Pat<(addc GR16:$src1, imm:$src2),
-          (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(addc GR16:$src, GR16:$src2),
+          (ADD16rr GR16:$src, GR16:$src2)>;
+def : Pat<(addc GR16:$src, (load addr:$src2)),
+          (ADD16rm GR16:$src, addr:$src2)>;
+def : Pat<(addc GR16:$src, imm:$src2),
+          (ADD16ri GR16:$src, imm:$src2)>;
 def : Pat<(store (addc (load addr:$dst), GR16:$src), addr:$dst),
           (ADD16mr addr:$dst, GR16:$src)>;
 def : Pat<(store (addc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
           (ADD16mm addr:$dst, addr:$src)>;
 
-def : Pat<(addc GR8:$src1, GR8:$src2),
-          (ADD8rr GR8:$src1, GR8:$src2)>;
-def : Pat<(addc GR8:$src1, (load addr:$src2)),
-          (ADD8rm GR8:$src1, addr:$src2)>;
-def : Pat<(addc GR8:$src1, imm:$src2),
-          (ADD8ri GR8:$src1, imm:$src2)>;
+def : Pat<(addc GR8:$src, GR8:$src2),
+          (ADD8rr GR8:$src, GR8:$src2)>;
+def : Pat<(addc GR8:$src, (load addr:$src2)),
+          (ADD8rm GR8:$src, addr:$src2)>;
+def : Pat<(addc GR8:$src, imm:$src2),
+          (ADD8ri GR8:$src, imm:$src2)>;
 def : Pat<(store (addc (load addr:$dst), GR8:$src), addr:$dst),
           (ADD8mr addr:$dst, GR8:$src)>;
 def : Pat<(store (addc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
           (ADD8mm addr:$dst, addr:$src)>;
 
-def : Pat<(subc GR16:$src1, GR16:$src2),
-          (SUB16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(subc GR16:$src1, (load addr:$src2)),
-          (SUB16rm GR16:$src1, addr:$src2)>;
-def : Pat<(subc GR16:$src1, imm:$src2),
-          (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(subc GR16:$src, GR16:$src2),
+          (SUB16rr GR16:$src, GR16:$src2)>;
+def : Pat<(subc GR16:$src, (load addr:$src2)),
+          (SUB16rm GR16:$src, addr:$src2)>;
+def : Pat<(subc GR16:$src, imm:$src2),
+          (SUB16ri GR16:$src, imm:$src2)>;
 def : Pat<(store (subc (load addr:$dst), GR16:$src), addr:$dst),
           (SUB16mr addr:$dst, GR16:$src)>;
 def : Pat<(store (subc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
           (SUB16mm addr:$dst, addr:$src)>;
 
-def : Pat<(subc GR8:$src1, GR8:$src2),
-          (SUB8rr GR8:$src1, GR8:$src2)>;
-def : Pat<(subc GR8:$src1, (load addr:$src2)),
-          (SUB8rm GR8:$src1, addr:$src2)>;
-def : Pat<(subc GR8:$src1, imm:$src2),
-          (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(subc GR8:$src, GR8:$src2),
+          (SUB8rr GR8:$src, GR8:$src2)>;
+def : Pat<(subc GR8:$src, (load addr:$src2)),
+          (SUB8rm GR8:$src, addr:$src2)>;
+def : Pat<(subc GR8:$src, imm:$src2),
+          (SUB8ri GR8:$src, imm:$src2)>;
 def : Pat<(store (subc (load addr:$dst), GR8:$src), addr:$dst),
           (SUB8mr addr:$dst, GR8:$src)>;
 def : Pat<(store (subc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
@@ -1201,6 +1206,6 @@ def : Pat<(store (subc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
 
 // peephole patterns
 def : Pat<(and GR16:$src, 255), (ZEXT16r GR16:$src)>;
-def : Pat<(MSP430cmp (trunc (and_su GR16:$src1, GR16:$src2)), 0),
-          (BIT8rr (EXTRACT_SUBREG GR16:$src1, subreg_8bit),
+def : Pat<(MSP430cmp (trunc (and_su GR16:$src, GR16:$src2)), 0),
+          (BIT8rr (EXTRACT_SUBREG GR16:$src, subreg_8bit),
                   (EXTRACT_SUBREG GR16:$src2, subreg_8bit))>;
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 0cae267..608ca49 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -71,48 +71,6 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 }
 
-const TargetRegisterClass *const *
-MSP430RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  const Function* F = MF->getFunction();
-  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    0
-  };
-  static const TargetRegisterClass * const CalleeSavedRegClassesFP[] = {
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, 0
-  };
-  static const TargetRegisterClass * const CalleeSavedRegClassesIntr[] = {
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    0
-  };
-  static const TargetRegisterClass * const CalleeSavedRegClassesIntrFP[] = {
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
-    &MSP430::GR16RegClass, 0
-  };
-
-  if (hasFP(*MF))
-    return (F->getCallingConv() == CallingConv::MSP430_INTR ?
-            CalleeSavedRegClassesIntrFP : CalleeSavedRegClassesFP);
-  else
-    return (F->getCallingConv() == CallingConv::MSP430_INTR ?
-            CalleeSavedRegClassesIntr : CalleeSavedRegClasses);
-}
-
 BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
 
@@ -270,8 +228,8 @@ MSP430RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
                                                                          const {
   // Create a frame entry for the FPW register that must be saved.
   if (hasFP(MF)) {
-    int ATTRIBUTE_UNUSED FrameIdx =
-      MF.getFrameInfo()->CreateFixedObject(2, -4, true, false);
+    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(2, -4, true);
+    (void)FrameIdx;
     assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
            "Slot for FPW register must be last in order to be found!");
   }
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index c8684df..6e58d31 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -36,9 +36,6 @@ public:
   /// Code Generation virtual methods...
   const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
 
-  const TargetRegisterClass* const*
-    getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
-
   BitVector getReservedRegs(const MachineFunction &MF) const;
   const TargetRegisterClass* getPointerRegClass(unsigned Kind = 0) const;
 
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
index 4ef017a..2037a91 100644
--- a/lib/Target/Mangler.cpp
+++ b/lib/Target/Mangler.cpp
@@ -180,7 +180,7 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
   ManglerPrefixTy PrefixTy = Mangler::Default;
   if (GV->hasPrivateLinkage() || isImplicitlyPrivate)
     PrefixTy = Mangler::Private;
-  else if (GV->hasLinkerPrivateLinkage())
+  else if (GV->hasLinkerPrivateLinkage() || GV->hasLinkerPrivateWeakLinkage())
     PrefixTy = Mangler::LinkerPrivate;
   
   // If this global has a name, handle it simply.
diff --git a/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp b/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp
index 4d7fe4c..8ae05b7 100644
--- a/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp
@@ -133,8 +133,9 @@ void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
   const MachineFrameInfo *MFI = MF->getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned RegNum = MipsRegisterInfo::getRegisterNumbering(CSI[i].getReg());
-    if (CSI[i].getRegClass() == Mips::CPURegsRegisterClass)
+    unsigned Reg = CSI[i].getReg();
+    unsigned RegNum = MipsRegisterInfo::getRegisterNumbering(Reg);
+    if (Mips::CPURegsRegisterClass->contains(Reg))
       CPUBitmask |= (1 << RegNum);
     else
       FPUBitmask |= (1 << RegNum);
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index e979c3f..b6ff2c3 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -284,6 +284,18 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     MachineFunction *F = BB->getParent();
     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    sinkMBB->splice(sinkMBB->begin(), BB,
+                    llvm::next(MachineBasicBlock::iterator(MI)),
+                    BB->end());
+    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    // Next, add the true and fallthrough blocks as its successors.
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
 
     // Emit the right instruction according to the type of the operands compared
     if (isFPCmp) {
@@ -296,20 +308,6 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
       BuildMI(BB, dl, TII->get(Mips::BNE)).addReg(MI->getOperand(1).getReg())
         .addReg(Mips::ZERO).addMBB(sinkMBB);
 
-    F->insert(It, copy0MBB);
-    F->insert(It, sinkMBB);
-    // Update machine-CFG edges by first adding all successors of the current
-    // block to the new block which will contain the Phi node for the select.
-    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
-          e = BB->succ_end(); i != e; ++i)
-      sinkMBB->addSuccessor(*i);
-    // Next, remove all successors of the current block, and add the true
-    // and fallthrough blocks as its successors.
-    while(!BB->succ_empty())
-      BB->removeSuccessor(BB->succ_begin());
-    BB->addSuccessor(copy0MBB);
-    BB->addSuccessor(sinkMBB);
-
     //  copy0MBB:
     //   %FalseValue = ...
     //   # fallthrough to sinkMBB
@@ -322,11 +320,12 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
     //  ...
     BB = sinkMBB;
-    BuildMI(BB, dl, TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    BuildMI(*BB, BB->begin(), dl,
+            TII->get(Mips::PHI), MI->getOperand(0).getReg())
       .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
       .addReg(MI->getOperand(3).getReg()).addMBB(thisMBB);
 
-    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
     return BB;
   }
   }
@@ -490,21 +489,21 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
     
     // %gp_rel relocation
     if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) { 
-      SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32, 0, 
+      SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, 
                                               MipsII::MO_GPREL);
       SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, dl, VTs, &GA, 1);
       SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
       return DAG.getNode(ISD::ADD, dl, MVT::i32, GOT, GPRelNode); 
     }
     // %hi/%lo relocation
-    SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32, 0,
+    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
                                             MipsII::MO_ABS_HILO);
     SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, &GA, 1);
     SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GA);
     return DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
 
   } else {
-    SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32, 0,
+    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
                                             MipsII::MO_GOT);
     SDValue ResNode = DAG.getLoad(MVT::i32, dl, 
                                   DAG.getEntryNode(), GA, NULL, 0,
@@ -768,6 +767,7 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                               CallingConv::ID CallConv, bool isVarArg,
                               bool &isTailCall,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                              const SmallVectorImpl<SDValue> &OutVals,
                               const SmallVectorImpl<ISD::InputArg> &Ins,
                               DebugLoc dl, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &InVals) const {
@@ -787,7 +787,7 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // the stack (even if less than 4 are used as arguments)
   if (Subtarget->isABI_O32()) {
     int VTsize = EVT(MVT::i32).getSizeInBits()/8;
-    MFI->CreateFixedObject(VTsize, (VTsize*3), true, false);
+    MFI->CreateFixedObject(VTsize, (VTsize*3), true);
     CCInfo.AnalyzeCallOperands(Outs, 
                      isVarArg ? CC_MipsO32_VarArgs : CC_MipsO32);
   } else
@@ -808,7 +808,7 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    SDValue Arg = Outs[i].Val;
+    SDValue Arg = OutVals[i];
     CCValAssign &VA = ArgLocs[i];
 
     // Promote the value if needed.
@@ -857,7 +857,7 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // if O32 ABI is used. For EABI the first address is zero.
     LastArgStackLoc = (FirstStackArgLoc + VA.getLocMemOffset());
     int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
-                                    LastArgStackLoc, true, false);
+                                    LastArgStackLoc, true);
 
     SDValue PtrOff = DAG.getFrameIndex(FI,getPointerTy());
 
@@ -889,7 +889,7 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // node so that legalize doesn't hack it. 
   unsigned char OpFlag = IsPIC ? MipsII::MO_GOT_CALL : MipsII::MO_NO_FLAG;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), 
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, 
                                 getPointerTy(), 0, OpFlag);
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), 
@@ -929,7 +929,7 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
         // Create the frame index only once. SPOffset here can be anything 
         // (this will be fixed on processFunctionBeforeFrameFinalized)
         if (MipsFI->getGPStackOffset() == -1) {
-          FI = MFI->CreateFixedObject(4, 0, true, false);
+          FI = MFI->CreateFixedObject(4, 0, true);
           MipsFI->setGPFI(FI);
         }
         MipsFI->setGPStackOffset(LastArgStackLoc);
@@ -1098,7 +1098,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
       // offset on PEI::calculateFrameObjectOffsets.
       // Arguments are always 32-bit.
       unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
-      int FI = MFI->CreateFixedObject(ArgSize, 0, true, false);
+      int FI = MFI->CreateFixedObject(ArgSize, 0, true);
       MipsFI->recordLoadArgsFI(FI, -(ArgSize+
         (FirstStackArgLoc + VA.getLocMemOffset())));
 
@@ -1137,7 +1137,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
       unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegEnd, RC);
       SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, MVT::i32);
 
-      int FI = MFI->CreateFixedObject(4, 0, true, false);
+      int FI = MFI->CreateFixedObject(4, 0, true);
       MipsFI->recordStoreVarArgsFI(FI, -(4+(StackLoc*4)));
       SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
       OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff, NULL, 0,
@@ -1169,6 +1169,7 @@ SDValue
 MipsTargetLowering::LowerReturn(SDValue Chain,
                                 CallingConv::ID CallConv, bool isVarArg,
                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                const SmallVectorImpl<SDValue> &OutVals,
                                 DebugLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of
@@ -1198,7 +1199,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
     assert(VA.isRegLoc() && "Can only return in registers!");
 
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 
-                             Outs[i].Val, Flag);
+                             OutVals[i], Flag);
 
     // guarantee that all emitted copies are
     // stuck together, avoiding something bad
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index f2de489..460747b 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -120,6 +120,7 @@ namespace llvm {
                 CallingConv::ID CallConv, bool isVarArg,
                 bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
@@ -128,6 +129,7 @@ namespace llvm {
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     virtual MachineBasicBlock *
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 4005e35..6c09a3e 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -127,61 +127,75 @@ insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const
   BuildMI(MBB, MI, DL, get(Mips::NOP));
 }
 
-bool MipsInstrInfo::
-copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-             unsigned DestReg, unsigned SrcReg,
-             const TargetRegisterClass *DestRC,
-             const TargetRegisterClass *SrcRC,
-             DebugLoc DL) const {
+void MipsInstrInfo::
+copyPhysReg(MachineBasicBlock &MBB,
+            MachineBasicBlock::iterator I, DebugLoc DL,
+            unsigned DestReg, unsigned SrcReg,
+            bool KillSrc) const {
+  bool DestCPU = Mips::CPURegsRegClass.contains(DestReg);
+  bool SrcCPU  = Mips::CPURegsRegClass.contains(SrcReg);
+
+  // CPU-CPU is the most common.
+  if (DestCPU && SrcCPU) {
+    BuildMI(MBB, I, DL, get(Mips::ADDu), DestReg).addReg(Mips::ZERO)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
 
-  if (DestRC != SrcRC) {
-
-    // Copy to/from FCR31 condition register
-    if ((DestRC == Mips::CPURegsRegisterClass) && 
-        (SrcRC == Mips::CCRRegisterClass))
-      BuildMI(MBB, I, DL, get(Mips::CFC1), DestReg).addReg(SrcReg);
-    else if ((DestRC == Mips::CCRRegisterClass) && 
-        (SrcRC == Mips::CPURegsRegisterClass))
-      BuildMI(MBB, I, DL, get(Mips::CTC1), DestReg).addReg(SrcReg);
-
-    // Moves between coprocessors and cpu
-    else if ((DestRC == Mips::CPURegsRegisterClass) && 
-        (SrcRC == Mips::FGR32RegisterClass))
-      BuildMI(MBB, I, DL, get(Mips::MFC1), DestReg).addReg(SrcReg);
-    else if ((DestRC == Mips::FGR32RegisterClass) &&
-             (SrcRC == Mips::CPURegsRegisterClass))
-      BuildMI(MBB, I, DL, get(Mips::MTC1), DestReg).addReg(SrcReg);
-
-    // Move from/to Hi/Lo registers
-    else if ((DestRC == Mips::HILORegisterClass) &&
-             (SrcRC == Mips::CPURegsRegisterClass)) {
-      unsigned Opc = (DestReg == Mips::HI) ? Mips::MTHI : Mips::MTLO;
-      BuildMI(MBB, I, DL, get(Opc), DestReg);
-    } else if ((SrcRC == Mips::HILORegisterClass) &&
-               (DestRC == Mips::CPURegsRegisterClass)) {
-      unsigned Opc = (SrcReg == Mips::HI) ? Mips::MFHI : Mips::MFLO;
-      BuildMI(MBB, I, DL, get(Opc), DestReg);
-    } else 
-      // Can't copy this register
-      return false; 
+  // Copy to CPU from other registers.
+  if (DestCPU) {
+    if (Mips::CCRRegClass.contains(SrcReg))
+      BuildMI(MBB, I, DL, get(Mips::CFC1), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    else if (Mips::FGR32RegClass.contains(SrcReg))
+      BuildMI(MBB, I, DL, get(Mips::MFC1), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    else if (SrcReg == Mips::HI)
+      BuildMI(MBB, I, DL, get(Mips::MFHI), DestReg);
+    else if (SrcReg == Mips::LO)
+      BuildMI(MBB, I, DL, get(Mips::MFLO), DestReg);
+    else
+      llvm_unreachable("Copy to CPU from invalid register");
+    return;
+  }
 
-    return true;
+  // Copy to other registers from CPU.
+  if (SrcCPU) {
+    if (Mips::CCRRegClass.contains(DestReg))
+      BuildMI(MBB, I, DL, get(Mips::CTC1), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    else if (Mips::FGR32RegClass.contains(DestReg))
+      BuildMI(MBB, I, DL, get(Mips::MTC1), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    else if (DestReg == Mips::HI)
+      BuildMI(MBB, I, DL, get(Mips::MTHI))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    else if (DestReg == Mips::LO)
+      BuildMI(MBB, I, DL, get(Mips::MTLO))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    else
+      llvm_unreachable("Copy from CPU to invalid register");
+    return;
   }
 
-  if (DestRC == Mips::CPURegsRegisterClass)
-    BuildMI(MBB, I, DL, get(Mips::ADDu), DestReg).addReg(Mips::ZERO)
-      .addReg(SrcReg);
-  else if (DestRC == Mips::FGR32RegisterClass) 
-    BuildMI(MBB, I, DL, get(Mips::FMOV_S32), DestReg).addReg(SrcReg);
-  else if (DestRC == Mips::AFGR64RegisterClass)
-    BuildMI(MBB, I, DL, get(Mips::FMOV_D32), DestReg).addReg(SrcReg);
-  else if (DestRC == Mips::CCRRegisterClass)
-    BuildMI(MBB, I, DL, get(Mips::MOVCCRToCCR), DestReg).addReg(SrcReg);
-  else
-    // Can't copy this register
-    return false;
+  if (Mips::FGR32RegClass.contains(DestReg, SrcReg)) {
+    BuildMI(MBB, I, DL, get(Mips::FMOV_S32), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
   
-  return true;
+  if (Mips::AFGR64RegClass.contains(DestReg, SrcReg)) {
+    BuildMI(MBB, I, DL, get(Mips::FMOV_D32), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (Mips::CCRRegClass.contains(DestReg, SrcReg)) {
+    BuildMI(MBB, I, DL, get(Mips::MOVCCRToCCR), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  llvm_unreachable("Cannot copy registers");
 }
 
 void MipsInstrInfo::
@@ -247,80 +261,6 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     llvm_unreachable("Register class not handled!");
 }
 
-MachineInstr *MipsInstrInfo::
-foldMemoryOperandImpl(MachineFunction &MF,
-                      MachineInstr* MI,
-                      const SmallVectorImpl<unsigned> &Ops, int FI) const 
-{
-  if (Ops.size() != 1) return NULL;
-
-  MachineInstr *NewMI = NULL;
-
-  switch (MI->getOpcode()) {
-  case Mips::ADDu:
-    if ((MI->getOperand(0).isReg()) &&
-        (MI->getOperand(1).isReg()) &&
-        (MI->getOperand(1).getReg() == Mips::ZERO) &&
-        (MI->getOperand(2).isReg())) {
-      if (Ops[0] == 0) {    // COPY -> STORE
-        unsigned SrcReg = MI->getOperand(2).getReg();
-        bool isKill = MI->getOperand(2).isKill();
-        bool isUndef = MI->getOperand(2).isUndef();
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(Mips::SW))
-          .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
-          .addImm(0).addFrameIndex(FI);
-      } else {              // COPY -> LOAD
-        unsigned DstReg = MI->getOperand(0).getReg();
-        bool isDead = MI->getOperand(0).isDead();
-        bool isUndef = MI->getOperand(0).isUndef();
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(Mips::LW))
-          .addReg(DstReg, RegState::Define | getDeadRegState(isDead) |
-                  getUndefRegState(isUndef))
-          .addImm(0).addFrameIndex(FI);
-      }
-    }
-    break;
-  case Mips::FMOV_S32:
-  case Mips::FMOV_D32:
-    if ((MI->getOperand(0).isReg()) &&
-        (MI->getOperand(1).isReg())) {
-      const TargetRegisterClass 
-        *RC = RI.getRegClass(MI->getOperand(0).getReg());
-      unsigned StoreOpc, LoadOpc;
-      bool IsMips1 = TM.getSubtarget<MipsSubtarget>().isMips1();
-
-      if (RC == Mips::FGR32RegisterClass) {
-        LoadOpc = Mips::LWC1; StoreOpc = Mips::SWC1;
-      } else {
-        assert(RC == Mips::AFGR64RegisterClass);
-        // Mips1 doesn't have ldc/sdc instructions.
-        if (IsMips1) break;
-        LoadOpc = Mips::LDC1; StoreOpc = Mips::SDC1;
-      }
-
-      if (Ops[0] == 0) {    // COPY -> STORE
-        unsigned SrcReg = MI->getOperand(1).getReg();
-        bool isKill = MI->getOperand(1).isKill();
-        bool isUndef = MI->getOperand(2).isUndef();
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(StoreOpc))
-          .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
-          .addImm(0).addFrameIndex(FI) ;
-      } else {              // COPY -> LOAD
-        unsigned DstReg = MI->getOperand(0).getReg();
-        bool isDead = MI->getOperand(0).isDead();
-        bool isUndef = MI->getOperand(0).isUndef();
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(LoadOpc))
-          .addReg(DstReg, RegState::Define | getDeadRegState(isDead) |
-                  getUndefRegState(isUndef))
-          .addImm(0).addFrameIndex(FI);
-      }
-    }
-    break;
-  }
-
-  return NewMI;
-}
-
 //===----------------------------------------------------------------------===//
 // Branch Analysis
 //===----------------------------------------------------------------------===//
@@ -520,9 +460,8 @@ bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 unsigned MipsInstrInfo::
 InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, 
              MachineBasicBlock *FBB,
-             const SmallVectorImpl<MachineOperand> &Cond) const {
-  // FIXME this should probably have a DebugLoc argument
-  DebugLoc dl;
+             const SmallVectorImpl<MachineOperand> &Cond,
+             DebugLoc DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 3 || Cond.size() == 2 || Cond.size() == 0) &&
@@ -531,18 +470,18 @@ InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   if (FBB == 0) { // One way branch.
     if (Cond.empty()) {
       // Unconditional branch?
-      BuildMI(&MBB, dl, get(Mips::J)).addMBB(TBB);
+      BuildMI(&MBB, DL, get(Mips::J)).addMBB(TBB);
     } else {
       // Conditional branch.
       unsigned Opc = GetCondBranchFromCond((Mips::CondCode)Cond[0].getImm());
       const TargetInstrDesc &TID = get(Opc);
 
       if (TID.getNumOperands() == 3)
-        BuildMI(&MBB, dl, TID).addReg(Cond[1].getReg())
+        BuildMI(&MBB, DL, TID).addReg(Cond[1].getReg())
                           .addReg(Cond[2].getReg())
                           .addMBB(TBB);
       else
-        BuildMI(&MBB, dl, TID).addReg(Cond[1].getReg())
+        BuildMI(&MBB, DL, TID).addReg(Cond[1].getReg())
                           .addMBB(TBB);
 
     }                             
@@ -554,12 +493,12 @@ InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   const TargetInstrDesc &TID = get(Opc);
 
   if (TID.getNumOperands() == 3)
-    BuildMI(&MBB, dl, TID).addReg(Cond[1].getReg()).addReg(Cond[2].getReg())
+    BuildMI(&MBB, DL, TID).addReg(Cond[1].getReg()).addReg(Cond[2].getReg())
                       .addMBB(TBB);
   else
-    BuildMI(&MBB, dl, TID).addReg(Cond[1].getReg()).addMBB(TBB);
+    BuildMI(&MBB, DL, TID).addReg(Cond[1].getReg()).addMBB(TBB);
 
-  BuildMI(&MBB, dl, get(Mips::J)).addMBB(FBB);
+  BuildMI(&MBB, DL, get(Mips::J)).addMBB(FBB);
   return 2;
 }
 
@@ -621,12 +560,8 @@ unsigned MipsInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
 
   GlobalBaseReg = RegInfo.createVirtualRegister(Mips::CPURegsRegisterClass);
-  bool Ok = TII->copyRegToReg(FirstMBB, MBBI, GlobalBaseReg, Mips::GP,
-                              Mips::CPURegsRegisterClass,
-                              Mips::CPURegsRegisterClass,
-                              DebugLoc());
-  assert(Ok && "Couldn't assign to global base register!");
-  Ok = Ok; // Silence warning when assertions are turned off.
+  BuildMI(FirstMBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY),
+          GlobalBaseReg).addReg(Mips::GP);
   RegInfo.addLiveIn(Mips::GP);
 
   MipsFI->setGlobalBaseReg(GlobalBaseReg);
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 7919d9a..d6f87f9 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -204,13 +204,12 @@ public:
   virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                 MachineBasicBlock *FBB,
-                            const SmallVectorImpl<MachineOperand> &Cond) const;
-  virtual bool copyRegToReg(MachineBasicBlock &MBB, 
-                            MachineBasicBlock::iterator I,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
   virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
                                    unsigned SrcReg, bool isKill, int FrameIndex,
@@ -223,18 +222,6 @@ public:
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
 
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              int FrameIndex) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              MachineInstr* LoadMI) const {
-    return 0;
-  }
-  
   virtual
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
 
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 2b9e941..5337c9f 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -541,7 +541,7 @@ let Predicates = [HasSwap] in {
 def MIPS_CMOV_ZERO  : PatLeaf<(i32 0)>;
 def MIPS_CMOV_NZERO : PatLeaf<(i32 1)>;
 
-let Predicates = [HasCondMov], isTwoAddress = 1 in {
+let Predicates = [HasCondMov], Constraints = "$F = $dst" in {
   def MOVN : CondMov<0x0a, "movn", MIPS_CMOV_NZERO>;
   def MOVZ : CondMov<0x0b, "movz", MIPS_CMOV_ZERO>;
 }
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 5e719af..e15f0a5 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -116,34 +116,6 @@ getCalleeSavedRegs(const MachineFunction *MF) const
     return BitMode32CalleeSavedRegs;
 }
 
-/// Mips Callee Saved Register Classes
-const TargetRegisterClass* const* 
-MipsRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const 
-{
-  static const TargetRegisterClass * const SingleFloatOnlyCalleeSavedRC[] = {
-    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, 
-    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
-    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
-    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 
-    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 
-    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 
-    &Mips::FGR32RegClass, &Mips::FGR32RegClass, 0
-  };
-
-  static const TargetRegisterClass * const BitMode32CalleeSavedRC[] = {
-    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, 
-    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
-    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
-    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 
-    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 0
-  };
-
-  if (Subtarget.isSingleFloat())
-    return SingleFloatOnlyCalleeSavedRC;
-  else
-    return BitMode32CalleeSavedRC;
-}
-
 BitVector MipsRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const
 {
@@ -279,7 +251,8 @@ void MipsRegisterInfo::adjustMipsStackFrame(MachineFunction &MF) const
   StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign);
 
   for (unsigned i = 0, e = CSI.size(); i != e ; ++i) {
-    if (CSI[i].getRegClass() != Mips::CPURegsRegisterClass)
+    unsigned Reg = CSI[i].getReg();
+    if (!Mips::CPURegsRegisterClass->contains(Reg))
       break;
     MFI->setObjectOffset(CSI[i].getFrameIdx(), StackOffset);
     TopCPUSavedRegOff = StackOffset;
@@ -311,7 +284,8 @@ void MipsRegisterInfo::adjustMipsStackFrame(MachineFunction &MF) const
   // Adjust FPU Callee Saved Registers Area. This Area must be 
   // aligned to the default Stack Alignment requirements.
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    if (CSI[i].getRegClass() == Mips::CPURegsRegisterClass)
+    unsigned Reg = CSI[i].getReg();
+    if (Mips::CPURegsRegisterClass->contains(Reg))
       continue;
     MFI->setObjectOffset(CSI[i].getFrameIdx(), StackOffset);
     TopFPUSavedRegOff = StackOffset;
@@ -528,4 +502,3 @@ getDwarfRegNum(unsigned RegNum, bool isEH) const {
 }
 
 #include "MipsGenRegisterInfo.inc"
-
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index bc857b8..b500a65 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -42,9 +42,6 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo {
   /// Code Generation virtual methods...
   const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
 
-  const TargetRegisterClass* const*
-  getCalleeSavedRegClasses(const MachineFunction* MF = 0) const;
-
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
   bool hasFP(const MachineFunction &MF) const;
diff --git a/lib/Target/PIC16/PIC16ISelLowering.cpp b/lib/Target/PIC16/PIC16ISelLowering.cpp
index f479f46..54a6a28 100644
--- a/lib/Target/PIC16/PIC16ISelLowering.cpp
+++ b/lib/Target/PIC16/PIC16ISelLowering.cpp
@@ -672,7 +672,8 @@ SDValue PIC16TargetLowering::ExpandGlobalAddress(SDNode *N,
   // FIXME there isn't really debug info here
   DebugLoc dl = G->getDebugLoc();
   
-  SDValue TGA = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i8,
+  SDValue TGA = DAG.getTargetGlobalAddress(G->getGlobal(), N->getDebugLoc(),
+                                           MVT::i8,
                                            G->getOffset());
 
   SDValue Offset = DAG.getConstant(0, MVT::i8);
@@ -1120,6 +1121,7 @@ SDValue PIC16TargetLowering::
 LowerIndirectCallArguments(SDValue Chain, SDValue InFlag,
                            SDValue DataAddr_Lo, SDValue DataAddr_Hi,
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            DebugLoc dl, SelectionDAG &DAG) const {
   unsigned NumOps = Outs.size();
@@ -1136,7 +1138,7 @@ LowerIndirectCallArguments(SDValue Chain, SDValue InFlag,
   unsigned RetVals = Ins.size();
   for (unsigned i = 0, ArgOffset = RetVals; i < NumOps; i++) {
     // Get the arguments
-    Arg = Outs[i].Val;
+    Arg = OutVals[i];
     
     Ops.clear();
     Ops.push_back(Chain);
@@ -1158,6 +1160,7 @@ LowerIndirectCallArguments(SDValue Chain, SDValue InFlag,
 SDValue PIC16TargetLowering::
 LowerDirectCallArguments(SDValue ArgLabel, SDValue Chain, SDValue InFlag,
                          const SmallVectorImpl<ISD::OutputArg> &Outs,
+                         const SmallVectorImpl<SDValue> &OutVals,
                          DebugLoc dl, SelectionDAG &DAG) const {
   unsigned NumOps = Outs.size();
   std::string Name;
@@ -1183,7 +1186,7 @@ LowerDirectCallArguments(SDValue ArgLabel, SDValue Chain, SDValue InFlag,
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
   for (unsigned i=0, Offset = 0; i<NumOps; i++) {
     // Get the argument
-    Arg = Outs[i].Val;
+    Arg = OutVals[i];
     StoreOffset = (Offset + AddressOffset);
    
     // Store the argument on frame
@@ -1282,6 +1285,7 @@ SDValue
 PIC16TargetLowering::LowerReturn(SDValue Chain,
                                  CallingConv::ID CallConv, bool isVarArg,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
                                  DebugLoc dl, SelectionDAG &DAG) const {
 
   // Number of values to return 
@@ -1298,7 +1302,7 @@ PIC16TargetLowering::LowerReturn(SDValue Chain,
   SDValue BS = DAG.getConstant(1, MVT::i8);
   SDValue RetVal;
   for(unsigned i=0;i<NumRet; ++i) {
-    RetVal = Outs[i].Val;
+    RetVal = OutVals[i];
     Chain =  DAG.getNode (PIC16ISD::PIC16Store, dl, MVT::Other, Chain, RetVal,
                         ES, BS,
                         DAG.getConstant (i, MVT::i8));
@@ -1374,6 +1378,7 @@ PIC16TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                CallingConv::ID CallConv, bool isVarArg,
                                bool &isTailCall,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
                                DebugLoc dl, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const {
@@ -1428,7 +1433,7 @@ PIC16TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
        // Considering the GlobalAddressNode case here.
        if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
           const GlobalValue *GV = G->getGlobal();
-          Callee = DAG.getTargetGlobalAddress(GV, MVT::i8);
+          Callee = DAG.getTargetGlobalAddress(GV, dl, MVT::i8);
           Name = G->getGlobal()->getName();
        } else {// Considering the ExternalSymbol case here
           ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Callee);
@@ -1461,12 +1466,13 @@ PIC16TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     SDValue CallArgs;
     if (IsDirectCall) {
       CallArgs = LowerDirectCallArguments(ArgLabel, Chain, OperFlag,
-                                          Outs, dl, DAG);
+                                          Outs, OutVals, dl, DAG);
       Chain = getChain(CallArgs);
       OperFlag = getOutFlag(CallArgs);
     } else {
       CallArgs = LowerIndirectCallArguments(Chain, OperFlag, DataAddr_Lo,
-                                            DataAddr_Hi, Outs, Ins, dl, DAG);
+                                            DataAddr_Hi, Outs, OutVals, Ins,
+                                            dl, DAG);
       Chain = getChain(CallArgs);
       OperFlag = getOutFlag(CallArgs);
     }
@@ -1791,14 +1797,14 @@ static PIC16CC::CondCodes IntCCToPIC16CC(ISD::CondCode CC) {
 static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
                              ISD::CondCode CC, unsigned &SPCC) {
   if (isa<ConstantSDNode>(RHS) &&
-      cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+      cast<ConstantSDNode>(RHS)->isNullValue() &&
       CC == ISD::SETNE &&
       (LHS.getOpcode() == PIC16ISD::SELECT_ICC &&
         LHS.getOperand(3).getOpcode() == PIC16ISD::SUBCC) &&
       isa<ConstantSDNode>(LHS.getOperand(0)) &&
       isa<ConstantSDNode>(LHS.getOperand(1)) &&
-      cast<ConstantSDNode>(LHS.getOperand(0))->getZExtValue() == 1 &&
-      cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == 0) {
+      cast<ConstantSDNode>(LHS.getOperand(0))->isOne() &&
+      cast<ConstantSDNode>(LHS.getOperand(1))->isNullValue()) {
     SDValue CMPCC = LHS.getOperand(3);
     SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
     LHS = CMPCC.getOperand(0);
@@ -1928,15 +1934,12 @@ PIC16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   F->insert(It, copy0MBB);
   F->insert(It, sinkMBB);
 
-  // Update machine-CFG edges by first adding all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
-         E = BB->succ_end(); I != E; ++I)
-    sinkMBB->addSuccessor(*I);
-  // Next, remove all successors of the current block, and add the true
-  // and fallthrough blocks as its successors.
-  while (!BB->succ_empty())
-    BB->removeSuccessor(BB->succ_begin());
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
   // Next, add the true and fallthrough blocks as its successors.
   BB->addSuccessor(copy0MBB);
   BB->addSuccessor(sinkMBB);
@@ -1953,11 +1956,12 @@ PIC16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   //  ...
   BB = sinkMBB;
-  BuildMI(BB, dl, TII.get(PIC16::PHI), MI->getOperand(0).getReg())
+  BuildMI(*BB, BB->begin(), dl,
+          TII.get(PIC16::PHI), MI->getOperand(0).getReg())
     .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
     .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
 
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
   return BB;
 }
 
diff --git a/lib/Target/PIC16/PIC16ISelLowering.h b/lib/Target/PIC16/PIC16ISelLowering.h
index eea17f8..0a7506c 100644
--- a/lib/Target/PIC16/PIC16ISelLowering.h
+++ b/lib/Target/PIC16/PIC16ISelLowering.h
@@ -106,12 +106,14 @@ namespace llvm {
     SDValue 
     LowerDirectCallArguments(SDValue ArgLabel, SDValue Chain, SDValue InFlag,
                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
                              DebugLoc dl, SelectionDAG &DAG) const;
 
     SDValue 
     LowerIndirectCallArguments(SDValue Chain, SDValue InFlag,
                                SDValue DataAddr_Lo, SDValue DataAddr_Hi, 
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
                                DebugLoc dl, SelectionDAG &DAG) const;
 
@@ -143,6 +145,7 @@ namespace llvm {
       LowerCall(SDValue Chain, SDValue Callee,
                 CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
@@ -151,6 +154,7 @@ namespace llvm {
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     SDValue ExpandStore(SDNode *N, SelectionDAG &DAG) const;
diff --git a/lib/Target/PIC16/PIC16InstrInfo.cpp b/lib/Target/PIC16/PIC16InstrInfo.cpp
index 793dd9f..e784f74 100644
--- a/lib/Target/PIC16/PIC16InstrInfo.cpp
+++ b/lib/Target/PIC16/PIC16InstrInfo.cpp
@@ -151,25 +151,20 @@ void PIC16InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     llvm_unreachable("Can't load this register from stack slot");
 }
 
-bool PIC16InstrInfo::copyRegToReg (MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   const TargetRegisterClass *DestRC,
-                                   const TargetRegisterClass *SrcRC,
-                                   DebugLoc DL) const {
-
-  if (DestRC == PIC16::FSR16RegisterClass) {
-    BuildMI(MBB, I, DL, get(PIC16::copy_fsr), DestReg).addReg(SrcReg);
-    return true;
-  }
-
-  if (DestRC == PIC16::GPRRegisterClass) {
-    BuildMI(MBB, I, DL, get(PIC16::copy_w), DestReg).addReg(SrcReg);
-    return true;
-  }
+void PIC16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I, DebugLoc DL,
+                                 unsigned DestReg, unsigned SrcReg,
+                                 bool KillSrc) const {
+  unsigned Opc;
+  if (PIC16::FSR16RegClass.contains(DestReg, SrcReg))
+    Opc = PIC16::copy_fsr;
+  else if (PIC16::GPRRegClass.contains(DestReg, SrcReg))
+    Opc = PIC16::copy_w;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
 
-  // Not yet supported.
-  return false;
+  BuildMI(MBB, I, DL, get(Opc), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 bool PIC16InstrInfo::isMoveInstr(const MachineInstr &MI,
@@ -196,15 +191,15 @@ bool PIC16InstrInfo::isMoveInstr(const MachineInstr &MI,
 unsigned PIC16InstrInfo::
 InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, 
              MachineBasicBlock *FBB,
-             const SmallVectorImpl<MachineOperand> &Cond) const {
+             const SmallVectorImpl<MachineOperand> &Cond,
+             DebugLoc DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
   if (FBB == 0) { // One way branch.
     if (Cond.empty()) {
       // Unconditional branch?
-      DebugLoc dl;
-      BuildMI(&MBB, dl, get(PIC16::br_uncond)).addMBB(TBB);
+      BuildMI(&MBB, DL, get(PIC16::br_uncond)).addMBB(TBB);
     }
     return 1;
   }
diff --git a/lib/Target/PIC16/PIC16InstrInfo.h b/lib/Target/PIC16/PIC16InstrInfo.h
index 40a4cb4..a3a77f1 100644
--- a/lib/Target/PIC16/PIC16InstrInfo.h
+++ b/lib/Target/PIC16/PIC16InstrInfo.h
@@ -57,12 +57,10 @@ public:
                                     unsigned DestReg, int FrameIndex,
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
-  virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
   virtual bool isMoveInstr(const MachineInstr &MI,
                            unsigned &SrcReg, unsigned &DstReg,
                            unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
@@ -70,7 +68,8 @@ public:
   virtual 
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB,
-                        const SmallVectorImpl<MachineOperand> &Cond) const; 
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const; 
   virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                              MachineBasicBlock *&FBB,
                              SmallVectorImpl<MachineOperand> &Cond,
diff --git a/lib/Target/PIC16/PIC16InstrInfo.td b/lib/Target/PIC16/PIC16InstrInfo.td
index 24df251..86d36cb 100644
--- a/lib/Target/PIC16/PIC16InstrInfo.td
+++ b/lib/Target/PIC16/PIC16InstrInfo.td
@@ -134,7 +134,7 @@ include "PIC16InstrFormats.td"
 //===----------------------------------------------------------------------===//
 
 // W = W Op F : Load the value from F and do Op to W.
-let isTwoAddress = 1, mayLoad = 1 in
+let Constraints = "$src = $dst", mayLoad = 1 in
 class BinOpFW<bits<6> OpCode, string OpcStr, SDNode OpNode>:
   ByteFormat<OpCode, (outs GPR:$dst),
              (ins GPR:$src, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
@@ -146,7 +146,7 @@ class BinOpFW<bits<6> OpCode, string OpcStr, SDNode OpNode>:
 // F = F Op W : Load the value from F, do op with W and store in F.
 // This insn class is not marked as TwoAddress because the reg is
 // being used as a source operand only. (Remember a TwoAddress insn
-// needs a copyRegToReg.)
+// needs a copy.)
 let mayStore = 1 in
 class BinOpWF<bits<6> OpCode, string OpcStr, SDNode OpNode>:
   ByteFormat<OpCode, (outs),
@@ -160,7 +160,7 @@ class BinOpWF<bits<6> OpCode, string OpcStr, SDNode OpNode>:
                                              )]>;
 
 // W = W Op L : Do Op of L with W and place result in W.
-let isTwoAddress = 1 in
+let Constraints = "$src = $dst" in
 class BinOpWL<bits<6> opcode, string OpcStr, SDNode OpNode> :
   LiteralFormat<opcode, (outs GPR:$dst),
                 (ins GPR:$src, i8imm:$literal),
@@ -220,7 +220,7 @@ def set_fsrlo:
              "movwf ${fsr}L",
              []>;
 
-let isTwoAddress = 1 in
+let Constraints = "$src = $dst" in
 def set_fsrhi:
   ByteFormat<0, (outs FSR16:$dst), 
              (ins FSR16:$src, GPR:$val),
@@ -234,8 +234,8 @@ def set_pclath:
              [(set PCLATHR:$dst , (MTPCLATH GPR:$val))]>;
 
 //----------------------------
-// copyRegToReg 
-// copyRegToReg insns. These are dummy. They should always be deleted
+// copyPhysReg 
+// copyPhysReg insns. These are dummy. They should always be deleted
 // by the optimizer and never be present in the final generated code.
 // if they are, then we have to write correct macros for these insns.
 //----------------------------
@@ -362,7 +362,7 @@ def addwfc: BinOpWF<0, "addwfc", adde>;  // With Carry.
 }
 
 // W -= [F] ; load from F and sub the value from W.
-let isTwoAddress = 1, mayLoad = 1 in
+let Constraints = "$src = $dst", mayLoad = 1 in
 class SUBFW<bits<6> OpCode, string OpcStr, SDNode OpNode>:
   ByteFormat<OpCode, (outs GPR:$dst),
              (ins GPR:$src, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
@@ -418,7 +418,7 @@ def orlw  : BinOpWL<0, "iorlw", or>;
 
 // sublw 
 // W = C - W ; sub W from literal. (Without borrow).
-let isTwoAddress = 1 in
+let Constraints = "$src = $dst" in
 class SUBLW<bits<6> opcode, string OpcStr, SDNode OpNode> :
   LiteralFormat<opcode, (outs GPR:$dst),
                 (ins GPR:$src, i8imm:$literal),
@@ -426,7 +426,7 @@ class SUBLW<bits<6> opcode, string OpcStr, SDNode OpNode> :
                 [(set GPR:$dst, (OpNode (i8 imm:$literal), GPR:$src))]>;
 // subwl 
 // W = W - C ; sub literal from W  (Without borrow).
-let isTwoAddress = 1 in
+let Constraints = "$src = $dst" in
 class SUBWL<bits<6> opcode, string OpcStr, SDNode OpNode> :
   LiteralFormat<opcode, (outs GPR:$dst),
                 (ins GPR:$src, i8imm:$literal),
diff --git a/lib/Target/PIC16/PIC16MemSelOpt.cpp b/lib/Target/PIC16/PIC16MemSelOpt.cpp
index ab81ed1..241170b 100644
--- a/lib/Target/PIC16/PIC16MemSelOpt.cpp
+++ b/lib/Target/PIC16/PIC16MemSelOpt.cpp
@@ -117,7 +117,7 @@ bool MemSelOpt::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
         DebugLoc dl = I->getDebugLoc();
         BuildMI(*MBB, I, dl, TII->get(PIC16::pagesel)).addExternalSymbol("$");
         Changed = true;
-        PageChanged = 0;	    
+        PageChanged = 0;            
       }
     }
   }
diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Cloner.cpp b/lib/Target/PIC16/PIC16Passes/PIC16Cloner.cpp
index c282521..27f1cf5 100644
--- a/lib/Target/PIC16/PIC16Passes/PIC16Cloner.cpp
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Cloner.cpp
@@ -150,8 +150,8 @@ void PIC16Cloner::markCallGraph(CallGraphNode *CGN, string StringMark) {
 
 
 // For PIC16, automatic variables of a function are emitted as globals.
-// Clone the auto variables of a function  and put them in ValueMap, 
-// this ValueMap will be used while
+// Clone the auto variables of a function  and put them in VMap, 
+// this VMap will be used while
 // Cloning the code of function itself.
 //
 void PIC16Cloner::CloneAutos(Function *F) {
@@ -160,11 +160,11 @@ void PIC16Cloner::CloneAutos(Function *F) {
   Module *M = F->getParent();
   Module::GlobalListType &Globals = M->getGlobalList();
 
-  // Clear the leftovers in ValueMap by any previous cloning.
-  ValueMap.clear();
+  // Clear the leftovers in VMap by any previous cloning.
+  VMap.clear();
 
   // Find the auto globls for this function and clone them, and put them
-  // in ValueMap.
+  // in VMap.
   std::string FnName = F->getName().str();
   std::string VarName, ClonedVarName;
   for (Module::global_iterator I = M->global_begin(), E = M->global_end();
@@ -182,8 +182,8 @@ void PIC16Cloner::CloneAutos(Function *F) {
       // Add these new globals to module's globals list.
       Globals.push_back(ClonedGV);
  
-      // Update ValueMap.
-      ValueMap[GV] = ClonedGV;
+      // Update VMap.
+      VMap[GV] = ClonedGV;
      }
   }
 }
@@ -236,10 +236,10 @@ void PIC16Cloner::cloneSharedFunctions(CallGraphNode *CGN) {
 }
 
 // Clone the given function and return it.
-// Note: it uses the ValueMap member of the class, which is already populated
+// Note: it uses the VMap member of the class, which is already populated
 // by cloneAutos by the time we reach here. 
-// FIXME: Should we just pass ValueMap's ref as a parameter here? rather
-// than keeping the ValueMap as a member.
+// FIXME: Should we just pass VMap's ref as a parameter here? rather
+// than keeping the VMap as a member.
 Function *
 PIC16Cloner::cloneFunction(Function *OrgF) {
    Function *ClonedF;
@@ -252,11 +252,11 @@ PIC16Cloner::cloneFunction(Function *OrgF) {
    }
 
    // Clone does not exist. 
-   // First clone the autos, and populate ValueMap.
+   // First clone the autos, and populate VMap.
    CloneAutos(OrgF);
 
    // Now create the clone.
-   ClonedF = CloneFunction(OrgF, ValueMap);
+   ClonedF = CloneFunction(OrgF, VMap);
 
    // The new function should be for interrupt line. Therefore should have 
    // the name suffixed with IL and section attribute marked with IL. 
diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Cloner.h b/lib/Target/PIC16/PIC16Passes/PIC16Cloner.h
index 24c1152..e8b5aa4 100644
--- a/lib/Target/PIC16/PIC16Passes/PIC16Cloner.h
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Cloner.h
@@ -15,7 +15,7 @@
 #ifndef PIC16CLONER_H
 #define PIC16CLONER_H
 
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ValueMap.h"
 
 using namespace llvm;
 using std::vector;
@@ -72,7 +72,7 @@ namespace llvm {
     // the corresponding cloned auto variable of the cloned function. 
     // This value map is passed during the function cloning so that all the
     // uses of auto variables be updated properly. 
-    DenseMap<const Value*, Value*> ValueMap;
+    ValueMap<const Value*, Value*> VMap;
 
     // Map of a already cloned functions. 
     map<Function *, Function *> ClonedFunctionMap;
diff --git a/lib/Target/PIC16/PIC16RegisterInfo.cpp b/lib/Target/PIC16/PIC16RegisterInfo.cpp
index 30a1d4a..dff98d1 100644
--- a/lib/Target/PIC16/PIC16RegisterInfo.cpp
+++ b/lib/Target/PIC16/PIC16RegisterInfo.cpp
@@ -35,13 +35,6 @@ getCalleeSavedRegs(const MachineFunction *MF) const {
   return CalleeSavedRegs;
 }
 
-// PIC16 Callee Saved Reg Classes
-const TargetRegisterClass* const*
-PIC16RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 };
-  return CalleeSavedRegClasses;
-}
-
 BitVector PIC16RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   return Reserved;
diff --git a/lib/Target/PIC16/PIC16RegisterInfo.h b/lib/Target/PIC16/PIC16RegisterInfo.h
index 6a9a038..5536a61 100644
--- a/lib/Target/PIC16/PIC16RegisterInfo.h
+++ b/lib/Target/PIC16/PIC16RegisterInfo.h
@@ -41,10 +41,6 @@ class PIC16RegisterInfo : public PIC16GenRegisterInfo {
   virtual const unsigned* 
   getCalleeSavedRegs(const MachineFunction *MF = 0) const;
 
-  // PIC16 callee saved register classes
-  virtual const TargetRegisterClass* const *
-  getCalleeSavedRegClasses(const MachineFunction *MF) const;
-
   virtual BitVector getReservedRegs(const MachineFunction &MF) const;
   virtual bool hasFP(const MachineFunction &MF) const;
 
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 66dfd4b..db11fde 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -78,7 +78,7 @@ PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
   isLoad  = TID.mayLoad();
   isStore = TID.mayStore();
   
-  unsigned TSFlags = TID.TSFlags;
+  uint64_t TSFlags = TID.TSFlags;
   
   isFirst   = TSFlags & PPCII::PPC970_First;
   isSingle  = TSFlags & PPCII::PPC970_Single;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 10b516a..d47d989 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1203,11 +1203,11 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
-  const GlobalValue *GV = GSDN->getGlobal();
-  SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
-  SDValue Zero = DAG.getConstant(0, PtrVT);
   // FIXME there isn't really any debug info here
   DebugLoc dl = GSDN->getDebugLoc();
+  const GlobalValue *GV = GSDN->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GSDN->getOffset());
+  SDValue Zero = DAG.getConstant(0, PtrVT);
 
   const TargetMachine &TM = DAG.getTarget();
 
@@ -1631,7 +1631,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
 
       unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
       int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(),
-                                      isImmutable, false);
+                                      isImmutable);
 
       // Create load nodes to retrieve arguments from the stack.
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
@@ -1700,8 +1700,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
 
     FuncInfo->setVarArgsStackOffset(
       MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
-                             CCInfo.getNextStackOffset(),
-                             true, false));
+                             CCInfo.getNextStackOffset(), true));
 
     FuncInfo->setVarArgsFrameIndex(MFI->CreateStackObject(Depth, 8, false));
     SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
@@ -1911,7 +1910,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
         CurArgOffset = CurArgOffset + (4 - ObjSize);
       }
       // The value of the object is its address.
-      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true, false);
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       InVals.push_back(FIN);
       if (ObjSize==1 || ObjSize==2) {
@@ -1936,7 +1935,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
         // the object.
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
-          int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true, false);
+          int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
           SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0,
@@ -2062,7 +2061,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
     if (needsLoad) {
       int FI = MFI->CreateFixedObject(ObjSize,
                                       CurArgOffset + (ArgSize - ObjSize),
-                                      isImmutable, false);
+                                      isImmutable);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, NULL, 0,
                            false, false, 0);
@@ -2097,7 +2096,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
 
     FuncInfo->setVarArgsFrameIndex(
       MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
-                             Depth, true, false));
+                             Depth, true));
     SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
 
     // If this function is vararg, store any remaining integer argument regs
@@ -2137,6 +2136,7 @@ CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
                                      unsigned CC,
                                      const SmallVectorImpl<ISD::OutputArg>
                                        &Outs,
+                                     const SmallVectorImpl<SDValue> &OutVals,
                                      unsigned &nAltivecParamsAtEnd) {
   // Count how many bytes are to be pushed on the stack, including the linkage
   // area, and parameter passing area.  We start with 24/48 bytes, which is
@@ -2153,9 +2153,9 @@ CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
   // 16-byte aligned.
   nAltivecParamsAtEnd = 0;
   for (unsigned i = 0; i != NumOps; ++i) {
-    SDValue Arg = Outs[i].Val;
+    SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
-    EVT ArgVT = Arg.getValueType();
+    EVT ArgVT = Outs[i].VT;
     // Varargs Altivec parameters are padded to a 16 byte boundary.
     if (ArgVT==MVT::v4f32 || ArgVT==MVT::v4i32 ||
         ArgVT==MVT::v8i16 || ArgVT==MVT::v16i8) {
@@ -2314,8 +2314,7 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
     int NewRetAddrLoc = SPDiff + PPCFrameInfo::getReturnSaveOffset(isPPC64,
                                                                    isDarwinABI);
     int NewRetAddr = MF.getFrameInfo()->CreateFixedObject(SlotSize,
-                                                          NewRetAddrLoc,
-                                                          true, false);
+                                                          NewRetAddrLoc, true);
     EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
     SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
     Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
@@ -2328,7 +2327,7 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
       int NewFPLoc =
         SPDiff + PPCFrameInfo::getFramePointerSaveOffset(isPPC64, isDarwinABI);
       int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc,
-                                                          true, false);
+                                                          true);
       SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
       Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
                            PseudoSourceValue::getFixedStack(NewFPIdx), 0,
@@ -2346,7 +2345,7 @@ CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
                       SmallVector<TailCallArgumentInfo, 8>& TailCallArguments) {
   int Offset = ArgOffset + SPDiff;
   uint32_t OpSize = (Arg.getValueType().getSizeInBits()+7)/8;
-  int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true,false);
+  int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
   EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
   SDValue FIN = DAG.getFrameIndex(FI, VT);
   TailCallArgumentInfo Info;
@@ -2472,7 +2471,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), Callee.getValueType());
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, 
+                                        Callee.getValueType());
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType());
   else if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG))
@@ -2705,6 +2705,7 @@ PPCTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                              CallingConv::ID CallConv, bool isVarArg,
                              bool &isTailCall,
                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
                              const SmallVectorImpl<ISD::InputArg> &Ins,
                              DebugLoc dl, SelectionDAG &DAG,
                              SmallVectorImpl<SDValue> &InVals) const {
@@ -2714,11 +2715,11 @@ PPCTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) {
     return LowerCall_SVR4(Chain, Callee, CallConv, isVarArg,
-                          isTailCall, Outs, Ins,
+                          isTailCall, Outs, OutVals, Ins,
                           dl, DAG, InVals);
   } else {
     return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
-                            isTailCall, Outs, Ins,
+                            isTailCall, Outs, OutVals, Ins,
                             dl, DAG, InVals);
   }
 }
@@ -2728,6 +2729,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   bool isTailCall,
                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<SDValue> &OutVals,
                                   const SmallVectorImpl<ISD::InputArg> &Ins,
                                   DebugLoc dl, SelectionDAG &DAG,
                                   SmallVectorImpl<SDValue> &InVals) const {
@@ -2737,7 +2739,6 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
   assert((CallConv == CallingConv::C ||
           CallConv == CallingConv::Fast) && "Unknown calling convention!");
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   unsigned PtrByteSize = 4;
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -2769,7 +2770,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
     unsigned NumArgs = Outs.size();
     
     for (unsigned i = 0; i != NumArgs; ++i) {
-      EVT ArgVT = Outs[i].Val.getValueType();
+      EVT ArgVT = Outs[i].VT;
       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
       bool Result;
       
@@ -2838,7 +2839,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
        i != e;
        ++i) {
     CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = Outs[i].Val;
+    SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
     
     if (Flags.isByVal()) {
@@ -2934,6 +2935,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
                                     CallingConv::ID CallConv, bool isVarArg,
                                     bool isTailCall,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                     DebugLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
@@ -2961,7 +2963,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   // prereserved space for [SP][CR][LR][3 x unused].
   unsigned NumBytes =
     CalculateParameterAndLinkageAreaSize(DAG, isPPC64, isVarArg, CallConv,
-                                         Outs,
+                                         Outs, OutVals,
                                          nAltivecParamsAtEnd);
 
   // Calculate by how many bytes the stack has to be adjusted in case of tail
@@ -3025,7 +3027,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
 
   SmallVector<SDValue, 8> MemOpChains;
   for (unsigned i = 0; i != NumOps; ++i) {
-    SDValue Arg = Outs[i].Val;
+    SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
 
     // PtrOff will be used to store the current argument to the stack if a
@@ -3051,7 +3053,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
         // Everything else is passed left-justified.
         EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
         if (GPR_idx != NumGPRs) {
-          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, PtrVT, dl, Chain, Arg,
                                         NULL, 0, VT, false, false, 0);
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
@@ -3228,8 +3230,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
     ArgOffset = ((ArgOffset+15)/16)*16;
     ArgOffset += 12*16;
     for (unsigned i = 0; i != NumOps; ++i) {
-      SDValue Arg = Outs[i].Val;
-      EVT ArgType = Arg.getValueType();
+      SDValue Arg = OutVals[i];
+      EVT ArgType = Outs[i].VT;
       if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
           ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
         if (++j > NumVRs) {
@@ -3297,6 +3299,7 @@ SDValue
 PPCTargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
                                DebugLoc dl, SelectionDAG &DAG) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
@@ -3318,7 +3321,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             Outs[i].Val, Flag);
+                             OutVals[i], Flag);
     Flag = Chain.getValue(1);
   }
 
@@ -3376,8 +3379,7 @@ PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
     // Find out what the fix offset of the frame pointer save area.
     int LROffset = PPCFrameInfo::getReturnSaveOffset(isPPC64, isDarwinABI);
     // Allocate the frame index for frame pointer save area.
-    RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset,
-                                                true, false);
+    RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, true);
     // Save the result.
     FI->setReturnAddrSaveIndex(RASI);
   }
@@ -3403,8 +3405,7 @@ PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
                                                            isDarwinABI);
 
     // Allocate the frame index for frame pointer save area.
-    FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset,
-                                                true, false);
+    FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
     // Save the result.
     FI->setFramePointerSaveIndex(FPSI);
   }
@@ -4518,7 +4519,10 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
   F->insert(It, loopMBB);
   F->insert(It, exitMBB);
-  exitMBB->transferSuccessors(BB);
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   MachineRegisterInfo &RegInfo = F->getRegInfo();
   unsigned TmpReg = (!BinOpcode) ? incr :
@@ -4583,7 +4587,10 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
   MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
   F->insert(It, loopMBB);
   F->insert(It, exitMBB);
-  exitMBB->transferSuccessors(BB);
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   MachineRegisterInfo &RegInfo = F->getRegInfo();
   const TargetRegisterClass *RC =
@@ -4716,23 +4723,22 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
     unsigned SelectPred = MI->getOperand(4).getImm();
     DebugLoc dl = MI->getDebugLoc();
-    BuildMI(BB, dl, TII->get(PPC::BCC))
-      .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
     F->insert(It, copy0MBB);
     F->insert(It, sinkMBB);
-    // Update machine-CFG edges by first adding all successors of the current
-    // block to the new block which will contain the Phi node for the select.
-    for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
-           E = BB->succ_end(); I != E; ++I)
-      sinkMBB->addSuccessor(*I);
-    // Next, remove all successors of the current block, and add the true
-    // and fallthrough blocks as its successors.
-    while (!BB->succ_empty())
-      BB->removeSuccessor(BB->succ_begin());
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    sinkMBB->splice(sinkMBB->begin(), BB,
+                    llvm::next(MachineBasicBlock::iterator(MI)),
+                    BB->end());
+    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
     // Next, add the true and fallthrough blocks as its successors.
     BB->addSuccessor(copy0MBB);
     BB->addSuccessor(sinkMBB);
 
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+
     //  copy0MBB:
     //   %FalseValue = ...
     //   # fallthrough to sinkMBB
@@ -4745,7 +4751,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
     //  ...
     BB = sinkMBB;
-    BuildMI(BB, dl, TII->get(PPC::PHI), MI->getOperand(0).getReg())
+    BuildMI(*BB, BB->begin(), dl,
+            TII->get(PPC::PHI), MI->getOperand(0).getReg())
       .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
       .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
   }
@@ -4831,7 +4838,10 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     F->insert(It, loop2MBB);
     F->insert(It, midMBB);
     F->insert(It, exitMBB);
-    exitMBB->transferSuccessors(BB);
+    exitMBB->splice(exitMBB->begin(), BB,
+                    llvm::next(MachineBasicBlock::iterator(MI)),
+                    BB->end());
+    exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
     //  thisMBB:
     //   ...
@@ -4899,7 +4909,10 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     F->insert(It, loop2MBB);
     F->insert(It, midMBB);
     F->insert(It, exitMBB);
-    exitMBB->transferSuccessors(BB);
+    exitMBB->splice(exitMBB->begin(), BB,
+                    llvm::next(MachineBasicBlock::iterator(MI)),
+                    BB->end());
+    exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
     MachineRegisterInfo &RegInfo = F->getRegInfo();
     const TargetRegisterClass *RC =
@@ -5025,7 +5038,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     llvm_unreachable("Unexpected instr type to insert");
   }
 
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
   return BB;
 }
 
@@ -5042,19 +5055,19 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   default: break;
   case PPCISD::SHL:
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
-      if (C->getZExtValue() == 0)   // 0 << V -> 0.
+      if (C->isNullValue())   // 0 << V -> 0.
         return N->getOperand(0);
     }
     break;
   case PPCISD::SRL:
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
-      if (C->getZExtValue() == 0)   // 0 >>u V -> 0.
+      if (C->isNullValue())   // 0 >>u V -> 0.
         return N->getOperand(0);
     }
     break;
   case PPCISD::SRA:
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
-      if (C->getZExtValue() == 0 ||   //  0 >>s V -> 0.
+      if (C->isNullValue() ||   //  0 >>s V -> 0.
           C->isAllOnesValue())    // -1 >>s V -> -1.
         return N->getOperand(0);
     }
@@ -5380,11 +5393,8 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
 
 
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
-/// vector.  If it is invalid, don't add anything to Ops. If hasMemory is true
-/// it means one of the asm constraint of the inline asm instruction being
-/// processed is 'm'.
+/// vector.  If it is invalid, don't add anything to Ops.
 void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Letter,
-                                                     bool hasMemory,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result(0,0);
@@ -5443,7 +5453,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Letter,
   }
 
   // Handle standard constraint letters.
-  TargetLowering::LowerAsmOperandForConstraint(Op, Letter, hasMemory, Ops, DAG);
+  TargetLowering::LowerAsmOperandForConstraint(Op, Letter, Ops, DAG);
 }
 
 // isLegalAddressingMode - Return true if the addressing mode represented
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 6dcaf1e..700816f 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -318,12 +318,9 @@ namespace llvm {
     unsigned getByValTypeAlignment(const Type *Ty) const;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
-    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
-    /// true it means one of the asm constraint of the inline asm instruction
-    /// being processed is 'm'.
+    /// vector.  If it is invalid, don't add anything to Ops.
     virtual void LowerAsmOperandForConstraint(SDValue Op,
                                               char ConstraintLetter,
-                                              bool hasMemory,
                                               std::vector<SDValue> &Ops,
                                               SelectionDAG &DAG) const;
     
@@ -438,6 +435,7 @@ namespace llvm {
       LowerCall(SDValue Chain, SDValue Callee,
                 CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
@@ -446,6 +444,7 @@ namespace llvm {
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     SDValue
@@ -465,6 +464,7 @@ namespace llvm {
       LowerCall_Darwin(SDValue Chain, SDValue Callee,
                        CallingConv::ID CallConv, bool isVarArg, bool isTailCall,
                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                       const SmallVectorImpl<SDValue> &OutVals,
                        const SmallVectorImpl<ISD::InputArg> &Ins,
                        DebugLoc dl, SelectionDAG &DAG,
                        SmallVectorImpl<SDValue> &InVals) const;
@@ -472,6 +472,7 @@ namespace llvm {
       LowerCall_SVR4(SDValue Chain, SDValue Callee,
                      CallingConv::ID CallConv, bool isVarArg, bool isTailCall,
                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     const SmallVectorImpl<SDValue> &OutVals,
                      const SmallVectorImpl<ISD::InputArg> &Ins,
                      DebugLoc dl, SelectionDAG &DAG,
                      SmallVectorImpl<SDValue> &InVals) const;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 1b7a778..1574aa3 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -316,9 +316,8 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 unsigned
 PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                            MachineBasicBlock *FBB,
-                           const SmallVectorImpl<MachineOperand> &Cond) const {
-  // FIXME this should probably have a DebugLoc argument
-  DebugLoc dl;
+                           const SmallVectorImpl<MachineOperand> &Cond,
+                           DebugLoc DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 2 || Cond.size() == 0) && 
@@ -327,50 +326,46 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   // One-way branch.
   if (FBB == 0) {
     if (Cond.empty())   // Unconditional branch
-      BuildMI(&MBB, dl, get(PPC::B)).addMBB(TBB);
+      BuildMI(&MBB, DL, get(PPC::B)).addMBB(TBB);
     else                // Conditional branch
-      BuildMI(&MBB, dl, get(PPC::BCC))
+      BuildMI(&MBB, DL, get(PPC::BCC))
         .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
     return 1;
   }
   
   // Two-way Conditional Branch.
-  BuildMI(&MBB, dl, get(PPC::BCC))
+  BuildMI(&MBB, DL, get(PPC::BCC))
     .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
-  BuildMI(&MBB, dl, get(PPC::B)).addMBB(FBB);
+  BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB);
   return 2;
 }
 
-bool PPCInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   const TargetRegisterClass *DestRC,
-                                   const TargetRegisterClass *SrcRC,
-                                   DebugLoc DL) const {
-  if (DestRC != SrcRC) {
-    // Not yet supported!
-    return false;
-  }
-
-  if (DestRC == PPC::GPRCRegisterClass) {
-    BuildMI(MBB, MI, DL, get(PPC::OR), DestReg).addReg(SrcReg).addReg(SrcReg);
-  } else if (DestRC == PPC::G8RCRegisterClass) {
-    BuildMI(MBB, MI, DL, get(PPC::OR8), DestReg).addReg(SrcReg).addReg(SrcReg);
-  } else if (DestRC == PPC::F4RCRegisterClass ||
-             DestRC == PPC::F8RCRegisterClass) {
-    BuildMI(MBB, MI, DL, get(PPC::FMR), DestReg).addReg(SrcReg);
-  } else if (DestRC == PPC::CRRCRegisterClass) {
-    BuildMI(MBB, MI, DL, get(PPC::MCRF), DestReg).addReg(SrcReg);
-  } else if (DestRC == PPC::VRRCRegisterClass) {
-    BuildMI(MBB, MI, DL, get(PPC::VOR), DestReg).addReg(SrcReg).addReg(SrcReg);
-  } else if (DestRC == PPC::CRBITRCRegisterClass) {
-    BuildMI(MBB, MI, DL, get(PPC::CROR), DestReg).addReg(SrcReg).addReg(SrcReg);
-  } else {
-    // Attempt to copy register that is not GPR or FPR
-    return false;
-  }
-  
-  return true;
+void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, DebugLoc DL,
+                               unsigned DestReg, unsigned SrcReg,
+                               bool KillSrc) const {
+  unsigned Opc;
+  if (PPC::GPRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::OR;
+  else if (PPC::G8RCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::OR8;
+  else if (PPC::F4RCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::FMR;
+  else if (PPC::CRRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::MCRF;
+  else if (PPC::VRRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::VOR;
+  else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::CROR;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  const TargetInstrDesc &TID = get(Opc);
+  if (TID.getNumOperands() == 3)
+    BuildMI(MBB, I, DL, TID, DestReg)
+      .addReg(SrcReg).addReg(SrcReg, getKillRegState(KillSrc));
+  else
+    BuildMI(MBB, I, DL, TID, DestReg).addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 bool
@@ -654,121 +649,6 @@ PPCInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
   return &*MIB;
 }
 
-/// foldMemoryOperand - PowerPC (like most RISC's) can only fold spills into
-/// copy instructions, turning them into load/store instructions.
-MachineInstr *PPCInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                  MachineInstr *MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                                  int FrameIndex) const {
-  if (Ops.size() != 1) return NULL;
-
-  // Make sure this is a reg-reg copy.  Note that we can't handle MCRF, because
-  // it takes more than one instruction to store it.
-  unsigned Opc = MI->getOpcode();
-  unsigned OpNum = Ops[0];
-
-  MachineInstr *NewMI = NULL;
-  if ((Opc == PPC::OR &&
-       MI->getOperand(1).getReg() == MI->getOperand(2).getReg())) {
-    if (OpNum == 0) {  // move -> store
-      unsigned InReg = MI->getOperand(1).getReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::STW))
-                                .addReg(InReg,
-                                        getKillRegState(isKill) |
-                                        getUndefRegState(isUndef)),
-                                FrameIndex);
-    } else {           // move -> load
-      unsigned OutReg = MI->getOperand(0).getReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::LWZ))
-                                .addReg(OutReg,
-                                        RegState::Define |
-                                        getDeadRegState(isDead) |
-                                        getUndefRegState(isUndef)),
-                                FrameIndex);
-    }
-  } else if ((Opc == PPC::OR8 &&
-              MI->getOperand(1).getReg() == MI->getOperand(2).getReg())) {
-    if (OpNum == 0) {  // move -> store
-      unsigned InReg = MI->getOperand(1).getReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::STD))
-                                .addReg(InReg,
-                                        getKillRegState(isKill) |
-                                        getUndefRegState(isUndef)),
-                                FrameIndex);
-    } else {           // move -> load
-      unsigned OutReg = MI->getOperand(0).getReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::LD))
-                                .addReg(OutReg,
-                                        RegState::Define |
-                                        getDeadRegState(isDead) |
-                                        getUndefRegState(isUndef)),
-                                FrameIndex);
-    }
-  } else if (Opc == PPC::FMR || Opc == PPC::FMRSD) {
-    // The register may be F4RC or F8RC, and that determines the memory op.
-    unsigned OrigReg = MI->getOperand(OpNum).getReg();
-    // We cannot tell the register class from a physreg alone.
-    if (TargetRegisterInfo::isPhysicalRegister(OrigReg))
-      return NULL;
-    const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(OrigReg);
-    const bool is64 = RC == PPC::F8RCRegisterClass;
-
-    if (OpNum == 0) {  // move -> store
-      unsigned InReg = MI->getOperand(1).getReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(),
-                                        get(is64 ? PPC::STFD : PPC::STFS))
-                                .addReg(InReg,
-                                        getKillRegState(isKill) |
-                                        getUndefRegState(isUndef)),
-                                FrameIndex);
-    } else {           // move -> load
-      unsigned OutReg = MI->getOperand(0).getReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(),
-                                        get(is64 ? PPC::LFD : PPC::LFS))
-                                .addReg(OutReg,
-                                        RegState::Define |
-                                        getDeadRegState(isDead) |
-                                        getUndefRegState(isUndef)),
-                                FrameIndex);
-    }
-  }
-
-  return NewMI;
-}
-
-bool PPCInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
-                                  const SmallVectorImpl<unsigned> &Ops) const {
-  if (Ops.size() != 1) return false;
-
-  // Make sure this is a reg-reg copy.  Note that we can't handle MCRF, because
-  // it takes more than one instruction to store it.
-  unsigned Opc = MI->getOpcode();
-
-  if ((Opc == PPC::OR &&
-       MI->getOperand(1).getReg() == MI->getOperand(2).getReg()))
-    return true;
-  else if ((Opc == PPC::OR8 &&
-              MI->getOperand(1).getReg() == MI->getOperand(2).getReg()))
-    return true;
-  else if (Opc == PPC::FMR || Opc == PPC::FMRSD)
-    return true;
-
-  return false;
-}
-
-
 bool PPCInstrInfo::
 ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 7a9e11b..eadb21e 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -109,13 +109,12 @@ public:
   virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                 MachineBasicBlock *FBB,
-                            const SmallVectorImpl<MachineOperand> &Cond) const;
-  virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
   
   virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
@@ -135,23 +134,6 @@ public:
                                                  const MDNode *MDPtr,
                                                  DebugLoc DL) const;
 
-  /// foldMemoryOperand - PowerPC (like most RISC's) can only fold spills into
-  /// copy instructions, turning them into load/store instructions.
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                              const SmallVectorImpl<unsigned> &Ops,
-                                              int FrameIndex) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                              const SmallVectorImpl<unsigned> &Ops,
-                                              MachineInstr* LoadMI) const {
-    return 0;
-  }
-
-  virtual bool canFoldMemoryOperand(const MachineInstr *MI,
-                                    const SmallVectorImpl<unsigned> &Ops) const;
-  
   virtual
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
   
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 0ff852c..4d6132a9 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -269,140 +269,6 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return Subtarget.isPPC64() ? SVR4_64_CalleeSavedRegs : SVR4_CalleeSavedRegs;
 }
 
-const TargetRegisterClass* const*
-PPCRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  // 32-bit Darwin calling convention.
-  static const TargetRegisterClass * const Darwin32_CalleeSavedRegClasses[] = {
-                       &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
-    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
-    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
-    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
-    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
-
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    
-    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
-    
-    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
-    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
-    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
-    
-    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass, 
-    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass, 
-    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass, 
-    
-    &PPC::GPRCRegClass, 0
-  };
-  
-  // 32-bit SVR4 calling convention.
-  static const TargetRegisterClass * const SVR4_CalleeSavedRegClasses[] = {
-                                          &PPC::GPRCRegClass,&PPC::GPRCRegClass,
-    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
-    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
-    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
-    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
-
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    
-    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
-    
-    &PPC::VRSAVERCRegClass,
-    
-    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
-    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
-    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
-    
-    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass, 
-    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass, 
-    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass, 
-    
-    0
-  };
-  
-  // 64-bit Darwin calling convention.
-  static const TargetRegisterClass * const Darwin64_CalleeSavedRegClasses[] = {
-    &PPC::G8RCRegClass,&PPC::G8RCRegClass,
-    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
-    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
-    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
-    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
-    
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    
-    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
-    
-    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
-    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
-    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
-    
-    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass, 
-    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass, 
-    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass, 
-    
-    &PPC::G8RCRegClass, 0
-  };
-
-  // 64-bit SVR4 calling convention.
-  static const TargetRegisterClass * const SVR4_64_CalleeSavedRegClasses[] = {
-    &PPC::G8RCRegClass,&PPC::G8RCRegClass,
-    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
-    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
-    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
-    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
-
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
-    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
-
-    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
-
-    &PPC::VRSAVERCRegClass,
-
-    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
-    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
-    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
-
-    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
-    &PPC::CRBITRCRegClass,
-
-    0
-  };
-  
-  if (Subtarget.isDarwinABI())
-    return Subtarget.isPPC64() ? Darwin64_CalleeSavedRegClasses :
-                                 Darwin32_CalleeSavedRegClasses;
-  
-  return Subtarget.isPPC64() ? SVR4_64_CalleeSavedRegClasses
-                             : SVR4_CalleeSavedRegClasses;
-}
-
 // needsFP - Return true if the specified function should have a dedicated frame
 // pointer register.  This is true if the function has variable sized allocas or
 // if frame pointer elimination is disabled.
@@ -1060,8 +926,7 @@ PPCRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(isPPC64,
                                                            isDarwinABI);
     // Allocate the frame index for frame pointer save area.
-    FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset,
-                                                true, false);
+    FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
     // Save the result.
     FI->setFramePointerSaveIndex(FPSI);                      
   }
@@ -1069,8 +934,7 @@ PPCRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // Reserve stack space to move the linkage area to in case of a tail call.
   int TCSPDelta = 0;
   if (GuaranteedTailCallOpt && (TCSPDelta = FI->getTailCallSPDelta()) < 0) {
-    MF.getFrameInfo()->CreateFixedObject(-1 * TCSPDelta, TCSPDelta,
-                                         true, false);
+    MF.getFrameInfo()->CreateFixedObject(-1 * TCSPDelta, TCSPDelta, true);
   }
   
   // Reserve a slot closest to SP or frame pointer if we have a dynalloc or
@@ -1127,9 +991,7 @@ PPCRegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
   
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    const TargetRegisterClass *RC = CSI[i].getRegClass();
-    
-    if (RC == PPC::GPRCRegisterClass) {
+    if (PPC::GPRCRegisterClass->contains(Reg)) {
       HasGPSaveArea = true;
       
       GPRegs.push_back(CSI[i]);
@@ -1137,7 +999,7 @@ PPCRegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
       if (Reg < MinGPR) {
         MinGPR = Reg;
       }
-    } else if (RC == PPC::G8RCRegisterClass) {
+    } else if (PPC::G8RCRegisterClass->contains(Reg)) {
       HasG8SaveArea = true;
 
       G8Regs.push_back(CSI[i]);
@@ -1145,7 +1007,7 @@ PPCRegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
       if (Reg < MinG8R) {
         MinG8R = Reg;
       }
-    } else if (RC == PPC::F8RCRegisterClass) {
+    } else if (PPC::F8RCRegisterClass->contains(Reg)) {
       HasFPSaveArea = true;
       
       FPRegs.push_back(CSI[i]);
@@ -1154,12 +1016,12 @@ PPCRegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
         MinFPR = Reg;
       }
 // FIXME SVR4: Disable CR save area for now.
-    } else if (   RC == PPC::CRBITRCRegisterClass
-               || RC == PPC::CRRCRegisterClass) {
+    } else if (PPC::CRBITRCRegisterClass->contains(Reg)
+               || PPC::CRRCRegisterClass->contains(Reg)) {
 //      HasCRSaveArea = true;
-    } else if (RC == PPC::VRSAVERCRegisterClass) {
+    } else if (PPC::VRSAVERCRegisterClass->contains(Reg)) {
       HasVRSAVESaveArea = true;
-    } else if (RC == PPC::VRRCRegisterClass) {
+    } else if (PPC::VRRCRegisterClass->contains(Reg)) {
       HasVRSaveArea = true;
       
       VRegs.push_back(CSI[i]);
@@ -1240,9 +1102,10 @@ PPCRegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
     //             which have the CR/CRBIT register class?
     // Adjust the frame index of the CR spill slot.
     for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-      const TargetRegisterClass *RC = CSI[i].getRegClass();
+      unsigned Reg = CSI[i].getReg();
     
-      if (RC == PPC::CRBITRCRegisterClass || RC == PPC::CRRCRegisterClass) {
+      if (PPC::CRBITRCRegisterClass->contains(Reg) ||
+          PPC::CRRCRegisterClass->contains(Reg)) {
         int FI = CSI[i].getFrameIdx();
 
         FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
@@ -1257,9 +1120,9 @@ PPCRegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
     //             which have the VRSAVE register class?
     // Adjust the frame index of the VRSAVE spill slot.
     for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-      const TargetRegisterClass *RC = CSI[i].getRegClass();
+      unsigned Reg = CSI[i].getReg();
     
-      if (RC == PPC::VRSAVERCRegisterClass) {
+      if (PPC::VRSAVERCRegisterClass->contains(Reg)) {
         int FI = CSI[i].getFrameIdx();
 
         FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
@@ -1762,4 +1625,3 @@ int PPCRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
 }
 
 #include "PPCGenRegisterInfo.inc"
-
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 43cf535..f026847 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -42,9 +42,6 @@ public:
   /// Code Generation virtual methods...
   const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
 
-  const TargetRegisterClass* const*
-  getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
-
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
   /// targetHandlesStackFrameRounding - Returns true if the target is
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 7fa73ed..4d7ee08 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -300,6 +300,14 @@ unsigned long reverse(unsigned v) {
     return v ^ (t >> 8);
 }
 
+Neither is this (very standard idiom):
+
+int f(int n)
+{
+  return (((n) << 24) | (((n) & 0xff00) << 8) 
+       | (((n) >> 8) & 0xff00) | ((n) >> 24));
+}
+
 //===---------------------------------------------------------------------===//
 
 [LOOP RECOGNITION]
@@ -898,17 +906,6 @@ The expression should optimize to something like
 
 //===---------------------------------------------------------------------===//
 
-From GCC Bug 3756:
-int
-pn (int n)
-{
- return (n >= 0 ? 1 : -1);
-}
-Should combine to (n >> 31) | 1.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts | llc".
-
-//===---------------------------------------------------------------------===//
-
 void a(int variable)
 {
  if (variable == 4 || variable == 6)
@@ -1439,33 +1436,6 @@ This pattern repeats several times, basically doing:
 
 //===---------------------------------------------------------------------===//
 
-186.crafty contains this interesting pattern:
-
-%77 = call i8* @strstr(i8* getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0),
-                       i8* %30)
-%phitmp648 = icmp eq i8* %77, getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0)
-br i1 %phitmp648, label %bb70, label %bb76
-
-bb70:           ; preds = %OptionMatch.exit91, %bb69
-        %78 = call i32 @strlen(i8* %30) nounwind readonly align 1               ; <i32> [#uses=1]
-
-This is basically:
-  cststr = "abcdef";
-  if (strstr(cststr, P) == cststr) {
-     x = strlen(P);
-     ...
-
-The strstr call would be significantly cheaper written as:
-
-cststr = "abcdef";
-if (memcmp(P, str, strlen(P)))
-  x = strlen(P);
-
-This is memcmp+strlen instead of strstr.  This also makes the strlen fully
-redundant.
-
-//===---------------------------------------------------------------------===//
-
 186.crafty also contains this code:
 
 %1906 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
@@ -1863,3 +1833,91 @@ LLVM prefers comparisons with zero over non-zero in general, but in this
 case it choses instead to keep the max operation obvious.
 
 //===---------------------------------------------------------------------===//
+
+Take the following testcase on x86-64 (similar testcases exist for all targets
+with addc/adde):
+
+define void @a(i64* nocapture %s, i64* nocapture %t, i64 %a, i64 %b,
+i64 %c) nounwind {
+entry:
+ %0 = zext i64 %a to i128                        ; <i128> [#uses=1]
+ %1 = zext i64 %b to i128                        ; <i128> [#uses=1]
+ %2 = add i128 %1, %0                            ; <i128> [#uses=2]
+ %3 = zext i64 %c to i128                        ; <i128> [#uses=1]
+ %4 = shl i128 %3, 64                            ; <i128> [#uses=1]
+ %5 = add i128 %4, %2                            ; <i128> [#uses=1]
+ %6 = lshr i128 %5, 64                           ; <i128> [#uses=1]
+ %7 = trunc i128 %6 to i64                       ; <i64> [#uses=1]
+ store i64 %7, i64* %s, align 8
+ %8 = trunc i128 %2 to i64                       ; <i64> [#uses=1]
+ store i64 %8, i64* %t, align 8
+ ret void
+}
+
+Generated code:
+       addq    %rcx, %rdx
+       movl    $0, %eax
+       adcq    $0, %rax
+       addq    %r8, %rax
+       movq    %rax, (%rdi)
+       movq    %rdx, (%rsi)
+       ret
+
+Expected code:
+       addq    %rcx, %rdx
+       adcq    $0, %r8
+       movq    %r8, (%rdi)
+       movq    %rdx, (%rsi)
+       ret
+
+The generated SelectionDAG has an ADD of an ADDE, where both operands of the
+ADDE are zero. Replacing one of the operands of the ADDE with the other operand
+of the ADD, and replacing the ADD with the ADDE, should give the desired result.
+
+(That said, we are doing a lot better than gcc on this testcase. :) )
+
+//===---------------------------------------------------------------------===//
+
+Switch lowering generates less than ideal code for the following switch:
+define void @a(i32 %x) nounwind {
+entry:
+  switch i32 %x, label %if.end [
+    i32 0, label %if.then
+    i32 1, label %if.then
+    i32 2, label %if.then
+    i32 3, label %if.then
+    i32 5, label %if.then
+  ]
+if.then:
+  tail call void @foo() nounwind
+  ret void
+if.end:
+  ret void
+}
+declare void @foo()
+
+Generated code on x86-64 (other platforms give similar results):
+a:
+	cmpl	$5, %edi
+	ja	.LBB0_2
+	movl	%edi, %eax
+	movl	$47, %ecx
+	btq	%rax, %rcx
+	jb	.LBB0_3
+.LBB0_2:
+	ret
+.LBB0_3:
+	jmp	foo  # TAILCALL
+
+The movl+movl+btq+jb could be simplified to a cmpl+jne.
+
+Or, if we wanted to be really clever, we could simplify the whole thing to
+something like the following, which eliminates a branch:
+	xorl    $1, %edi
+	cmpl	$4, %edi
+	ja	.LBB0_2
+	ret
+.LBB0_2:
+	jmp	foo  # TAILCALL
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index f47e53a..4099a62 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -38,6 +38,7 @@ SDValue
 SparcTargetLowering::LowerReturn(SDValue Chain,
                                  CallingConv::ID CallConv, bool isVarArg,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
                                  DebugLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to locations.
@@ -66,7 +67,7 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
     assert(VA.isRegLoc() && "Can only return in registers!");
 
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 
-                             Outs[i].Val, Flag);
+                             OutVals[i], Flag);
 
     // Guarantee that all emitted copies are stuck together with flags.
     Flag = Chain.getValue(1);
@@ -133,7 +134,7 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
         InVals.push_back(Arg);
       } else {
         int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset,
-                                                            true, false);
+                                                            true);
         SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
         SDValue Load;
         if (ObjectVT == MVT::i32) {
@@ -146,7 +147,7 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
           unsigned Offset = 4-std::max(1U, ObjectVT.getSizeInBits()/8);
           FIPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIPtr,
                               DAG.getConstant(Offset, MVT::i32));
-          Load = DAG.getExtLoad(LoadOp, dl, MVT::i32, Chain, FIPtr,
+          Load = DAG.getExtLoad(LoadOp, MVT::i32, dl, Chain, FIPtr,
                                 NULL, 0, ObjectVT, false, false, 0);
           Load = DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, Load);
         }
@@ -169,7 +170,7 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
         InVals.push_back(Arg);
       } else {
         int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset,
-                                                            true, false);
+                                                            true);
         SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
         SDValue Load = DAG.getLoad(MVT::f32, dl, Chain, FIPtr, NULL, 0,
                                    false, false, 0);
@@ -192,7 +193,7 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
           HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);
         } else {
           int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset,
-                                                              true, false);
+                                                              true);
           SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
           HiVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, NULL, 0,
                               false, false, 0);
@@ -205,7 +206,7 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
           LoVal = DAG.getCopyFromReg(Chain, dl, VRegLo, MVT::i32);
         } else {
           int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset+4,
-                                                              true, false);
+                                                              true);
           SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
           LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, NULL, 0,
                               false, false, 0);
@@ -239,7 +240,7 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
       SDValue Arg = DAG.getCopyFromReg(DAG.getRoot(), dl, VReg, MVT::i32);
 
       int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset,
-                                                          true, false);
+                                                          true);
       SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
 
       OutChains.push_back(DAG.getStore(DAG.getRoot(), dl, Arg, FIPtr, NULL, 0,
@@ -262,6 +263,7 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                CallingConv::ID CallConv, bool isVarArg,
                                bool &isTailCall,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
                                DebugLoc dl, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const {
@@ -283,7 +285,7 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // Count the size of the outgoing arguments.
   unsigned ArgsSize = 0;
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-    switch (Outs[i].Val.getValueType().getSimpleVT().SimpleTy) {
+    switch (Outs[i].VT.getSimpleVT().SimpleTy) {
       default: llvm_unreachable("Unknown value type!");
       case MVT::i1:
       case MVT::i8:
@@ -316,7 +318,7 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = Outs[i].Val;
+    SDValue Arg = OutVals[i];
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
@@ -358,8 +360,8 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   unsigned ArgOffset = 68;
 
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-    SDValue Val = Outs[i].Val;
-    EVT ObjectVT = Val.getValueType();
+    SDValue Val = OutVals[i];
+    EVT ObjectVT = Outs[i].VT;
     SDValue ValToStore(0, 0);
     unsigned ObjSize;
     switch (ObjectVT.getSimpleVT().SimpleTy) {
@@ -478,7 +480,7 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i32);
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
     Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
 
@@ -737,7 +739,7 @@ void SparcTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
 static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
                              ISD::CondCode CC, unsigned &SPCC) {
   if (isa<ConstantSDNode>(RHS) &&
-      cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+      cast<ConstantSDNode>(RHS)->isNullValue() &&
       CC == ISD::SETNE &&
       ((LHS.getOpcode() == SPISD::SELECT_ICC &&
         LHS.getOperand(3).getOpcode() == SPISD::CMPICC) ||
@@ -745,8 +747,8 @@ static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
         LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) &&
       isa<ConstantSDNode>(LHS.getOperand(0)) &&
       isa<ConstantSDNode>(LHS.getOperand(1)) &&
-      cast<ConstantSDNode>(LHS.getOperand(0))->getZExtValue() == 1 &&
-      cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == 0) {
+      cast<ConstantSDNode>(LHS.getOperand(0))->isOne() &&
+      cast<ConstantSDNode>(LHS.getOperand(1))->isNullValue()) {
     SDValue CMPCC = LHS.getOperand(3);
     SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
     LHS = CMPCC.getOperand(0);
@@ -759,7 +761,7 @@ SDValue SparcTargetLowering::LowerGlobalAddress(SDValue Op,
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   // FIXME there isn't really any debug info here
   DebugLoc dl = Op.getDebugLoc();
-  SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
   SDValue Hi = DAG.getNode(SPISD::Hi, dl, MVT::i32, GA);
   SDValue Lo = DAG.getNode(SPISD::Lo, dl, MVT::i32, GA);
 
@@ -1007,21 +1009,20 @@ SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   MachineFunction *F = BB->getParent();
   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
   BuildMI(BB, dl, TII.get(BROpcode)).addMBB(sinkMBB).addImm(CC);
   F->insert(It, copy0MBB);
   F->insert(It, sinkMBB);
-  // Update machine-CFG edges by first adding all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
-         E = BB->succ_end(); I != E; ++I)
-    sinkMBB->addSuccessor(*I);
-  // Next, remove all successors of the current block, and add the true
-  // and fallthrough blocks as its successors.
-  while (!BB->succ_empty())
-    BB->removeSuccessor(BB->succ_begin());
-  // Next, add the true and fallthrough blocks as its successors.
-  BB->addSuccessor(copy0MBB);
-  BB->addSuccessor(sinkMBB);
 
   //  copy0MBB:
   //   %FalseValue = ...
@@ -1035,11 +1036,11 @@ SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   //  ...
   BB = sinkMBB;
-  BuildMI(BB, dl, TII.get(SP::PHI), MI->getOperand(0).getReg())
+  BuildMI(*BB, BB->begin(), dl, TII.get(SP::PHI), MI->getOperand(0).getReg())
     .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
     .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
 
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
   return BB;
 }
 
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 5ebdcac..db39e08 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -86,6 +86,7 @@ namespace llvm {
                 CallingConv::ID CallConv, bool isVarArg,
                 bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
@@ -94,6 +95,7 @@ namespace llvm {
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index 8e49eca..3a4c80a 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -109,38 +109,29 @@ unsigned SparcInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
 unsigned
 SparcInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
                              MachineBasicBlock *FBB,
-                             const SmallVectorImpl<MachineOperand> &Cond)const{
-  // FIXME this should probably take a DebugLoc argument
-  DebugLoc dl;
+                             const SmallVectorImpl<MachineOperand> &Cond,
+                             DebugLoc DL)const{
   // Can only insert uncond branches so far.
   assert(Cond.empty() && !FBB && TBB && "Can only handle uncond branches!");
-  BuildMI(&MBB, dl, get(SP::BA)).addMBB(TBB);
+  BuildMI(&MBB, DL, get(SP::BA)).addMBB(TBB);
   return 1;
 }
 
-bool SparcInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I,
-                                  unsigned DestReg, unsigned SrcReg,
-                                  const TargetRegisterClass *DestRC,
-                                  const TargetRegisterClass *SrcRC,
-                                  DebugLoc DL) const {
-  if (DestRC != SrcRC) {
-    // Not yet supported!
-    return false;
-  }
-
-  if (DestRC == SP::IntRegsRegisterClass)
-    BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0).addReg(SrcReg);
-  else if (DestRC == SP::FPRegsRegisterClass)
-    BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg).addReg(SrcReg);
-  else if (DestRC == SP::DFPRegsRegisterClass)
-    BuildMI(MBB, I, DL, get(Subtarget.isV9() ? SP::FMOVD : SP::FpMOVD),DestReg)
-      .addReg(SrcReg);
+void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I, DebugLoc DL,
+                                 unsigned DestReg, unsigned SrcReg,
+                                 bool KillSrc) const {
+  if (SP::IntRegsRegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (SP::FPRegsRegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(Subtarget.isV9() ? SP::FMOVD : SP::FpMOVD), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
   else
-    // Can't copy this register
-    return false;
-
-  return true;
+    llvm_unreachable("Impossible reg-to-reg copy");
 }
 
 void SparcInstrInfo::
@@ -183,61 +174,6 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     llvm_unreachable("Can't load this register from stack slot");
 }
 
-MachineInstr *SparcInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                    MachineInstr* MI,
-                                          const SmallVectorImpl<unsigned> &Ops,
-                                                    int FI) const {
-  if (Ops.size() != 1) return NULL;
-
-  unsigned OpNum = Ops[0];
-  bool isFloat = false;
-  MachineInstr *NewMI = NULL;
-  switch (MI->getOpcode()) {
-  case SP::ORrr:
-    if (MI->getOperand(1).isReg() && MI->getOperand(1).getReg() == SP::G0&&
-        MI->getOperand(0).isReg() && MI->getOperand(2).isReg()) {
-      if (OpNum == 0)    // COPY -> STORE
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(SP::STri))
-          .addFrameIndex(FI)
-          .addImm(0)
-          .addReg(MI->getOperand(2).getReg());
-      else               // COPY -> LOAD
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(SP::LDri),
-                        MI->getOperand(0).getReg())
-          .addFrameIndex(FI)
-          .addImm(0);
-    }
-    break;
-  case SP::FMOVS:
-    isFloat = true;
-    // FALLTHROUGH
-  case SP::FMOVD:
-    if (OpNum == 0) { // COPY -> STORE
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(isFloat ? SP::STFri : SP::STDFri))
-        .addFrameIndex(FI)
-        .addImm(0)
-        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef));
-    } else {             // COPY -> LOAD
-      unsigned DstReg = MI->getOperand(0).getReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(),
-                      get(isFloat ? SP::LDFri : SP::LDDFri))
-        .addReg(DstReg, RegState::Define |
-                getDeadRegState(isDead) | getUndefRegState(isUndef))
-        .addFrameIndex(FI)
-        .addImm(0);
-    }
-    break;
-  }
-
-  return NewMI;
-}
-
 unsigned SparcInstrInfo::getGlobalBaseReg(MachineFunction *MF) const
 {
   SparcMachineFunctionInfo *SparcFI = MF->getInfo<SparcMachineFunctionInfo>();
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index a00ba39..1334718 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -68,14 +68,13 @@ public:
   
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                 MachineBasicBlock *FBB,
-                            const SmallVectorImpl<MachineOperand> &Cond) const;
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
 
-  virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator I,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
   
   virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
@@ -89,18 +88,6 @@ public:
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
   
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              int FrameIndex) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              MachineInstr* LoadMI) const {
-    return 0;
-  }
-
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
 };
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 9489580..ddadd51 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -665,7 +665,7 @@ let Defs = [FCC] in {
 //===----------------------------------------------------------------------===//
 
 // V9 Conditional Moves.
-let Predicates = [HasV9], isTwoAddress = 1 in {
+let Predicates = [HasV9], Constraints = "$T = $dst" in {
   // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual.
   // FIXME: Add instruction encodings for the JIT some day.
   def MOVICCrr
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 08373bb8..427cc7f 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -52,13 +52,6 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
-
-const TargetRegisterClass* const*
-SparcRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 };
-  return CalleeSavedRegClasses;
-}
-
 bool SparcRegisterInfo::hasFP(const MachineFunction &MF) const {
   return false;
 }
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index 24d43e3..9f0cda7 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -32,9 +32,6 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
   /// Code Generation virtual methods...  
   const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
 
-  const TargetRegisterClass* const* getCalleeSavedRegClasses(
-                                     const MachineFunction *MF = 0) const;
-
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
   bool hasFP(const MachineFunction &MF) const;
diff --git a/lib/Target/SystemZ/AsmPrinter/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/AsmPrinter/SystemZAsmPrinter.cpp
index 90be222..d7ac8f5 100644
--- a/lib/Target/SystemZ/AsmPrinter/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/AsmPrinter/SystemZAsmPrinter.cpp
@@ -124,7 +124,7 @@ void SystemZAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     unsigned Reg = MO.getReg();
     if (Modifier && strncmp(Modifier, "subreg", 6) == 0) {
       if (strncmp(Modifier + 7, "even", 4) == 0)
-        Reg = TM.getRegisterInfo()->getSubReg(Reg, SystemZ::subreg_even32);
+        Reg = TM.getRegisterInfo()->getSubReg(Reg, SystemZ::subreg_32bit);
       else if (strncmp(Modifier + 7, "odd", 3) == 0)
         Reg = TM.getRegisterInfo()->getSubReg(Reg, SystemZ::subreg_odd32);
       else
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index bb2952a..ed290ca 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -670,7 +670,7 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
     // Copy the remainder (even subreg) result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
       unsigned SubRegIdx = (is32Bit ?
-                            SystemZ::subreg_even32 : SystemZ::subreg_even);
+                            SystemZ::subreg_32bit : SystemZ::subreg_even);
       SDNode *Rem = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
                                            dl, NVT,
                                            SDValue(Result, 0),
@@ -754,7 +754,7 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
     // Copy the remainder (even subreg) result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
       unsigned SubRegIdx = (is32Bit ?
-                            SystemZ::subreg_even32 : SystemZ::subreg_even);
+                            SystemZ::subreg_32bit : SystemZ::subreg_even);
       SDNode *Rem = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
                                            dl, NVT,
                                            SDValue(Result, 0),
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 76f2901..67f739f 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -254,6 +254,7 @@ SystemZTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                  CallingConv::ID CallConv, bool isVarArg,
                                  bool &isTailCall,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
                                  const SmallVectorImpl<ISD::InputArg> &Ins,
                                  DebugLoc dl, SelectionDAG &DAG,
                                  SmallVectorImpl<SDValue> &InVals) const {
@@ -266,7 +267,7 @@ SystemZTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   case CallingConv::Fast:
   case CallingConv::C:
     return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
-                          Outs, Ins, dl, DAG, InVals);
+                          Outs, OutVals, Ins, dl, DAG, InVals);
   }
 }
 
@@ -334,7 +335,7 @@ SystemZTargetLowering::LowerCCCArguments(SDValue Chain,
       // Create the nodes corresponding to a load from this parameter slot.
       // Create the frame index object for this incoming parameter...
       int FI = MFI->CreateFixedObject(LocVT.getSizeInBits()/8,
-                                      VA.getLocMemOffset(), true, false);
+                                      VA.getLocMemOffset(), true);
 
       // Create the SelectionDAG nodes corresponding to a load
       // from this parameter
@@ -372,6 +373,7 @@ SystemZTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
                                       bool isTailCall,
                                       const SmallVectorImpl<ISD::OutputArg>
                                         &Outs,
+                                      const SmallVectorImpl<SDValue> &OutVals,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
                                       DebugLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
@@ -402,7 +404,7 @@ SystemZTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
 
-    SDValue Arg = Outs[i].Val;
+    SDValue Arg = OutVals[i];
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
@@ -464,7 +466,7 @@ SystemZTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, getPointerTy());
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
     Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy());
 
@@ -550,6 +552,7 @@ SDValue
 SystemZTargetLowering::LowerReturn(SDValue Chain,
                                    CallingConv::ID CallConv, bool isVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
                                    DebugLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to a location
@@ -575,7 +578,7 @@ SystemZTargetLowering::LowerReturn(SDValue Chain,
   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
-    SDValue ResValue = Outs[i].Val;
+    SDValue ResValue = OutVals[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
     // If this is an 8/16/32-bit value, it is really should be passed promoted
@@ -729,14 +732,14 @@ SDValue SystemZTargetLowering::LowerGlobalAddress(SDValue Op,
 
   SDValue Result;
   if (!IsPic && !ExtraLoadRequired) {
-    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
+    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
     Offset = 0;
   } else {
     unsigned char OpFlags = 0;
     if (ExtraLoadRequired)
       OpFlags = SystemZII::MO_GOTENT;
 
-    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
+    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
   }
 
   Result = DAG.getNode(SystemZISD::PCRelativeWrapper, dl,
@@ -827,16 +830,20 @@ SystemZTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *copy1MBB = F->CreateMachineBasicBlock(LLVM_BB);
   SystemZCC::CondCodes CC = (SystemZCC::CondCodes)MI->getOperand(3).getImm();
-  BuildMI(BB, dl, TII.getBrCond(CC)).addMBB(copy1MBB);
   F->insert(I, copy0MBB);
   F->insert(I, copy1MBB);
   // Update machine-CFG edges by transferring all successors of the current
   // block to the new block which will contain the Phi node for the select.
-  copy1MBB->transferSuccessors(BB);
+  copy1MBB->splice(copy1MBB->begin(), BB,
+                   llvm::next(MachineBasicBlock::iterator(MI)),
+                   BB->end());
+  copy1MBB->transferSuccessorsAndUpdatePHIs(BB);
   // Next, add the true and fallthrough blocks as its successors.
   BB->addSuccessor(copy0MBB);
   BB->addSuccessor(copy1MBB);
 
+  BuildMI(BB, dl, TII.getBrCond(CC)).addMBB(copy1MBB);
+
   //  copy0MBB:
   //   %FalseValue = ...
   //   # fallthrough to copy1MBB
@@ -849,11 +856,11 @@ SystemZTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   //  ...
   BB = copy1MBB;
-  BuildMI(BB, dl, TII.get(SystemZ::PHI),
+  BuildMI(*BB, BB->begin(), dl, TII.get(SystemZ::PHI),
           MI->getOperand(0).getReg())
     .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
     .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
 
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
   return BB;
 }
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 94bd906..51d2df3 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -98,6 +98,7 @@ namespace llvm {
                            CallingConv::ID CallConv, bool isVarArg,
                            bool isTailCall,
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            DebugLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
@@ -126,6 +127,7 @@ namespace llvm {
       LowerCall(SDValue Chain, SDValue Callee,
                 CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
@@ -134,6 +136,7 @@ namespace llvm {
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     const SystemZSubtarget &Subtarget;
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 8c5e905..a658280 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -126,7 +126,7 @@ def FNABS64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src),
                         (implicit PSW)]>;
 }
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let Defs = [PSW] in {
 let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
 def FADD32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2),
@@ -237,7 +237,7 @@ def FDIV64rm : Pseudo<(outs FP64:$dst), (ins FP64:$src1, rriaddr12:$src2),
                        "ddb\t{$dst, $src2}",
                        [(set FP64:$dst, (fdiv FP64:$src1, (load rriaddr12:$src2)))]>;
 
-} // isTwoAddress = 1
+} // Constraints = "$src1 = $dst"
 
 def FSQRT32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src),
                        "sqebr\t{$dst, $src}",
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 043686c..c03864f 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -117,59 +117,28 @@ void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
 }
 
-bool SystemZInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator I,
-                                    unsigned DestReg, unsigned SrcReg,
-                                    const TargetRegisterClass *DestRC,
-                                    const TargetRegisterClass *SrcRC,
-                                    DebugLoc DL) const {
-
-  // Determine if DstRC and SrcRC have a common superclass.
-  const TargetRegisterClass *CommonRC = DestRC;
-  if (DestRC == SrcRC)
-    /* Same regclass for source and dest */;
-  else if (CommonRC->hasSuperClass(SrcRC))
-    CommonRC = SrcRC;
-  else if (!CommonRC->hasSubClass(SrcRC))
-    CommonRC = 0;
-
-  if (CommonRC) {
-    if (CommonRC == &SystemZ::GR64RegClass ||
-        CommonRC == &SystemZ::ADDR64RegClass) {
-      BuildMI(MBB, I, DL, get(SystemZ::MOV64rr), DestReg).addReg(SrcReg);
-    } else if (CommonRC == &SystemZ::GR32RegClass ||
-               CommonRC == &SystemZ::ADDR32RegClass) {
-      BuildMI(MBB, I, DL, get(SystemZ::MOV32rr), DestReg).addReg(SrcReg);
-    } else if (CommonRC == &SystemZ::GR64PRegClass) {
-      BuildMI(MBB, I, DL, get(SystemZ::MOV64rrP), DestReg).addReg(SrcReg);
-    } else if (CommonRC == &SystemZ::GR128RegClass) {
-      BuildMI(MBB, I, DL, get(SystemZ::MOV128rr), DestReg).addReg(SrcReg);
-    } else if (CommonRC == &SystemZ::FP32RegClass) {
-      BuildMI(MBB, I, DL, get(SystemZ::FMOV32rr), DestReg).addReg(SrcReg);
-    } else if (CommonRC == &SystemZ::FP64RegClass) {
-      BuildMI(MBB, I, DL, get(SystemZ::FMOV64rr), DestReg).addReg(SrcReg);
-    } else {
-      return false;
-    }
-
-    return true;
-  }
-
-  if ((SrcRC == &SystemZ::GR64RegClass &&
-       DestRC == &SystemZ::ADDR64RegClass) ||
-      (DestRC == &SystemZ::GR64RegClass &&
-       SrcRC == &SystemZ::ADDR64RegClass)) {
-    BuildMI(MBB, I, DL, get(SystemZ::MOV64rr), DestReg).addReg(SrcReg);
-    return true;
-  } else if ((SrcRC == &SystemZ::GR32RegClass &&
-              DestRC == &SystemZ::ADDR32RegClass) ||
-             (DestRC == &SystemZ::GR32RegClass &&
-              SrcRC == &SystemZ::ADDR32RegClass)) {
-    BuildMI(MBB, I, DL, get(SystemZ::MOV32rr), DestReg).addReg(SrcReg);
-    return true;
-  }
-
-  return false;
+void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I, DebugLoc DL,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   bool KillSrc) const {
+  unsigned Opc;
+  if (SystemZ::GR64RegClass.contains(DestReg, SrcReg))
+    Opc = SystemZ::MOV64rr;
+  else if (SystemZ::GR32RegClass.contains(DestReg, SrcReg))
+    Opc = SystemZ::MOV32rr;
+  else if (SystemZ::GR64PRegClass.contains(DestReg, SrcReg))
+    Opc = SystemZ::MOV64rrP;
+  else if (SystemZ::GR128RegClass.contains(DestReg, SrcReg))
+    Opc = SystemZ::MOV128rr;
+  else if (SystemZ::FP32RegClass.contains(DestReg, SrcReg))
+    Opc = SystemZ::FMOV32rr;
+  else if (SystemZ::FP64RegClass.contains(DestReg, SrcReg))
+    Opc = SystemZ::FMOV64rr;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  BuildMI(MBB, I, DL, get(Opc), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 bool
@@ -286,8 +255,7 @@ SystemZInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   unsigned LowReg = 0, HighReg = 0, StartOffset = -1U, EndOffset = 0;
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    const TargetRegisterClass *RegClass = CSI[i].getRegClass();
-    if (RegClass != &SystemZ::FP64RegClass) {
+    if (!SystemZ::FP64RegClass.contains(Reg)) {
       unsigned Offset = RegSpillOffsets[Reg];
       CalleeFrameSize += 8;
       if (StartOffset > Offset) {
@@ -332,11 +300,10 @@ SystemZInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   // Save FPRs
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    const TargetRegisterClass *RegClass = CSI[i].getRegClass();
-    if (RegClass == &SystemZ::FP64RegClass) {
+    if (SystemZ::FP64RegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
-      storeRegToStackSlot(MBB, MI, Reg, true, CSI[i].getFrameIdx(), RegClass,
-                          &RI);
+      storeRegToStackSlot(MBB, MI, Reg, true, CSI[i].getFrameIdx(),
+                          &SystemZ::FP64RegClass, &RI);
     }
   }
 
@@ -361,9 +328,9 @@ SystemZInstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   // Restore FP registers
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    const TargetRegisterClass *RegClass = CSI[i].getRegClass();
-    if (RegClass == &SystemZ::FP64RegClass)
-      loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RegClass, &RI);
+    if (SystemZ::FP64RegClass.contains(Reg))
+      loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(),
+                           &SystemZ::FP64RegClass, &RI);
   }
 
   // Restore GP registers
@@ -523,9 +490,8 @@ unsigned SystemZInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 unsigned
 SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                MachineBasicBlock *FBB,
-                            const SmallVectorImpl<MachineOperand> &Cond) const {
-  // FIXME: this should probably have a DebugLoc operand
-  DebugLoc DL;
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               DebugLoc DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 1 || Cond.size() == 0) &&
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index a753f14..0559619 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -60,11 +60,10 @@ public:
   ///
   virtual const SystemZRegisterInfo &getRegisterInfo() const { return RI; }
 
-  bool copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                    unsigned DestReg, unsigned SrcReg,
-                    const TargetRegisterClass *DestRC,
-                    const TargetRegisterClass *SrcRC,
-                    DebugLoc DL) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
 
   bool isMoveInstr(const MachineInstr& MI,
                    unsigned &SrcReg, unsigned &DstReg,
@@ -102,7 +101,8 @@ public:
                              bool AllowModify) const;
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                 MachineBasicBlock *FBB,
-                             const SmallVectorImpl<MachineOperand> &Cond) const;
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
   virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
 
   SystemZCC::CondCodes getOppositeCondition(SystemZCC::CondCodes CC) const;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 22bde4e..8df07c0 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -478,7 +478,8 @@ def MOV64rmm  : RSYI<0x04EB,
                      "lmg\t{$from, $to, $dst}",
                      []>;
 
-let isReMaterializable = 1, isAsCheapAsAMove = 1, isTwoAddress = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1,
+    Constraints = "$src = $dst" in {
 def MOV64Pr0_even : Pseudo<(outs GR64P:$dst), (ins GR64P:$src),
                            "lhi\t${dst:subreg_even}, 0",
                            []>;
@@ -537,7 +538,7 @@ def NEG64rr32 : RREI<0xB913, (outs GR64:$dst), (ins GR32:$src),
                       (implicit PSW)]>;
 }
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 
 let Defs = [PSW] in {
 
@@ -924,12 +925,12 @@ def UDIVREM64m : RXYI<0xE387, (outs GR128:$dst), (ins GR128:$src1, rriaddr:$src2
                       "dlg\t{$dst, $src2}",
                       []>;
 } // mayLoad
-} // isTwoAddress = 1
+} // Constraints = "$src1 = $dst"
 
 //===----------------------------------------------------------------------===//
 // Shifts
 
-let isTwoAddress = 1 in
+let Constraints = "$src = $dst" in
 def SRL32rri : RSI<0x88,
                    (outs GR32:$dst), (ins GR32:$src, riaddr32:$amt),
                    "srl\t{$src, $amt}",
@@ -939,7 +940,7 @@ def SRL64rri : RSYI<0xEB0C,
                     "srlg\t{$dst, $src, $amt}",
                     [(set GR64:$dst, (srl GR64:$src, riaddr:$amt))]>;
 
-let isTwoAddress = 1 in
+let Constraints = "$src = $dst" in
 def SHL32rri : RSI<0x89,
                    (outs GR32:$dst), (ins GR32:$src, riaddr32:$amt),
                    "sll\t{$src, $amt}",
@@ -950,7 +951,7 @@ def SHL64rri : RSYI<0xEB0D,
                     [(set GR64:$dst, (shl GR64:$src, riaddr:$amt))]>;
 
 let Defs = [PSW] in {
-let isTwoAddress = 1 in
+let Constraints = "$src = $dst" in
 def SRA32rri : RSI<0x8A,
                    (outs GR32:$dst), (ins GR32:$src, riaddr32:$amt),
                    "sra\t{$src, $amt}",
@@ -1129,13 +1130,13 @@ def : Pat<(mulhs GR32:$src1, GR32:$src2),
           (EXTRACT_SUBREG (MUL64rrP (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
                                                    GR32:$src1, subreg_odd32),
                                     GR32:$src2),
-                          subreg_even32)>;
+                          subreg_32bit)>;
 
 def : Pat<(mulhu GR32:$src1, GR32:$src2),
           (EXTRACT_SUBREG (UMUL64rrP (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
                                                     GR32:$src1, subreg_odd32),
                                      GR32:$src2),
-                          subreg_even32)>;
+                          subreg_32bit)>;
 def : Pat<(mulhu GR64:$src1, GR64:$src2),
           (EXTRACT_SUBREG (UMUL128rrP (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
                                                      GR64:$src1, subreg_odd),
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 638fd17..ae96b0b 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -47,22 +47,6 @@ SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return CalleeSavedRegs;
 }
 
-const TargetRegisterClass* const*
-SystemZRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
-    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
-    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
-    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
-    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
-    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
-    &SystemZ::FP64RegClass, &SystemZ::FP64RegClass,
-    &SystemZ::FP64RegClass, &SystemZ::FP64RegClass,
-    &SystemZ::FP64RegClass, &SystemZ::FP64RegClass,
-    &SystemZ::FP64RegClass, &SystemZ::FP64RegClass, 0
-  };
-  return CalleeSavedRegClasses;
-}
-
 BitVector SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   if (hasFP(MF))
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index 42aa5dd..670025f 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -32,9 +32,6 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
   /// Code Generation virtual methods...
   const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
 
-  const TargetRegisterClass* const* getCalleeSavedRegClasses(
-                                     const MachineFunction *MF = 0) const;
-
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
   bool hasReservedCallFrame(MachineFunction &MF) const { return true; }
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index b561744..33be8dd 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -55,7 +55,6 @@ class FPRL<bits<4> num, string n, list<Register> subregs>
 
 let Namespace = "SystemZ" in {
 def subreg_32bit  : SubRegIndex;
-def subreg_even32 : SubRegIndex;
 def subreg_odd32  : SubRegIndex;
 def subreg_even   : SubRegIndex;
 def subreg_odd    : SubRegIndex;
@@ -99,7 +98,7 @@ def R15D : GPR64<15, "r15", [R15W]>, DwarfRegNum<[15]>;
 }
 
 // Register pairs
-let SubRegIndices = [subreg_even32, subreg_odd32] in {
+let SubRegIndices = [subreg_32bit, subreg_odd32] in {
 def R0P  : GPR64< 0,  "r0", [R0W,  R1W],  [R0D,  R1D]>,  DwarfRegNum<[0]>;
 def R2P  : GPR64< 2,  "r2", [R2W,  R3W],  [R2D,  R3D]>,  DwarfRegNum<[2]>;
 def R4P  : GPR64< 4,  "r4", [R4W,  R5W],  [R4D,  R5D]>,  DwarfRegNum<[4]>;
@@ -111,8 +110,7 @@ def R14P : GPR64<14, "r14", [R14W, R15W], [R14D, R15D]>, DwarfRegNum<[14]>;
 }
 
 let SubRegIndices = [subreg_even, subreg_odd],
- CompositeIndices = [(subreg_even32 subreg_even, subreg_32bit),
-                     (subreg_odd32  subreg_odd,  subreg_32bit)] in {
+ CompositeIndices = [(subreg_odd32  subreg_odd,  subreg_32bit)] in {
 def R0Q  : GPR128< 0,  "r0", [R0D,  R1D],  [R0P]>,  DwarfRegNum<[0]>;
 def R2Q  : GPR128< 2,  "r2", [R2D,  R3D],  [R2P]>,  DwarfRegNum<[2]>;
 def R4Q  : GPR128< 4,  "r4", [R4D,  R5D],  [R4P]>,  DwarfRegNum<[4]>;
@@ -355,7 +353,7 @@ def ADDR64 : RegisterClass<"SystemZ", [i64], 64,
 def GR64P : RegisterClass<"SystemZ", [v2i32], 64,
   [R0P, R2P, R4P, R6P, R8P, R10P, R12P, R14P]>
 {
-  let SubRegClasses = [(GR32 subreg_even32, subreg_odd32)];
+  let SubRegClasses = [(GR32 subreg_32bit, subreg_odd32)];
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
@@ -391,7 +389,7 @@ def GR64P : RegisterClass<"SystemZ", [v2i32], 64,
 def GR128 : RegisterClass<"SystemZ", [v2i64], 128,
   [R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q]>
 {
-  let SubRegClasses = [(GR32 subreg_even32, subreg_odd32),
+  let SubRegClasses = [(GR32 subreg_32bit, subreg_odd32),
                          (GR64 subreg_even, subreg_odd)];
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp
index 094a57e..c099a7e 100644
--- a/lib/Target/TargetInstrInfo.cpp
+++ b/lib/Target/TargetInstrInfo.cpp
@@ -28,6 +28,10 @@ const TargetRegisterClass *
 TargetOperandInfo::getRegClass(const TargetRegisterInfo *TRI) const {
   if (isLookupPtrRegClass())
     return TRI->getPointerRegClass(RegClass);
+  // Instructions like INSERT_SUBREG do not have fixed register classes.
+  if (RegClass < 0)
+    return 0;
+  // Otherwise just look it up normally.
   return TRI->getRegClass(RegClass);
 }
 
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index b9372d0..dd7b532 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -101,7 +101,7 @@ static bool IsNullTerminatedString(const Constant *C) {
 
     ConstantInt *Null =
       dyn_cast<ConstantInt>(CVA->getOperand(ATy->getNumElements()-1));
-    if (Null == 0 || Null->getZExtValue() != 0)
+    if (Null == 0 || !Null->isZero())
       return false; // Not null terminated.
 
     // Verify that the null doesn't occur anywhere else in the string.
diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp
index dcc5f61..49bfad5 100644
--- a/lib/Target/TargetRegisterInfo.cpp
+++ b/lib/Target/TargetRegisterInfo.cpp
@@ -39,20 +39,20 @@ TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterDesc *D, unsigned NR,
 
 TargetRegisterInfo::~TargetRegisterInfo() {}
 
-/// getPhysicalRegisterRegClass - Returns the Register Class of a physical
-/// register of the given type. If type is EVT::Other, then just return any
-/// register class the register belongs to.
+/// getMinimalPhysRegClass - Returns the Register Class of a physical
+/// register of the given type, picking the most sub register class of
+/// the right type that contains this physreg.
 const TargetRegisterClass *
-TargetRegisterInfo::getPhysicalRegisterRegClass(unsigned reg, EVT VT) const {
+TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, EVT VT) const {
   assert(isPhysicalRegister(reg) && "reg must be a physical register");
 
-  // Pick the most super register class of the right type that contains
+  // Pick the most sub register class of the right type that contains
   // this physreg.
   const TargetRegisterClass* BestRC = 0;
   for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I){
     const TargetRegisterClass* RC = *I;
     if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) &&
-        (!BestRC || BestRC->hasSuperClass(RC)))
+        (!BestRC || BestRC->hasSubClass(RC)))
       BestRC = RC;
   }
 
diff --git a/lib/Target/X86/AsmParser/X86AsmLexer.cpp b/lib/Target/X86/AsmParser/X86AsmLexer.cpp
index a58f58e..26797ab 100644
--- a/lib/Target/X86/AsmParser/X86AsmLexer.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmLexer.cpp
@@ -33,13 +33,11 @@ class X86AsmLexer : public TargetAsmLexer {
   }
   
   const AsmToken &lexDefinite() {
-    if(tentativeIsValid) {
+    if (tentativeIsValid) {
       tentativeIsValid = false;
       return tentativeToken;
     }
-    else {
-      return getLexer()->Lex();
-    }
+    return getLexer()->Lex();
   }
   
   AsmToken LexTokenATT();
@@ -72,38 +70,65 @@ public:
 static unsigned MatchRegisterName(StringRef Name);
 
 AsmToken X86AsmLexer::LexTokenATT() {
-  const AsmToken lexedToken = lexDefinite();
+  AsmToken lexedToken = lexDefinite();
   
   switch (lexedToken.getKind()) {
   default:
-    return AsmToken(lexedToken);
+    return lexedToken;
   case AsmToken::Error:
     SetError(Lexer->getErrLoc(), Lexer->getErr());
-    return AsmToken(lexedToken);
-  case AsmToken::Percent:
-  {
+    return lexedToken;
+      
+  case AsmToken::Percent: {
     const AsmToken &nextToken = lexTentative();
-    if (nextToken.getKind() == AsmToken::Identifier) {
-      unsigned regID = MatchRegisterName(nextToken.getString());
+    if (nextToken.getKind() != AsmToken::Identifier)
+      return lexedToken;
+
       
-      if (regID) {
-        lexDefinite();
+    if (unsigned regID = MatchRegisterName(nextToken.getString())) {
+      lexDefinite();
         
+      // FIXME: This is completely wrong when there is a space or other
+      // punctuation between the % and the register name.
+      StringRef regStr(lexedToken.getString().data(),
+                       lexedToken.getString().size() + 
+                       nextToken.getString().size());
+      
+      return AsmToken(AsmToken::Register, regStr, 
+                      static_cast<int64_t>(regID));
+    }
+    
+    // Match register name failed.  If this is "db[0-7]", match it as an alias
+    // for dr[0-7].
+    if (nextToken.getString().size() == 3 &&
+        nextToken.getString().startswith("db")) {
+      int RegNo = -1;
+      switch (nextToken.getString()[2]) {
+      case '0': RegNo = X86::DR0; break;
+      case '1': RegNo = X86::DR1; break;
+      case '2': RegNo = X86::DR2; break;
+      case '3': RegNo = X86::DR3; break;
+      case '4': RegNo = X86::DR4; break;
+      case '5': RegNo = X86::DR5; break;
+      case '6': RegNo = X86::DR6; break;
+      case '7': RegNo = X86::DR7; break;
+      }
+      
+      if (RegNo != -1) {
+        lexDefinite();
+
+        // FIXME: This is completely wrong when there is a space or other
+        // punctuation between the % and the register name.
         StringRef regStr(lexedToken.getString().data(),
                          lexedToken.getString().size() + 
                          nextToken.getString().size());
-        
-        return AsmToken(AsmToken::Register, 
-                        regStr, 
-                        static_cast<int64_t>(regID));
-      }
-      else {
-        return AsmToken(lexedToken);
+        return AsmToken(AsmToken::Register, regStr, 
+                        static_cast<int64_t>(RegNo));
       }
     }
-    else {
-      return AsmToken(lexedToken);
-    }
+      
+   
+    return lexedToken;
   }    
   }
 }
@@ -113,26 +138,22 @@ AsmToken X86AsmLexer::LexTokenIntel() {
   
   switch(lexedToken.getKind()) {
   default:
-    return AsmToken(lexedToken);
+    return lexedToken;
   case AsmToken::Error:
     SetError(Lexer->getErrLoc(), Lexer->getErr());
-    return AsmToken(lexedToken);
-  case AsmToken::Identifier:
-  {
+    return lexedToken;
+  case AsmToken::Identifier: {
     std::string upperCase = lexedToken.getString().str();
     std::string lowerCase = LowercaseString(upperCase);
     StringRef lowerRef(lowerCase);
     
     unsigned regID = MatchRegisterName(lowerRef);
     
-    if (regID) {
+    if (regID)
       return AsmToken(AsmToken::Register,
                       lexedToken.getString(),
                       static_cast<int64_t>(regID));
-    }
-    else {
-      return AsmToken(lexedToken);
-    }
+    return lexedToken;
   }
   }
 }
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 40a6a7b..a856e9c 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -412,6 +412,28 @@ bool X86ATTAsmParser::ParseRegister(unsigned &RegNo,
     return false;
   }
   
+  // If this is "db[0-7]", match it as an alias
+  // for dr[0-7].
+  if (RegNo == 0 && Tok.getString().size() == 3 &&
+      Tok.getString().startswith("db")) {
+    switch (Tok.getString()[2]) {
+    case '0': RegNo = X86::DR0; break;
+    case '1': RegNo = X86::DR1; break;
+    case '2': RegNo = X86::DR2; break;
+    case '3': RegNo = X86::DR3; break;
+    case '4': RegNo = X86::DR4; break;
+    case '5': RegNo = X86::DR5; break;
+    case '6': RegNo = X86::DR6; break;
+    case '7': RegNo = X86::DR7; break;
+    }
+    
+    if (RegNo != 0) {
+      EndLoc = Tok.getLoc();
+      Parser.Lex(); // Eat it.
+      return false;
+    }
+  }
+  
   if (RegNo == 0)
     return Error(Tok.getLoc(), "invalid register name");
 
@@ -597,6 +619,16 @@ ParseInstruction(const StringRef &Name, SMLoc NameLoc,
       return Error(NameLoc, "pushfq cannot be encoded in 32-bit mode");
   }
 
+  // The "Jump if rCX Zero" form jcxz is not allowed in 64-bit mode and
+  // the form jrcxz is not allowed in 32-bit mode.
+  if (Is64Bit) {
+    if (Name == "jcxz")
+      return Error(NameLoc, "jcxz cannot be encoded in 64-bit mode");
+  } else {
+    if (Name == "jrcxz")
+      return Error(NameLoc, "jrcxz cannot be encoded in 32-bit mode");
+  }
+
   // FIXME: Hack to recognize "sal..." and "rep..." for now. We need a way to
   // represent alternative syntaxes in the .td file, without requiring
   // instruction duplication.
@@ -617,6 +649,23 @@ ParseInstruction(const StringRef &Name, SMLoc NameLoc,
     .Case("setnz", "setne")
     .Case("jz", "je")
     .Case("jnz", "jne")
+    .Case("jc", "jb")
+    // FIXME: in 32-bit mode jcxz requires an AdSize prefix. In 64-bit mode
+    // jecxz requires an AdSize prefix but jecxz does not have a prefix in
+    // 32-bit mode.
+    .Case("jecxz", "jcxz")
+    .Case("jrcxz", "jcxz")
+    .Case("jna", "jbe")
+    .Case("jnae", "jb")
+    .Case("jnb", "jae")
+    .Case("jnbe", "ja")
+    .Case("jnc", "jae")
+    .Case("jng", "jle")
+    .Case("jnge", "jl")
+    .Case("jnl", "jge")
+    .Case("jnle", "jg")
+    .Case("jpe", "jp")
+    .Case("jpo", "jnp")
     .Case("cmovcl", "cmovbl")
     .Case("cmovcl", "cmovbl")
     .Case("cmovnal", "cmovbel")
@@ -631,36 +680,64 @@ ParseInstruction(const StringRef &Name, SMLoc NameLoc,
     .Case("cmovnlel", "cmovgl")
     .Case("cmovnzl", "cmovnel")
     .Case("cmovzl", "cmovel")
+    .Case("fwait", "wait")
+    .Case("movzx", "movzb")
     .Default(Name);
 
   // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
   const MCExpr *ExtraImmOp = 0;
-  if (PatchedName.startswith("cmp") &&
+  if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
       (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
        PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
+    bool IsVCMP = PatchedName.startswith("vcmp");
+    unsigned SSECCIdx = IsVCMP ? 4 : 3;
     unsigned SSEComparisonCode = StringSwitch<unsigned>(
-      PatchedName.slice(3, PatchedName.size() - 2))
-      .Case("eq", 0)
-      .Case("lt", 1)
-      .Case("le", 2)
-      .Case("unord", 3)
-      .Case("neq", 4)
-      .Case("nlt", 5)
-      .Case("nle", 6)
-      .Case("ord", 7)
+      PatchedName.slice(SSECCIdx, PatchedName.size() - 2))
+      .Case("eq",          0)
+      .Case("lt",          1)
+      .Case("le",          2)
+      .Case("unord",       3)
+      .Case("neq",         4)
+      .Case("nlt",         5)
+      .Case("nle",         6)
+      .Case("ord",         7)
+      .Case("eq_uq",       8)
+      .Case("nge",         9)
+      .Case("ngt",      0x0A)
+      .Case("false",    0x0B)
+      .Case("neq_oq",   0x0C)
+      .Case("ge",       0x0D)
+      .Case("gt",       0x0E)
+      .Case("true",     0x0F)
+      .Case("eq_os",    0x10)
+      .Case("lt_oq",    0x11)
+      .Case("le_oq",    0x12)
+      .Case("unord_s",  0x13)
+      .Case("neq_us",   0x14)
+      .Case("nlt_uq",   0x15)
+      .Case("nle_uq",   0x16)
+      .Case("ord_s",    0x17)
+      .Case("eq_us",    0x18)
+      .Case("nge_uq",   0x19)
+      .Case("ngt_uq",   0x1A)
+      .Case("false_os", 0x1B)
+      .Case("neq_os",   0x1C)
+      .Case("ge_oq",    0x1D)
+      .Case("gt_oq",    0x1E)
+      .Case("true_us",  0x1F)
       .Default(~0U);
     if (SSEComparisonCode != ~0U) {
       ExtraImmOp = MCConstantExpr::Create(SSEComparisonCode,
                                           getParser().getContext());
       if (PatchedName.endswith("ss")) {
-        PatchedName = "cmpss";
+        PatchedName = IsVCMP ? "vcmpss" : "cmpss";
       } else if (PatchedName.endswith("sd")) {
-        PatchedName = "cmpsd";
+        PatchedName = IsVCMP ? "vcmpsd" : "cmpsd";
       } else if (PatchedName.endswith("ps")) {
-        PatchedName = "cmpps";
+        PatchedName = IsVCMP ? "vcmpps" : "cmpps";
       } else {
         assert(PatchedName.endswith("pd") && "Unexpected mnemonic!");
-        PatchedName = "cmppd";
+        PatchedName = IsVCMP ? "vcmppd" : "cmppd";
       }
     }
   }
diff --git a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
index 0b64cb4..f2cdb5b 100644
--- a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
@@ -85,11 +85,18 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void X86ATTInstPrinter::printLeaMemReference(const MCInst *MI, unsigned Op,
-                                             raw_ostream &O) {
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+                                          raw_ostream &O) {
   const MCOperand &BaseReg  = MI->getOperand(Op);
   const MCOperand &IndexReg = MI->getOperand(Op+2);
   const MCOperand &DispSpec = MI->getOperand(Op+3);
+  const MCOperand &SegReg = MI->getOperand(Op+4);
+  
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+4, O);
+    O << ':';
+  }
   
   if (DispSpec.isImm()) {
     int64_t DispVal = DispSpec.getImm();
@@ -115,13 +122,3 @@ void X86ATTInstPrinter::printLeaMemReference(const MCInst *MI, unsigned Op,
     O << ')';
   }
 }
-
-void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
-                                          raw_ostream &O) {
-  // If this has a segment register, print it.
-  if (MI->getOperand(Op+4).getReg()) {
-    printOperand(MI, Op+4, O);
-    O << ':';
-  }
-  printLeaMemReference(MI, Op, O);
-}
diff --git a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
index 8d5d508..3be4bae 100644
--- a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
@@ -34,7 +34,6 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
-  void printLeaMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void print_pcrel_imm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   
@@ -69,14 +68,8 @@ public:
   void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
-  void printlea32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    printLeaMemReference(MI, OpNo, O);
-  }
-  void printlea64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    printLeaMemReference(MI, OpNo, O);
-  }
-  void printlea64_32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    printLeaMemReference(MI, OpNo, O);
+  void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
   }
 };
   
diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
index 183213d..73bc603 100644
--- a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
+++ b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
@@ -200,6 +200,11 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
   case X86II::MO_GOT:       O << "@GOT";       break;
   case X86II::MO_GOTOFF:    O << "@GOTOFF";    break;
   case X86II::MO_PLT:       O << "@PLT";       break;
+  case X86II::MO_TLVP:      O << "@TLVP";      break;
+  case X86II::MO_TLVP_PIC_BASE:
+    O << "@TLVP" << '-';
+    PrintPICBaseSymbol(O);
+    break;
   }
 }
 
@@ -383,6 +388,8 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
       } 
       if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol()) {
         printSymbolOperand(MO, O);
+        if (Subtarget->isPICStyleRIPRel())
+          O << "(%rip)";
         return false;
       }
       if (MO.isReg()) {
diff --git a/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp
index 7e0a9bb..a632047 100644
--- a/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp
@@ -81,12 +81,19 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void X86IntelInstPrinter::printLeaMemReference(const MCInst *MI, unsigned Op,
-                                               raw_ostream &O) {
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+                                            raw_ostream &O) {
   const MCOperand &BaseReg  = MI->getOperand(Op);
   unsigned ScaleVal         = MI->getOperand(Op+1).getImm();
   const MCOperand &IndexReg = MI->getOperand(Op+2);
   const MCOperand &DispSpec = MI->getOperand(Op+3);
+  const MCOperand &SegReg   = MI->getOperand(Op+4);
+  
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+4, O);
+    O << ':';
+  }
   
   O << '[';
   
@@ -104,7 +111,7 @@ void X86IntelInstPrinter::printLeaMemReference(const MCInst *MI, unsigned Op,
     NeedPlus = true;
   }
   
- 
+  
   if (!DispSpec.isImm()) {
     if (NeedPlus) O << " + ";
     assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
@@ -126,13 +133,3 @@ void X86IntelInstPrinter::printLeaMemReference(const MCInst *MI, unsigned Op,
   
   O << ']';
 }
-
-void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
-                                            raw_ostream &O) {
-  // If this has a segment register, print it.
-  if (MI->getOperand(Op+4).getReg()) {
-    printOperand(MI, Op+4, O);
-    O << ':';
-  }
-  printLeaMemReference(MI, Op, O);
-}
diff --git a/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h
index a0beeb2..4d68074 100644
--- a/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h
@@ -36,7 +36,6 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
-  void printLeaMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O);
   void print_pcrel_imm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   
@@ -81,17 +80,9 @@ public:
     O << "XMMWORD PTR ";
     printMemReference(MI, OpNo, O);
   }
-  void printlea32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "DWORD PTR ";
-    printLeaMemReference(MI, OpNo, O);
-  }
-  void printlea64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "QWORD PTR ";
-    printLeaMemReference(MI, OpNo, O);
-  }
-  void printlea64_32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "QWORD PTR ";
-    printLeaMemReference(MI, OpNo, O);
+  void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "YMMWORD PTR ";
+    printMemReference(MI, OpNo, O);
   }
 };
   
diff --git a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
index 4edeca9..09f150b 100644
--- a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
+++ b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
@@ -152,6 +152,17 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case X86II::MO_DARWIN_STUB:
     break;
       
+  case X86II::MO_TLVP:      RefKind = MCSymbolRefExpr::VK_TLVP; break;
+  case X86II::MO_TLVP_PIC_BASE:
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+      // Subtract the pic base.
+      Expr 
+        = MCBinaryExpr::CreateSub(Expr,
+                                  MCSymbolRefExpr::Create(GetPICBaseSymbol(),
+                                                          Ctx),
+                                  Ctx);
+  
+      break;
   case X86II::MO_TLSGD:     RefKind = MCSymbolRefExpr::VK_TLSGD; break;
   case X86II::MO_GOTTPOFF:  RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
   case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
@@ -266,10 +277,21 @@ static void SimplifyShortMoveForm(MCInst &Inst, unsigned Opcode) {
     return;
 
   // Check whether this is an absolute address.
-  if (Inst.getOperand(AddrBase + 0).getReg() != 0 ||
-      Inst.getOperand(AddrBase + 2).getReg() != 0 ||
-      Inst.getOperand(AddrBase + 4).getReg() != 0 ||
-      Inst.getOperand(AddrBase + 1).getImm() != 1)
+  // FIXME: We know TLVP symbol refs aren't, but there should be a better way 
+  // to do this here.
+  bool Absolute = true;
+  if (Inst.getOperand(AddrOp).isExpr()) {
+    const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr();
+    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
+        Absolute = false;
+  }
+  
+  if (Absolute &&
+      (Inst.getOperand(AddrBase + 0).getReg() != 0 ||
+       Inst.getOperand(AddrBase + 2).getReg() != 0 ||
+       Inst.getOperand(AddrBase + 4).getReg() != 0 ||
+       Inst.getOperand(AddrBase + 1).getImm() != 1))
     return;
 
   // If so, rewrite the instruction.
@@ -327,6 +349,15 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   switch (OutMI.getOpcode()) {
   case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand.
     lower_lea64_32mem(&OutMI, 1);
+    // FALL THROUGH.
+  case X86::LEA64r:
+  case X86::LEA16r:
+  case X86::LEA32r:
+    // LEA should have a segment register, but it must be empty.
+    assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands &&
+           "Unexpected # of LEA operands");
+    assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
+           "LEA has segment specified!");
     break;
   case X86::MOVZX16rr8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break;
   case X86::MOVZX16rm8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
@@ -364,10 +395,9 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
     break;
 
-  // TAILJMPr, TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have
+  // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have
   // register inputs modeled as normal uses instead of implicit uses.  As such,
   // truncate off all but the first operand (the callee).  FIXME: Change isel.
-  case X86::TAILJMPr:
   case X86::TAILJMPr64:
   case X86::CALL64r:
   case X86::CALL64pcrel32: {
@@ -380,11 +410,20 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   }
 
   // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions.
+  case X86::TAILJMPr:
   case X86::TAILJMPd:
   case X86::TAILJMPd64: {
+    unsigned Opcode;
+    switch (OutMI.getOpcode()) {
+    default: assert(0 && "Invalid opcode");
+    case X86::TAILJMPr: Opcode = X86::JMP32r; break;
+    case X86::TAILJMPd:
+    case X86::TAILJMPd64: Opcode = X86::JMP_1; break;
+    }
+    
     MCOperand Saved = OutMI.getOperand(0);
     OutMI = MCInst();
-    OutMI.setOpcode(X86::TAILJMP_1);
+    OutMI.setOpcode(Opcode);
     OutMI.addOperand(Saved);
     break;
   }
@@ -483,8 +522,12 @@ void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
   O << V.getName();
   O << " <- ";
   // Frame address.  Currently handles register +- offset only.
-  assert(MI->getOperand(0).isReg() && MI->getOperand(3).isImm());
-  O << '['; printOperand(MI, 0, O); O << '+'; printOperand(MI, 3, O);
+  O << '['; 
+  if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg())
+    printOperand(MI, 0, O); 
+  else
+    O << "undef";
+  O << '+'; printOperand(MI, 3, O);
   O << ']';
   O << "+";
   printOperand(MI, NOps-2, O);
@@ -495,8 +538,9 @@ X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
   MachineLocation Location;
   assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!");
   // Frame address.  Currently handles register +- offset only.
-  assert(MI->getOperand(0).isReg() && MI->getOperand(3).isImm());
-  Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm());
+
+  if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm())
+    Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm());
   return Location;
 }
 
@@ -513,6 +557,13 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     return;
 
+  case X86::TAILJMPr:
+  case X86::TAILJMPd:
+  case X86::TAILJMPd64:
+    // Lower these as normal, but add some comments.
+    OutStreamer.AddComment("TAILCALL");
+    break;
+      
   case X86::MOVPC32r: {
     MCInst TmpInst;
     // This is a pseudo op for a two instruction sequence with a label, which
@@ -578,7 +629,6 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
-  
   OutStreamer.EmitInstruction(TmpInst);
 }
 
diff --git a/lib/Target/X86/Disassembler/CMakeLists.txt b/lib/Target/X86/Disassembler/CMakeLists.txt
index 9f91060..97589c0 100644
--- a/lib/Target/X86/Disassembler/CMakeLists.txt
+++ b/lib/Target/X86/Disassembler/CMakeLists.txt
@@ -4,8 +4,8 @@ add_llvm_library(LLVMX86Disassembler
   X86Disassembler.cpp
   X86DisassemblerDecoder.c
   )
-# workaround for hanging compilation on MSVC9
-if( MSVC_VERSION EQUAL 1500 )
+# workaround for hanging compilation on MSVC9 and 10
+if( MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
 set_property(
   SOURCE X86Disassembler.cpp
   PROPERTY COMPILE_FLAGS "/Od"
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 8a5a630..09f1584 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -252,13 +252,8 @@ static bool translateRMRegister(MCInst &mcInst,
 /// @param mcInst       - The MCInst to append to.
 /// @param insn         - The instruction to extract Mod, R/M, and SIB fields
 ///                       from.
-/// @param sr           - Whether or not to emit the segment register.  The
-///                       LEA instruction does not expect a segment-register
-///                       operand.
 /// @return             - 0 on success; nonzero otherwise
-static bool translateRMMemory(MCInst &mcInst,
-                              InternalInstruction &insn,
-                              bool sr) {
+static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn) {
   // Addresses in an MCInst are represented as five operands:
   //   1. basereg       (register)  The R/M base, or (if there is a SIB) the 
   //                                SIB base
@@ -385,10 +380,7 @@ static bool translateRMMemory(MCInst &mcInst,
   mcInst.addOperand(scaleAmount);
   mcInst.addOperand(indexReg);
   mcInst.addOperand(displacement);
-  
-  if (sr)
-    mcInst.addOperand(segmentReg);
-  
+  mcInst.addOperand(segmentReg);
   return false;
 }
 
@@ -439,9 +431,8 @@ static bool translateRM(MCInst &mcInst,
   case TYPE_M1616:
   case TYPE_M1632:
   case TYPE_M1664:
-    return translateRMMemory(mcInst, insn, true);
   case TYPE_LEA:
-    return translateRMMemory(mcInst, insn, false);
+    return translateRMMemory(mcInst, insn);
   }
 }
   
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index e5f84e8..b6aba93 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -36,62 +36,6 @@ The pattern isel got this one right.
 
 //===---------------------------------------------------------------------===//
 
-SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
-like this:
-
-  X += y
-
-and the register allocator decides to spill X, it is cheaper to emit this as:
-
-Y += [xslot]
-store Y -> [xslot]
-
-than as:
-
-tmp = [xslot]
-tmp += y
-store tmp -> [xslot]
-
-..and this uses one fewer register (so this should be done at load folding
-time, not at spiller time).  *Note* however that this can only be done
-if Y is dead.  Here's a testcase:
-
-@.str_3 = external global [15 x i8]
-declare void @printf(i32, ...)
-define void @main() {
-build_tree.exit:
-	br label %no_exit.i7
-
-no_exit.i7:		; preds = %no_exit.i7, %build_tree.exit
-	%tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ],
-                                   [ %tmp.34.i18, %no_exit.i7 ]
-	%tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ],
-                                    [ %tmp.28.i16, %no_exit.i7 ]
-	%tmp.28.i16 = fadd double %tmp.0.0.0.i10, 0.000000e+00
-	%tmp.34.i18 = fadd double %tmp.0.1.0.i9, 0.000000e+00
-	br i1 false, label %Compute_Tree.exit23, label %no_exit.i7
-
-Compute_Tree.exit23:		; preds = %no_exit.i7
-	tail call void (i32, ...)* @printf( i32 0 )
-	store double %tmp.34.i18, double* null
-	ret void
-}
-
-We currently emit:
-
-.BBmain_1:
-        xorpd %XMM1, %XMM1
-        addsd %XMM0, %XMM1
-***     movsd %XMM2, QWORD PTR [%ESP + 8]
-***     addsd %XMM2, %XMM1
-***     movsd QWORD PTR [%ESP + 8], %XMM2
-        jmp .BBmain_1   # no_exit.i7
-
-This is a bugpoint reduced testcase, which is why the testcase doesn't make
-much sense (e.g. its an infinite loop). :)
-
-//===---------------------------------------------------------------------===//
-
 SSE should implement 'select_cc' using 'emulated conditional moves' that use
 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
 
@@ -122,12 +66,6 @@ LBB_X_2:
 
 //===---------------------------------------------------------------------===//
 
-It's not clear whether we should use pxor or xorps / xorpd to clear XMM
-registers. The choice may depend on subtarget information. We should do some
-more experiments on different x86 machines.
-
-//===---------------------------------------------------------------------===//
-
 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
 feasible.
 
@@ -151,45 +89,6 @@ Perhaps use pxor / xorp* to clear a XMM register first?
 
 //===---------------------------------------------------------------------===//
 
-How to decide when to use the "floating point version" of logical ops? Here are
-some code fragments:
-
-	movaps LCPI5_5, %xmm2
-	divps %xmm1, %xmm2
-	mulps %xmm2, %xmm3
-	mulps 8656(%ecx), %xmm3
-	addps 8672(%ecx), %xmm3
-	andps LCPI5_6, %xmm2
-	andps LCPI5_1, %xmm3
-	por %xmm2, %xmm3
-	movdqa %xmm3, (%edi)
-
-	movaps LCPI5_5, %xmm1
-	divps %xmm0, %xmm1
-	mulps %xmm1, %xmm3
-	mulps 8656(%ecx), %xmm3
-	addps 8672(%ecx), %xmm3
-	andps LCPI5_6, %xmm1
-	andps LCPI5_1, %xmm3
-	orps %xmm1, %xmm3
-	movaps %xmm3, 112(%esp)
-	movaps %xmm3, (%ebx)
-
-Due to some minor source change, the later case ended up using orps and movaps
-instead of por and movdqa. Does it matter?
-
-//===---------------------------------------------------------------------===//
-
-X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
-to choose between movaps, movapd, and movdqa based on types of source and
-destination?
-
-How about andps, andpd, and pand? Do we really care about the type of the packed
-elements? If not, why not always use the "ps" variants which are likely to be
-shorter.
-
-//===---------------------------------------------------------------------===//
-
 External test Nurbs exposed some problems. Look for
 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 emits:
@@ -278,41 +177,6 @@ It also exposes some other problems. See MOV32ri -3 and the spills.
 
 //===---------------------------------------------------------------------===//
 
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
-
-LLVM is producing bad code.
-
-LBB_main_4:	# cond_true44
-	addps %xmm1, %xmm2
-	subps %xmm3, %xmm2
-	movaps (%ecx), %xmm4
-	movaps %xmm2, %xmm1
-	addps %xmm4, %xmm1
-	addl $16, %ecx
-	incl %edx
-	cmpl $262144, %edx
-	movaps %xmm3, %xmm2
-	movaps %xmm4, %xmm3
-	jne LBB_main_4	# cond_true44
-
-There are two problems. 1) No need to two loop induction variables. We can
-compare against 262144 * 16. 2) Known register coalescer issue. We should
-be able eliminate one of the movaps:
-
-	addps %xmm2, %xmm1    <=== Commute!
-	subps %xmm3, %xmm1
-	movaps (%ecx), %xmm4
-	movaps %xmm1, %xmm1   <=== Eliminate!
-	addps %xmm4, %xmm1
-	addl $16, %ecx
-	incl %edx
-	cmpl $262144, %edx
-	movaps %xmm3, %xmm2
-	movaps %xmm4, %xmm3
-	jne LBB_main_4	# cond_true44
-
-//===---------------------------------------------------------------------===//
-
 Consider:
 
 __m128 test(float a) {
@@ -382,22 +246,6 @@ elements are fixed zeros.
 
 //===---------------------------------------------------------------------===//
 
-__m128d test1( __m128d A, __m128d B) {
-  return _mm_shuffle_pd(A, B, 0x3);
-}
-
-compiles to
-
-shufpd $3, %xmm1, %xmm0
-
-Perhaps it's better to use unpckhpd instead?
-
-unpckhpd %xmm1, %xmm0
-
-Don't know if unpckhpd is faster. But it is shorter.
-
-//===---------------------------------------------------------------------===//
-
 This code generates ugly code, probably due to costs being off or something:
 
 define void @test(float* %P, <4 x float>* %P2 ) {
@@ -549,6 +397,7 @@ entry:
  %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
  ret i64 %tmp20
 }
+declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
 
 This currently compiles to:
 
@@ -987,3 +836,34 @@ This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
 doing a shuffle from v[1] to v[0] then a float store.
 
 //===---------------------------------------------------------------------===//
+
+On SSE4 machines, we compile this code:
+
+define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
+       <2 x float> *%P) nounwind {
+  %Z = fadd <2 x float> %Q, %R
+
+  store <2 x float> %Z, <2 x float> *%P
+  ret <2 x float> %Z
+}
+
+into:
+
+_test2:                                 ## @test2
+## BB#0:
+	insertps	$0, %xmm2, %xmm2
+	insertps	$16, %xmm3, %xmm2
+	insertps	$0, %xmm0, %xmm3
+	insertps	$16, %xmm1, %xmm3
+	addps	%xmm2, %xmm3
+	movq	%xmm3, (%rdi)
+	movaps	%xmm3, %xmm0
+	pshufd	$1, %xmm3, %xmm1
+                                        ## kill: XMM1<def> XMM1<kill>
+	ret
+
+The insertps's of $0 are pointless complex copies.
+
+//===---------------------------------------------------------------------===//
+
+
diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt
index e8f7c5d..78c4dc0 100644
--- a/lib/Target/X86/README-X86-64.txt
+++ b/lib/Target/X86/README-X86-64.txt
@@ -1,27 +1,5 @@
 //===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
 
-Implement different PIC models? Right now we only support Mac OS X with small
-PIC code model.
-
-//===---------------------------------------------------------------------===//
-
-For this:
-
-extern void xx(void);
-void bar(void) {
-  xx();
-}
-
-gcc compiles to:
-
-.globl _bar
-_bar:
-	jmp	_xx
-
-We need to do the tailcall optimization as well.
-
-//===---------------------------------------------------------------------===//
-
 AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
 multiplication by a constant. How much of it applies to Intel's X86-64
 implementation? There are definite trade-offs to consider: latency vs. register
@@ -96,123 +74,14 @@ gcc:
 	movq	%rax, (%rdx)
 	ret
 
-//===---------------------------------------------------------------------===//
-
-Vararg function prologue can be further optimized. Currently all XMM registers
-are stored into register save area. Most of them can be eliminated since the
-upper bound of the number of XMM registers used are passed in %al. gcc produces
-something like the following:
-
-	movzbl	%al, %edx
-	leaq	0(,%rdx,4), %rax
-	leaq	4+L2(%rip), %rdx
-	leaq	239(%rsp), %rax
-       	jmp	*%rdx
-	movaps	%xmm7, -15(%rax)
-	movaps	%xmm6, -31(%rax)
-	movaps	%xmm5, -47(%rax)
-	movaps	%xmm4, -63(%rax)
-	movaps	%xmm3, -79(%rax)
-	movaps	%xmm2, -95(%rax)
-	movaps	%xmm1, -111(%rax)
-	movaps	%xmm0, -127(%rax)
-L2:
-
-It jumps over the movaps that do not need to be stored. Hard to see this being
-significant as it added 5 instruciton (including a indirect branch) to avoid
-executing 0 to 8 stores in the function prologue.
-
-Perhaps we can optimize for the common case where no XMM registers are used for
-parameter passing. i.e. is %al == 0 jump over all stores. Or in the case of a
-leaf function where we can determine that no XMM input parameter is need, avoid
-emitting the stores at all.
-
-//===---------------------------------------------------------------------===//
+And the codegen is even worse for the following
+(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33103):
+  void fill1(char *s, int a)
+  {
+    __builtin_memset(s, a, 15);
+  }
 
-AMD64 has a complex calling convention for aggregate passing by value:
-
-1. If the size of an object is larger than two eightbytes, or in C++, is a non- 
-   POD structure or union type, or contains unaligned fields, it has class 
-   MEMORY.
-2. Both eightbytes get initialized to class NO_CLASS. 
-3. Each field of an object is classified recursively so that always two fields
-   are considered. The resulting class is calculated according to the classes
-   of the fields in the eightbyte: 
-   (a) If both classes are equal, this is the resulting class. 
-   (b) If one of the classes is NO_CLASS, the resulting class is the other 
-       class. 
-   (c) If one of the classes is MEMORY, the result is the MEMORY class. 
-   (d) If one of the classes is INTEGER, the result is the INTEGER. 
-   (e) If one of the classes is X87, X87UP, COMPLEX_X87 class, MEMORY is used as
-      class. 
-   (f) Otherwise class SSE is used. 
-4. Then a post merger cleanup is done: 
-   (a) If one of the classes is MEMORY, the whole argument is passed in memory. 
-   (b) If SSEUP is not preceeded by SSE, it is converted to SSE.
-
-Currently llvm frontend does not handle this correctly.
-
-Problem 1:
-    typedef struct { int i; double d; } QuadWordS;
-It is currently passed in two i64 integer registers. However, gcc compiled
-callee expects the second element 'd' to be passed in XMM0.
-
-Problem 2:
-    typedef struct { int32_t i; float j; double d; } QuadWordS;
-The size of the first two fields == i64 so they will be combined and passed in
-a integer register RDI. The third field is still passed in XMM0.
-
-Problem 3:
-    typedef struct { int64_t i; int8_t j; int64_t d; } S;
-    void test(S s)
-The size of this aggregate is greater than two i64 so it should be passed in 
-memory. Currently llvm breaks this down and passed it in three integer
-registers.
-
-Problem 4:
-Taking problem 3 one step ahead where a function expects a aggregate value
-in memory followed by more parameter(s) passed in register(s).
-    void test(S s, int b)
-
-LLVM IR does not allow parameter passing by aggregates, therefore it must break
-the aggregates value (in problem 3 and 4) into a number of scalar values:
-    void %test(long %s.i, byte %s.j, long %s.d);
-
-However, if the backend were to lower this code literally it would pass the 3
-values in integer registers. To force it be passed in memory, the frontend
-should change the function signiture to:
-    void %test(long %undef1, long %undef2, long %undef3, long %undef4, 
-               long %undef5, long %undef6,
-               long %s.i, byte %s.j, long %s.d);
-And the callee would look something like this:
-    call void %test( undef, undef, undef, undef, undef, undef,
-                     %tmp.s.i, %tmp.s.j, %tmp.s.d );
-The first 6 undef parameters would exhaust the 6 integer registers used for
-parameter passing. The following three integer values would then be forced into
-memory.
-
-For problem 4, the parameter 'd' would be moved to the front of the parameter
-list so it will be passed in register:
-    void %test(int %d,
-               long %undef1, long %undef2, long %undef3, long %undef4, 
-               long %undef5, long %undef6,
-               long %s.i, byte %s.j, long %s.d);
-
-//===---------------------------------------------------------------------===//
-
-Right now the asm printer assumes GlobalAddress are accessed via RIP relative
-addressing. Therefore, it is not possible to generate this:
-        movabsq $__ZTV10polynomialIdE+16, %rax
-
-That is ok for now since we currently only support small model. So the above
-is selected as
-        leaq __ZTV10polynomialIdE+16(%rip), %rax
-
-This is probably slightly slower but is much shorter than movabsq. However, if
-we were to support medium or larger code models, we need to use the movabs
-instruction. We should probably introduce something like AbsoluteAddress to
-distinguish it from GlobalAddress so the asm printer and JIT code emitter can
-do the right thing.
+For this version, we duplicate the computation of the constant to store.
 
 //===---------------------------------------------------------------------===//
 
@@ -298,3 +167,107 @@ be able to recognize the zero extend.  This could also presumably be implemented
 if we have whole-function selectiondags.
 
 //===---------------------------------------------------------------------===//
+
+Take the following C code
+(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43640):
+
+struct u1
+{
+        float x;
+        float y;
+};
+
+float foo(struct u1 u)
+{
+        return u.x + u.y;
+}
+
+Optimizes to the following IR:
+define float @foo(double %u.0) nounwind readnone {
+entry:
+  %tmp8 = bitcast double %u.0 to i64              ; <i64> [#uses=2]
+  %tmp6 = trunc i64 %tmp8 to i32                  ; <i32> [#uses=1]
+  %tmp7 = bitcast i32 %tmp6 to float              ; <float> [#uses=1]
+  %tmp2 = lshr i64 %tmp8, 32                      ; <i64> [#uses=1]
+  %tmp3 = trunc i64 %tmp2 to i32                  ; <i32> [#uses=1]
+  %tmp4 = bitcast i32 %tmp3 to float              ; <float> [#uses=1]
+  %0 = fadd float %tmp7, %tmp4                    ; <float> [#uses=1]
+  ret float %0
+}
+
+And current llvm-gcc/clang output:
+	movd	%xmm0, %rax
+	movd	%eax, %xmm1
+	shrq	$32, %rax
+	movd	%eax, %xmm0
+	addss	%xmm1, %xmm0
+	ret
+
+We really shouldn't move the floats to RAX, only to immediately move them
+straight back to the XMM registers.
+
+There really isn't any good way to handle this purely in IR optimizers; it
+could possibly be handled by changing the output of the fronted, though.  It
+would also be feasible to add a x86-specific DAGCombine to optimize the
+bitcast+trunc+(lshr+)bitcast combination.
+
+//===---------------------------------------------------------------------===//
+
+Take the following code
+(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):
+extern unsigned long table[];
+unsigned long foo(unsigned char *p) {
+  unsigned long tag = *p;
+  return table[tag >> 4] + table[tag & 0xf];
+}
+
+Current code generated:
+	movzbl	(%rdi), %eax
+	movq	%rax, %rcx
+	andq	$240, %rcx
+	shrq	%rcx
+	andq	$15, %rax
+	movq	table(,%rax,8), %rax
+	addq	table(%rcx), %rax
+	ret
+
+Issues:
+1. First movq should be movl; saves a byte.
+2. Both andq's should be andl; saves another two bytes.  I think this was
+   implemented at one point, but subsequently regressed.
+3. shrq should be shrl; saves another byte.
+4. The first andq can be completely eliminated by using a slightly more
+   expensive addressing mode.
+
+//===---------------------------------------------------------------------===//
+
+Consider the following (contrived testcase, but contains common factors):
+
+#include <stdarg.h>
+int test(int x, ...) {
+  int sum, i;
+  va_list l;
+  va_start(l, x);
+  for (i = 0; i < x; i++)
+    sum += va_arg(l, int);
+  va_end(l);
+  return sum;
+}
+
+Testcase given in C because fixing it will likely involve changing the IR
+generated for it.  The primary issue with the result is that it doesn't do any
+of the optimizations which are possible if we know the address of a va_list
+in the current function is never taken:
+1. We shouldn't spill the XMM registers because we only call va_arg with "int".
+2. It would be nice if we could scalarrepl the va_list.
+3. Probably overkill, but it'd be cool if we could peel off the first five
+iterations of the loop.
+
+Other optimizations involving functions which use va_arg on floats which don't
+have the address of a va_list taken:
+1. Conversely to the above, we shouldn't spill general registers if we only
+   call va_arg on "double".
+2. If we know nothing more than 64 bits wide is read from the XMM registers,
+   we can change the spilling code to reduce the amount of stack used by half.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index d4545a6..efc0cd8 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -1103,57 +1103,6 @@ be folded into: shl [mem], 1
 
 //===---------------------------------------------------------------------===//
 
-This testcase misses a read/modify/write opportunity (from PR1425):
-
-void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
-    int i;
-    for(i=0; i<width; i++)
-        b1[i] += (1*(b0[i] + b2[i])+0)>>0;
-}
-
-We compile it down to:
-
-LBB1_2:	# bb
-	movl	(%esi,%edi,4), %ebx
-	addl	(%ecx,%edi,4), %ebx
-	addl	(%edx,%edi,4), %ebx
-	movl	%ebx, (%ecx,%edi,4)
-	incl	%edi
-	cmpl	%eax, %edi
-	jne	LBB1_2	# bb
-
-the inner loop should add to the memory location (%ecx,%edi,4), saving
-a mov.  Something like:
-
-        movl    (%esi,%edi,4), %ebx
-        addl    (%edx,%edi,4), %ebx
-        addl    %ebx, (%ecx,%edi,4)
-
-Here is another interesting example:
-
-void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
-    int i;
-    for(i=0; i<width; i++)
-        b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
-}
-
-We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
-
-LBB9_2:	# bb
-	movl	(%ecx,%edi,4), %ebx
-	subl	(%esi,%edi,4), %ebx
-	subl	(%edx,%edi,4), %ebx
-	movl	%ebx, (%ecx,%edi,4)
-	incl	%edi
-	cmpl	%eax, %edi
-	jne	LBB9_2	# bb
-
-Additionally, LSR should rewrite the exit condition of these loops to use
-a stride-4 IV, would would allow all the scales in the loop to go away.
-This would result in smaller code and more efficient microops.
-
-//===---------------------------------------------------------------------===//
-
 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
 or and instruction, for example:
 
@@ -1301,15 +1250,8 @@ FirstOnet:
         xorl    %eax, %eax
         ret
 
-There are a few possible improvements here:
-1. We should be able to eliminate the dead load into %ecx
-2. We could change the "movl 8(%esp), %eax" into
-   "movzwl 10(%esp), %eax"; this lets us change the cmpl
-   into a testl, which is shorter, and eliminate the shift.
-
-We could also in theory eliminate the branch by using a conditional
-for the address of the load, but that seems unlikely to be worthwhile
-in general.
+We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
+lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
 
 //===---------------------------------------------------------------------===//
 
@@ -1331,22 +1273,23 @@ bb7:		; preds = %entry
 
 to:
 
-_foo:
+foo:                                    # @foo
+# BB#0:                                 # %entry
+	movl	4(%esp), %ecx
 	cmpb	$0, 16(%esp)
-	movl	12(%esp), %ecx
+	je	.LBB0_2
+# BB#1:                                 # %bb
 	movl	8(%esp), %eax
-	movl	4(%esp), %edx
-	je	LBB1_2	# bb7
-LBB1_1:	# bb
-	addl	%edx, %eax
+	addl	%ecx, %eax
 	ret
-LBB1_2:	# bb7
-	movl	%edx, %eax
-	subl	%ecx, %eax
+.LBB0_2:                                # %bb7
+	movl	12(%esp), %edx
+	movl	%ecx, %eax
+	subl	%edx, %eax
 	ret
 
-The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
-if it commuted the addl in LBB1_1.
+There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
+couple more movls by putting 4(%esp) into %eax instead of %ecx.
 
 //===---------------------------------------------------------------------===//
 
@@ -1396,8 +1339,7 @@ Also check why xmm7 is not used at all in the function.
 
 //===---------------------------------------------------------------------===//
 
-Legalize loses track of the fact that bools are always zero extended when in
-memory.  This causes us to compile abort_gzip (from 164.gzip) from:
+Take the following:
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
@@ -1416,16 +1358,15 @@ bb4.i:		; preds = %entry
 }
 declare void @exit(i32) noreturn nounwind 
 
-into:
-
-_abort_gzip:
+This compiles into:
+_abort_gzip:                            ## @abort_gzip
+## BB#0:                                ## %entry
 	subl	$12, %esp
 	movb	_in_exit.4870.b, %al
-	notb	%al
-	testb	$1, %al
-	jne	LBB1_2	## bb4.i
-LBB1_1:	## bb.i
-  ...
+	cmpb	$1, %al
+	jne	LBB0_2
+
+We somehow miss folding the movb into the cmpb.
 
 //===---------------------------------------------------------------------===//
 
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 22e89a5..677781d 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -35,6 +35,10 @@ class formatted_raw_ostream;
 FunctionPass *createX86ISelDag(X86TargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 
+/// createGlobalBaseRegPass - This pass initializes a global base
+/// register for PIC on x86-32.
+FunctionPass* createGlobalBaseRegPass();
+
 /// createX86FloatingPointStackifierPass - This function returns a pass which
 /// converts floating point register references and pseudo instructions into
 /// floating point stack references and physical instructions.
diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp
index 151087f..2cf65c1 100644
--- a/lib/Target/X86/X86AsmBackend.cpp
+++ b/lib/Target/X86/X86AsmBackend.cpp
@@ -23,13 +23,13 @@
 #include "llvm/Target/TargetAsmBackend.h"
 using namespace llvm;
 
-namespace {
 
 static unsigned getFixupKindLog2Size(unsigned Kind) {
   switch (Kind) {
   default: assert(0 && "invalid fixup kind!");
   case X86::reloc_pcrel_1byte:
   case FK_Data_1: return 0;
+  case X86::reloc_pcrel_2byte:
   case FK_Data_2: return 1;
   case X86::reloc_pcrel_4byte:
   case X86::reloc_riprel_4byte:
@@ -39,6 +39,7 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
   }
 }
 
+namespace {
 class X86AsmBackend : public TargetAsmBackend {
 public:
   X86AsmBackend(const Target &T)
@@ -60,6 +61,7 @@ public:
 
   bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
 };
+} // end anonymous namespace 
 
 static unsigned getRelaxedOpcode(unsigned Op) {
   switch (Op) {
@@ -75,7 +77,6 @@ static unsigned getRelaxedOpcode(unsigned Op) {
   case X86::JG_1:  return X86::JG_4;
   case X86::JLE_1: return X86::JLE_4;
   case X86::JL_1:  return X86::JL_4;
-  case X86::TAILJMP_1:
   case X86::JMP_1: return X86::JMP_4;
   case X86::JNE_1: return X86::JNE_4;
   case X86::JNO_1: return X86::JNO_4;
@@ -180,6 +181,7 @@ bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
 
 /* *** */
 
+namespace {
 class ELFX86AsmBackend : public X86AsmBackend {
 public:
   ELFX86AsmBackend(const Target &T)
@@ -281,7 +283,7 @@ public:
   }
 };
 
-}
+} // end anonymous namespace 
 
 TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
                                                const std::string &TT) {
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index a5774e1..a6a1e4e 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -42,7 +42,7 @@ def RetCC_X86Common : CallingConv<[
 
   // MMX vector types are always returned in MM0. If the target doesn't have
   // MM0, it doesn't support these vector types.
-  CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32], CCAssignToReg<[MM0]>>,
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToReg<[MM0]>>,
 
   // Long double types are always returned in ST0 (even with SSE).
   CCIfType<[f80], CCAssignToReg<[ST0, ST1]>>
@@ -89,7 +89,7 @@ def RetCC_X86_64_C : CallingConv<[
   // returned in RAX. This disagrees with ABI documentation but is bug
   // compatible with gcc.
   CCIfType<[v1i64], CCAssignToReg<[RAX]>>,
-  CCIfType<[v8i8, v4i16, v2i32, v2f32], CCAssignToReg<[XMM0, XMM1]>>,
+  CCIfType<[v8i8, v4i16, v2i32], CCAssignToReg<[XMM0, XMM1]>>,
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
@@ -155,7 +155,7 @@ def CC_X86_64_C : CallingConv<[
 
   // The first 8 MMX (except for v1i64) vector arguments are passed in XMM
   // registers on Darwin.
-  CCIfType<[v8i8, v4i16, v2i32, v2f32],
+  CCIfType<[v8i8, v4i16, v2i32],
             CCIfSubtarget<"isTargetDarwin()",
             CCIfSubtarget<"hasSSE2()",
             CCPromoteToType<v2i64>>>>,
@@ -177,7 +177,7 @@ def CC_X86_64_C : CallingConv<[
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
 
   // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
-  CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32], CCAssignToStack<8, 8>>
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
 ]>;
 
 // Calling convention used on Win64
@@ -195,7 +195,7 @@ def CC_X86_Win64_C : CallingConv<[
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
 
   // The first 4 MMX vector arguments are passed in GPRs.
-  CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32],
+  CCIfType<[v8i8, v4i16, v2i32, v1i64],
            CCBitConvertToType<i64>>,
 
   // The first 4 integer arguments are passed in integer registers.
@@ -254,7 +254,7 @@ def CC_X86_32_Common : CallingConv<[
 
   // The first 3 __m64 (except for v1i64) vector arguments are passed in mmx
   // registers if the call is not a vararg call.
-  CCIfNotVarArg<CCIfType<[v8i8, v4i16, v2i32, v2f32],
+  CCIfNotVarArg<CCIfType<[v8i8, v4i16, v2i32],
                 CCAssignToReg<[MM0, MM1, MM2]>>>,
 
   // Integer/Float values get stored in stack slots that are 4 bytes in
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 8f02604..f13669b 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -138,7 +138,7 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
         // MOVPC32r is basically a call plus a pop instruction.
         if (Desc.getOpcode() == X86::MOVPC32r)
           emitInstruction(*I, &II->get(X86::POP32r));
-        NumEmitted++;  // Keep track of the # of mi's emitted
+        ++NumEmitted;  // Keep track of the # of mi's emitted
       }
     }
   } while (MCE.finishFunction(MF));
@@ -730,9 +730,9 @@ void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI,
   case X86II::MRMDestMem: {
     MCE.emitByte(BaseOpcode);
     emitMemModRMByte(MI, CurOp,
-                     getX86RegNum(MI.getOperand(CurOp + X86AddrNumOperands)
+                     getX86RegNum(MI.getOperand(CurOp + X86::AddrNumOperands)
                                   .getReg()));
-    CurOp +=  X86AddrNumOperands + 1;
+    CurOp +=  X86::AddrNumOperands + 1;
     if (CurOp != NumOps)
       emitConstant(MI.getOperand(CurOp++).getImm(),
                    X86II::getSizeOfImm(Desc->TSFlags));
@@ -750,13 +750,7 @@ void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI,
     break;
 
   case X86II::MRMSrcMem: {
-    // FIXME: Maybe lea should have its own form?
-    int AddrOperands;
-    if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
-        Opcode == X86::LEA16r || Opcode == X86::LEA32r)
-      AddrOperands = X86AddrNumOperands - 1; // No segment register
-    else
-      AddrOperands = X86AddrNumOperands;
+    int AddrOperands = X86::AddrNumOperands;
 
     intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
       X86II::getSizeOfImm(Desc->TSFlags) : 0;
@@ -810,14 +804,14 @@ void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI,
   case X86II::MRM2m: case X86II::MRM3m:
   case X86II::MRM4m: case X86II::MRM5m:
   case X86II::MRM6m: case X86II::MRM7m: {
-    intptr_t PCAdj = (CurOp + X86AddrNumOperands != NumOps) ?
-      (MI.getOperand(CurOp+X86AddrNumOperands).isImm() ? 
+    intptr_t PCAdj = (CurOp + X86::AddrNumOperands != NumOps) ?
+      (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ? 
           X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0;
 
     MCE.emitByte(BaseOpcode);
     emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m,
                      PCAdj);
-    CurOp += X86AddrNumOperands;
+    CurOp += X86::AddrNumOperands;
 
     if (CurOp == NumOps)
       break;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 1bc5eb7..cdde24a 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -23,7 +23,9 @@
 #include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -52,20 +54,7 @@ class X86FastISel : public FastISel {
   bool X86ScalarSSEf32;
 
 public:
-  explicit X86FastISel(MachineFunction &mf,
-                       DenseMap<const Value *, unsigned> &vm,
-                       DenseMap<const BasicBlock *, MachineBasicBlock *> &bm,
-                       DenseMap<const AllocaInst *, int> &am,
-                       std::vector<std::pair<MachineInstr*, unsigned> > &pn
-#ifndef NDEBUG
-                       , SmallSet<const Instruction *, 8> &cil
-#endif
-                       )
-    : FastISel(mf, vm, bm, am, pn
-#ifndef NDEBUG
-               , cil
-#endif
-               ) {
+  explicit X86FastISel(FunctionLoweringInfo &funcInfo) : FastISel(funcInfo) {
     Subtarget = &TM.getSubtarget<X86Subtarget>();
     StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
     X86ScalarSSEf64 = Subtarget->hasSSE2();
@@ -96,6 +85,8 @@ private:
   
   bool X86SelectStore(const Instruction *I);
 
+  bool X86SelectRet(const Instruction *I);
+
   bool X86SelectCmp(const Instruction *I);
 
   bool X86SelectZExt(const Instruction *I);
@@ -117,6 +108,7 @@ private:
   bool X86SelectCall(const Instruction *I);
 
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isTailCall = false);
+  CCAssignFn *CCAssignFnForRet(CallingConv::ID CC, bool isTailCall = false);
 
   const X86InstrInfo *getInstrInfo() const {
     return getTargetMachine()->getInstrInfo();
@@ -190,6 +182,20 @@ CCAssignFn *X86FastISel::CCAssignFnForCall(CallingConv::ID CC,
     return CC_X86_32_C;
 }
 
+/// CCAssignFnForRet - Selects the correct CCAssignFn for a given calling
+/// convention.
+CCAssignFn *X86FastISel::CCAssignFnForRet(CallingConv::ID CC,
+                                          bool isTaillCall) {
+  if (Subtarget->is64Bit()) {
+    if (Subtarget->isTargetWin64())
+      return RetCC_X86_Win64_C;
+    else
+      return RetCC_X86_64_C;
+  }
+
+  return RetCC_X86_32_C;
+}
+
 /// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
 /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
 /// Return true and the result register by reference if it is possible.
@@ -242,7 +248,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
   }
 
   ResultReg = createResultReg(RC);
-  addFullAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                         DL, TII.get(Opc), ResultReg), AM);
   return true;
 }
 
@@ -261,7 +268,7 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val,
   case MVT::i1: {
     // Mask out all but lowest bit.
     unsigned AndResult = createResultReg(X86::GR8RegisterClass);
-    BuildMI(MBB, DL,
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
             TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1);
     Val = AndResult;
   }
@@ -278,7 +285,8 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val,
     break;
   }
   
-  addFullAddress(BuildMI(MBB, DL, TII.get(Opc)), AM).addReg(Val);
+  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                         DL, TII.get(Opc)), AM).addReg(Val);
   return true;
 }
 
@@ -306,7 +314,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
     }
     
     if (Opc) {
-      addFullAddress(BuildMI(MBB, DL, TII.get(Opc)), AM)
+      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                             DL, TII.get(Opc)), AM)
                              .addImm(Signed ? (uint64_t) CI->getSExtValue() :
                                               CI->getZExtValue());
       return true;
@@ -342,6 +351,12 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
   const User *U = NULL;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // Don't walk into other basic blocks; it's possible we haven't
+    // visited them yet, so the instructions may not yet be assigned
+    // virtual registers.
+    if (FuncInfo.MBBMap[I->getParent()] != FuncInfo.MBB)
+      return false;
+
     Opcode = I->getOpcode();
     U = I;
   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
@@ -349,6 +364,12 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
     U = C;
   }
 
+  if (const PointerType *Ty = dyn_cast<PointerType>(V->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
   switch (Opcode) {
   default: break;
   case Instruction::BitCast:
@@ -370,8 +391,9 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
   case Instruction::Alloca: {
     // Do static allocas.
     const AllocaInst *A = cast<AllocaInst>(V);
-    DenseMap<const AllocaInst*, int>::iterator SI = StaticAllocaMap.find(A);
-    if (SI != StaticAllocaMap.end()) {
+    DenseMap<const AllocaInst*, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(A);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
       AM.BaseType = X86AddressMode::FrameIndexBase;
       AM.Base.FrameIndex = SI->second;
       return true;
@@ -411,20 +433,33 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
         Disp += SL->getElementOffset(Idx);
       } else {
         uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
-        if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
-          // Constant-offset addressing.
-          Disp += CI->getSExtValue() * S;
-        } else if (IndexReg == 0 &&
-                   (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
-                   (S == 1 || S == 2 || S == 4 || S == 8)) {
-          // Scaled-index addressing.
-          Scale = S;
-          IndexReg = getRegForGEPIndex(Op).first;
-          if (IndexReg == 0)
-            return false;
-        } else
-          // Unsupported.
-          goto unsupported_gep;
+        SmallVector<const Value *, 4> Worklist;
+        Worklist.push_back(Op);
+        do {
+          Op = Worklist.pop_back_val();
+          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+            // Constant-offset addressing.
+            Disp += CI->getSExtValue() * S;
+          } else if (isa<AddOperator>(Op) &&
+                     isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
+            // An add with a constant operand. Fold the constant.
+            ConstantInt *CI =
+              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+            Disp += CI->getSExtValue() * S;
+            // Add the other operand back to the work list.
+            Worklist.push_back(cast<AddOperator>(Op)->getOperand(0));
+          } else if (IndexReg == 0 &&
+                     (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
+                     (S == 1 || S == 2 || S == 4 || S == 8)) {
+            // Scaled-index addressing.
+            Scale = S;
+            IndexReg = getRegForGEPIndex(Op).first;
+            if (IndexReg == 0)
+              return false;
+          } else
+            // Unsupported.
+            goto unsupported_gep;
+        } while (!Worklist.empty());
       }
     }
     // Check for displacement overflow.
@@ -473,7 +508,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
     // If this reference is relative to the pic base, set it now.
     if (isGlobalRelativeToPICBase(GVFlags)) {
       // FIXME: How do we know Base.Reg is free??
-      AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(&MF);
+      AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
     }
     
     // Unless the ABI requires an extra load, return a direct reference to
@@ -504,6 +539,9 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
       StubAM.GV = GV;
       StubAM.GVOpFlags = GVFlags;
 
+      // Prepare for inserting code in the local-value area.
+      MachineBasicBlock::iterator SaveInsertPt = enterLocalValueArea();
+
       if (TLI.getPointerTy() == MVT::i64) {
         Opc = X86::MOV64rm;
         RC  = X86::GR64RegisterClass;
@@ -516,8 +554,13 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
       }
       
       LoadReg = createResultReg(RC);
-      addFullAddress(BuildMI(MBB, DL, TII.get(Opc), LoadReg), StubAM);
-      
+      MachineInstrBuilder LoadMI =
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
+      addFullAddress(LoadMI, StubAM);
+
+      // Ok, back to normal mode.
+      leaveLocalValueArea(SaveInsertPt);
+
       // Prevent loading GV stub multiple times in same MBB.
       LocalValueMap[V] = LoadReg;
     }
@@ -642,6 +685,93 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
   return X86FastEmitStore(VT, I->getOperand(0), AM);
 }
 
+/// X86SelectRet - Select and emit code to implement ret instructions.
+bool X86FastISel::X86SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (CC != CallingConv::C &&
+      CC != CallingConv::Fast &&
+      CC != CallingConv::X86_FastCall)
+    return false;
+
+  if (Subtarget->isTargetWin64())
+    return false;
+
+  // Don't handle popping bytes on return for now.
+  if (FuncInfo.MF->getInfo<X86MachineFunctionInfo>()
+        ->getBytesToPopOnReturn() != 0)
+    return 0;
+
+  // fastcc with -tailcallopt is intended to provide a guaranteed
+  // tail call optimization. Fastisel doesn't know how to do that.
+  if (CC == CallingConv::Fast && GuaranteedTailCallOpt)
+    return false;
+
+  // Let SDISel handle vararg functions.
+  if (F.isVarArg())
+    return false;
+
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes().getRetAttributes(),
+                  Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), TM, ValLocs, I->getContext());
+    CCInfo.AnalyzeReturn(Outs, CCAssignFnForRet(CC));
+
+    const Value *RV = Ret->getOperand(0);
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+  
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+    // TODO: For now, don't try to handle cases where getLocInfo()
+    // says Full but the types don't match.
+    if (VA.getValVT() != TLI.getValueType(RV->getType()))
+      return false;
+
+    // The calling-convention tables for x87 returns don't tell
+    // the whole story.
+    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+      return false;
+
+    // Make the copy.
+    unsigned SrcReg = Reg + VA.getValNo();
+    unsigned DstReg = VA.getLocReg();
+    const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!SrcRC->contains(DstReg))
+      return false;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            DstReg).addReg(SrcReg);
+
+    // Mark the register as live out of the function.
+    MRI.addLiveOut(VA.getLocReg());
+  }
+
+  // Now emit the RET.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::RET));
+  return true;
+}
+
 /// X86SelectLoad - Select and emit code to implement load instructions.
 ///
 bool X86FastISel::X86SelectLoad(const Instruction *I)  {
@@ -661,15 +791,15 @@ bool X86FastISel::X86SelectLoad(const Instruction *I)  {
   return false;
 }
 
-static unsigned X86ChooseCmpOpcode(EVT VT) {
+static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
   switch (VT.getSimpleVT().SimpleTy) {
   default:       return 0;
   case MVT::i8:  return X86::CMP8rr;
   case MVT::i16: return X86::CMP16rr;
   case MVT::i32: return X86::CMP32rr;
   case MVT::i64: return X86::CMP64rr;
-  case MVT::f32: return X86::UCOMISSrr;
-  case MVT::f64: return X86::UCOMISDrr;
+  case MVT::f32: return Subtarget->hasSSE1() ? X86::UCOMISSrr : 0;
+  case MVT::f64: return Subtarget->hasSSE2() ? X86::UCOMISDrr : 0;
   }
 }
 
@@ -706,18 +836,21 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
   // CMPri, otherwise use CMPrr.
   if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
     if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
-      BuildMI(MBB, DL, TII.get(CompareImmOpc)).addReg(Op0Reg)
-                                          .addImm(Op1C->getSExtValue());
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CompareImmOpc))
+        .addReg(Op0Reg)
+        .addImm(Op1C->getSExtValue());
       return true;
     }
   }
   
-  unsigned CompareOpc = X86ChooseCmpOpcode(VT);
+  unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
   if (CompareOpc == 0) return false;
     
   unsigned Op1Reg = getRegForValue(Op1);
   if (Op1Reg == 0) return false;
-  BuildMI(MBB, DL, TII.get(CompareOpc)).addReg(Op0Reg).addReg(Op1Reg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CompareOpc))
+    .addReg(Op0Reg)
+    .addReg(Op1Reg);
   
   return true;
 }
@@ -739,9 +872,10 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
     
     unsigned EReg = createResultReg(&X86::GR8RegClass);
     unsigned NPReg = createResultReg(&X86::GR8RegClass);
-    BuildMI(MBB, DL, TII.get(X86::SETEr), EReg);
-    BuildMI(MBB, DL, TII.get(X86::SETNPr), NPReg);
-    BuildMI(MBB, DL, 
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETEr), EReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(X86::SETNPr), NPReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, 
             TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg);
     UpdateValueMap(I, ResultReg);
     return true;
@@ -752,9 +886,13 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 
     unsigned NEReg = createResultReg(&X86::GR8RegClass);
     unsigned PReg = createResultReg(&X86::GR8RegClass);
-    BuildMI(MBB, DL, TII.get(X86::SETNEr), NEReg);
-    BuildMI(MBB, DL, TII.get(X86::SETPr), PReg);
-    BuildMI(MBB, DL, TII.get(X86::OR8rr), ResultReg).addReg(PReg).addReg(NEReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(X86::SETNEr), NEReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(X86::SETPr), PReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(X86::OR8rr), ResultReg)
+      .addReg(PReg).addReg(NEReg);
     UpdateValueMap(I, ResultReg);
     return true;
   }
@@ -793,7 +931,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
   if (!X86FastEmitCompare(Op0, Op1, VT))
     return false;
   
-  BuildMI(MBB, DL, TII.get(SetCCOpc), ResultReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(SetCCOpc), ResultReg);
   UpdateValueMap(I, ResultReg);
   return true;
 }
@@ -819,8 +957,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   // Unconditional branches are selected by tablegen-generated code.
   // Handle a conditional branch.
   const BranchInst *BI = cast<BranchInst>(I);
-  MachineBasicBlock *TrueMBB = MBBMap[BI->getSuccessor(0)];
-  MachineBasicBlock *FalseMBB = MBBMap[BI->getSuccessor(1)];
+  MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
 
   // Fold the common case of a conditional branch with a comparison.
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
@@ -829,7 +967,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
 
       // Try to take advantage of fallthrough opportunities.
       CmpInst::Predicate Predicate = CI->getPredicate();
-      if (MBB->isLayoutSuccessor(TrueMBB)) {
+      if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
         std::swap(TrueMBB, FalseMBB);
         Predicate = CmpInst::getInversePredicate(Predicate);
       }
@@ -878,16 +1016,18 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
       if (!X86FastEmitCompare(Op0, Op1, VT))
         return false;
       
-      BuildMI(MBB, DL, TII.get(BranchOpc)).addMBB(TrueMBB);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BranchOpc))
+        .addMBB(TrueMBB);
 
       if (Predicate == CmpInst::FCMP_UNE) {
         // X86 requires a second branch to handle UNE (and OEQ,
         // which is mapped to UNE above).
-        BuildMI(MBB, DL, TII.get(X86::JP_4)).addMBB(TrueMBB);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JP_4))
+          .addMBB(TrueMBB);
       }
 
-      FastEmitBranch(FalseMBB);
-      MBB->addSuccessor(TrueMBB);
+      FastEmitBranch(FalseMBB, DL);
+      FuncInfo.MBB->addSuccessor(TrueMBB);
       return true;
     }
   } else if (ExtractValueInst *EI =
@@ -910,10 +1050,11 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
       if (CI->getIntrinsicID() == Intrinsic::sadd_with_overflow ||
           CI->getIntrinsicID() == Intrinsic::uadd_with_overflow) {
         const MachineInstr *SetMI = 0;
-        unsigned Reg = lookUpRegForValue(EI);
+        unsigned Reg = getRegForValue(EI);
 
         for (MachineBasicBlock::const_reverse_iterator
-               RI = MBB->rbegin(), RE = MBB->rend(); RI != RE; ++RI) {
+               RI = FuncInfo.MBB->rbegin(), RE = FuncInfo.MBB->rend();
+             RI != RE; ++RI) {
           const MachineInstr &MI = *RI;
 
           if (MI.definesRegister(Reg)) {
@@ -938,11 +1079,11 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
           unsigned OpCode = SetMI->getOpcode();
 
           if (OpCode == X86::SETOr || OpCode == X86::SETBr) {
-            BuildMI(MBB, DL, TII.get(OpCode == X86::SETOr ?
-                                        X86::JO_4 : X86::JB_4))
+            BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                    TII.get(OpCode == X86::SETOr ?  X86::JO_4 : X86::JB_4))
               .addMBB(TrueMBB);
-            FastEmitBranch(FalseMBB);
-            MBB->addSuccessor(TrueMBB);
+            FastEmitBranch(FalseMBB, DL);
+            FuncInfo.MBB->addSuccessor(TrueMBB);
             return true;
           }
         }
@@ -954,10 +1095,12 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   unsigned OpReg = getRegForValue(BI->getCondition());
   if (OpReg == 0) return false;
 
-  BuildMI(MBB, DL, TII.get(X86::TEST8rr)).addReg(OpReg).addReg(OpReg);
-  BuildMI(MBB, DL, TII.get(X86::JNE_4)).addMBB(TrueMBB);
-  FastEmitBranch(FalseMBB);
-  MBB->addSuccessor(TrueMBB);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8rr))
+    .addReg(OpReg).addReg(OpReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JNE_4))
+    .addMBB(TrueMBB);
+  FastEmitBranch(FalseMBB, DL);
+  FuncInfo.MBB->addSuccessor(TrueMBB);
   return true;
 }
 
@@ -1014,7 +1157,7 @@ bool X86FastISel::X86SelectShift(const Instruction *I) {
   // Fold immediate in shl(x,3).
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
     unsigned ResultReg = createResultReg(RC);
-    BuildMI(MBB, DL, TII.get(OpImm), 
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpImm), 
             ResultReg).addReg(Op0Reg).addImm(CI->getZExtValue() & 0xff);
     UpdateValueMap(I, ResultReg);
     return true;
@@ -1022,17 +1165,19 @@ bool X86FastISel::X86SelectShift(const Instruction *I) {
   
   unsigned Op1Reg = getRegForValue(I->getOperand(1));
   if (Op1Reg == 0) return false;
-  TII.copyRegToReg(*MBB, MBB->end(), CReg, Op1Reg, RC, RC, DL);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+          CReg).addReg(Op1Reg);
 
   // The shift instruction uses X86::CL. If we defined a super-register
-  // of X86::CL, emit an EXTRACT_SUBREG to precisely describe what
-  // we're doing here.
+  // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
   if (CReg != X86::CL)
-    BuildMI(MBB, DL, TII.get(TargetOpcode::EXTRACT_SUBREG), X86::CL)
-      .addReg(CReg).addImm(X86::sub_8bit);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(TargetOpcode::KILL), X86::CL)
+      .addReg(CReg, RegState::Kill);
 
   unsigned ResultReg = createResultReg(RC);
-  BuildMI(MBB, DL, TII.get(OpReg), ResultReg).addReg(Op0Reg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpReg), ResultReg)
+    .addReg(Op0Reg);
   UpdateValueMap(I, ResultReg);
   return true;
 }
@@ -1064,9 +1209,11 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) {
   unsigned Op2Reg = getRegForValue(I->getOperand(2));
   if (Op2Reg == 0) return false;
 
-  BuildMI(MBB, DL, TII.get(X86::TEST8rr)).addReg(Op0Reg).addReg(Op0Reg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8rr))
+    .addReg(Op0Reg).addReg(Op0Reg);
   unsigned ResultReg = createResultReg(RC);
-  BuildMI(MBB, DL, TII.get(Opc), ResultReg).addReg(Op1Reg).addReg(Op2Reg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+    .addReg(Op1Reg).addReg(Op2Reg);
   UpdateValueMap(I, ResultReg);
   return true;
 }
@@ -1080,7 +1227,9 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
       unsigned OpReg = getRegForValue(V);
       if (OpReg == 0) return false;
       unsigned ResultReg = createResultReg(X86::FR64RegisterClass);
-      BuildMI(MBB, DL, TII.get(X86::CVTSS2SDrr), ResultReg).addReg(OpReg);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+              TII.get(X86::CVTSS2SDrr), ResultReg)
+        .addReg(OpReg);
       UpdateValueMap(I, ResultReg);
       return true;
     }
@@ -1097,7 +1246,9 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
         unsigned OpReg = getRegForValue(V);
         if (OpReg == 0) return false;
         unsigned ResultReg = createResultReg(X86::FR32RegisterClass);
-        BuildMI(MBB, DL, TII.get(X86::CVTSD2SSrr), ResultReg).addReg(OpReg);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                TII.get(X86::CVTSD2SSrr), ResultReg)
+          .addReg(OpReg);
         UpdateValueMap(I, ResultReg);
         return true;
       }
@@ -1132,7 +1283,8 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
   const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
     ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass;
   unsigned CopyReg = createResultReg(CopyRC);
-  BuildMI(MBB, DL, TII.get(CopyOpc), CopyReg).addReg(InputReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CopyOpc), CopyReg)
+    .addReg(InputReg);
 
   // Then issue an extract_subreg.
   unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8,
@@ -1153,14 +1305,18 @@ bool X86FastISel::X86SelectExtractValue(const Instruction *I) {
     switch (CI->getIntrinsicID()) {
     default: break;
     case Intrinsic::sadd_with_overflow:
-    case Intrinsic::uadd_with_overflow:
+    case Intrinsic::uadd_with_overflow: {
       // Cheat a little. We know that the registers for "add" and "seto" are
       // allocated sequentially. However, we only keep track of the register
       // for "add" in the value map. Use extractvalue's index to get the
       // correct register for "seto".
-      UpdateValueMap(I, lookUpRegForValue(Agg) + *EI->idx_begin());
+      unsigned OpReg = getRegForValue(Agg);
+      if (OpReg == 0)
+        return false;
+      UpdateValueMap(I, OpReg + *EI->idx_begin());
       return true;
     }
+    }
   }
 
   return false;
@@ -1174,8 +1330,8 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     // Emit code inline code to store the stack guard onto the stack.
     EVT PtrTy = TLI.getPointerTy();
 
-    const Value *Op1 = I.getOperand(1); // The guard's value.
-    const AllocaInst *Slot = cast<AllocaInst>(I.getOperand(2));
+    const Value *Op1 = I.getArgOperand(0); // The guard's value.
+    const AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
 
     // Grab the frame index.
     X86AddressMode AM;
@@ -1186,7 +1342,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     return true;
   }
   case Intrinsic::objectsize: {
-    ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(2));
+    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));
     const Type *Ty = I.getCalledFunction()->getReturnType();
     
     assert(CI && "Non-constant type in Intrinsic::objectsize?");
@@ -1204,8 +1360,8 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
       return false;
     
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
-    BuildMI(MBB, DL, TII.get(OpC), ResultReg).
-                                  addImm(CI->getZExtValue() == 0 ? -1ULL : 0);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg).
+                                  addImm(CI->isZero() ? -1ULL : 0);
     UpdateValueMap(&I, ResultReg);
     return true;
   }
@@ -1218,12 +1374,12 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     const TargetInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
     // FIXME may need to add RegState::Debug to any registers produced,
     // although ESP/EBP should be the only ones at the moment.
-    addFullAddress(BuildMI(MBB, DL, II), AM).addImm(0).
-                                        addMetadata(DI->getVariable());
+    addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II), AM).
+      addImm(0).addMetadata(DI->getVariable());
     return true;
   }
   case Intrinsic::trap: {
-    BuildMI(MBB, DL, TII.get(X86::TRAP));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TRAP));
     return true;
   }
   case Intrinsic::sadd_with_overflow:
@@ -1241,8 +1397,8 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     if (!isTypeLegal(RetTy, VT))
       return false;
 
-    const Value *Op1 = I.getOperand(1);
-    const Value *Op2 = I.getOperand(2);
+    const Value *Op1 = I.getArgOperand(0);
+    const Value *Op2 = I.getArgOperand(1);
     unsigned Reg1 = getRegForValue(Op1);
     unsigned Reg2 = getRegForValue(Op2);
 
@@ -1259,7 +1415,8 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
       return false;
 
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
-    BuildMI(MBB, DL, TII.get(OpC), ResultReg).addReg(Reg1).addReg(Reg2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg)
+      .addReg(Reg1).addReg(Reg2);
     unsigned DestReg1 = UpdateValueMap(&I, ResultReg);
 
     // If the add with overflow is an intra-block value then we just want to
@@ -1277,7 +1434,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     unsigned Opc = X86::SETBr;
     if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
       Opc = X86::SETOr;
-    BuildMI(MBB, DL, TII.get(Opc), ResultReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg);
     return true;
   }
   }
@@ -1285,7 +1442,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
 
 bool X86FastISel::X86SelectCall(const Instruction *I) {
   const CallInst *CI = cast<CallInst>(I);
-  const Value *Callee = I->getOperand(0);
+  const Value *Callee = CI->getCalledValue();
 
   // Can't handle inline asm yet.
   if (isa<InlineAsm>(Callee))
@@ -1314,6 +1471,10 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   if (FTy->isVarArg())
     return false;
 
+  // Fast-isel doesn't know about callee-pop yet.
+  if (Subtarget->IsCalleePop(FTy->isVarArg(), CC))
+    return false;
+
   // Handle *simple* calls for now.
   const Type *RetTy = CS.getType();
   EVT RetVT;
@@ -1387,6 +1548,12 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, false, TM, ArgLocs, I->getParent()->getContext());
+  
+  // Allocate shadow area for Win64
+  if (Subtarget->isTargetWin64()) {  
+    CCInfo.AllocateStack(32, 8); 
+  }
+
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -1394,7 +1561,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
 
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TM.getRegisterInfo()->getCallFrameSetupOpcode();
-  BuildMI(MBB, DL, TII.get(AdjStackDown)).addImm(NumBytes);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackDown))
+    .addImm(NumBytes);
 
   // Process argument: walk the register/memloc assignments, inserting
   // copies / loads.
@@ -1449,11 +1617,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
     }
     
     if (VA.isRegLoc()) {
-      TargetRegisterClass* RC = TLI.getRegClassFor(ArgVT);
-      bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), VA.getLocReg(),
-                                      Arg, RC, RC, DL);
-      assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
-      Emitted = true;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              VA.getLocReg()).addReg(Arg);
       RegArgs.push_back(VA.getLocReg());
     } else {
       unsigned LocMemOffset = VA.getLocMemOffset();
@@ -1475,12 +1640,9 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   // ELF / PIC requires GOT in the EBX register before function calls via PLT
   // GOT pointer.  
   if (Subtarget->isPICStyleGOT()) {
-    TargetRegisterClass *RC = X86::GR32RegisterClass;
-    unsigned Base = getInstrInfo()->getGlobalBaseReg(&MF);
-    bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), X86::EBX, Base, RC, RC,
-                                    DL);
-    assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
-    Emitted = true;
+    unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            X86::EBX).addReg(Base);
   }
   
   // Issue the call.
@@ -1488,7 +1650,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   if (CalleeOp) {
     // Register-indirect call.
     unsigned CallOpc = Subtarget->is64Bit() ? X86::CALL64r : X86::CALL32r;
-    MIB = BuildMI(MBB, DL, TII.get(CallOpc)).addReg(CalleeOp);
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc))
+      .addReg(CalleeOp);
     
   } else {
     // Direct call.
@@ -1517,7 +1680,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
     }
     
     
-    MIB = BuildMI(MBB, DL, TII.get(CallOpc)).addGlobalAddress(GV, 0, OpFlags);
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc))
+      .addGlobalAddress(GV, 0, OpFlags);
   }
 
   // Add an implicit use GOT pointer in EBX.
@@ -1530,9 +1694,11 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
 
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TM.getRegisterInfo()->getCallFrameDestroyOpcode();
-  BuildMI(MBB, DL, TII.get(AdjStackUp)).addImm(NumBytes).addImm(0);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp))
+    .addImm(NumBytes).addImm(0);
 
   // Now handle call return value (if any).
+  SmallVector<unsigned, 4> UsedRegs;
   if (RetVT.getSimpleVT().SimpleTy != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CC, false, TM, RVLocs, I->getParent()->getContext());
@@ -1542,7 +1708,6 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
     assert(RVLocs.size() == 1 && "Can't handle multi-value calls!");
     EVT CopyVT = RVLocs[0].getValVT();
     TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
-    TargetRegisterClass *SrcRC = DstRC;
     
     // If this is a call to a function that returns an fp value on the x87 fp
     // stack, but where we prefer to use the value in xmm registers, copy it
@@ -1551,15 +1716,14 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
          RVLocs[0].getLocReg() == X86::ST1) &&
         isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) {
       CopyVT = MVT::f80;
-      SrcRC = X86::RSTRegisterClass;
       DstRC = X86::RFP80RegisterClass;
     }
 
     unsigned ResultReg = createResultReg(DstRC);
-    bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                    RVLocs[0].getLocReg(), DstRC, SrcRC, DL);
-    assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
-    Emitted = true;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(RVLocs[0].getLocReg());
+    UsedRegs.push_back(RVLocs[0].getLocReg());
+
     if (CopyVT != RVLocs[0].getValVT()) {
       // Round the F80 the right size, which also moves to the appropriate xmm
       // register. This is accomplished by storing the F80 value in memory and
@@ -1568,18 +1732,21 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
       unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
       unsigned MemSize = ResVT.getSizeInBits()/8;
       int FI = MFI.CreateStackObject(MemSize, MemSize, false);
-      addFrameReference(BuildMI(MBB, DL, TII.get(Opc)), FI).addReg(ResultReg);
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                TII.get(Opc)), FI)
+        .addReg(ResultReg);
       DstRC = ResVT == MVT::f32
         ? X86::FR32RegisterClass : X86::FR64RegisterClass;
       Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
       ResultReg = createResultReg(DstRC);
-      addFrameReference(BuildMI(MBB, DL, TII.get(Opc), ResultReg), FI);
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                TII.get(Opc), ResultReg), FI);
     }
 
     if (AndToI1) {
       // Mask out all but lowest bit for some call which produces an i1.
       unsigned AndResult = createResultReg(X86::GR8RegisterClass);
-      BuildMI(MBB, DL, 
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, 
               TII.get(X86::AND8ri), AndResult).addReg(ResultReg).addImm(1);
       ResultReg = AndResult;
     }
@@ -1587,6 +1754,9 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
     UpdateValueMap(I, ResultReg);
   }
 
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
   return true;
 }
 
@@ -1599,6 +1769,8 @@ X86FastISel::TargetSelectInstruction(const Instruction *I)  {
     return X86SelectLoad(I);
   case Instruction::Store:
     return X86SelectStore(I);
+  case Instruction::Ret:
+    return X86SelectRet(I);
   case Instruction::ICmp:
   case Instruction::FCmp:
     return X86SelectCmp(I);
@@ -1699,7 +1871,8 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
       else
         Opc = X86::LEA64r;
       unsigned ResultReg = createResultReg(RC);
-      addLeaAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                             TII.get(Opc), ResultReg), AM);
       return ResultReg;
     }
     return 0;
@@ -1717,10 +1890,10 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
   unsigned char OpFlag = 0;
   if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
     OpFlag = X86II::MO_PIC_BASE_OFFSET;
-    PICBase = getInstrInfo()->getGlobalBaseReg(&MF);
+    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
   } else if (Subtarget->isPICStyleGOT()) {
     OpFlag = X86II::MO_GOTOFF;
-    PICBase = getInstrInfo()->getGlobalBaseReg(&MF);
+    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
   } else if (Subtarget->isPICStyleRIPRel() &&
              TM.getCodeModel() == CodeModel::Small) {
     PICBase = X86::RIP;
@@ -1729,7 +1902,8 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
   // Create the load from the constant pool.
   unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align);
   unsigned ResultReg = createResultReg(RC);
-  addConstantPoolReference(BuildMI(MBB, DL, TII.get(Opc), ResultReg),
+  addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                   TII.get(Opc), ResultReg),
                            MCPOffset, PICBase, OpFlag);
 
   return ResultReg;
@@ -1743,7 +1917,7 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) {
   // various places, but TargetMaterializeAlloca also needs a check
   // in order to avoid recursion between getRegForValue,
   // X86SelectAddrss, and TargetMaterializeAlloca.
-  if (!StaticAllocaMap.count(C))
+  if (!FuncInfo.StaticAllocaMap.count(C))
     return 0;
 
   X86AddressMode AM;
@@ -1752,24 +1926,13 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) {
   unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
   TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
   unsigned ResultReg = createResultReg(RC);
-  addLeaAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                         TII.get(Opc), ResultReg), AM);
   return ResultReg;
 }
 
 namespace llvm {
-  llvm::FastISel *X86::createFastISel(MachineFunction &mf,
-                        DenseMap<const Value *, unsigned> &vm,
-                        DenseMap<const BasicBlock *, MachineBasicBlock *> &bm,
-                        DenseMap<const AllocaInst *, int> &am,
-                        std::vector<std::pair<MachineInstr*, unsigned> > &pn
-#ifndef NDEBUG
-                        , SmallSet<const Instruction *, 8> &cil
-#endif
-                        ) {
-    return new X86FastISel(mf, vm, bm, am, pn
-#ifndef NDEBUG
-                           , cil
-#endif
-                           );
+  llvm::FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo) {
+    return new X86FastISel(funcInfo);
   }
 }
diff --git a/lib/Target/X86/X86FixupKinds.h b/lib/Target/X86/X86FixupKinds.h
index a8117d4..96e0aae 100644
--- a/lib/Target/X86/X86FixupKinds.h
+++ b/lib/Target/X86/X86FixupKinds.h
@@ -17,6 +17,7 @@ namespace X86 {
 enum Fixups {
   reloc_pcrel_4byte = FirstTargetFixupKind,  // 32-bit pcrel, e.g. a branch.
   reloc_pcrel_1byte,                         // 8-bit pcrel, e.g. branch_1
+  reloc_pcrel_2byte,                         // 16-bit pcrel, e.g. callw
   reloc_riprel_4byte,                        // 32-bit rip-relative
   reloc_riprel_4byte_movq_load               // 32-bit rip-relative in movq
 };
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 93460ef..cee4ad7 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -133,7 +133,7 @@ namespace {
 
       // Emit an fxch to update the runtime processors version of the state.
       BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
-      NumFXCH++;
+      ++NumFXCH;
     }
 
     void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
@@ -164,6 +164,8 @@ namespace {
     void handleCompareFP(MachineBasicBlock::iterator &I);
     void handleCondMovFP(MachineBasicBlock::iterator &I);
     void handleSpecialFP(MachineBasicBlock::iterator &I);
+
+    bool translateCopy(MachineInstr*);
   };
   char FPS::ID = 0;
 }
@@ -232,12 +234,15 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
 
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
     MachineInstr *MI = I;
-    unsigned Flags = MI->getDesc().TSFlags;
+    uint64_t Flags = MI->getDesc().TSFlags;
     
     unsigned FPInstClass = Flags & X86II::FPTypeMask;
     if (MI->isInlineAsm())
       FPInstClass = X86II::SpecialFP;
-    
+
+    if (MI->isCopy() && translateCopy(MI))
+      FPInstClass = X86II::SpecialFP;
+
     if (FPInstClass == X86II::NotFP)
       continue;  // Efficiently ignore non-fp insts!
 
@@ -628,7 +633,7 @@ void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
 void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
   MachineInstr *MI = I;
   unsigned NumOps = MI->getDesc().getNumOperands();
-  assert((NumOps == X86AddrNumOperands + 1 || NumOps == 1) &&
+  assert((NumOps == X86::AddrNumOperands + 1 || NumOps == 1) &&
          "Can only handle fst* & ftst instructions!");
 
   // Is this the last use of the source register?
@@ -1001,15 +1006,17 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
   case X86::FpSET_ST0_32:
   case X86::FpSET_ST0_64:
   case X86::FpSET_ST0_80: {
+    // FpSET_ST0_80 is generated by copyRegToReg for setting up inline asm
+    // arguments that use an st constraint. We expect a sequence of
+    // instructions: Fp_SET_ST0 Fp_SET_ST1? INLINEASM
     unsigned Op0 = getFPReg(MI->getOperand(0));
 
-    // FpSET_ST0_80 is generated by copyRegToReg for both function return
-    // and inline assembly with the "st" constrain. In the latter case,
-    // it is possible for ST(0) to be alive after this instruction.
     if (!MI->killsRegister(X86::FP0 + Op0)) {
-      // Duplicate Op0
-      duplicateToTop(0, 7 /*temp register*/, I);
+      // Duplicate Op0 into a temporary on the stack top.
+      // This actually assumes that FP7 is dead.
+      duplicateToTop(Op0, 7, I);
     } else {
+      // Op0 is killed, so just swap it into position.
       moveToTop(Op0, I);
     }
     --StackTop;   // "Forget" we have something on the top of stack!
@@ -1017,17 +1024,29 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
   }
   case X86::FpSET_ST1_32:
   case X86::FpSET_ST1_64:
-  case X86::FpSET_ST1_80:
-    // StackTop can be 1 if a FpSET_ST0_* was before this. Exchange them.
-    if (StackTop == 1) {
-      BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(X86::ST1);
-      NumFXCH++;
-      StackTop = 0;
-      break;
+  case X86::FpSET_ST1_80: {
+    // Set up st(1) for inline asm. We are assuming that st(0) has already been
+    // set up by FpSET_ST0, and our StackTop is off by one because of it.
+    unsigned Op0 = getFPReg(MI->getOperand(0));
+    // Restore the actual StackTop from before Fp_SET_ST0.
+    // Note we can't handle Fp_SET_ST1 without a preceeding Fp_SET_ST0, and we
+    // are not enforcing the constraint.
+    ++StackTop;
+    unsigned RegOnTop = getStackEntry(0); // This reg must remain in st(0).
+    if (!MI->killsRegister(X86::FP0 + Op0)) {
+      // Assume FP6 is not live, use it as a scratch register.
+      duplicateToTop(Op0, 6, I);
+      moveToTop(RegOnTop, I);
+    } else if (getSTReg(Op0) != X86::ST1) {
+      // We have the wrong value at st(1). Shuffle! Untested!
+      moveToTop(getStackEntry(1), I);
+      moveToTop(Op0, I);
+      moveToTop(RegOnTop, I);
     }
-    assert(StackTop == 2 && "Stack should have two element on it to return!");
-    --StackTop;   // "Forget" we have something on the top of stack!
+    assert(StackTop >= 2 && "Too few live registers");
+    StackTop -= 2; // "Forget" both st(0) and st(1).
     break;
+  }
   case X86::MOV_Fp3232:
   case X86::MOV_Fp3264:
   case X86::MOV_Fp6432:
@@ -1041,32 +1060,6 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
     unsigned SrcReg = getFPReg(MO1);
 
     const MachineOperand &MO0 = MI->getOperand(0);
-    // These can be created due to inline asm. Two address pass can introduce
-    // copies from RFP registers to virtual registers.
-    if (MO0.getReg() == X86::ST0 && SrcReg == 0) {
-      assert(MO1.isKill());
-      // Treat %ST0<def> = MOV_Fp8080 %FP0<kill>
-      // like  FpSET_ST0_80 %FP0<kill>, %ST0<imp-def>
-      assert((StackTop == 1 || StackTop == 2)
-             && "Stack should have one or two element on it to return!");
-      --StackTop;   // "Forget" we have something on the top of stack!
-      break;
-    } else if (MO0.getReg() == X86::ST1 && SrcReg == 1) {
-      assert(MO1.isKill());
-      // Treat %ST1<def> = MOV_Fp8080 %FP1<kill>
-      // like  FpSET_ST1_80 %FP0<kill>, %ST1<imp-def>
-      // StackTop can be 1 if a FpSET_ST0_* was before this. Exchange them.
-      if (StackTop == 1) {
-        BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(X86::ST1);
-        NumFXCH++;
-        StackTop = 0;
-        break;
-      }
-      assert(StackTop == 2 && "Stack should have two element on it to return!");
-      --StackTop;   // "Forget" we have something on the top of stack!
-      break;
-    }
-
     unsigned DestReg = getFPReg(MO0);
     if (MI->killsRegister(X86::FP0+SrcReg)) {
       // If the input operand is killed, we can just change the owner of the
@@ -1206,3 +1199,33 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
   I = MBB->erase(I);  // Remove the pseudo instruction
   --I;
 }
+
+// Translate a COPY instruction to a pseudo-op that handleSpecialFP understands.
+bool FPS::translateCopy(MachineInstr *MI) {
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+
+  if (DstReg == X86::ST0) {
+    MI->setDesc(TII->get(X86::FpSET_ST0_80));
+    MI->RemoveOperand(0);
+    return true;
+  }
+  if (DstReg == X86::ST1) {
+    MI->setDesc(TII->get(X86::FpSET_ST1_80));
+    MI->RemoveOperand(0);
+    return true;
+  }
+  if (SrcReg == X86::ST0) {
+    MI->setDesc(TII->get(X86::FpGET_ST0_80));
+    return true;
+  }
+  if (SrcReg == X86::ST1) {
+    MI->setDesc(TII->get(X86::FpGET_ST1_80));
+    return true;
+  }
+  if (X86::RFP80RegClass.contains(DstReg, SrcReg)) {
+    MI->setDesc(TII->get(X86::MOV_Fp8080));
+    return true;
+  }
+  return false;
+}
diff --git a/lib/Target/X86/X86FloatingPointRegKill.cpp b/lib/Target/X86/X86FloatingPointRegKill.cpp
index 747683d..2c98b96 100644
--- a/lib/Target/X86/X86FloatingPointRegKill.cpp
+++ b/lib/Target/X86/X86FloatingPointRegKill.cpp
@@ -72,18 +72,15 @@ static bool isFPStackVReg(unsigned RegNo, const MachineRegisterInfo &MRI) {
 /// stack code, and thus needs an FP_REG_KILL.
 static bool ContainsFPStackCode(MachineBasicBlock *MBB,
                                 const MachineRegisterInfo &MRI) {
-  // Scan the block, looking for instructions that define fp stack vregs.
+  // Scan the block, looking for instructions that define or use fp stack vregs.
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
        I != E; ++I) {
-    if (I->getNumOperands() == 0 || !I->getOperand(0).isReg())
-      continue;
-    
     for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
-      if (!I->getOperand(op).isReg() || !I->getOperand(op).isDef())
+      if (!I->getOperand(op).isReg())
         continue;
-      
-      if (isFPStackVReg(I->getOperand(op).getReg(), MRI))
-        return true;
+      if (unsigned Reg = I->getOperand(op).getReg())
+        if (isFPStackVReg(Reg, MRI))
+          return true;
     }
   }
   
@@ -108,8 +105,8 @@ static bool ContainsFPStackCode(MachineBasicBlock *MBB,
 
 bool FPRegKiller::runOnMachineFunction(MachineFunction &MF) {
   // If we are emitting FP stack code, scan the basic block to determine if this
-  // block defines any FP values.  If so, put an FP_REG_KILL instruction before
-  // the terminator of the block.
+  // block defines or uses any FP values.  If so, put an FP_REG_KILL instruction
+  // before the terminator of the block.
 
   // Note that FP stack instructions are used in all modes for long double,
   // so we always need to do this check.
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 0f64383..72f2bc1 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -137,21 +137,6 @@ namespace {
 }
 
 namespace {
-  class X86ISelListener : public SelectionDAG::DAGUpdateListener {
-    SmallSet<SDNode*, 4> Deletes;
-  public:
-    explicit X86ISelListener() {}
-    virtual void NodeDeleted(SDNode *N, SDNode *E) {
-      Deletes.insert(N);
-    }
-    virtual void NodeUpdated(SDNode *N) {
-      // Ignore updates.
-    }
-    bool IsDeleted(SDNode *N) {
-      return Deletes.count(N);
-    }
-  };
-
   //===--------------------------------------------------------------------===//
   /// ISel - X86 specific code to select X86 machine instructions for
   /// SelectionDAG operations.
@@ -199,16 +184,17 @@ namespace {
     bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
     bool MatchAddress(SDValue N, X86ISelAddressMode &AM);
     bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
-                                 X86ISelListener &DeadNodes,
                                  unsigned Depth);
     bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM);
     bool SelectAddr(SDNode *Op, SDValue N, SDValue &Base,
                     SDValue &Scale, SDValue &Index, SDValue &Disp,
                     SDValue &Segment);
     bool SelectLEAAddr(SDNode *Op, SDValue N, SDValue &Base,
-                       SDValue &Scale, SDValue &Index, SDValue &Disp);
+                       SDValue &Scale, SDValue &Index, SDValue &Disp,
+                       SDValue &Segment);
     bool SelectTLSADDRAddr(SDNode *Op, SDValue N, SDValue &Base,
-                       SDValue &Scale, SDValue &Index, SDValue &Disp);
+                           SDValue &Scale, SDValue &Index, SDValue &Disp,
+                           SDValue &Segment);
     bool SelectScalarSSELoad(SDNode *Root, SDValue N,
                              SDValue &Base, SDValue &Scale,
                              SDValue &Index, SDValue &Disp,
@@ -239,7 +225,8 @@ namespace {
       // These are 32-bit even in 64-bit mode since RIP relative offset
       // is 32-bit.
       if (AM.GV)
-        Disp = CurDAG->getTargetGlobalAddress(AM.GV, MVT::i32, AM.Disp,
+        Disp = CurDAG->getTargetGlobalAddress(AM.GV, DebugLoc(),
+                                              MVT::i32, AM.Disp,
                                               AM.SymbolFlags);
       else if (AM.CP)
         Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
@@ -386,14 +373,14 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
   }
   for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i)
     Ops.push_back(OrigChain.getOperand(i));
-  CurDAG->UpdateNodeOperands(OrigChain, &Ops[0], Ops.size());
-  CurDAG->UpdateNodeOperands(Load, Call.getOperand(0),
+  CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size());
+  CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
                              Load.getOperand(1), Load.getOperand(2));
   Ops.clear();
   Ops.push_back(SDValue(Load.getNode(), 1));
   for (unsigned i = 1, e = Call.getNode()->getNumOperands(); i != e; ++i)
     Ops.push_back(Call.getOperand(i));
-  CurDAG->UpdateNodeOperands(Call, &Ops[0], Ops.size());
+  CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], Ops.size());
 }
 
 /// isCalleeLoad - Return true if call address is a load and it can be
@@ -515,7 +502,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
                                           N->getOperand(0),
                                           MemTmp, NULL, 0, MemVT,
                                           false, false, 0);
-    SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+    SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, DstVT, dl, Store, MemTmp,
                                         NULL, 0, MemVT, false, false, 0);
 
     // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
@@ -664,8 +651,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
 /// returning true if it cannot be done.  This just pattern matches for the
 /// addressing mode.
 bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
-  X86ISelListener DeadNodes;
-  if (MatchAddressRecursively(N, AM, DeadNodes, 0))
+  if (MatchAddressRecursively(N, AM, 0))
     return true;
 
   // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
@@ -713,7 +699,6 @@ static bool isLogicallyAddWithConstant(SDValue V, SelectionDAG *CurDAG) {
 }
 
 bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
-                                              X86ISelListener &DeadNodes,
                                               unsigned Depth) {
   bool is64Bit = Subtarget->is64Bit();
   DebugLoc dl = N.getDebugLoc();
@@ -876,13 +861,13 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     // other uses, since it avoids a two-address sub instruction, however
     // it costs an additional mov if the index register has other uses.
 
+    // Add an artificial use to this node so that we can keep track of
+    // it if it gets CSE'd with a different node.
+    HandleSDNode Handle(N);
+
     // Test if the LHS of the sub can be folded.
     X86ISelAddressMode Backup = AM;
-    if (MatchAddressRecursively(N.getNode()->getOperand(0), AM,
-                                DeadNodes, Depth+1) ||
-        // If it is successful but the recursive update causes N to be deleted,
-        // then it's not safe to continue.
-        DeadNodes.IsDeleted(N.getNode())) {
+    if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
       AM = Backup;
       break;
     }
@@ -893,7 +878,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     }
 
     int Cost = 0;
-    SDValue RHS = N.getNode()->getOperand(1);
+    SDValue RHS = Handle.getValue().getNode()->getOperand(1);
     // If the RHS involves a register with multiple uses, this
     // transformation incurs an extra mov, due to the neg instruction
     // clobbering its operand.
@@ -944,35 +929,27 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
   }
 
   case ISD::ADD: {
+    // Add an artificial use to this node so that we can keep track of
+    // it if it gets CSE'd with a different node.
+    HandleSDNode Handle(N);
+    SDValue LHS = Handle.getValue().getNode()->getOperand(0);
+    SDValue RHS = Handle.getValue().getNode()->getOperand(1);
+
     X86ISelAddressMode Backup = AM;
-    if (!MatchAddressRecursively(N.getNode()->getOperand(0), AM,
-                                 DeadNodes, Depth+1)) {
-      if (DeadNodes.IsDeleted(N.getNode()))
-        // If it is successful but the recursive update causes N to be deleted,
-        // then it's not safe to continue.
-        return true;
-      if (!MatchAddressRecursively(N.getNode()->getOperand(1), AM,
-                                   DeadNodes, Depth+1))
-        // If it is successful but the recursive update causes N to be deleted,
-        // then it's not safe to continue.
-        return DeadNodes.IsDeleted(N.getNode());
-    }
+    if (!MatchAddressRecursively(LHS, AM, Depth+1) &&
+        !MatchAddressRecursively(RHS, AM, Depth+1))
+      return false;
+    AM = Backup;
+    LHS = Handle.getValue().getNode()->getOperand(0);
+    RHS = Handle.getValue().getNode()->getOperand(1);
 
     // Try again after commuting the operands.
+    if (!MatchAddressRecursively(RHS, AM, Depth+1) &&
+        !MatchAddressRecursively(LHS, AM, Depth+1))
+      return false;
     AM = Backup;
-    if (!MatchAddressRecursively(N.getNode()->getOperand(1), AM,
-                                 DeadNodes, Depth+1)) {
-      if (DeadNodes.IsDeleted(N.getNode()))
-        // If it is successful but the recursive update causes N to be deleted,
-        // then it's not safe to continue.
-        return true;
-      if (!MatchAddressRecursively(N.getNode()->getOperand(0), AM,
-                                   DeadNodes, Depth+1))
-        // If it is successful but the recursive update causes N to be deleted,
-        // then it's not safe to continue.
-        return DeadNodes.IsDeleted(N.getNode());
-    }
-    AM = Backup;
+    LHS = Handle.getValue().getNode()->getOperand(0);
+    RHS = Handle.getValue().getNode()->getOperand(1);
 
     // If we couldn't fold both operands into the address at the same time,
     // see if we can just put each operand into a register and fold at least
@@ -980,8 +957,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
         !AM.Base_Reg.getNode() &&
         !AM.IndexReg.getNode()) {
-      AM.Base_Reg = N.getNode()->getOperand(0);
-      AM.IndexReg = N.getNode()->getOperand(1);
+      AM.Base_Reg = LHS;
+      AM.IndexReg = RHS;
       AM.Scale = 1;
       return false;
     }
@@ -996,7 +973,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
       uint64_t Offset = CN->getSExtValue();
 
       // Start with the LHS as an addr mode.
-      if (!MatchAddressRecursively(N.getOperand(0), AM, DeadNodes, Depth+1) &&
+      if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
           // Address could not have picked a GV address for the displacement.
           AM.GV == NULL &&
           // On x86-64, the resultant disp must fit in 32-bits.
@@ -1073,7 +1050,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
           CurDAG->RepositionNode(N.getNode(), Shl.getNode());
           Shl.getNode()->setNodeId(N.getNode()->getNodeId());
         }
-        CurDAG->ReplaceAllUsesWith(N, Shl, &DeadNodes);
+        CurDAG->ReplaceAllUsesWith(N, Shl);
         AM.IndexReg = And;
         AM.Scale = (1 << ScaleLog);
         return false;
@@ -1124,7 +1101,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
       NewSHIFT.getNode()->setNodeId(N.getNode()->getNodeId());
     }
 
-    CurDAG->ReplaceAllUsesWith(N, NewSHIFT, &DeadNodes);
+    CurDAG->ReplaceAllUsesWith(N, NewSHIFT);
     
     AM.Scale = 1 << ShiftCst;
     AM.IndexReg = NewAND;
@@ -1230,7 +1207,8 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
 /// mode it matches can be cost effectively emitted as an LEA instruction.
 bool X86DAGToDAGISel::SelectLEAAddr(SDNode *Op, SDValue N,
                                     SDValue &Base, SDValue &Scale,
-                                    SDValue &Index, SDValue &Disp) {
+                                    SDValue &Index, SDValue &Disp,
+                                    SDValue &Segment) {
   X86ISelAddressMode AM;
 
   // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
@@ -1284,7 +1262,6 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDNode *Op, SDValue N,
   if (Complexity <= 2)
     return false;
   
-  SDValue Segment;
   getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
   return true;
 }
@@ -1292,10 +1269,10 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDNode *Op, SDValue N,
 /// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes.
 bool X86DAGToDAGISel::SelectTLSADDRAddr(SDNode *Op, SDValue N, SDValue &Base,
                                         SDValue &Scale, SDValue &Index,
-                                        SDValue &Disp) {
+                                        SDValue &Disp, SDValue &Segment) {
   assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
-  
+    
   X86ISelAddressMode AM;
   AM.GV = GA->getGlobal();
   AM.Disp += GA->getOffset();
@@ -1309,7 +1286,6 @@ bool X86DAGToDAGISel::SelectTLSADDRAddr(SDNode *Op, SDValue N, SDValue &Base,
     AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
   }
   
-  SDValue Segment;
   getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
   return true;
 }
@@ -1672,6 +1648,26 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
     }
 
+    // Prevent use of AH in a REX instruction by referencing AX instead.
+    if (HiReg == X86::AH && Subtarget->is64Bit() &&
+        !SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              X86::AX, MVT::i16, InFlag);
+      InFlag = Result.getValue(2);
+      // Get the low part if needed. Don't use getCopyFromReg for aliasing
+      // registers.
+      if (!SDValue(Node, 0).use_empty())
+        ReplaceUses(SDValue(Node, 1),
+          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+
+      // Shift AX down 8 bits.
+      Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+                                              Result,
+                                     CurDAG->getTargetConstant(8, MVT::i8)), 0);
+      // Then truncate it down to i8.
+      ReplaceUses(SDValue(Node, 1),
+        CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+    }
     // Copy the low half of the result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
@@ -1682,24 +1678,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
     // Copy the high half of the result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
-      SDValue Result;
-      if (HiReg == X86::AH && Subtarget->is64Bit()) {
-        // Prevent use of AH in a REX instruction by referencing AX instead.
-        // Shift it down 8 bits.
-        Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                        X86::AX, MVT::i16, InFlag);
-        InFlag = Result.getValue(2);
-        Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
-                                                Result,
-                                   CurDAG->getTargetConstant(8, MVT::i8)), 0);
-        // Then truncate it down to i8.
-        Result = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
-                                                MVT::i8, Result);
-      } else {
-        Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                        HiReg, NVT, InFlag);
-        InFlag = Result.getValue(2);
-      }
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              HiReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
       ReplaceUses(SDValue(Node, 1), Result);
       DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
     }
@@ -1812,6 +1793,29 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
     }
 
+    // Prevent use of AH in a REX instruction by referencing AX instead.
+    // Shift it down 8 bits.
+    if (HiReg == X86::AH && Subtarget->is64Bit() &&
+        !SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              X86::AX, MVT::i16, InFlag);
+      InFlag = Result.getValue(2);
+
+      // If we also need AL (the quotient), get it by extracting a subreg from
+      // Result. The fast register allocator does not like multiple CopyFromReg
+      // nodes using aliasing registers.
+      if (!SDValue(Node, 0).use_empty())
+        ReplaceUses(SDValue(Node, 0),
+          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+
+      // Shift AX right by 8 bits instead of using AH.
+      Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+                                         Result,
+                                         CurDAG->getTargetConstant(8, MVT::i8)),
+                       0);
+      ReplaceUses(SDValue(Node, 1),
+        CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+    }
     // Copy the division (low) result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
@@ -1822,25 +1826,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
     // Copy the remainder (high) result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
-      SDValue Result;
-      if (HiReg == X86::AH && Subtarget->is64Bit()) {
-        // Prevent use of AH in a REX instruction by referencing AX instead.
-        // Shift it down 8 bits.
-        Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                        X86::AX, MVT::i16, InFlag);
-        InFlag = Result.getValue(2);
-        Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
-                                      Result,
-                                      CurDAG->getTargetConstant(8, MVT::i8)),
-                         0);
-        // Then truncate it down to i8.
-        Result = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
-                                                MVT::i8, Result);
-      } else {
-        Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                        HiReg, NVT, InFlag);
-        InFlag = Result.getValue(2);
-      }
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              HiReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
       ReplaceUses(SDValue(Node, 1), Result);
       DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
     }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b02c33d..1a63474 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -62,21 +62,19 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                        SDValue V2);
 
 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
-  switch (TM.getSubtarget<X86Subtarget>().TargetType) {
-  default: llvm_unreachable("unknown subtarget type");
-  case X86Subtarget::isDarwin:
-    if (TM.getSubtarget<X86Subtarget>().is64Bit())
-      return new X8664_MachoTargetObjectFile();
+  
+  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+  
+  if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) {
+    if (is64Bit) return new X8664_MachoTargetObjectFile();
     return new TargetLoweringObjectFileMachO();
-  case X86Subtarget::isELF:
-   if (TM.getSubtarget<X86Subtarget>().is64Bit())
-     return new X8664_ELFTargetObjectFile(TM);
+  } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){
+    if (is64Bit) return new X8664_ELFTargetObjectFile(TM);
     return new X8632_ELFTargetObjectFile(TM);
-  case X86Subtarget::isMingw:
-  case X86Subtarget::isCygwin:
-  case X86Subtarget::isWindows:
+  } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) {
     return new TargetLoweringObjectFileCOFF();
-  }
+  }  
+  llvm_unreachable("unknown subtarget type");
 }
 
 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
@@ -347,6 +345,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   if (!Subtarget->hasSSE2())
     setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
+  // On X86 and X86-64, atomic operations are lowered to locked instructions.
+  // Locked instructions, in turn, have implicit fence semantics (all memory
+  // operations are flushed before issuing the locked instruction, and they
+  // are not buffered), so we can fold away the common pattern of
+  // fence-atomic-fence.
+  setShouldFoldAtomicFences(true);
 
   // Expand certain atomics
   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
@@ -611,7 +615,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass, false);
     addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false);
     addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false);
-    addRegisterClass(MVT::v2f32, X86::VR64RegisterClass, false);
+    
     addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false);
 
     setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
@@ -657,14 +661,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
     setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
     AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
-    setOperationAction(ISD::LOAD,               MVT::v2f32, Promote);
-    AddPromotedToType (ISD::LOAD,               MVT::v2f32, MVT::v1i64);
     setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
 
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f32, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
 
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
@@ -672,7 +673,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
 
-    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2f32, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
@@ -691,7 +691,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
       setOperationAction(ISD::BIT_CONVERT,        MVT::v8i8,  Custom);
       setOperationAction(ISD::BIT_CONVERT,        MVT::v4i16, Custom);
       setOperationAction(ISD::BIT_CONVERT,        MVT::v2i32, Custom);
-      setOperationAction(ISD::BIT_CONVERT,        MVT::v2f32, Custom);
       setOperationAction(ISD::BIT_CONVERT,        MVT::v1i64, Custom);
     }
   }
@@ -792,9 +791,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
       EVT VT = SVT;
 
       // Do not attempt to promote non-128-bit vectors
-      if (!VT.is128BitVector()) {
+      if (!VT.is128BitVector())
         continue;
-      }
       
       setOperationAction(ISD::AND,    SVT, Promote);
       AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
@@ -825,6 +823,17 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   }
 
   if (Subtarget->hasSSE41()) {
+    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
+    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
+    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
+    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
+    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
+
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
@@ -965,15 +974,24 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // Add/Sub/Mul with overflow operations are custom lowered.
   setOperationAction(ISD::SADDO, MVT::i32, Custom);
-  setOperationAction(ISD::SADDO, MVT::i64, Custom);
   setOperationAction(ISD::UADDO, MVT::i32, Custom);
-  setOperationAction(ISD::UADDO, MVT::i64, Custom);
   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
-  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
   setOperationAction(ISD::USUBO, MVT::i32, Custom);
-  setOperationAction(ISD::USUBO, MVT::i64, Custom);
   setOperationAction(ISD::SMULO, MVT::i32, Custom);
-  setOperationAction(ISD::SMULO, MVT::i64, Custom);
+
+  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
+  // handle type legalization for these operations here.
+  //
+  // FIXME: We really should do custom legalization for addition and
+  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
+  // than generic legalization for 64-bit multiplication-with-overflow, though.
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::SADDO, MVT::i64, Custom);
+    setOperationAction(ISD::UADDO, MVT::i64, Custom);
+    setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+    setOperationAction(ISD::USUBO, MVT::i64, Custom);
+    setOperationAction(ISD::SMULO, MVT::i64, Custom);
+  }
 
   if (!Subtarget->is64Bit()) {
     // These libcalls are not available in 32-bit.
@@ -992,7 +1010,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::STORE);
-  setTargetDAGCombine(ISD::MEMBARRIER);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
@@ -1172,6 +1189,27 @@ unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
   return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
 }
 
+bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
+                                               unsigned &Offset) const {
+  if (!Subtarget->isTargetLinux())
+    return false;
+
+  if (Subtarget->is64Bit()) {
+    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
+    Offset = 0x28;
+    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+      AddressSpace = 256;
+    else
+      AddressSpace = 257;
+  } else {
+    // %gs:0x14 on i386
+    Offset = 0x14;
+    AddressSpace = 256;
+  }
+  return true;
+}
+
+
 //===----------------------------------------------------------------------===//
 //               Return Value Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -1180,19 +1218,19 @@ unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
 
 bool 
 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
-                        const SmallVectorImpl<EVT> &OutTys,
-                        const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
-                        SelectionDAG &DAG) const {
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
-  return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86);
+                 RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
 
 SDValue
 X86TargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
                                DebugLoc dl, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
@@ -1220,7 +1258,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
-    SDValue ValToCopy = Outs[i].Val;
+    SDValue ValToCopy = OutVals[i];
 
     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
     // the RET instruction and handled by the FP Stackifier.
@@ -1308,17 +1346,34 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
       report_fatal_error("SSE register return with SSE disabled");
     }
 
+    SDValue Val;
+
     // If this is a call to a function that returns an fp value on the floating
-    // point stack, but where we prefer to use the value in xmm registers, copy
-    // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
-    if ((VA.getLocReg() == X86::ST0 ||
-         VA.getLocReg() == X86::ST1) &&
-        isScalarFPTypeInSSEReg(VA.getValVT())) {
-      CopyVT = MVT::f80;
-    }
+    // point stack, we must guarantee the the value is popped from the stack, so
+    // a CopyFromReg is not good enough - the copy instruction may be eliminated
+    // if the return value is not used. We use the FpGET_ST0 instructions
+    // instead.
+    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
+      // If we prefer to use the value in xmm registers, copy it out as f80 and
+      // use a truncate to move it from fp stack reg to xmm reg.
+      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
+      bool isST0 = VA.getLocReg() == X86::ST0;
+      unsigned Opc = 0;
+      if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32;
+      if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64;
+      if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80;
+      SDValue Ops[] = { Chain, InFlag };
+      Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag,
+                                         Ops, 2), 1);
+      Val = Chain.getValue(0);
 
-    SDValue Val;
-    if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
+      // Round the f80 to the right size, which also moves it to the appropriate
+      // xmm register.
+      if (CopyVT != VA.getValVT())
+        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                          // This truncation won't change the value.
+                          DAG.getIntPtrConstant(1));
+    } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
       // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
       if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
         Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
@@ -1338,15 +1393,6 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
       Val = Chain.getValue(0);
     }
     InFlag = Chain.getValue(2);
-
-    if (CopyVT != VA.getValVT()) {
-      // Round the F80 the right size, which also moves to the appropriate xmm
-      // register.
-      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
-                        // This truncation won't change the value.
-                        DAG.getIntPtrConstant(1));
-    }
-
     InVals.push_back(Val);
   }
 
@@ -1383,29 +1429,6 @@ ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
   return Ins[0].Flags.isSRet();
 }
 
-/// IsCalleePop - Determines whether the callee is required to pop its
-/// own arguments. Callee pop is necessary to support tail calls.
-bool X86TargetLowering::IsCalleePop(bool IsVarArg,
-                                    CallingConv::ID CallingConv) const {
-  if (IsVarArg)
-    return false;
-
-  switch (CallingConv) {
-  default:
-    return false;
-  case CallingConv::X86_StdCall:
-    return !Subtarget->is64Bit();
-  case CallingConv::X86_FastCall:
-    return !Subtarget->is64Bit();
-  case CallingConv::X86_ThisCall:
-    return !Subtarget->is64Bit();
-  case CallingConv::Fast:
-    return GuaranteedTailCallOpt;
-  case CallingConv::GHC:
-    return GuaranteedTailCallOpt;
-  }
-}
-
 /// CCAssignFnForNode - Selects the correct CCAssignFn for a the
 /// given CallingConvention value.
 CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
@@ -1483,11 +1506,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
   // could be overwritten by lowering of arguments in case of a tail call.
   if (Flags.isByVal()) {
     int FI = MFI->CreateFixedObject(Flags.getByValSize(),
-                                    VA.getLocMemOffset(), isImmutable, false);
+                                    VA.getLocMemOffset(), isImmutable);
     return DAG.getFrameIndex(FI, getPointerTy());
   } else {
     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
-                                    VA.getLocMemOffset(), isImmutable, false);
+                                    VA.getLocMemOffset(), isImmutable);
     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
     return DAG.getLoad(ValVT, dl, Chain, FIN,
                        PseudoSourceValue::getFixedStack(FI), 0,
@@ -1615,8 +1638,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
   if (isVarArg) {
     if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
                     CallConv != CallingConv::X86_ThisCall)) {
-      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,
-                                                            true, false));
+      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
     }
     if (Is64Bit) {
       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
@@ -1722,7 +1744,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
   }
 
   // Some CCs need callee pop.
-  if (IsCalleePop(isVarArg, CallConv)) {
+  if (Subtarget->IsCalleePop(isVarArg, CallConv)) {
     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   } else {
     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
@@ -1788,7 +1810,7 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
   // Calculate the new stack slot for the return address.
   int SlotSize = Is64Bit ? 8 : 4;
   int NewReturnAddrFI =
-    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false);
+    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
   EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
@@ -1802,6 +1824,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                              CallingConv::ID CallConv, bool isVarArg,
                              bool &isTailCall,
                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
                              const SmallVectorImpl<ISD::InputArg> &Ins,
                              DebugLoc dl, SelectionDAG &DAG,
                              SmallVectorImpl<SDValue> &InVals) const {
@@ -1814,7 +1837,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // Check if it's really possible to do a tail call.
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                     isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
-                                                   Outs, Ins, DAG);
+                                                   Outs, OutVals, Ins, DAG);
 
     // Sibcalls are automatically detected tailcalls which do not require
     // ABI changes.
@@ -1874,7 +1897,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     EVT RegVT = VA.getLocVT();
-    SDValue Arg = Outs[i].Val;
+    SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
     bool isByVal = Flags.isByVal();
 
@@ -2013,12 +2036,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
         if (VA.isRegLoc())
           continue;
         assert(VA.isMemLoc());
-        SDValue Arg = Outs[i].Val;
+        SDValue Arg = OutVals[i];
         ISD::ArgFlagsTy Flags = Outs[i].Flags;
         // Create frame index.
         int32_t Offset = VA.getLocMemOffset()+FPDiff;
         uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
-        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false);
+        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
         FIN = DAG.getFrameIndex(FI, getPointerTy());
 
         if (Flags.isByVal()) {
@@ -2059,7 +2082,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                      FPDiff, dl);
   }
 
-  bool WasGlobalOrExternal = false;
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
     // In the 64-bit large code model, we have to make all calls
@@ -2067,7 +2089,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // pc-relative offset may not be large enough to hold the whole
     // address.
   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    WasGlobalOrExternal = true;
     // If the callee is a GlobalAddress node (quite common, every direct call
     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
     // it.
@@ -2095,11 +2116,10 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
         OpFlags = X86II::MO_DARWIN_STUB;
       }
 
-      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(),
+      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
                                           G->getOffset(), OpFlags);
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    WasGlobalOrExternal = true;
     unsigned char OpFlags = 0;
 
     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
@@ -2153,17 +2173,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     Ops.push_back(InFlag);
 
   if (isTailCall) {
-    // If this is the first return lowered for this function, add the regs
-    // to the liveout set for the function.
-    if (MF.getRegInfo().liveout_empty()) {
-      SmallVector<CCValAssign, 16> RVLocs;
-      CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
-                     *DAG.getContext());
-      CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
-      for (unsigned i = 0; i != RVLocs.size(); ++i)
-        if (RVLocs[i].isRegLoc())
-          MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg());
-    }
+    // We used to do:
+    //// If this is the first return lowered for this function, add the regs
+    //// to the liveout set for the function.
+    // This isn't right, although it's probably harmless on x86; liveouts
+    // should be computed from returns not tail calls.  Consider a void
+    // function making a tail call to a function returning int.
     return DAG.getNode(X86ISD::TC_RETURN, dl,
                        NodeTys, &Ops[0], Ops.size());
   }
@@ -2173,7 +2188,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Create the CALLSEQ_END node.
   unsigned NumBytesForCalleeToPush;
-  if (IsCalleePop(isVarArg, CallConv))
+  if (Subtarget->IsCalleePop(isVarArg, CallConv))
     NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
   else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet)
     // If this is a call to a struct-return function, the callee
@@ -2314,6 +2329,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                                      bool isCalleeStructRet,
                                                      bool isCallerStructRet,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                                      SelectionDAG& DAG) const {
   if (!IsTailCallConvention(CalleeCC) &&
@@ -2332,8 +2348,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     return false;
   }
 
-  // Look for obvious safe cases to perform tail call optimization that does not
-  // requite ABI changes. This is what gcc calls sibcall.
+  // Look for obvious safe cases to perform tail call optimization that do not
+  // require ABI changes. This is what gcc calls sibcall.
 
   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   // emit a special epilogue.
@@ -2427,8 +2443,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
         ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
-        EVT RegVT = VA.getLocVT();
-        SDValue Arg = Outs[i].Val;
+        SDValue Arg = OutVals[i];
         ISD::ArgFlagsTy Flags = Outs[i].Flags;
         if (VA.getLocInfo() == CCValAssign::Indirect)
           return false;
@@ -2439,26 +2454,32 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
         }
       }
     }
+
+    // If the tailcall address may be in a register, then make sure it's
+    // possible to register allocate for it. In 32-bit, the call address can
+    // only target EAX, EDX, or ECX since the tail call must be scheduled after
+    // callee-saved registers are restored. In 64-bit, it's RAX, RCX, RDX, RSI,
+    // RDI, R8, R9, R11.
+    if (!isa<GlobalAddressSDNode>(Callee) &&
+        !isa<ExternalSymbolSDNode>(Callee)) {
+      unsigned Limit = Subtarget->is64Bit() ? 8 : 3;
+      unsigned NumInRegs = 0;
+      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+        CCValAssign &VA = ArgLocs[i];
+        if (VA.isRegLoc()) {
+          if (++NumInRegs == Limit)
+            return false;
+        }
+      }
+    }
   }
 
   return true;
 }
 
 FastISel *
-X86TargetLowering::createFastISel(MachineFunction &mf,
-                            DenseMap<const Value *, unsigned> &vm,
-                            DenseMap<const BasicBlock*, MachineBasicBlock*> &bm,
-                            DenseMap<const AllocaInst *, int> &am,
-                            std::vector<std::pair<MachineInstr*, unsigned> > &pn
-#ifndef NDEBUG
-                          , SmallSet<const Instruction *, 8> &cil
-#endif
-                                  ) const {
-  return X86::createFastISel(mf, vm, bm, am, pn
-#ifndef NDEBUG
-                             , cil
-#endif
-                             );
+X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
+  return X86::createFastISel(funcInfo);
 }
 
 
@@ -2476,7 +2497,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
     // Set up a frame object for the return address.
     uint64_t SlotSize = TD->getPointerSize();
     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
-                                                           false, false);
+                                                           false);
     FuncInfo->setRAIndex(ReturnAddrIndex);
   }
 
@@ -3175,7 +3196,7 @@ unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
 /// constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
   return ((isa<ConstantSDNode>(Elt) &&
-           cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
+           cast<ConstantSDNode>(Elt)->isNullValue()) ||
           (isa<ConstantFPSDNode>(Elt) &&
            cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
 }
@@ -4433,7 +4454,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
 }
 
 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
-/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
+/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be
 /// done when every pair / quad of shuffle mask elements point to elements in
 /// the right sequence. e.g.
 /// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
@@ -4447,7 +4468,6 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
   unsigned NumElems = VT.getVectorNumElements();
   unsigned NewWidth = (NumElems == 4) ? 2 : 4;
   EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
-  EVT MaskEltVT = MaskVT.getVectorElementType();
   EVT NewVT = MaskVT;
   switch (VT.getSimpleVT().SimpleTy) {
   default: assert(false && "Unexpected!");
@@ -5059,13 +5079,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
-  if (Op.getValueType() == MVT::v2f32)
-    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
-                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
-                                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
-                                               Op.getOperand(0))));
-
-  if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64)
+  
+  if (Op.getValueType() == MVT::v1i64 &&
+      Op.getOperand(0).getValueType() == MVT::i64)
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
 
   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
@@ -5230,10 +5246,10 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
   if (OpFlags == X86II::MO_NO_FLAG &&
       X86::isOffsetSuitableForCodeModel(Offset, M)) {
     // A direct static reference to a global.
-    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
+    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
     Offset = 0;
   } else {
-    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
+    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
   }
 
   if (Subtarget->isPICStyleRIPRel() &&
@@ -5278,7 +5294,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
   DebugLoc dl = GA->getDebugLoc();
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                            GA->getValueType(0),
                                            GA->getOffset(),
                                            OperandFlags);
@@ -5351,7 +5367,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
 
   // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
   // exec)
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 
+                                           GA->getValueType(0),
                                            GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
@@ -5366,33 +5383,78 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
 
 SDValue
 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
-  // TODO: implement the "local dynamic" model
-  // TODO: implement the "initial exec"model for pic executables
-  assert(Subtarget->isTargetELF() &&
-         "TLS not implemented for non-ELF targets");
+  
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GA->getGlobal();
 
-  // If GV is an alias then use the aliasee for determining
-  // thread-localness.
-  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    GV = GA->resolveAliasedGlobal(false);
-
-  TLSModel::Model model = getTLSModel(GV,
-                                      getTargetMachine().getRelocationModel());
-
-  switch (model) {
-  case TLSModel::GeneralDynamic:
-  case TLSModel::LocalDynamic: // not implemented
-    if (Subtarget->is64Bit())
-      return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
-    return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+  if (Subtarget->isTargetELF()) {
+    // TODO: implement the "local dynamic" model
+    // TODO: implement the "initial exec"model for pic executables
+    
+    // If GV is an alias then use the aliasee for determining
+    // thread-localness.
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GV = GA->resolveAliasedGlobal(false);
+    
+    TLSModel::Model model 
+      = getTLSModel(GV, getTargetMachine().getRelocationModel());
+    
+    switch (model) {
+      case TLSModel::GeneralDynamic:
+      case TLSModel::LocalDynamic: // not implemented
+        if (Subtarget->is64Bit())
+          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
+        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+        
+      case TLSModel::InitialExec:
+      case TLSModel::LocalExec:
+        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
+                                   Subtarget->is64Bit());
+    }
+  } else if (Subtarget->isTargetDarwin()) {
+    // Darwin only has one model of TLS.  Lower to that.
+    unsigned char OpFlag = 0;
+    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
+                           X86ISD::WrapperRIP : X86ISD::Wrapper;
+    
+    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+    // global base reg.
+    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
+                  !Subtarget->is64Bit();
+    if (PIC32)
+      OpFlag = X86II::MO_TLVP_PIC_BASE;
+    else
+      OpFlag = X86II::MO_TLVP;
+    DebugLoc DL = Op.getDebugLoc();    
+    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
+                                                getPointerTy(),
+                                                GA->getOffset(), OpFlag);
+    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  
+    // With PIC32, the address is actually $g + Offset.
+    if (PIC32)
+      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                           DAG.getNode(X86ISD::GlobalBaseReg,
+                                       DebugLoc(), getPointerTy()),
+                           Offset);
+    
+    // Lowering the machine isd will make sure everything is in the right
+    // location.
+    SDValue Args[] = { Offset };
+    SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1);
+    
+    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI->setAdjustsStack(true);
 
-  case TLSModel::InitialExec:
-  case TLSModel::LocalExec:
-    return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
-                               Subtarget->is64Bit());
+    // And our return value (tls address) is in the standard call return value
+    // location.
+    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
   }
+  
+  assert(false &&
+         "TLS not implemented for this target.");
 
   llvm_unreachable("Unreachable");
   return SDValue();
@@ -5715,7 +5777,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
 
   // Load the value out, extending it from f32 to f80.
   // FIXME: Avoid the extend by constructing the right constant pool?
-  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
+  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(),
                                  FudgePtr, PseudoSourceValue::getConstantPool(),
                                  0, MVT::f32, false, false, 4);
   // Extend everything to 80 bits to force it to be done on x87.
@@ -5964,6 +6026,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
   bool NeedCF = false;
   bool NeedOF = false;
   switch (X86CC) {
+  default: break;
   case X86::COND_A: case X86::COND_AE:
   case X86::COND_B: case X86::COND_BE:
     NeedCF = true;
@@ -5973,120 +6036,129 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
   case X86::COND_O: case X86::COND_NO:
     NeedOF = true;
     break;
-  default: break;
   }
 
   // See if we can use the EFLAGS value from the operand instead of
   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   // we prove that the arithmetic won't overflow, we can't use OF or CF.
-  if (Op.getResNo() == 0 && !NeedOF && !NeedCF) {
-    unsigned Opcode = 0;
-    unsigned NumOperands = 0;
-    switch (Op.getNode()->getOpcode()) {
-    case ISD::ADD:
-      // Due to an isel shortcoming, be conservative if this add is
-      // likely to be selected as part of a load-modify-store
-      // instruction. When the root node in a match is a store, isel
-      // doesn't know how to remap non-chain non-flag uses of other
-      // nodes in the match, such as the ADD in this case. This leads
-      // to the ADD being left around and reselected, with the result
-      // being two adds in the output.  Alas, even if none our users
-      // are stores, that doesn't prove we're O.K.  Ergo, if we have
-      // any parents that aren't CopyToReg or SETCC, eschew INC/DEC.
-      // A better fix seems to require climbing the DAG back to the
-      // root, and it doesn't seem to be worth the effort.
-      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
-             UE = Op.getNode()->use_end(); UI != UE; ++UI)
-        if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
-          goto default_case;
-      if (ConstantSDNode *C =
-            dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
-        // An add of one will be selected as an INC.
-        if (C->getAPIntValue() == 1) {
-          Opcode = X86ISD::INC;
-          NumOperands = 1;
-          break;
-        }
-        // An add of negative one (subtract of one) will be selected as a DEC.
-        if (C->getAPIntValue().isAllOnesValue()) {
-          Opcode = X86ISD::DEC;
-          NumOperands = 1;
-          break;
-        }
+  if (Op.getResNo() != 0 || NeedOF || NeedCF)
+    // Emit a CMP with 0, which is the TEST pattern.
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, Op.getValueType()));
+
+  unsigned Opcode = 0;
+  unsigned NumOperands = 0;
+  switch (Op.getNode()->getOpcode()) {
+  case ISD::ADD:
+    // Due to an isel shortcoming, be conservative if this add is likely to be
+    // selected as part of a load-modify-store instruction. When the root node
+    // in a match is a store, isel doesn't know how to remap non-chain non-flag
+    // uses of other nodes in the match, such as the ADD in this case. This
+    // leads to the ADD being left around and reselected, with the result being
+    // two adds in the output.  Alas, even if none our users are stores, that
+    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
+    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
+    // climbing the DAG back to the root, and it doesn't seem to be worth the
+    // effort.
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI)
+      if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
+        goto default_case;
+
+    if (ConstantSDNode *C =
+        dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
+      // An add of one will be selected as an INC.
+      if (C->getAPIntValue() == 1) {
+        Opcode = X86ISD::INC;
+        NumOperands = 1;
+        break;
       }
-      // Otherwise use a regular EFLAGS-setting add.
-      Opcode = X86ISD::ADD;
-      NumOperands = 2;
-      break;
-    case ISD::AND: {
-      // If the primary and result isn't used, don't bother using X86ISD::AND,
-      // because a TEST instruction will be better.
-      bool NonFlagUse = false;
-      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
-             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
-        SDNode *User = *UI;
-        unsigned UOpNo = UI.getOperandNo();
-        if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
-          // Look pass truncate.
-          UOpNo = User->use_begin().getOperandNo();
-          User = *User->use_begin();
-        }
-        if (User->getOpcode() != ISD::BRCOND &&
-            User->getOpcode() != ISD::SETCC &&
-            (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
-          NonFlagUse = true;
-          break;
-        }
+
+      // An add of negative one (subtract of one) will be selected as a DEC.
+      if (C->getAPIntValue().isAllOnesValue()) {
+        Opcode = X86ISD::DEC;
+        NumOperands = 1;
+        break;
       }
-      if (!NonFlagUse)
+    }
+
+    // Otherwise use a regular EFLAGS-setting add.
+    Opcode = X86ISD::ADD;
+    NumOperands = 2;
+    break;
+  case ISD::AND: {
+    // If the primary and result isn't used, don't bother using X86ISD::AND,
+    // because a TEST instruction will be better.
+    bool NonFlagUse = false;
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
+      SDNode *User = *UI;
+      unsigned UOpNo = UI.getOperandNo();
+      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+        // Look pass truncate.
+        UOpNo = User->use_begin().getOperandNo();
+        User = *User->use_begin();
+      }
+
+      if (User->getOpcode() != ISD::BRCOND &&
+          User->getOpcode() != ISD::SETCC &&
+          (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
+        NonFlagUse = true;
         break;
+      }
     }
+
+    if (!NonFlagUse)
+      break;
+  }
     // FALL THROUGH
-    case ISD::SUB:
-    case ISD::OR:
-    case ISD::XOR:
-      // Due to the ISEL shortcoming noted above, be conservative if this op is
-      // likely to be selected as part of a load-modify-store instruction.
-      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+  case ISD::SUB:
+  case ISD::OR:
+  case ISD::XOR:
+    // Due to the ISEL shortcoming noted above, be conservative if this op is
+    // likely to be selected as part of a load-modify-store instruction.
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
            UE = Op.getNode()->use_end(); UI != UE; ++UI)
-        if (UI->getOpcode() == ISD::STORE)
-          goto default_case;
-      // Otherwise use a regular EFLAGS-setting instruction.
-      switch (Op.getNode()->getOpcode()) {
-      case ISD::SUB: Opcode = X86ISD::SUB; break;
-      case ISD::OR:  Opcode = X86ISD::OR;  break;
-      case ISD::XOR: Opcode = X86ISD::XOR; break;
-      case ISD::AND: Opcode = X86ISD::AND; break;
-      default: llvm_unreachable("unexpected operator!");
-      }
-      NumOperands = 2;
-      break;
-    case X86ISD::ADD:
-    case X86ISD::SUB:
-    case X86ISD::INC:
-    case X86ISD::DEC:
-    case X86ISD::OR:
-    case X86ISD::XOR:
-    case X86ISD::AND:
-      return SDValue(Op.getNode(), 1);
-    default:
-    default_case:
-      break;
-    }
-    if (Opcode != 0) {
-      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-      SmallVector<SDValue, 4> Ops;
-      for (unsigned i = 0; i != NumOperands; ++i)
-        Ops.push_back(Op.getOperand(i));
-      SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
-      DAG.ReplaceAllUsesWith(Op, New);
-      return SDValue(New.getNode(), 1);
+      if (UI->getOpcode() == ISD::STORE)
+        goto default_case;
+
+    // Otherwise use a regular EFLAGS-setting instruction.
+    switch (Op.getNode()->getOpcode()) {
+    default: llvm_unreachable("unexpected operator!");
+    case ISD::SUB: Opcode = X86ISD::SUB; break;
+    case ISD::OR:  Opcode = X86ISD::OR;  break;
+    case ISD::XOR: Opcode = X86ISD::XOR; break;
+    case ISD::AND: Opcode = X86ISD::AND; break;
     }
+
+    NumOperands = 2;
+    break;
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+  case X86ISD::INC:
+  case X86ISD::DEC:
+  case X86ISD::OR:
+  case X86ISD::XOR:
+  case X86ISD::AND:
+    return SDValue(Op.getNode(), 1);
+  default:
+  default_case:
+    break;
   }
 
-  // Otherwise just emit a CMP with 0, which is the TEST pattern.
-  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
-                     DAG.getConstant(0, Op.getValueType()));
+  if (Opcode == 0)
+    // Emit a CMP with 0, which is the TEST pattern.
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, Op.getValueType()));
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 0; i != NumOperands; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+  DAG.ReplaceAllUsesWith(Op, New);
+  return SDValue(New.getNode(), 1);
 }
 
 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
@@ -6113,15 +6185,21 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
     Op1 = Op1.getOperand(0);
 
   SDValue LHS, RHS;
-  if (Op1.getOpcode() == ISD::SHL) {
-    if (ConstantSDNode *And10C = dyn_cast<ConstantSDNode>(Op1.getOperand(0)))
-      if (And10C->getZExtValue() == 1) {
-        LHS = Op0;
-        RHS = Op1.getOperand(1);
-      }
-  } else if (Op0.getOpcode() == ISD::SHL) {
+  if (Op1.getOpcode() == ISD::SHL)
+    std::swap(Op0, Op1);
+  if (Op0.getOpcode() == ISD::SHL) {
     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
       if (And00C->getZExtValue() == 1) {
+        // If we looked past a truncate, check that it's only truncating away
+        // known zeros.
+        unsigned BitWidth = Op0.getValueSizeInBits();
+        unsigned AndBitWidth = And.getValueSizeInBits();
+        if (BitWidth > AndBitWidth) {
+          APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones;
+          DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones);
+          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
+            return SDValue();
+        }
         LHS = Op1;
         RHS = Op0.getOperand(1);
       }
@@ -6172,7 +6250,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   if (Op0.getOpcode() == ISD::AND &&
       Op0.hasOneUse() &&
       Op1.getOpcode() == ISD::Constant &&
-      cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
+      cast<ConstantSDNode>(Op1)->isNullValue() &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
     if (NewSetCC.getNode())
@@ -6552,15 +6630,16 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
           CCode = X86::GetOppositeBranchCondition(CCode);
           CC = DAG.getConstant(CCode, MVT::i8);
-          SDValue User = SDValue(*Op.getNode()->use_begin(), 0);
+          SDNode *User = *Op.getNode()->use_begin();
           // Look for an unconditional branch following this conditional branch.
           // We need this because we need to reverse the successors in order
           // to implement FCMP_OEQ.
-          if (User.getOpcode() == ISD::BR) {
-            SDValue FalseBB = User.getOperand(1);
-            SDValue NewBR =
-              DAG.UpdateNodeOperands(User, User.getOperand(0), Dest);
+          if (User->getOpcode() == ISD::BR) {
+            SDValue FalseBB = User->getOperand(1);
+            SDNode *NewBR =
+              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
             assert(NewBR == User);
+            (void)NewBR;
             Dest = FalseBB;
 
             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
@@ -6632,7 +6711,6 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
   SDValue Flag;
 
-  EVT IntPtr = getPointerTy();
   EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
 
   Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
@@ -6685,7 +6763,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   Store = DAG.getStore(Op.getOperand(0), dl,
                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
                                        MVT::i32),
-                       FIN, SV, 0, false, false, 0);
+                       FIN, SV, 4, false, false, 0);
   MemOps.push_back(Store);
 
   // Store ptr to overflow_arg_area
@@ -6693,7 +6771,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                     FIN, DAG.getIntPtrConstant(4));
   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
                                     getPointerTy());
-  Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0,
+  Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 8,
                        false, false, 0);
   MemOps.push_back(Store);
 
@@ -6702,7 +6780,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                     FIN, DAG.getIntPtrConstant(8));
   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
                                     getPointerTy());
-  Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0,
+  Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 16,
                        false, false, 0);
   MemOps.push_back(Store);
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
@@ -6712,9 +6790,6 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
-  SDValue Chain = Op.getOperand(0);
-  SDValue SrcPtr = Op.getOperand(1);
-  SDValue SrcSV = Op.getOperand(2);
 
   report_fatal_error("VAArgInst is not yet implemented for x86-64!");
   return SDValue();
@@ -7733,6 +7808,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
+  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress";
   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
@@ -7944,8 +8020,11 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
   F->insert(MBBIter, newMBB);
   F->insert(MBBIter, nextMBB);
 
-  // Move all successors to thisMBB to nextMBB
-  nextMBB->transferSuccessors(thisMBB);
+  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
+  nextMBB->splice(nextMBB->begin(), thisMBB,
+                  llvm::next(MachineBasicBlock::iterator(bInstr)),
+                  thisMBB->end());
+  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
 
   // Update thisMBB to fall through to newMBB
   thisMBB->addSuccessor(newMBB);
@@ -7955,17 +8034,17 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
   newMBB->addSuccessor(newMBB);
 
   // Insert instructions into newMBB based on incoming instruction
-  assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
          "unexpected number of operands");
   DebugLoc dl = bInstr->getDebugLoc();
   MachineOperand& destOper = bInstr->getOperand(0);
-  MachineOperand* argOpers[2 + X86AddrNumOperands];
+  MachineOperand* argOpers[2 + X86::AddrNumOperands];
   int numArgs = bInstr->getNumOperands() - 1;
   for (int i=0; i < numArgs; ++i)
     argOpers[i] = &bInstr->getOperand(i+1);
 
   // x86 address has 4 operands: base, index, scale, and displacement
-  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
   int valArgIndx = lastAddrIndx + 1;
 
   unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
@@ -8008,7 +8087,7 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
   // insert branch
   BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
 
-  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
+  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
   return nextMBB;
 }
 
@@ -8053,8 +8132,11 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
   F->insert(MBBIter, newMBB);
   F->insert(MBBIter, nextMBB);
 
-  // Move all successors to thisMBB to nextMBB
-  nextMBB->transferSuccessors(thisMBB);
+  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
+  nextMBB->splice(nextMBB->begin(), thisMBB,
+                  llvm::next(MachineBasicBlock::iterator(bInstr)),
+                  thisMBB->end());
+  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
 
   // Update thisMBB to fall through to newMBB
   thisMBB->addSuccessor(newMBB);
@@ -8066,12 +8148,12 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
   DebugLoc dl = bInstr->getDebugLoc();
   // Insert instructions into newMBB based on incoming instruction
   // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
-  assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 &&
+  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
          "unexpected number of operands");
   MachineOperand& dest1Oper = bInstr->getOperand(0);
   MachineOperand& dest2Oper = bInstr->getOperand(1);
-  MachineOperand* argOpers[2 + X86AddrNumOperands];
-  for (int i=0; i < 2 + X86AddrNumOperands; ++i) {
+  MachineOperand* argOpers[2 + X86::AddrNumOperands];
+  for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
     argOpers[i] = &bInstr->getOperand(i+2);
 
     // We use some of the operands multiple times, so conservatively just
@@ -8081,7 +8163,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
   }
 
   // x86 address has 5 operands: base, index, scale, displacement, and segment.
-  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
 
   unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
   MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
@@ -8171,7 +8253,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
   // insert branch
   BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
 
-  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
+  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
   return nextMBB;
 }
 
@@ -8205,8 +8287,11 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
   F->insert(MBBIter, newMBB);
   F->insert(MBBIter, nextMBB);
 
-  // Move all successors of thisMBB to nextMBB
-  nextMBB->transferSuccessors(thisMBB);
+  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
+  nextMBB->splice(nextMBB->begin(), thisMBB,
+                  llvm::next(MachineBasicBlock::iterator(mInstr)),
+                  thisMBB->end());
+  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
 
   // Update thisMBB to fall through to newMBB
   thisMBB->addSuccessor(newMBB);
@@ -8217,16 +8302,16 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
 
   DebugLoc dl = mInstr->getDebugLoc();
   // Insert instructions into newMBB based on incoming instruction
-  assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+  assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
          "unexpected number of operands");
   MachineOperand& destOper = mInstr->getOperand(0);
-  MachineOperand* argOpers[2 + X86AddrNumOperands];
+  MachineOperand* argOpers[2 + X86::AddrNumOperands];
   int numArgs = mInstr->getNumOperands() - 1;
   for (int i=0; i < numArgs; ++i)
     argOpers[i] = &mInstr->getOperand(i+1);
 
   // x86 address has 4 operands: base, index, scale, and displacement
-  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
   int valArgIndx = lastAddrIndx + 1;
 
   unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
@@ -8274,7 +8359,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
   // insert branch
   BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
 
-  F->DeleteMachineInstr(mInstr);   // The pseudo instruction is gone now.
+  mInstr->eraseFromParent();   // The pseudo instruction is gone now.
   return nextMBB;
 }
 
@@ -8284,7 +8369,6 @@ MachineBasicBlock *
 X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
                             unsigned numArgs, bool memArg) const {
 
-  MachineFunction *F = BB->getParent();
   DebugLoc dl = MI->getDebugLoc();
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 
@@ -8306,7 +8390,7 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
   BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
     .addReg(X86::XMM0);
 
-  F->DeleteMachineInstr(MI);
+  MI->eraseFromParent();
 
   return BB;
 }
@@ -8335,9 +8419,12 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   F->insert(MBBIter, XMMSaveMBB);
   F->insert(MBBIter, EndMBB);
 
-  // Set up the CFG.
-  // Move any original successors of MBB to the end block.
-  EndMBB->transferSuccessors(MBB);
+  // Transfer the remainder of MBB and its successor edges to EndMBB.
+  EndMBB->splice(EndMBB->begin(), MBB,
+                 llvm::next(MachineBasicBlock::iterator(MI)),
+                 MBB->end());
+  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
   // The original block will now fall through to the XMM save block.
   MBB->addSuccessor(XMMSaveMBB);
   // The XMMSaveMBB will fall through to the end block.
@@ -8376,7 +8463,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
       .addMemOperand(MMO);
   }
 
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
 
   return EndMBB;
 }
@@ -8405,24 +8492,39 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   MachineFunction *F = BB->getParent();
   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  unsigned Opc =
-    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
-  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
   F->insert(It, copy0MBB);
   F->insert(It, sinkMBB);
-  // Update machine-CFG edges by first adding all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(),
-         E = BB->succ_end(); I != E; ++I)
-    sinkMBB->addSuccessor(*I);
-  // Next, remove all successors of the current block, and add the true
-  // and fallthrough blocks as its successors.
-  while (!BB->succ_empty())
-    BB->removeSuccessor(BB->succ_begin());
+
+  // If the EFLAGS register isn't dead in the terminator, then claim that it's
+  // live into the sink and copy blocks.
+  const MachineFunction *MF = BB->getParent();
+  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  BitVector ReservedRegs = TRI->getReservedRegs(*MF);
+
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg != X86::EFLAGS) continue;
+    copy0MBB->addLiveIn(Reg);
+    sinkMBB->addLiveIn(Reg);
+  }
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
   // Add the true and fallthrough blocks as its successors.
   BB->addSuccessor(copy0MBB);
   BB->addSuccessor(sinkMBB);
 
+  // Create the conditional branch instruction.
+  unsigned Opc =
+    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+
   //  copy0MBB:
   //   %FalseValue = ...
   //   # fallthrough to sinkMBB
@@ -8431,11 +8533,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   //  sinkMBB:
   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   //  ...
-  BuildMI(sinkMBB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg())
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(X86::PHI), MI->getOperand(0).getReg())
     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
 
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
   return sinkMBB;
 }
 
@@ -8444,21 +8547,70 @@ X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI,
                                           MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
-  MachineFunction *F = BB->getParent();
 
   // The lowering is pretty easy: we're just emitting the call to _alloca.  The
   // non-trivial part is impdef of ESP.
   // FIXME: The code should be tweaked as soon as we'll try to do codegen for
   // mingw-w64.
 
-  BuildMI(BB, DL, TII->get(X86::CALLpcrel32))
+  BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
     .addExternalSymbol("_alloca")
     .addReg(X86::EAX, RegState::Implicit)
     .addReg(X86::ESP, RegState::Implicit)
     .addReg(X86::EAX, RegState::Define | RegState::Implicit)
     .addReg(X86::ESP, RegState::Define | RegState::Implicit);
 
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
+                                      MachineBasicBlock *BB) const {
+  // This is pretty easy.  We're taking the value that we received from
+  // our load from the relocation, sticking it in either RDI (x86-64)
+  // or EAX and doing an indirect call.  The return value will then
+  // be in the normal return register.
+  const X86InstrInfo *TII 
+    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *F = BB->getParent();
+  
+  assert(MI->getOperand(3).isGlobal() && "This should be a global");
+  
+  if (Subtarget->is64Bit()) {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV64rm), X86::RDI)
+    .addReg(X86::RIP)
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
+    addDirectMem(MIB, X86::RDI);
+  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV32rm), X86::EAX)
+    .addReg(0)
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
+  } else {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV32rm), X86::EAX)
+    .addReg(TII->getGlobalBaseReg(F))
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
+  }
+  
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
@@ -8469,6 +8621,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   default: assert(false && "Unexpected instr type to insert");
   case X86::MINGW_ALLOCA:
     return EmitLoweredMingwAlloca(MI, BB);
+  case X86::TLSCall_32:
+  case X86::TLSCall_64:
+    return EmitLoweredTLSCall(MI, BB);
   case X86::CMOV_GR8:
   case X86::CMOV_V1I64:
   case X86::CMOV_FR32:
@@ -8499,23 +8654,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     // mode when truncating to an integer value.
     MachineFunction *F = BB->getParent();
     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
-    addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx);
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
 
     // Load the old value of the high byte of the control word...
     unsigned OldCW =
       F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
-    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW),
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
                       CWFrameIdx);
 
     // Set the high part to be round to zero...
-    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
       .addImm(0xC7F);
 
     // Reload the modified control word now...
-    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
 
     // Restore the memory image of control word to original value
-    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
       .addReg(OldCW);
 
     // Get the X86 opcode to use.
@@ -8554,13 +8711,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     } else {
       AM.Disp = Op.getImm();
     }
-    addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM)
-                      .addReg(MI->getOperand(X86AddrNumOperands).getReg());
+    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
+                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
 
     // Reload the original control word now.
-    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
 
-    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
     return BB;
   }
     // String/text processing lowering.
@@ -9513,8 +9671,10 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   if (ShAmt1.getOpcode() == ISD::SUB) {
     SDValue Sum = ShAmt1.getOperand(0);
     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
-      if (SumC->getSExtValue() == Bits &&
-          ShAmt1.getOperand(1) == ShAmt0)
+      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
+      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
+        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
+      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
         return DAG.getNode(Opc, DL, VT,
                            Op0, Op1,
                            DAG.getNode(ISD::TRUNCATE, DL,
@@ -9710,58 +9870,6 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// On X86 and X86-64, atomic operations are lowered to locked instructions.
-// Locked instructions, in turn, have implicit fence semantics (all memory
-// operations are flushed before issuing the locked instruction, and the
-// are not buffered), so we can fold away the common pattern of
-// fence-atomic-fence.
-static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) {
-  SDValue atomic = N->getOperand(0);
-  switch (atomic.getOpcode()) {
-    case ISD::ATOMIC_CMP_SWAP:
-    case ISD::ATOMIC_SWAP:
-    case ISD::ATOMIC_LOAD_ADD:
-    case ISD::ATOMIC_LOAD_SUB:
-    case ISD::ATOMIC_LOAD_AND:
-    case ISD::ATOMIC_LOAD_OR:
-    case ISD::ATOMIC_LOAD_XOR:
-    case ISD::ATOMIC_LOAD_NAND:
-    case ISD::ATOMIC_LOAD_MIN:
-    case ISD::ATOMIC_LOAD_MAX:
-    case ISD::ATOMIC_LOAD_UMIN:
-    case ISD::ATOMIC_LOAD_UMAX:
-      break;
-    default:
-      return SDValue();
-  }
-
-  SDValue fence = atomic.getOperand(0);
-  if (fence.getOpcode() != ISD::MEMBARRIER)
-    return SDValue();
-
-  switch (atomic.getOpcode()) {
-    case ISD::ATOMIC_CMP_SWAP:
-      return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
-                                    atomic.getOperand(1), atomic.getOperand(2),
-                                    atomic.getOperand(3));
-    case ISD::ATOMIC_SWAP:
-    case ISD::ATOMIC_LOAD_ADD:
-    case ISD::ATOMIC_LOAD_SUB:
-    case ISD::ATOMIC_LOAD_AND:
-    case ISD::ATOMIC_LOAD_OR:
-    case ISD::ATOMIC_LOAD_XOR:
-    case ISD::ATOMIC_LOAD_NAND:
-    case ISD::ATOMIC_LOAD_MIN:
-    case ISD::ATOMIC_LOAD_MAX:
-    case ISD::ATOMIC_LOAD_UMIN:
-    case ISD::ATOMIC_LOAD_UMAX:
-      return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
-                                    atomic.getOperand(1), atomic.getOperand(2));
-    default:
-      return SDValue();
-  }
-}
-
 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
   //           (and (i32 x86isd::setcc_carry), 1)
@@ -9809,7 +9917,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
-  case ISD::MEMBARRIER:     return PerformMEMBARRIERCombine(N, DAG);
   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG);
   }
 
@@ -9932,8 +10039,8 @@ static bool LowerToBSwap(CallInst *CI) {
   // so don't worry about this.
 
   // Verify this is a simple bswap.
-  if (CI->getNumOperands() != 2 ||
-      CI->getType() != CI->getOperand(1)->getType() ||
+  if (CI->getNumArgOperands() != 1 ||
+      CI->getType() != CI->getArgOperand(0)->getType() ||
       !CI->getType()->isIntegerTy())
     return false;
 
@@ -9946,7 +10053,7 @@ static bool LowerToBSwap(CallInst *CI) {
   Module *M = CI->getParent()->getParent()->getParent();
   Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
 
-  Value *Op = CI->getOperand(1);
+  Value *Op = CI->getArgOperand(0);
   Op = CallInst::Create(Int, Op, CI->getName(), CI);
 
   CI->replaceAllUsesWith(Op);
@@ -10079,7 +10186,6 @@ LowerXConstraint(EVT ConstraintVT) const {
 /// vector.  If it is invalid, don't add anything to Ops.
 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      char Constraint,
-                                                     bool hasMemory,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result(0, 0);
@@ -10121,9 +10227,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   case 'e': {
     // 32-bit signed value
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-      const ConstantInt *CI = C->getConstantIntValue();
-      if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
-                                  C->getSExtValue())) {
+      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                           C->getSExtValue())) {
         // Widen to 64 bits here to get it sign extended.
         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
         break;
@@ -10136,9 +10241,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   case 'Z': {
     // 32-bit unsigned value
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-      const ConstantInt *CI = C->getConstantIntValue();
-      if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
-                                  C->getZExtValue())) {
+      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                           C->getZExtValue())) {
         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
         break;
       }
@@ -10155,6 +10259,12 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
       break;
     }
 
+    // In any sort of PIC mode addresses need to be computed at runtime by
+    // adding in a register or some sort of table lookup.  These can't
+    // be used as immediates.
+    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
+      return;
+
     // If we are in non-pic codegen mode, we allow the address of a global (with
     // an optional displacement) to be used with 'i'.
     GlobalAddressSDNode *GA = 0;
@@ -10190,11 +10300,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                         getTargetMachine())))
       return;
 
-    if (hasMemory)
-      Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
-    else
-      Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset);
-    Result = Op;
+    Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+                                        GA->getValueType(0), Offset);
     break;
   }
   }
@@ -10203,8 +10310,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     Ops.push_back(Result);
     return;
   }
-  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
-                                                      Ops, DAG);
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
 std::vector<unsigned> X86TargetLowering::
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 1ef1a7b..2d28e5c 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -196,6 +196,10 @@ namespace llvm {
 
       // TLSADDR - Thread Local Storage.
       TLSADDR,
+      
+      // TLSCALL - Thread Local Storage.  When calling to an OS provided
+      // thunk at the address from an earlier relocation.
+      TLSCALL,
 
       // SegmentBaseAddress - The address segment:0
       SegmentBaseAddress,
@@ -496,7 +500,6 @@ namespace llvm {
     /// being processed is 'm'.
     virtual void LowerAsmOperandForConstraint(SDValue Op,
                                               char ConstraintLetter,
-                                              bool hasMemory,
                                               std::vector<SDValue> &Ops,
                                               SelectionDAG &DAG) const;
     
@@ -576,20 +579,17 @@ namespace llvm {
 
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
-    virtual FastISel *
-    createFastISel(MachineFunction &mf,
-                   DenseMap<const Value *, unsigned> &,
-                   DenseMap<const BasicBlock *, MachineBasicBlock *> &,
-                   DenseMap<const AllocaInst *, int> &,
-                   std::vector<std::pair<MachineInstr*, unsigned> > &
-#ifndef NDEBUG
-                   , SmallSet<const Instruction *, 8> &
-#endif
-                   ) const;
+    virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const;
 
     /// getFunctionAlignment - Return the Log2 alignment of this function.
     virtual unsigned getFunctionAlignment(const Function *F) const;
 
+    /// getStackCookieLocation - Return true if the target stores stack
+    /// protector cookies at a fixed offset in some non-standard address
+    /// space, and populates the address space and offset as
+    /// appropriate.
+    virtual bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const;
+
   private:
     /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
     /// make the right decision when generating code for different targets.
@@ -643,6 +643,7 @@ namespace llvm {
                                            bool isCalleeStructRet,
                                            bool isCallerStructRet,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                            SelectionDAG& DAG) const;
     bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const;
@@ -725,6 +726,7 @@ namespace llvm {
       LowerCall(SDValue Chain, SDValue Callee,
                 CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
@@ -733,13 +735,13 @@ namespace llvm {
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     virtual bool
       CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
-                     const SmallVectorImpl<EVT> &OutTys,
-                     const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
-                     SelectionDAG &DAG) const;
+                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     LLVMContext &Context) const;
 
     void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                  SelectionDAG &DAG, unsigned NewOp) const;
@@ -794,6 +796,9 @@ namespace llvm {
 
     MachineBasicBlock *EmitLoweredMingwAlloca(MachineInstr *MI,
                                               MachineBasicBlock *BB) const;
+    
+    MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
+                                          MachineBasicBlock *BB) const;
 
     /// Emit nodes that will be selected as "test Op0,Op0", or something
     /// equivalent, for use with the given x86 condition code.
@@ -806,15 +811,7 @@ namespace llvm {
   };
 
   namespace X86 {
-    FastISel *createFastISel(MachineFunction &mf,
-                           DenseMap<const Value *, unsigned> &,
-                           DenseMap<const BasicBlock *, MachineBasicBlock *> &,
-                           DenseMap<const AllocaInst *, int> &,
-                           std::vector<std::pair<MachineInstr*, unsigned> > &
-#ifndef NDEBUG
-                           , SmallSet<const Instruction*, 8> &
-#endif
-                           );
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo);
   }
 }
 
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
index 97eb17c..42d0e7f 100644
--- a/lib/Target/X86/X86Instr64bit.td
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -35,6 +35,14 @@ def i64i8imm   : Operand<i64> {
   let ParserMatchClass = ImmSExti64i8AsmOperand;
 }
 
+def lea64_32mem : Operand<i32> {
+  let PrintMethod = "printi32mem";
+  let AsmOperandLowerMethod = "lower_lea64_32mem";
+  let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+
 // Special i64mem for addresses of load folding tail calls. These are not
 // allowed to use callee-saved registers since they must be scheduled
 // after callee-saved register are popped.
@@ -44,29 +52,16 @@ def i64mem_TC : Operand<i64> {
   let ParserMatchClass = X86MemAsmOperand;
 }
 
-def lea64mem : Operand<i64> {
-  let PrintMethod = "printlea64mem";
-  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm);
-  let ParserMatchClass = X86NoSegMemAsmOperand;
-}
-
-def lea64_32mem : Operand<i32> {
-  let PrintMethod = "printlea64_32mem";
-  let AsmOperandLowerMethod = "lower_lea64_32mem";
-  let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm);
-  let ParserMatchClass = X86NoSegMemAsmOperand;
-}
-
 //===----------------------------------------------------------------------===//
 // Complex Pattern Definitions.
 //
-def lea64addr : ComplexPattern<i64, 4, "SelectLEAAddr",
+def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr",
                         [add, sub, mul, X86mul_imm, shl, or, frameindex,
                          X86WrapperRIP], []>;
 
-def tls64addr : ComplexPattern<i64, 4, "SelectTLSADDRAddr",
+def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
-
+                               
 //===----------------------------------------------------------------------===//
 // Pattern fragments.
 //
@@ -289,11 +284,11 @@ def LEA64_32r : I<0x8D, MRMSrcMem,
                   [(set GR32:$dst, lea32addr:$src)]>, Requires<[In64BitMode]>;
 
 let isReMaterializable = 1 in
-def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "lea{q}\t{$src|$dst}, {$dst|$src}",
                   [(set GR64:$dst, lea64addr:$src)]>;
 
-let isTwoAddress = 1 in
+let Constraints = "$src = $dst" in
 def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
                   "bswap{q}\t$dst", 
                   [(set GR64:$dst, (bswap GR64:$src))]>, TB;
@@ -521,7 +516,7 @@ let Defs = [EFLAGS] in {
 def ADD64i32 : RIi32<0x05, RawFrm, (outs), (ins i64i32imm:$src),
                      "add{q}\t{$src, %rax|%rax, $src}", []>;
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let isConvertibleToThreeAddress = 1 in {
 let isCommutable = 1 in
 // Register-Register Addition
@@ -559,7 +554,7 @@ def ADD64rm     : RI<0x03, MRMSrcMem, (outs GR64:$dst),
                      [(set GR64:$dst, EFLAGS,
                            (X86add_flag GR64:$src1, (load addr:$src2)))]>;
 
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 // Memory-Register Addition
 def ADD64mr  : RI<0x01, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
@@ -580,7 +575,7 @@ let Uses = [EFLAGS] in {
 def ADC64i32 : RIi32<0x15, RawFrm, (outs), (ins i64i32imm:$src),
                      "adc{q}\t{$src, %rax|%rax, $src}", []>;
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let isCommutable = 1 in
 def ADC64rr  : RI<0x11, MRMDestReg, (outs GR64:$dst), 
                   (ins GR64:$src1, GR64:$src2),
@@ -606,7 +601,7 @@ def ADC64ri32 : RIi32<0x81, MRM2r, (outs GR64:$dst),
                       (ins GR64:$src1, i64i32imm:$src2),
                       "adc{q}\t{$src2, $dst|$dst, $src2}",
                       [(set GR64:$dst, (adde GR64:$src1, i64immSExt32:$src2))]>;
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 def ADC64mr  : RI<0x11, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                   "adc{q}\t{$src2, $dst|$dst, $src2}",
@@ -621,7 +616,7 @@ def ADC64mi32 : RIi32<0x81, MRM2m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
                   addr:$dst)]>;
 } // Uses = [EFLAGS]
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 // Register-Register Subtraction
 def SUB64rr  : RI<0x29, MRMDestReg, (outs GR64:$dst), 
                   (ins GR64:$src1, GR64:$src2),
@@ -653,7 +648,7 @@ def SUB64ri32 : RIi32<0x81, MRM5r, (outs GR64:$dst),
                       "sub{q}\t{$src2, $dst|$dst, $src2}",
                       [(set GR64:$dst, EFLAGS,
                             (X86sub_flag GR64:$src1, i64immSExt32:$src2))]>;
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 def SUB64i32 : RIi32<0x2D, RawFrm, (outs), (ins i64i32imm:$src),
                      "sub{q}\t{$src, %rax|%rax, $src}", []>;
@@ -677,7 +672,7 @@ def SUB64mi32 : RIi32<0x81, MRM5m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
                        (implicit EFLAGS)]>;
 
 let Uses = [EFLAGS] in {
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 def SBB64rr    : RI<0x19, MRMDestReg, (outs GR64:$dst), 
                     (ins GR64:$src1, GR64:$src2),
                     "sbb{q}\t{$src2, $dst|$dst, $src2}",
@@ -702,7 +697,7 @@ def SBB64ri32 : RIi32<0x81, MRM3r, (outs GR64:$dst),
                       (ins GR64:$src1, i64i32imm:$src2),
                       "sbb{q}\t{$src2, $dst|$dst, $src2}",
                       [(set GR64:$dst, (sube GR64:$src1, i64immSExt32:$src2))]>;
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 def SBB64i32 : RIi32<0x1D, RawFrm, (outs), (ins i64i32imm:$src),
                      "sbb{q}\t{$src, %rax|%rax, $src}", []>;
@@ -736,7 +731,7 @@ def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
 }
 
 let Defs = [EFLAGS] in {
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let isCommutable = 1 in
 // Register-Register Signed Integer Multiplication
 def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
@@ -751,7 +746,7 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
                   "imul{q}\t{$src2, $dst|$dst, $src2}",
                   [(set GR64:$dst, EFLAGS,
                         (X86smul_flag GR64:$src1, (load addr:$src2)))]>, TB;
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 // Suprisingly enough, these are not two address instructions!
 
@@ -803,7 +798,7 @@ def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
 
 // Unary instructions
 let Defs = [EFLAGS], CodeSize = 2 in {
-let isTwoAddress = 1 in
+let Constraints = "$src = $dst" in
 def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src), "neg{q}\t$dst",
                 [(set GR64:$dst, (ineg GR64:$src)),
                  (implicit EFLAGS)]>;
@@ -811,14 +806,14 @@ def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
                 [(store (ineg (loadi64 addr:$dst)), addr:$dst),
                  (implicit EFLAGS)]>;
 
-let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+let Constraints = "$src = $dst", isConvertibleToThreeAddress = 1 in
 def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src), "inc{q}\t$dst",
                 [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src))]>;
 def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
                 [(store (add (loadi64 addr:$dst), 1), addr:$dst),
                  (implicit EFLAGS)]>;
 
-let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+let Constraints = "$src = $dst", isConvertibleToThreeAddress = 1 in
 def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src), "dec{q}\t$dst",
                 [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src))]>;
 def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
@@ -826,7 +821,7 @@ def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
                  (implicit EFLAGS)]>;
 
 // In 64-bit mode, single byte INC and DEC cannot be encoded.
-let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in {
+let Constraints = "$src = $dst", isConvertibleToThreeAddress = 1 in {
 // Can transform into LEA.
 def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src), 
                   "inc{w}\t$dst",
@@ -844,38 +839,36 @@ def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src),
                   "dec{l}\t$dst",
                   [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src))]>,
                 Requires<[In64BitMode]>;
-} // isConvertibleToThreeAddress
+} // Constraints = "$src = $dst", isConvertibleToThreeAddress
 
 // These are duplicates of their 32-bit counterparts. Only needed so X86 knows
 // how to unfold them.
-let isTwoAddress = 0, CodeSize = 2 in {
-  def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
-                    [(store (add (loadi16 addr:$dst), 1), addr:$dst),
-                     (implicit EFLAGS)]>,
-                  OpSize, Requires<[In64BitMode]>;
-  def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
-                    [(store (add (loadi32 addr:$dst), 1), addr:$dst),
-                     (implicit EFLAGS)]>,
-                  Requires<[In64BitMode]>;
-  def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
-                    [(store (add (loadi16 addr:$dst), -1), addr:$dst),
-                     (implicit EFLAGS)]>,
-                  OpSize, Requires<[In64BitMode]>;
-  def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
-                    [(store (add (loadi32 addr:$dst), -1), addr:$dst),
-                     (implicit EFLAGS)]>,
-                  Requires<[In64BitMode]>;
-}
+def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+                  [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+                    (implicit EFLAGS)]>,
+                OpSize, Requires<[In64BitMode]>;
+def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+                  [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+                    (implicit EFLAGS)]>,
+                Requires<[In64BitMode]>;
+def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+                  [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+                    (implicit EFLAGS)]>,
+                OpSize, Requires<[In64BitMode]>;
+def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+                  [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+                    (implicit EFLAGS)]>,
+                Requires<[In64BitMode]>;
 } // Defs = [EFLAGS], CodeSize
 
 
 let Defs = [EFLAGS] in {
 // Shift instructions
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let Uses = [CL] in
-def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src),
+def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
                   "shl{q}\t{%cl, $dst|$dst, %CL}",
-                  [(set GR64:$dst, (shl GR64:$src, CL))]>;
+                  [(set GR64:$dst, (shl GR64:$src1, CL))]>;
 let isConvertibleToThreeAddress = 1 in   // Can transform into LEA.
 def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst), 
                     (ins GR64:$src1, i8imm:$src2),
@@ -885,7 +878,7 @@ def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst),
 // 'add reg,reg' is cheaper.
 def SHL64r1  : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
                  "shl{q}\t$dst", []>;
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 let Uses = [CL] in
 def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
@@ -898,18 +891,18 @@ def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
                   "shl{q}\t$dst",
                  [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let Uses = [CL] in
-def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src),
+def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
                   "shr{q}\t{%cl, $dst|$dst, %CL}",
-                  [(set GR64:$dst, (srl GR64:$src, CL))]>;
+                  [(set GR64:$dst, (srl GR64:$src1, CL))]>;
 def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
                   "shr{q}\t{$src2, $dst|$dst, $src2}",
                   [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>;
 def SHR64r1  : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
                  "shr{q}\t$dst",
                  [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>;
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 let Uses = [CL] in
 def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
@@ -922,11 +915,11 @@ def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
                   "shr{q}\t$dst",
                  [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let Uses = [CL] in
-def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src),
+def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
                  "sar{q}\t{%cl, $dst|$dst, %CL}",
-                 [(set GR64:$dst, (sra GR64:$src, CL))]>;
+                 [(set GR64:$dst, (sra GR64:$src1, CL))]>;
 def SAR64ri  : RIi8<0xC1, MRM7r, (outs GR64:$dst),
                     (ins GR64:$src1, i8imm:$src2),
                     "sar{q}\t{$src2, $dst|$dst, $src2}",
@@ -934,7 +927,7 @@ def SAR64ri  : RIi8<0xC1, MRM7r, (outs GR64:$dst),
 def SAR64r1  : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
                  "sar{q}\t$dst",
                  [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>;
-} // isTwoAddress
+} // Constraints = "$src = $dst"
 
 let Uses = [CL] in
 def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), 
@@ -949,7 +942,7 @@ def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
 
 // Rotate instructions
 
-let isTwoAddress = 1 in {
+let Constraints = "$src = $dst" in {
 def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src),
                  "rcl{q}\t{1, $dst|$dst, 1}", []>;
 def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src, i8imm:$cnt),
@@ -966,9 +959,8 @@ def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src),
 def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src),
                   "rcr{q}\t{%cl, $dst|$dst, CL}", []>;
 }
-}
+} // Constraints = "$src = $dst"
 
-let isTwoAddress = 0 in {
 def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
                  "rcl{q}\t{1, $dst|$dst, 1}", []>;
 def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt),
@@ -984,13 +976,12 @@ def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
 def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
                   "rcr{q}\t{%cl, $dst|$dst, CL}", []>;
 }
-}
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let Uses = [CL] in
-def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src),
+def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
                   "rol{q}\t{%cl, $dst|$dst, %CL}",
-                  [(set GR64:$dst, (rotl GR64:$src, CL))]>;
+                  [(set GR64:$dst, (rotl GR64:$src1, CL))]>;
 def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst), 
                     (ins GR64:$src1, i8imm:$src2),
                     "rol{q}\t{$src2, $dst|$dst, $src2}",
@@ -998,7 +989,7 @@ def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst),
 def ROL64r1  : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
                   "rol{q}\t$dst",
                   [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 let Uses = [CL] in
 def ROL64mCL :  RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
@@ -1011,11 +1002,11 @@ def ROL64m1  : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
                  "rol{q}\t$dst",
                [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let Uses = [CL] in
-def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src),
+def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
                   "ror{q}\t{%cl, $dst|$dst, %CL}",
-                  [(set GR64:$dst, (rotr GR64:$src, CL))]>;
+                  [(set GR64:$dst, (rotr GR64:$src1, CL))]>;
 def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst), 
                     (ins GR64:$src1, i8imm:$src2),
                     "ror{q}\t{$src2, $dst|$dst, $src2}",
@@ -1023,7 +1014,7 @@ def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst),
 def ROR64r1  : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
                   "ror{q}\t$dst",
                   [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>;
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 let Uses = [CL] in
 def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), 
@@ -1037,7 +1028,7 @@ def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
                [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
 
 // Double shift instructions (generalizations of rotate)
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let Uses = [CL] in {
 def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), 
                     (ins GR64:$src1, GR64:$src2),
@@ -1067,7 +1058,7 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
                                        (i8 imm:$src3)))]>,
                  TB;
 } // isCommutable
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 let Uses = [CL] in {
 def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
@@ -1097,7 +1088,7 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
 //  Logical Instructions...
 //
 
-let isTwoAddress = 1 , AddedComplexity = 15 in
+let Constraints = "$src = $dst" , AddedComplexity = 15 in
 def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src), "not{q}\t$dst",
                 [(set GR64:$dst, (not GR64:$src))]>;
 def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
@@ -1107,7 +1098,7 @@ let Defs = [EFLAGS] in {
 def AND64i32 : RIi32<0x25, RawFrm, (outs), (ins i64i32imm:$src),
                      "and{q}\t{$src, %rax|%rax, $src}", []>;
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let isCommutable = 1 in
 def AND64rr  : RI<0x21, MRMDestReg, 
                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
@@ -1134,7 +1125,7 @@ def AND64ri32  : RIi32<0x81, MRM4r,
                        "and{q}\t{$src2, $dst|$dst, $src2}",
                        [(set GR64:$dst, EFLAGS,
                              (X86and_flag GR64:$src1, i64immSExt32:$src2))]>;
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 def AND64mr  : RI<0x21, MRMDestMem,
                   (outs), (ins i64mem:$dst, GR64:$src),
@@ -1152,7 +1143,7 @@ def AND64mi32  : RIi32<0x81, MRM4m,
              [(store (and (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
               (implicit EFLAGS)]>;
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let isCommutable = 1 in
 def OR64rr   : RI<0x09, MRMDestReg, (outs GR64:$dst), 
                   (ins GR64:$src1, GR64:$src2),
@@ -1179,7 +1170,7 @@ def OR64ri32 : RIi32<0x81, MRM1r, (outs GR64:$dst),
                      "or{q}\t{$src2, $dst|$dst, $src2}",
                   [(set GR64:$dst, EFLAGS,
                         (X86or_flag GR64:$src1, i64immSExt32:$src2))]>;
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 def OR64mr : RI<0x09, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                 "or{q}\t{$src, $dst|$dst, $src}",
@@ -1197,7 +1188,7 @@ def OR64mi32 : RIi32<0x81, MRM1m, (outs), (ins i64mem:$dst, i64i32imm:$src),
 def OR64i32 : RIi32<0x0D, RawFrm, (outs), (ins i64i32imm:$src),
                     "or{q}\t{$src, %rax|%rax, $src}", []>;
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let isCommutable = 1 in
 def XOR64rr  : RI<0x31, MRMDestReg,  (outs GR64:$dst), 
                   (ins GR64:$src1, GR64:$src2), 
@@ -1224,7 +1215,7 @@ def XOR64ri32 : RIi32<0x81, MRM6r,
                       "xor{q}\t{$src2, $dst|$dst, $src2}",
                       [(set GR64:$dst, EFLAGS,
                             (X86xor_flag GR64:$src1, i64immSExt32:$src2))]>;
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 def XOR64mr  : RI<0x31, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                   "xor{q}\t{$src, $dst|$dst, $src}",
@@ -1366,7 +1357,7 @@ def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
 } // Defs = [EFLAGS]
 
 // Conditional moves
-let Uses = [EFLAGS], isTwoAddress = 1 in {
+let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
 let isCommutable = 1 in {
 def CMOVB64rr : RI<0x42, MRMSrcReg,       // if <u, GR64 = GR64
                    (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
@@ -1530,7 +1521,7 @@ def CMOVNO64rm : RI<0x41, MRMSrcMem,       // if !overflow, GR64 = [mem64]
                    "cmovno{q}\t{$src2, $dst|$dst, $src2}",
                     [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
                                      X86_COND_NO, EFLAGS))]>, TB;
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 // Use sbb to materialize carry flag into a GPR.
 // FIXME: This are pseudo ops that should be replaced with Pat<> patterns.
@@ -1588,7 +1579,7 @@ def CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                        "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 def Int_CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg,
                            (outs VR128:$dst), (ins VR128:$src1, GR64:$src2),
                            "cvtsi2sd{q}\t{$src2, $dst|$dst, $src2}",
@@ -1601,7 +1592,7 @@ def Int_CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem,
                            [(set VR128:$dst,
                              (int_x86_sse2_cvtsi642sd VR128:$src1,
                               (loadi64 addr:$src2)))]>;
-} // isTwoAddress
+} // Constraints = "$src1 = $dst"
 
 // Signed i64 -> f32
 def CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg, (outs FR32:$dst), (ins GR64:$src),
@@ -1611,7 +1602,7 @@ def CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem, (outs FR32:$dst), (ins i64mem:$src),
                        "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
                        [(set FR32:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
 
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
   def Int_CVTSI2SS64rr : RSSI<0x2A, MRMSrcReg,
                               (outs VR128:$dst), (ins VR128:$src1, GR64:$src2),
                               "cvtsi2ss{q}\t{$src2, $dst|$dst, $src2}",
@@ -1625,7 +1616,7 @@ let isTwoAddress = 1 in {
                               [(set VR128:$dst,
                                 (int_x86_sse_cvtsi642ss VR128:$src1,
                                  (loadi64 addr:$src2)))]>;
-}
+} // Constraints = "$src1 = $dst"
 
 // f32 -> signed i64
 def CVTSS2SI64rr: RSSI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src),
@@ -1691,6 +1682,7 @@ def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src),
 // Thread Local Storage Instructions
 //===----------------------------------------------------------------------===//
 
+// ELF TLS Support
 // All calls clobber the non-callee saved registers. RSP is marked as
 // a use to prevent stack-pointer assignments that appear immediately
 // before calls from potentially appearing dead.
@@ -1700,7 +1692,7 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
     Uses = [RSP] in
-def TLS_addr64 : I<0, Pseudo, (outs), (ins lea64mem:$sym),
+def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
                    ".byte\t0x66; "
                    "leaq\t$sym(%rip), %rdi; "
                    ".word\t0x6666; "
@@ -1709,6 +1701,17 @@ def TLS_addr64 : I<0, Pseudo, (outs), (ins lea64mem:$sym),
                   [(X86tlsaddr tls64addr:$sym)]>,
                   Requires<[In64BitMode]>;
 
+// Darwin TLS Support
+// For x86_64, the address of the thunk is passed in %rdi, on return 
+// the address of the variable is in %rax.  All other registers are preserved.
+let Defs = [RAX],
+    Uses = [RDI],
+    usesCustomInserter = 1 in
+def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                  "# TLSCall_64",
+                  [(X86TLSCall addr:$sym)]>,
+                  Requires<[In64BitMode]>;
+
 let AddedComplexity = 5, isCodeGenOnly = 1 in
 def MOV64GSrm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                  "movq\t%gs:$src, $dst",
@@ -1964,6 +1967,17 @@ def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
           (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
 	  Requires<[In64BitMode]>;
 
+// tls has some funny stuff here...
+// This corresponds to movabs $foo@tpoff, %rax
+def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
+          (MOV64ri tglobaltlsaddr :$dst)>;
+// This corresponds to add $foo@tpoff, %rax
+def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
+          (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
+// This corresponds to mov foo@tpoff(%rbx), %eax
+def : Pat<(load (i64 (X86Wrapper tglobaltlsaddr :$dst))),
+          (MOV64rm tglobaltlsaddr :$dst)>;
+
 // Comparisons.
 
 // TEST R,R is smaller than CMP R,0
@@ -2332,45 +2346,3 @@ def MOVSDto64mr  : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
 
-//===----------------------------------------------------------------------===//
-// X86-64 SSE4.1 Instructions
-//===----------------------------------------------------------------------===//
-
-/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
-multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
-  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
-                 (ins VR128:$src1, i32i8imm:$src2),
-                 !strconcat(OpcodeStr, 
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 [(set GR64:$dst,
-                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W;
-  def mr : SS4AIi8<opc, MRMDestMem, (outs),
-                 (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
-                 !strconcat(OpcodeStr, 
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
-                          addr:$dst)]>, OpSize, REX_W;
-}
-
-defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
-
-let isTwoAddress = 1 in {
-  multiclass SS41I_insert64<bits<8> opc, string OpcodeStr> {
-    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
-                   !strconcat(OpcodeStr, 
-                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, 
-                     (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
-                   OpSize, REX_W;
-    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
-                   (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
-                   !strconcat(OpcodeStr,
-                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, 
-                     (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
-                                       imm:$src3)))]>, OpSize, REX_W;
-  }
-}
-
-defm PINSRQ      : SS41I_insert64<0x22, "pinsrq">;
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index 5a82a7b..2a6a71d 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -64,19 +64,15 @@ struct X86AddressMode {
 ///
 static inline const MachineInstrBuilder &
 addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
-  // Because memory references are always represented with four
-  // values, this adds: Reg, [1, NoReg, 0] to the instruction.
-  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0);
+  // Because memory references are always represented with five
+  // values, this adds: Reg, 1, NoReg, 0, NoReg to the instruction.
+  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0);
 }
 
-static inline const MachineInstrBuilder &
-addLeaOffset(const MachineInstrBuilder &MIB, int Offset) {
-  return MIB.addImm(1).addReg(0).addImm(Offset);
-}
 
 static inline const MachineInstrBuilder &
 addOffset(const MachineInstrBuilder &MIB, int Offset) {
-  return addLeaOffset(MIB, Offset).addReg(0);
+  return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0);
 }
 
 /// addRegOffset - This function is used to add a memory reference of the form
@@ -89,25 +85,20 @@ addRegOffset(const MachineInstrBuilder &MIB,
   return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
 }
 
-static inline const MachineInstrBuilder &
-addLeaRegOffset(const MachineInstrBuilder &MIB,
-                unsigned Reg, bool isKill, int Offset) {
-  return addLeaOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
-}
-
 /// addRegReg - This function is used to add a memory reference of the form:
 /// [Reg + Reg].
 static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
                                             unsigned Reg1, bool isKill1,
                                             unsigned Reg2, bool isKill2) {
   return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
-    .addReg(Reg2, getKillRegState(isKill2)).addImm(0);
+    .addReg(Reg2, getKillRegState(isKill2)).addImm(0).addReg(0);
 }
 
 static inline const MachineInstrBuilder &
-addLeaAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM) {
-  assert (AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
-
+addFullAddress(const MachineInstrBuilder &MIB,
+               const X86AddressMode &AM) {
+  assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+  
   if (AM.BaseType == X86AddressMode::RegBase)
     MIB.addReg(AM.Base.Reg);
   else if (AM.BaseType == X86AddressMode::FrameIndexBase)
@@ -116,15 +107,11 @@ addLeaAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM) {
     assert (0);
   MIB.addImm(AM.Scale).addReg(AM.IndexReg);
   if (AM.GV)
-    return MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
+    MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
   else
-    return MIB.addImm(AM.Disp);
-}
-
-static inline const MachineInstrBuilder &
-addFullAddress(const MachineInstrBuilder &MIB,
-               const X86AddressMode &AM) {
-  return addLeaAddress(MIB, AM).addReg(0);
+    MIB.addImm(AM.Disp);
+    
+  return MIB.addReg(0);
 }
 
 /// addFrameReference - This function is used to add a reference to the base of
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 0aae4a8..da93de9 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -371,7 +371,7 @@ multiclass FPCMov<PatLeaf cc> {
                                         Requires<[HasCMov]>;
 }
 
-let Uses = [EFLAGS], isTwoAddress = 1 in {
+let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
 defm CMOVB  : FPCMov<X86_COND_B>;
 defm CMOVBE : FPCMov<X86_COND_BE>;
 defm CMOVE  : FPCMov<X86_COND_E>;
@@ -380,7 +380,7 @@ defm CMOVNB : FPCMov<X86_COND_AE>;
 defm CMOVNBE: FPCMov<X86_COND_A>;
 defm CMOVNE : FPCMov<X86_COND_NE>;
 defm CMOVNP : FPCMov<X86_COND_NP>;
-}
+} // Uses = [EFLAGS], Constraints = "$src1 = $dst"
 
 let Predicates = [HasCMov] in {
 // These are not factored because there's no clean way to pass DA/DB.
@@ -680,19 +680,19 @@ def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
 
 // FP extensions map onto simple pseudo-value conversions if they are to/from
 // the FP stack.
-def : Pat<(f64 (fextend RFP32:$src)), (MOV_Fp3264 RFP32:$src)>,
+def : Pat<(f64 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
           Requires<[FPStackf32]>;
-def : Pat<(f80 (fextend RFP32:$src)), (MOV_Fp3280 RFP32:$src)>,
+def : Pat<(f80 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
            Requires<[FPStackf32]>;
-def : Pat<(f80 (fextend RFP64:$src)), (MOV_Fp6480 RFP64:$src)>,
+def : Pat<(f80 (fextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
            Requires<[FPStackf64]>;
 
 // FP truncations map onto simple pseudo-value conversions if they are to/from
 // the FP stack.  We have validated that only value-preserving truncations make
 // it through isel.
-def : Pat<(f32 (fround RFP64:$src)), (MOV_Fp6432 RFP64:$src)>,
+def : Pat<(f32 (fround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
           Requires<[FPStackf32]>;
-def : Pat<(f32 (fround RFP80:$src)), (MOV_Fp8032 RFP80:$src)>,
+def : Pat<(f32 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
            Requires<[FPStackf32]>;
-def : Pat<(f64 (fround RFP80:$src)), (MOV_Fp8064 RFP80:$src)>,
+def : Pat<(f64 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
            Requires<[FPStackf64]>;
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index c4522f3..97578af 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -50,9 +50,10 @@ def NoImm      : ImmType<0>;
 def Imm8       : ImmType<1>;
 def Imm8PCRel  : ImmType<2>;
 def Imm16      : ImmType<3>;
-def Imm32      : ImmType<4>;
-def Imm32PCRel : ImmType<5>;
-def Imm64      : ImmType<6>;
+def Imm16PCRel : ImmType<4>;
+def Imm32      : ImmType<5>;
+def Imm32PCRel : ImmType<6>;
+def Imm64      : ImmType<7>;
 
 // FPFormat - This specifies what form this FP instruction has.  This is used by
 // the Floating-Point stackifier pass.
@@ -101,6 +102,10 @@ class XS     { bits<4> Prefix = 12; }
 class T8     { bits<4> Prefix = 13; }
 class TA     { bits<4> Prefix = 14; }
 class TF     { bits<4> Prefix = 15; }
+class VEX    { bit hasVEXPrefix = 1; }
+class VEX_W  { bit hasVEX_WPrefix = 1; }
+class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; }
+class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
 
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
               string AsmStr, Domain d = GenericDomain>
@@ -128,6 +133,11 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
   bits<2> SegOvrBits = 0;   // Segment override prefix.
   Domain ExeDomain = d;
+  bit hasVEXPrefix = 0;     // Does this inst requires a VEX prefix?
+  bit hasVEX_WPrefix = 0;   // Does this inst set the VEX_W field?
+  bit hasVEX_4VPrefix = 0;  // Does this inst requires the VEX.VVVV field?
+  bit hasVEX_i8ImmReg = 0;  // Does this inst requires the last source register
+                            // to be encoded in a immediate field?
 
   // TSFlags layout should be kept in sync with X86InstrInfo.h.
   let TSFlags{5-0}   = FormBits;
@@ -141,6 +151,10 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{21-20} = SegOvrBits;
   let TSFlags{23-22} = ExeDomain.Value;
   let TSFlags{31-24} = Opcode;
+  let TSFlags{32}    = hasVEXPrefix;
+  let TSFlags{33}    = hasVEX_WPrefix;
+  let TSFlags{34}    = hasVEX_4VPrefix;
+  let TSFlags{35}    = hasVEX_i8ImmReg;
 }
 
 class I<bits<8> o, Format f, dag outs, dag ins, string asm,
@@ -174,6 +188,13 @@ class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
   let CodeSize = 3;
 }
 
+class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern>
+  : X86Inst<o, f, Imm16PCRel, outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
 class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, 
            list<dag> pattern>
   : X86Inst<o, f, Imm32PCRel, outs, ins, asm> {
@@ -211,11 +232,56 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
   let CodeSize = 3;
 }
 
+// SI - SSE 1 & 2 scalar instructions
+class SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern> {
+  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
+            !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2]));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
+}
+
+// SIi8 - SSE 1 & 2 scalar instructions
+class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern> {
+  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
+            !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2]));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
+}
+
+// PI - SSE 1 & 2 packed instructions
+class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+         Domain d>
+      : I<o, F, outs, ins, asm, pattern, d> {
+  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
+        !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
+}
+
+// PIi8 - SSE 1 & 2 packed instructions with immediate
+class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, Domain d>
+      : Ii8<o, F, outs, ins, asm, pattern, d> {
+  let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX],
+        !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm);
+}
+
 // SSE1 Instruction Templates:
 // 
 //   SSI   - SSE1 instructions with XS prefix.
 //   PSI   - SSE1 instructions with TB prefix.
 //   PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
+//   VSSI  - SSE1 instructions with XS prefix in AVX form.
+//   VPSI  - SSE1 instructions with TB prefix in AVX form.
 
 class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
       : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
@@ -229,6 +295,14 @@ class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
       : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
         Requires<[HasSSE1]>;
+class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS,
+        Requires<[HasAVX]>;
+class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>,
+        Requires<[HasAVX]>;
 
 // SSE2 Instruction Templates:
 // 
@@ -237,6 +311,8 @@ class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
 //   PDI    - SSE2 instructions with TB and OpSize prefixes.
 //   PDIi8  - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
+//   VSDI   - SSE2 instructions with XD prefix in AVX form.
+//   VPDI   - SSE2 instructions with TB and OpSize prefixes in AVX form.
 
 class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
       : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>;
@@ -253,6 +329,14 @@ class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
       : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize,
         Requires<[HasSSE2]>;
+class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XD,
+        Requires<[HasAVX]>;
+class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>,
+        OpSize, Requires<[HasAVX]>;
 
 // SSE3 Instruction Templates:
 // 
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 6b9478d..71c4e8b 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -60,3 +60,339 @@ def mmx_pshufw : PatFrag<(ops node:$lhs, node:$rhs),
                          (vector_shuffle node:$lhs, node:$rhs), [{
   return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
 }], MMX_SHUFFLE_get_shuf_imm>;
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
+                                            SDTCisFP<0>, SDTCisInt<2> ]>;
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+                                       SDTCisFP<1>, SDTCisVT<3, i8>]>;
+
+def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
+def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
+def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
+def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
+def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
+def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
+def X86pshufb  : SDNode<"X86ISD::PSHUFB",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86pextrb  : SDNode<"X86ISD::PEXTRB",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pextrw  : SDNode<"X86ISD::PEXTRW",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pinsrb  : SDNode<"X86ISD::PINSRB",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86pinsrw  : SDNode<"X86ISD::PINSRW",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86insrtps : SDNode<"X86ISD::INSERTPS",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
+def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
+                 SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+                        [SDNPHasChain, SDNPMayLoad]>;
+def X86vshl    : SDNode<"X86ISD::VSHL",      SDTIntShiftOp>;
+def X86vshr    : SDNode<"X86ISD::VSRL",      SDTIntShiftOp>;
+def X86cmpps   : SDNode<"X86ISD::CMPPS",     SDTX86VFCMP>;
+def X86cmppd   : SDNode<"X86ISD::CMPPD",     SDTX86VFCMP>;
+def X86pcmpeqb : SDNode<"X86ISD::PCMPEQB", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqw : SDNode<"X86ISD::PCMPEQW", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqd : SDNode<"X86ISD::PCMPEQD", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqq : SDNode<"X86ISD::PCMPEQQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgtb : SDNode<"X86ISD::PCMPGTB", SDTIntBinOp>;
+def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>;
+def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>;
+def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>;
+
+def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                          SDTCisVT<1, v4f32>,
+                                          SDTCisVT<2, v4f32>]>;
+def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements.  These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad]>;
+def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad]>;
+
+def ssmem : Operand<v4f32> {
+  let PrintMethod = "printf32mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+def sdmem : Operand<v2f64> {
+  let PrintMethod = "printf64mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv4i32    : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
+def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+// FIXME: move this to a more appropriate place after all AVX is done.
+def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
+def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
+def loadv8i32    : PatFrag<(ops node:$ptr), (v8i32 (load node:$ptr))>;
+def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
+
+// Like 'store', but always requires vector alignment.
+def alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                           (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'load', but always requires vector alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def alignedloadfsf32 : PatFrag<(ops node:$ptr),
+                               (f32 (alignedload node:$ptr))>;
+def alignedloadfsf64 : PatFrag<(ops node:$ptr),
+                               (f64 (alignedload node:$ptr))>;
+def alignedloadv4f32 : PatFrag<(ops node:$ptr),
+                               (v4f32 (alignedload node:$ptr))>;
+def alignedloadv2f64 : PatFrag<(ops node:$ptr),
+                               (v2f64 (alignedload node:$ptr))>;
+def alignedloadv4i32 : PatFrag<(ops node:$ptr),
+                               (v4i32 (alignedload node:$ptr))>;
+def alignedloadv2i64 : PatFrag<(ops node:$ptr),
+                               (v2i64 (alignedload node:$ptr))>;
+
+// FIXME: move this to a more appropriate place after all AVX is done.
+def alignedloadv8f32 : PatFrag<(ops node:$ptr),
+                               (v8f32 (alignedload node:$ptr))>;
+def alignedloadv4f64 : PatFrag<(ops node:$ptr),
+                               (v4f64 (alignedload node:$ptr))>;
+def alignedloadv8i32 : PatFrag<(ops node:$ptr),
+                               (v8i32 (alignedload node:$ptr))>;
+def alignedloadv4i64 : PatFrag<(ops node:$ptr),
+                               (v4i64 (alignedload node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
+// memory operands in most SSE instructions, which are required to
+// be naturally aligned on some targets but not on others.  If the subtarget
+// allows unaligned accesses, match any load, though this may require
+// setting a feature bit in the processor (on startup, for example).
+// Opteron 10h and later implement such a feature.
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasVectorUAMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
+def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
+def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
+def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
+def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
+def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
+
+// FIXME: move this to a more appropriate place after all AVX is done.
+def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>;
+def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>;
+
+// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
+// 16-byte boundary.
+// FIXME: 8 byte alignment for mmx reads is not required
+def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
+def memopv8i8  : PatFrag<(ops node:$ptr), (v8i8  (memop64 node:$ptr))>;
+def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>;
+def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop64 node:$ptr))>;
+def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>;
+
+// MOVNT Support
+// Like 'store', but requires the non-temporal bit to be set
+def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+                           (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isNonTemporal();
+  return false;
+}]>;
+
+def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+			           (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isNonTemporal() && !ST->isTruncatingStore() &&
+           ST->getAddressingMode() == ISD::UNINDEXED &&
+           ST->getAlignment() >= 16;
+  return false;
+}]>;
+
+def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+			           (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isNonTemporal() &&
+           ST->getAlignment() < 16;
+  return false;
+}]>;
+
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+def vzmovl_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzmovl
+                             (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
+def vzmovl_v4i32 : PatFrag<(ops node:$src),
+                           (bitconvert (v4i32 (X86vzmovl
+                             (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+
+def vzload_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzload node:$src)))>;
+
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+// BYTE_imm - Transform bit immediates into byte immediates.
+def BYTE_imm  : SDNodeXForm<imm, [{
+  // Transformation function: imm >> 3
+  return getI32Imm(N->getZExtValue() >> 3);
+}]>;
+
+// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*,
+// SHUFP* etc. imm.
+def SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to
+// PSHUFHW imm.
+def SHUFFLE_get_pshufhw_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShufflePSHUFHWImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to
+// PSHUFLW imm.
+def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
+}]>;
+
+// SHUFFLE_get_palign_imm xform function: convert vector_shuffle mask to
+// a PALIGNR imm.
+def SHUFFLE_get_palign_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShufflePALIGNRImmediate(N));
+}]>;
+
+def splat_lo : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return SVOp->isSplat() && SVOp->getSplatIndex() == 0;
+}]>;
+
+def movddup : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVDDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movhlps : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVHLPSMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movhlps_undef : PatFrag<(ops node:$lhs, node:$rhs),
+                            (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVHLPS_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movlhps : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVLHPSMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movlp : PatFrag<(ops node:$lhs, node:$rhs),
+                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVLPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movl : PatFrag<(ops node:$lhs, node:$rhs),
+                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movshdup : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVSHDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movsldup : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVSLDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckl : PatFrag<(ops node:$lhs, node:$rhs),
+                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckh : PatFrag<(ops node:$lhs, node:$rhs),
+                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
+                           (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
+                           (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def pshufd : PatFrag<(ops node:$lhs, node:$rhs),
+                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_shuf_imm>;
+
+def shufp : PatFrag<(ops node:$lhs, node:$rhs),
+                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isSHUFPMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_shuf_imm>;
+
+def pshufhw : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPSHUFHWMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_pshufhw_imm>;
+
+def pshuflw : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_pshuflw_imm>;
+
+def palign : PatFrag<(ops node:$lhs, node:$rhs),
+                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPALIGNRMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_palign_imm>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 34e12ca..ce471ea 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -784,7 +784,9 @@ static bool isFrameLoadOpcode(int Opcode) {
   case X86::MOV8rm:
   case X86::MOV16rm:
   case X86::MOV32rm:
+  case X86::MOV32rm_TC:
   case X86::MOV64rm:
+  case X86::MOV64rm_TC:
   case X86::LD_Fp64m:
   case X86::MOVSSrm:
   case X86::MOVSDrm:
@@ -805,7 +807,9 @@ static bool isFrameStoreOpcode(int Opcode) {
   case X86::MOV8mr:
   case X86::MOV16mr:
   case X86::MOV32mr:
+  case X86::MOV32mr_TC:
   case X86::MOV64mr:
+  case X86::MOV64mr_TC:
   case X86::ST_FpP64m:
   case X86::MOVSSmr:
   case X86::MOVSDmr:
@@ -863,7 +867,7 @@ unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
                                           int &FrameIndex) const {
   if (isFrameStoreOpcode(MI->getOpcode()))
     if (isFrameOperand(MI, 0, FrameIndex))
-      return MI->getOperand(X86AddrNumOperands).getReg();
+      return MI->getOperand(X86::AddrNumOperands).getReg();
   return 0;
 }
 
@@ -1064,14 +1068,9 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
                                  unsigned DestReg, unsigned SubIdx,
                                  const MachineInstr *Orig,
-                                 const TargetRegisterInfo *TRI) const {
+                                 const TargetRegisterInfo &TRI) const {
   DebugLoc DL = Orig->getDebugLoc();
 
-  if (SubIdx && TargetRegisterInfo::isPhysicalRegister(DestReg)) {
-    DestReg = TRI->getSubReg(DestReg, SubIdx);
-    SubIdx = 0;
-  }
-
   // MOV32r0 etc. are implemented with xor which clobbers condition code.
   // Re-materialize them as movri instructions to avoid side effects.
   bool Clone = true;
@@ -1098,14 +1097,13 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
 
   if (Clone) {
     MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
-    MI->getOperand(0).setReg(DestReg);
     MBB.insert(I, MI);
   } else {
-    BuildMI(MBB, I, DL, get(Opc), DestReg).addImm(0);
+    BuildMI(MBB, I, DL, get(Opc)).addOperand(Orig->getOperand(0)).addImm(0);
   }
 
   MachineInstr *NewMI = prior(I);
-  NewMI->getOperand(0).setSubReg(SubIdx);
+  NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
 }
 
 /// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that
@@ -1151,10 +1149,9 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   // least on modern x86 machines).
   BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
   MachineInstr *InsMI =
-    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::INSERT_SUBREG),leaInReg)
-    .addReg(leaInReg)
-    .addReg(Src, getKillRegState(isKill))
-    .addImm(X86::sub_16bit);
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
+    .addReg(leaInReg, RegState::Define, X86::sub_16bit)
+    .addReg(Src, getKillRegState(isKill));
 
   MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
                                     get(Opc), leaOutReg);
@@ -1165,20 +1162,20 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   case X86::SHL16ri: {
     unsigned ShAmt = MI->getOperand(2).getImm();
     MIB.addReg(0).addImm(1 << ShAmt)
-       .addReg(leaInReg, RegState::Kill).addImm(0);
+       .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
     break;
   }
   case X86::INC16r:
   case X86::INC64_16r:
-    addLeaRegOffset(MIB, leaInReg, true, 1);
+    addRegOffset(MIB, leaInReg, true, 1);
     break;
   case X86::DEC16r:
   case X86::DEC64_16r:
-    addLeaRegOffset(MIB, leaInReg, true, -1);
+    addRegOffset(MIB, leaInReg, true, -1);
     break;
   case X86::ADD16ri:
   case X86::ADD16ri8:
-    addLeaRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());    
+    addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());    
     break;
   case X86::ADD16rr: {
     unsigned Src2 = MI->getOperand(2).getReg();
@@ -1195,10 +1192,9 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
       // well be shifting and then extracting the lower 16-bits. 
       BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2);
       InsMI2 =
-        BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::INSERT_SUBREG),leaInReg2)
-        .addReg(leaInReg2)
-        .addReg(Src2, getKillRegState(isKill2))
-        .addImm(X86::sub_16bit);
+        BuildMI(*MFI, MIB, MI->getDebugLoc(), get(TargetOpcode::COPY))
+        .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
+        .addReg(Src2, getKillRegState(isKill2));
       addRegReg(MIB, leaInReg, true, leaInReg2, true);
     }
     if (LV && isKill2 && InsMI2)
@@ -1209,10 +1205,9 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
 
   MachineInstr *NewMI = MIB;
   MachineInstr *ExtMI =
-    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::EXTRACT_SUBREG))
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
     .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-    .addReg(leaOutReg, RegState::Kill)
-    .addImm(X86::sub_16bit);
+    .addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
 
   if (LV) {
     // Update live variables
@@ -1283,7 +1278,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       .addReg(Dest, RegState::Define | getDeadRegState(isDead))
       .addReg(0).addImm(1 << ShAmt)
       .addReg(Src, getKillRegState(isKill))
-      .addImm(0);
+      .addImm(0).addReg(0);
     break;
   }
   case X86::SHL32ri: {
@@ -1297,7 +1292,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
       .addReg(Dest, RegState::Define | getDeadRegState(isDead))
       .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill)).addImm(0);
+      .addReg(Src, getKillRegState(isKill)).addImm(0).addReg(0);
     break;
   }
   case X86::SHL16ri: {
@@ -1313,7 +1308,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       .addReg(Dest, RegState::Define | getDeadRegState(isDead))
       .addReg(0).addImm(1 << ShAmt)
       .addReg(Src, getKillRegState(isKill))
-      .addImm(0);
+      .addImm(0).addReg(0);
     break;
   }
   default: {
@@ -1331,7 +1326,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
       unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
         : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
                               .addReg(Dest, RegState::Define |
                                       getDeadRegState(isDead)),
                               Src, isKill, 1);
@@ -1353,7 +1348,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
       unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
         : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
                               .addReg(Dest, RegState::Define |
                                       getDeadRegState(isDead)),
                               Src, isKill, -1);
@@ -1401,7 +1396,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD64ri32:
     case X86::ADD64ri8:
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
                               .addReg(Dest, RegState::Define |
                                       getDeadRegState(isDead)),
                               Src, isKill, MI->getOperand(2).getImm());
@@ -1410,7 +1405,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD32ri8: {
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
                               .addReg(Dest, RegState::Define |
                                       getDeadRegState(isDead)),
                                 Src, isKill, MI->getOperand(2).getImm());
@@ -1421,7 +1416,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       if (DisableLEA16)
         return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                               .addReg(Dest, RegState::Define |
                                       getDeadRegState(isDead)),
                               Src, isKill, MI->getOperand(2).getImm());
@@ -1845,9 +1840,8 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 unsigned
 X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                            MachineBasicBlock *FBB,
-                           const SmallVectorImpl<MachineOperand> &Cond) const {
-  // FIXME this should probably have a DebugLoc operand
-  DebugLoc dl;
+                           const SmallVectorImpl<MachineOperand> &Cond,
+                           DebugLoc DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 1 || Cond.size() == 0) &&
@@ -1856,7 +1850,7 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   if (Cond.empty()) {
     // Unconditional branch?
     assert(!FBB && "Unconditional branch with multiple successors!");
-    BuildMI(&MBB, dl, get(X86::JMP_4)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(TBB);
     return 1;
   }
 
@@ -1866,27 +1860,27 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   switch (CC) {
   case X86::COND_NP_OR_E:
     // Synthesize NP_OR_E with two branches.
-    BuildMI(&MBB, dl, get(X86::JNP_4)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JNP_4)).addMBB(TBB);
     ++Count;
-    BuildMI(&MBB, dl, get(X86::JE_4)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JE_4)).addMBB(TBB);
     ++Count;
     break;
   case X86::COND_NE_OR_P:
     // Synthesize NE_OR_P with two branches.
-    BuildMI(&MBB, dl, get(X86::JNE_4)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JNE_4)).addMBB(TBB);
     ++Count;
-    BuildMI(&MBB, dl, get(X86::JP_4)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(X86::JP_4)).addMBB(TBB);
     ++Count;
     break;
   default: {
     unsigned Opc = GetCondBranchFromCond(CC);
-    BuildMI(&MBB, dl, get(Opc)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
     ++Count;
   }
   }
   if (FBB) {
     // Two-way Conditional branch. Insert the second branch.
-    BuildMI(&MBB, dl, get(X86::JMP_4)).addMBB(FBB);
+    BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(FBB);
     ++Count;
   }
   return Count;
@@ -1897,237 +1891,153 @@ static bool isHReg(unsigned Reg) {
   return X86::GR8_ABCD_HRegClass.contains(Reg);
 }
 
-bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI,
-                                unsigned DestReg, unsigned SrcReg,
-                                const TargetRegisterClass *DestRC,
-                                const TargetRegisterClass *SrcRC,
-                                DebugLoc DL) const {
-
-  // Determine if DstRC and SrcRC have a common superclass in common.
-  const TargetRegisterClass *CommonRC = DestRC;
-  if (DestRC == SrcRC)
-    /* Source and destination have the same register class. */;
-  else if (CommonRC->hasSuperClass(SrcRC))
-    CommonRC = SrcRC;
-  else if (!DestRC->hasSubClass(SrcRC)) {
-    // Neither of GR64_NOREX or GR64_NOSP is a superclass of the other,
-    // but we want to copy them as GR64. Similarly, for GR32_NOREX and
-    // GR32_NOSP, copy as GR32.
-    if (SrcRC->hasSuperClass(&X86::GR64RegClass) &&
-        DestRC->hasSuperClass(&X86::GR64RegClass))
-      CommonRC = &X86::GR64RegClass;
-    else if (SrcRC->hasSuperClass(&X86::GR32RegClass) &&
-             DestRC->hasSuperClass(&X86::GR32RegClass))
-      CommonRC = &X86::GR32RegClass;
+void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI, DebugLoc DL,
+                               unsigned DestReg, unsigned SrcReg,
+                               bool KillSrc) const {
+  // First deal with the normal symmetric copies.
+  unsigned Opc = 0;
+  if (X86::GR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV64rr;
+  else if (X86::GR32RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV32rr;
+  else if (X86::GR16RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV16rr;
+  else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
+    // Copying to or from a physical H register on x86-64 requires a NOREX
+    // move.  Otherwise use a normal move.
+    if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+        TM.getSubtarget<X86Subtarget>().is64Bit())
+      Opc = X86::MOV8rr_NOREX;
     else
-      CommonRC = 0;
-  }
-
-  if (CommonRC) {
-    unsigned Opc;
-    if (CommonRC == &X86::GR64RegClass || CommonRC == &X86::GR64_NOSPRegClass) {
-      Opc = X86::MOV64rr;
-    } else if (CommonRC == &X86::GR32RegClass ||
-               CommonRC == &X86::GR32_NOSPRegClass) {
-      Opc = X86::MOV32rr;
-    } else if (CommonRC == &X86::GR16RegClass) {
-      Opc = X86::MOV16rr;
-    } else if (CommonRC == &X86::GR8RegClass) {
-      // Copying to or from a physical H register on x86-64 requires a NOREX
-      // move.  Otherwise use a normal move.
-      if ((isHReg(DestReg) || isHReg(SrcReg)) &&
-          TM.getSubtarget<X86Subtarget>().is64Bit())
-        Opc = X86::MOV8rr_NOREX;
-      else
-        Opc = X86::MOV8rr;
-    } else if (CommonRC == &X86::GR64_ABCDRegClass) {
-      Opc = X86::MOV64rr;
-    } else if (CommonRC == &X86::GR32_ABCDRegClass) {
-      Opc = X86::MOV32rr;
-    } else if (CommonRC == &X86::GR16_ABCDRegClass) {
-      Opc = X86::MOV16rr;
-    } else if (CommonRC == &X86::GR8_ABCD_LRegClass) {
       Opc = X86::MOV8rr;
-    } else if (CommonRC == &X86::GR8_ABCD_HRegClass) {
-      if (TM.getSubtarget<X86Subtarget>().is64Bit())
-        Opc = X86::MOV8rr_NOREX;
-      else
-        Opc = X86::MOV8rr;
-    } else if (CommonRC == &X86::GR64_NOREXRegClass ||
-               CommonRC == &X86::GR64_NOREX_NOSPRegClass) {
-      Opc = X86::MOV64rr;
-    } else if (CommonRC == &X86::GR32_NOREXRegClass) {
-      Opc = X86::MOV32rr;
-    } else if (CommonRC == &X86::GR16_NOREXRegClass) {
-      Opc = X86::MOV16rr;
-    } else if (CommonRC == &X86::GR8_NOREXRegClass) {
-      Opc = X86::MOV8rr;
-    } else if (CommonRC == &X86::GR64_TCRegClass) {
-      Opc = X86::MOV64rr_TC;
-    } else if (CommonRC == &X86::GR32_TCRegClass) {
-      Opc = X86::MOV32rr_TC;
-    } else if (CommonRC == &X86::RFP32RegClass) {
-      Opc = X86::MOV_Fp3232;
-    } else if (CommonRC == &X86::RFP64RegClass || CommonRC == &X86::RSTRegClass) {
-      Opc = X86::MOV_Fp6464;
-    } else if (CommonRC == &X86::RFP80RegClass) {
-      Opc = X86::MOV_Fp8080;
-    } else if (CommonRC == &X86::FR32RegClass) {
-      Opc = X86::FsMOVAPSrr;
-    } else if (CommonRC == &X86::FR64RegClass) {
-      Opc = X86::FsMOVAPDrr;
-    } else if (CommonRC == &X86::VR128RegClass) {
-      Opc = X86::MOVAPSrr;
-    } else if (CommonRC == &X86::VR64RegClass) {
-      Opc = X86::MMX_MOVQ64rr;
-    } else {
-      return false;
-    }
-    BuildMI(MBB, MI, DL, get(Opc), DestReg).addReg(SrcReg);
-    return true;
+  } else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOVAPSrr;
+  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MMX_MOVQ64rr;
+
+  if (Opc) {
+    BuildMI(MBB, MI, DL, get(Opc), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
   }
 
   // Moving EFLAGS to / from another register requires a push and a pop.
-  if (SrcRC == &X86::CCRRegClass) {
-    if (SrcReg != X86::EFLAGS)
-      return false;
-    if (DestRC == &X86::GR64RegClass || DestRC == &X86::GR64_NOSPRegClass) {
+  if (SrcReg == X86::EFLAGS) {
+    if (X86::GR64RegClass.contains(DestReg)) {
       BuildMI(MBB, MI, DL, get(X86::PUSHF64));
       BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
-      return true;
-    } else if (DestRC == &X86::GR32RegClass ||
-               DestRC == &X86::GR32_NOSPRegClass) {
+      return;
+    } else if (X86::GR32RegClass.contains(DestReg)) {
       BuildMI(MBB, MI, DL, get(X86::PUSHF32));
       BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
-      return true;
+      return;
     }
-  } else if (DestRC == &X86::CCRRegClass) {
-    if (DestReg != X86::EFLAGS)
-      return false;
-    if (SrcRC == &X86::GR64RegClass || DestRC == &X86::GR64_NOSPRegClass) {
-      BuildMI(MBB, MI, DL, get(X86::PUSH64r)).addReg(SrcReg);
+  }
+  if (DestReg == X86::EFLAGS) {
+    if (X86::GR64RegClass.contains(SrcReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSH64r))
+        .addReg(SrcReg, getKillRegState(KillSrc));
       BuildMI(MBB, MI, DL, get(X86::POPF64));
-      return true;
-    } else if (SrcRC == &X86::GR32RegClass ||
-               DestRC == &X86::GR32_NOSPRegClass) {
-      BuildMI(MBB, MI, DL, get(X86::PUSH32r)).addReg(SrcReg);
+      return;
+    } else if (X86::GR32RegClass.contains(SrcReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSH32r))
+        .addReg(SrcReg, getKillRegState(KillSrc));
       BuildMI(MBB, MI, DL, get(X86::POPF32));
-      return true;
-    }
-  }
-
-  // Moving from ST(0) turns into FpGET_ST0_32 etc.
-  if (SrcRC == &X86::RSTRegClass) {
-    // Copying from ST(0)/ST(1).
-    if (SrcReg != X86::ST0 && SrcReg != X86::ST1)
-      // Can only copy from ST(0)/ST(1) right now
-      return false;
-    bool isST0 = SrcReg == X86::ST0;
-    unsigned Opc;
-    if (DestRC == &X86::RFP32RegClass)
-      Opc = isST0 ? X86::FpGET_ST0_32 : X86::FpGET_ST1_32;
-    else if (DestRC == &X86::RFP64RegClass)
-      Opc = isST0 ? X86::FpGET_ST0_64 : X86::FpGET_ST1_64;
-    else {
-      if (DestRC != &X86::RFP80RegClass)
-        return false;
-      Opc = isST0 ? X86::FpGET_ST0_80 : X86::FpGET_ST1_80;
+      return;
     }
-    BuildMI(MBB, MI, DL, get(Opc), DestReg);
-    return true;
   }
 
-  // Moving to ST(0) turns into FpSET_ST0_32 etc.
-  if (DestRC == &X86::RSTRegClass) {
-    // Copying to ST(0) / ST(1).
-    if (DestReg != X86::ST0 && DestReg != X86::ST1)
-      // Can only copy to TOS right now
-      return false;
-    bool isST0 = DestReg == X86::ST0;
-    unsigned Opc;
-    if (SrcRC == &X86::RFP32RegClass)
-      Opc = isST0 ? X86::FpSET_ST0_32 : X86::FpSET_ST1_32;
-    else if (SrcRC == &X86::RFP64RegClass)
-      Opc = isST0 ? X86::FpSET_ST0_64 : X86::FpSET_ST1_64;
-    else {
-      if (SrcRC != &X86::RFP80RegClass)
-        return false;
-      Opc = isST0 ? X86::FpSET_ST0_80 : X86::FpSET_ST1_80;
-    }
-    BuildMI(MBB, MI, DL, get(Opc)).addReg(SrcReg);
-    return true;
-  }
-  
-  // Not yet supported!
-  return false;
+  DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
+               << " to " << RI.getName(DestReg) << '\n');
+  llvm_unreachable("Cannot emit physreg copy instruction");
 }
 
-static unsigned getStoreRegOpcode(unsigned SrcReg,
-                                  const TargetRegisterClass *RC,
-                                  bool isStackAligned,
-                                  TargetMachine &TM) {
-  unsigned Opc = 0;
-  if (RC == &X86::GR64RegClass || RC == &X86::GR64_NOSPRegClass) {
-    Opc = X86::MOV64mr;
-  } else if (RC == &X86::GR32RegClass || RC == &X86::GR32_NOSPRegClass) {
-    Opc = X86::MOV32mr;
-  } else if (RC == &X86::GR16RegClass) {
-    Opc = X86::MOV16mr;
-  } else if (RC == &X86::GR8RegClass) {
+static unsigned getLoadStoreRegOpcode(unsigned Reg,
+                                      const TargetRegisterClass *RC,
+                                      bool isStackAligned,
+                                      const TargetMachine &TM,
+                                      bool load) {
+  switch (RC->getID()) {
+  default:
+    llvm_unreachable("Unknown regclass");
+  case X86::GR64RegClassID:
+  case X86::GR64_NOSPRegClassID:
+    return load ? X86::MOV64rm : X86::MOV64mr;
+  case X86::GR32RegClassID:
+  case X86::GR32_NOSPRegClassID:
+  case X86::GR32_ADRegClassID:
+    return load ? X86::MOV32rm : X86::MOV32mr;
+  case X86::GR16RegClassID:
+    return load ? X86::MOV16rm : X86::MOV16mr;
+  case X86::GR8RegClassID:
     // Copying to or from a physical H register on x86-64 requires a NOREX
     // move.  Otherwise use a normal move.
-    if (isHReg(SrcReg) &&
+    if (isHReg(Reg) &&
         TM.getSubtarget<X86Subtarget>().is64Bit())
-      Opc = X86::MOV8mr_NOREX;
+      return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
     else
-      Opc = X86::MOV8mr;
-  } else if (RC == &X86::GR64_ABCDRegClass) {
-    Opc = X86::MOV64mr;
-  } else if (RC == &X86::GR32_ABCDRegClass) {
-    Opc = X86::MOV32mr;
-  } else if (RC == &X86::GR16_ABCDRegClass) {
-    Opc = X86::MOV16mr;
-  } else if (RC == &X86::GR8_ABCD_LRegClass) {
-    Opc = X86::MOV8mr;
-  } else if (RC == &X86::GR8_ABCD_HRegClass) {
+      return load ? X86::MOV8rm : X86::MOV8mr;
+  case X86::GR64_ABCDRegClassID:
+    return load ? X86::MOV64rm : X86::MOV64mr;
+  case X86::GR32_ABCDRegClassID:
+    return load ? X86::MOV32rm : X86::MOV32mr;
+  case X86::GR16_ABCDRegClassID:
+    return load ? X86::MOV16rm : X86::MOV16mr;
+  case X86::GR8_ABCD_LRegClassID:
+    return load ? X86::MOV8rm :X86::MOV8mr;
+  case X86::GR8_ABCD_HRegClassID:
     if (TM.getSubtarget<X86Subtarget>().is64Bit())
-      Opc = X86::MOV8mr_NOREX;
+      return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
     else
-      Opc = X86::MOV8mr;
-  } else if (RC == &X86::GR64_NOREXRegClass ||
-             RC == &X86::GR64_NOREX_NOSPRegClass) {
-    Opc = X86::MOV64mr;
-  } else if (RC == &X86::GR32_NOREXRegClass) {
-    Opc = X86::MOV32mr;
-  } else if (RC == &X86::GR16_NOREXRegClass) {
-    Opc = X86::MOV16mr;
-  } else if (RC == &X86::GR8_NOREXRegClass) {
-    Opc = X86::MOV8mr;
-  } else if (RC == &X86::GR64_TCRegClass) {
-    Opc = X86::MOV64mr_TC;
-  } else if (RC == &X86::GR32_TCRegClass) {
-    Opc = X86::MOV32mr_TC;
-  } else if (RC == &X86::RFP80RegClass) {
-    Opc = X86::ST_FpP80m;   // pops
-  } else if (RC == &X86::RFP64RegClass) {
-    Opc = X86::ST_Fp64m;
-  } else if (RC == &X86::RFP32RegClass) {
-    Opc = X86::ST_Fp32m;
-  } else if (RC == &X86::FR32RegClass) {
-    Opc = X86::MOVSSmr;
-  } else if (RC == &X86::FR64RegClass) {
-    Opc = X86::MOVSDmr;
-  } else if (RC == &X86::VR128RegClass) {
+      return load ? X86::MOV8rm : X86::MOV8mr;
+  case X86::GR64_NOREXRegClassID:
+  case X86::GR64_NOREX_NOSPRegClassID:
+    return load ? X86::MOV64rm : X86::MOV64mr;
+  case X86::GR32_NOREXRegClassID:
+    return load ? X86::MOV32rm : X86::MOV32mr;
+  case X86::GR16_NOREXRegClassID:
+    return load ? X86::MOV16rm : X86::MOV16mr;
+  case X86::GR8_NOREXRegClassID:
+    return load ? X86::MOV8rm : X86::MOV8mr;
+  case X86::GR64_TCRegClassID:
+    return load ? X86::MOV64rm_TC : X86::MOV64mr_TC;
+  case X86::GR32_TCRegClassID:
+    return load ? X86::MOV32rm_TC : X86::MOV32mr_TC;
+  case X86::RFP80RegClassID:
+    return load ? X86::LD_Fp80m : X86::ST_FpP80m;
+  case X86::RFP64RegClassID:
+    return load ? X86::LD_Fp64m : X86::ST_Fp64m;
+  case X86::RFP32RegClassID:
+    return load ? X86::LD_Fp32m : X86::ST_Fp32m;
+  case X86::FR32RegClassID:
+    return load ? X86::MOVSSrm : X86::MOVSSmr;
+  case X86::FR64RegClassID:
+    return load ? X86::MOVSDrm : X86::MOVSDmr;
+  case X86::VR128RegClassID:
     // If stack is realigned we can use aligned stores.
-    Opc = isStackAligned ? X86::MOVAPSmr : X86::MOVUPSmr;
-  } else if (RC == &X86::VR64RegClass) {
-    Opc = X86::MMX_MOVQ64mr;
-  } else {
-    llvm_unreachable("Unknown regclass");
+    if (isStackAligned)
+      return load ? X86::MOVAPSrm : X86::MOVAPSmr;
+    else
+      return load ? X86::MOVUPSrm : X86::MOVUPSmr;
+  case X86::VR64RegClassID:
+    return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
   }
+}
+
+static unsigned getStoreRegOpcode(unsigned SrcReg,
+                                  const TargetRegisterClass *RC,
+                                  bool isStackAligned,
+                                  TargetMachine &TM) {
+  return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, TM, false);
+}
 
-  return Opc;
+
+static unsigned getLoadRegOpcode(unsigned DestReg,
+                                 const TargetRegisterClass *RC,
+                                 bool isStackAligned,
+                                 const TargetMachine &TM) {
+  return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, TM, true);
 }
 
 void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -2150,7 +2060,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
                                   MachineInstr::mmo_iterator MMOBegin,
                                   MachineInstr::mmo_iterator MMOEnd,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  bool isAligned = (*MMOBegin)->getAlignment() >= 16;
+  bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= 16;
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
@@ -2161,72 +2071,6 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
   NewMIs.push_back(MIB);
 }
 
-static unsigned getLoadRegOpcode(unsigned DestReg,
-                                 const TargetRegisterClass *RC,
-                                 bool isStackAligned,
-                                 const TargetMachine &TM) {
-  unsigned Opc = 0;
-  if (RC == &X86::GR64RegClass || RC == &X86::GR64_NOSPRegClass) {
-    Opc = X86::MOV64rm;
-  } else if (RC == &X86::GR32RegClass || RC == &X86::GR32_NOSPRegClass) {
-    Opc = X86::MOV32rm;
-  } else if (RC == &X86::GR16RegClass) {
-    Opc = X86::MOV16rm;
-  } else if (RC == &X86::GR8RegClass) {
-    // Copying to or from a physical H register on x86-64 requires a NOREX
-    // move.  Otherwise use a normal move.
-    if (isHReg(DestReg) &&
-        TM.getSubtarget<X86Subtarget>().is64Bit())
-      Opc = X86::MOV8rm_NOREX;
-    else
-      Opc = X86::MOV8rm;
-  } else if (RC == &X86::GR64_ABCDRegClass) {
-    Opc = X86::MOV64rm;
-  } else if (RC == &X86::GR32_ABCDRegClass) {
-    Opc = X86::MOV32rm;
-  } else if (RC == &X86::GR16_ABCDRegClass) {
-    Opc = X86::MOV16rm;
-  } else if (RC == &X86::GR8_ABCD_LRegClass) {
-    Opc = X86::MOV8rm;
-  } else if (RC == &X86::GR8_ABCD_HRegClass) {
-    if (TM.getSubtarget<X86Subtarget>().is64Bit())
-      Opc = X86::MOV8rm_NOREX;
-    else
-      Opc = X86::MOV8rm;
-  } else if (RC == &X86::GR64_NOREXRegClass ||
-             RC == &X86::GR64_NOREX_NOSPRegClass) {
-    Opc = X86::MOV64rm;
-  } else if (RC == &X86::GR32_NOREXRegClass) {
-    Opc = X86::MOV32rm;
-  } else if (RC == &X86::GR16_NOREXRegClass) {
-    Opc = X86::MOV16rm;
-  } else if (RC == &X86::GR8_NOREXRegClass) {
-    Opc = X86::MOV8rm;
-  } else if (RC == &X86::GR64_TCRegClass) {
-    Opc = X86::MOV64rm_TC;
-  } else if (RC == &X86::GR32_TCRegClass) {
-    Opc = X86::MOV32rm_TC;
-  } else if (RC == &X86::RFP80RegClass) {
-    Opc = X86::LD_Fp80m;
-  } else if (RC == &X86::RFP64RegClass) {
-    Opc = X86::LD_Fp64m;
-  } else if (RC == &X86::RFP32RegClass) {
-    Opc = X86::LD_Fp32m;
-  } else if (RC == &X86::FR32RegClass) {
-    Opc = X86::MOVSSrm;
-  } else if (RC == &X86::FR64RegClass) {
-    Opc = X86::MOVSDrm;
-  } else if (RC == &X86::VR128RegClass) {
-    // If stack is realigned we can use aligned loads.
-    Opc = isStackAligned ? X86::MOVAPSrm : X86::MOVUPSrm;
-  } else if (RC == &X86::VR64RegClass) {
-    Opc = X86::MMX_MOVQ64rm;
-  } else {
-    llvm_unreachable("Unknown regclass");
-  }
-
-  return Opc;
-}
 
 void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MI,
@@ -2246,7 +2090,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                  MachineInstr::mmo_iterator MMOBegin,
                                  MachineInstr::mmo_iterator MMOEnd,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  bool isAligned = (*MMOBegin)->getAlignment() >= 16;
+  bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= 16;
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
@@ -2277,18 +2121,17 @@ bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   unsigned Opc = is64Bit ? X86::PUSH64r : X86::PUSH32r;
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i-1].getReg();
-    const TargetRegisterClass *RegClass = CSI[i-1].getRegClass();
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
     if (Reg == FPReg)
       // X86RegisterInfo::emitPrologue will handle spilling of frame register.
       continue;
-    if (RegClass != &X86::VR128RegClass && !isWin64) {
+    if (!X86::VR128RegClass.contains(Reg) && !isWin64) {
       CalleeFrameSize += SlotSize;
       BuildMI(MBB, MI, DL, get(Opc)).addReg(Reg, RegState::Kill);
     } else {
-      storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), RegClass,
-                          &RI);
+      storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(),
+                          &X86::VR128RegClass, &RI);
     }
   }
 
@@ -2315,11 +2158,11 @@ bool X86InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
     if (Reg == FPReg)
       // X86RegisterInfo::emitEpilogue will handle restoring of frame register.
       continue;
-    const TargetRegisterClass *RegClass = CSI[i].getRegClass();
-    if (RegClass != &X86::VR128RegClass && !isWin64) {
+    if (!X86::VR128RegClass.contains(Reg) && !isWin64) {
       BuildMI(MBB, MI, DL, get(Opc), Reg);
     } else {
-      loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RegClass, &RI);
+      loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(),
+                           &X86::VR128RegClass, &RI);
     }
   }
   return true;
@@ -2492,7 +2335,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   }
   
   // No fusion 
-  if (PrintFailedFusing)
+  if (PrintFailedFusing && !MI->isCopy())
     dbgs() << "We failed to fuse operand " << i << " in " << *MI;
   return NULL;
 }
@@ -2610,7 +2453,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   } else if (Ops.size() != 1)
     return NULL;
 
-  SmallVector<MachineOperand,X86AddrNumOperands> MOs;
+  SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
   switch (LoadMI->getOpcode()) {
   case X86::V_SET0PS:
   case X86::V_SET0PD:
@@ -2632,7 +2475,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       if (TM.getSubtarget<X86Subtarget>().is64Bit())
         PICBase = X86::RIP;
       else
-        // FIXME: PICBase = TM.getInstrInfo()->getGlobalBaseReg(&MF);
+        // FIXME: PICBase = getGlobalBaseReg(&MF);
         // This doesn't work for several reasons.
         // 1. GlobalBaseReg may have been spilled.
         // 2. It may not be live at MI.
@@ -2664,7 +2507,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   default: {
     // Folding a normal load. Just copy the load's address operands.
     unsigned NumOps = LoadMI->getDesc().getNumOperands();
-    for (unsigned i = NumOps - X86AddrNumOperands; i != NumOps; ++i)
+    for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
       MOs.push_back(LoadMI->getOperand(i));
     break;
   }
@@ -2727,7 +2570,7 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
     if (I != OpcodeTablePtr->end())
       return true;
   }
-  return false;
+  return TargetInstrInfoImpl::canFoldMemoryOperand(MI, Ops);
 }
 
 bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
@@ -2751,13 +2594,20 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
   const TargetInstrDesc &TID = get(Opc);
   const TargetOperandInfo &TOI = TID.OpInfo[Index];
   const TargetRegisterClass *RC = TOI.getRegClass(&RI);
-  SmallVector<MachineOperand, X86AddrNumOperands> AddrOps;
+  if (!MI->hasOneMemOperand() &&
+      RC == &X86::VR128RegClass &&
+      !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+    // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
+    // conservatively assume the address is unaligned. That's bad for
+    // performance.
+    return false;
+  SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
   SmallVector<MachineOperand,2> BeforeOps;
   SmallVector<MachineOperand,2> AfterOps;
   SmallVector<MachineOperand,4> ImpOps;
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MachineOperand &Op = MI->getOperand(i);
-    if (i >= Index && i < Index + X86AddrNumOperands)
+    if (i >= Index && i < Index + X86::AddrNumOperands)
       AddrOps.push_back(Op);
     else if (Op.isReg() && Op.isImplicit())
       ImpOps.push_back(Op);
@@ -2776,7 +2626,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
     loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
     if (UnfoldStore) {
       // Address operands cannot be marked isKill.
-      for (unsigned i = 1; i != 1 + X86AddrNumOperands; ++i) {
+      for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
         MachineOperand &MO = NewMIs[0]->getOperand(i);
         if (MO.isReg())
           MO.setIsKill(false);
@@ -2873,7 +2723,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   unsigned NumOps = N->getNumOperands();
   for (unsigned i = 0; i != NumOps-1; ++i) {
     SDValue Op = N->getOperand(i);
-    if (i >= Index-NumDefs && i < Index-NumDefs + X86AddrNumOperands)
+    if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
       AddrOps.push_back(Op);
     else if (i < Index-NumDefs)
       BeforeOps.push_back(Op);
@@ -2892,7 +2742,12 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
                             cast<MachineSDNode>(N)->memoperands_end());
-    bool isAligned = (*MMOs.first)->getAlignment() >= 16;
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+      // Do not introduce a slow unaligned load.
+      return false;
+    bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
     Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
                               VT, MVT::Other, &AddrOps[0], AddrOps.size());
     NewNodes.push_back(Load);
@@ -2929,7 +2784,12 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
                              cast<MachineSDNode>(N)->memoperands_end());
-    bool isAligned = (*MMOs.first)->getAlignment() >= 16;
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+      // Do not introduce a slow unaligned store.
+      return false;
+    bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
     SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
                                                          isAligned, TM),
                                        dl, MVT::Other,
@@ -3065,16 +2925,16 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
 
   EVT VT = Load1->getValueType(0);
   switch (VT.getSimpleVT().SimpleTy) {
-  default: {
+  default:
     // XMM registers. In 64-bit mode we can be a bit more aggressive since we
     // have 16 of them to play with.
     if (TM.getSubtargetImpl()->is64Bit()) {
       if (NumLoads >= 3)
         return false;
-    } else if (NumLoads)
+    } else if (NumLoads) {
       return false;
+    }
     break;
-  }
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
@@ -3083,6 +2943,7 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   case MVT::f64:
     if (NumLoads)
       return false;
+    break;
   }
 
   return true;
@@ -3123,6 +2984,8 @@ bool X86InstrInfo::isX86_64ExtendedReg(unsigned RegNo) {
   case X86::R12B:  case X86::R13B:  case X86::R14B:  case X86::R15B:
   case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
   case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+  case X86::YMM8:  case X86::YMM9:  case X86::YMM10: case X86::YMM11:
+  case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
     return true;
   }
   return false;
@@ -3194,7 +3057,7 @@ unsigned X86InstrInfo::determineREX(const MachineInstr &MI) {
     case X86II::MRM4m: case X86II::MRM5m:
     case X86II::MRM6m: case X86II::MRM7m:
     case X86II::MRMDestMem: {
-      unsigned e = (isTwoAddr ? X86AddrNumOperands+1 : X86AddrNumOperands);
+      unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
       i = isTwoAddr ? 1 : 0;
       if (NumOps > e && isX86_64ExtendedReg(MI.getOperand(e)))
         REX |= 1 << 2;
@@ -3546,7 +3409,7 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
   case X86II::MRMDestMem: {
     ++FinalSize;
     FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
-    CurOp +=  X86AddrNumOperands + 1;
+    CurOp +=  X86::AddrNumOperands + 1;
     if (CurOp != NumOps) {
       ++CurOp;
       FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
@@ -3565,16 +3428,9 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
     break;
 
   case X86II::MRMSrcMem: {
-    int AddrOperands;
-    if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
-        Opcode == X86::LEA16r || Opcode == X86::LEA32r)
-      AddrOperands = X86AddrNumOperands - 1; // No segment register
-    else
-      AddrOperands = X86AddrNumOperands;
-
     ++FinalSize;
     FinalSize += getMemModRMByteSize(MI, CurOp+1, IsPIC, Is64BitMode);
-    CurOp += AddrOperands + 1;
+    CurOp += X86::AddrNumOperands + 1;
     if (CurOp != NumOps) {
       ++CurOp;
       FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
@@ -3628,7 +3484,7 @@ static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
     
     ++FinalSize;
     FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
-    CurOp += X86AddrNumOperands;
+    CurOp += X86::AddrNumOperands;
 
     if (CurOp != NumOps) {
       const MachineOperand &MO = MI.getOperand(CurOp++);
@@ -3694,6 +3550,8 @@ unsigned X86InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
 /// the global base register value. Output instructions required to
 /// initialize the register in the function entry block, if necessary.
 ///
+/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
+///
 unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   assert(!TM.getSubtarget<X86Subtarget>().is64Bit() &&
          "X86-64 PIC uses RIP relative addressing");
@@ -3703,30 +3561,10 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   if (GlobalBaseReg != 0)
     return GlobalBaseReg;
 
-  // Insert the set of GlobalBaseReg into the first MBB of the function
-  MachineBasicBlock &FirstMBB = MF->front();
-  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
-  DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+  // Create the register. The code to initialize it is inserted
+  // later, by the CGBR pass (below).
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  unsigned PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
-  
-  const TargetInstrInfo *TII = TM.getInstrInfo();
-  // Operand of MovePCtoStack is completely ignored by asm printer. It's
-  // only used in JIT code emission as displacement to pc.
-  BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
-  
-  // If we're using vanilla 'GOT' PIC style, we should use relative addressing
-  // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
-  if (TM.getSubtarget<X86Subtarget>().isPICStyleGOT()) {
-    GlobalBaseReg = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
-    // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
-    BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
-      .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
-                                    X86II::MO_GOT_ABSOLUTE_ADDRESS);
-  } else {
-    GlobalBaseReg = PC;
-  }
-
+  GlobalBaseReg = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
   X86FI->setGlobalBaseReg(GlobalBaseReg);
   return GlobalBaseReg;
 }
@@ -3784,3 +3622,65 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   NopInst.setOpcode(X86::NOOP);
 }
 
+namespace {
+  /// CGBR - Create Global Base Reg pass. This initializes the PIC
+  /// global base register for x86-32.
+  struct CGBR : public MachineFunctionPass {
+    static char ID;
+    CGBR() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      const X86TargetMachine *TM =
+        static_cast<const X86TargetMachine *>(&MF.getTarget());
+
+      assert(!TM->getSubtarget<X86Subtarget>().is64Bit() &&
+             "X86-64 PIC uses RIP relative addressing");
+
+      // Only emit a global base reg in PIC mode.
+      if (TM->getRelocationModel() != Reloc::PIC_)
+        return false;
+
+      // Insert the set of GlobalBaseReg into the first MBB of the function
+      MachineBasicBlock &FirstMBB = MF.front();
+      MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+      DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+      MachineRegisterInfo &RegInfo = MF.getRegInfo();
+      const X86InstrInfo *TII = TM->getInstrInfo();
+
+      unsigned PC;
+      if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT())
+        PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+      else
+        PC = TII->getGlobalBaseReg(&MF);
+  
+      // Operand of MovePCtoStack is completely ignored by asm printer. It's
+      // only used in JIT code emission as displacement to pc.
+      BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+  
+      // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+      // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+      if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) {
+        unsigned GlobalBaseReg = TII->getGlobalBaseReg(&MF);
+        // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
+        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+          .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+                                        X86II::MO_GOT_ABSOLUTE_ADDRESS);
+      }
+
+      return true;
+    }
+
+    virtual const char *getPassName() const {
+      return "X86 PIC Global Base Reg Initialization";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char CGBR::ID = 0;
+FunctionPass*
+llvm::createGlobalBaseRegPass() { return new CGBR(); }
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 62d7c74..f762b58 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -24,6 +24,24 @@ namespace llvm {
   class X86TargetMachine;
 
 namespace X86 {
+  // Enums for memory operand decoding.  Each memory operand is represented with
+  // a 5 operand sequence in the form:
+  //   [BaseReg, ScaleAmt, IndexReg, Disp, Segment]
+  // These enums help decode this.
+  enum {
+    AddrBaseReg = 0,
+    AddrScaleAmt = 1,
+    AddrIndexReg = 2,
+    AddrDisp = 3,
+    
+    /// AddrSegmentReg - The operand # of the segment in the memory operand.
+    AddrSegmentReg = 4,
+
+    /// AddrNumOperands - Total number of operands in a memory reference.
+    AddrNumOperands = 5
+  };
+  
+  
   // X86 specific condition code. These correspond to X86_*_COND in
   // X86InstrInfo.td. They must be kept in synch.
   enum CondCode {
@@ -173,7 +191,19 @@ namespace X86II {
     /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
     /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
     /// stub.
-    MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE
+    MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE,
+    
+    /// MO_TLVP - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// This is the TLS offset for the Darwin TLS mechanism.
+    MO_TLVP,
+    
+    /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
+    /// is some TLS offset from the picbase.
+    ///
+    /// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
+    MO_TLVP_PIC_BASE
   };
 }
 
@@ -203,6 +233,7 @@ inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
   case X86II::MO_PIC_BASE_OFFSET:                // Darwin local global.
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Darwin/32 external global.
   case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global.
+  case X86II::MO_TLVP:                           // ??? Pretty sure..
     return true;
   default:
     return false;
@@ -347,9 +378,10 @@ namespace X86II {
     Imm8       = 1 << ImmShift,
     Imm8PCRel  = 2 << ImmShift,
     Imm16      = 3 << ImmShift,
-    Imm32      = 4 << ImmShift,
-    Imm32PCRel = 5 << ImmShift,
-    Imm64      = 6 << ImmShift,
+    Imm16PCRel = 4 << ImmShift,
+    Imm32      = 5 << ImmShift,
+    Imm32PCRel = 6 << ImmShift,
+    Imm64      = 7 << ImmShift,
 
     //===------------------------------------------------------------------===//
     // FP Instruction Classification...  Zero is non-fp instruction.
@@ -403,28 +435,47 @@ namespace X86II {
     SSEDomainShift = 22,
 
     OpcodeShift   = 24,
-    OpcodeMask    = 0xFF << OpcodeShift
+    OpcodeMask    = 0xFF << OpcodeShift,
+
+    //===------------------------------------------------------------------===//
+    // VEX - The opcode prefix used by AVX instructions
+    VEX         = 1ULL << 32,
+
+    // VEX_W - Has a opcode specific functionality, but is used in the same
+    // way as REX_W is for regular SSE instructions.
+    VEX_W       = 1ULL << 33,
+
+    // VEX_4V - Used to specify an additional AVX/SSE register. Several 2
+    // address instructions in SSE are represented as 3 address ones in AVX
+    // and the additional register is encoded in VEX_VVVV prefix.
+    VEX_4V      = 1ULL << 34,
+
+    // VEX_I8IMM - Specifies that the last register used in a AVX instruction,
+    // must be encoded in the i8 immediate field. This usually happens in
+    // instructions with 4 operands.
+    VEX_I8IMM   = 1ULL << 35
   };
   
   // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
   // specified machine instruction.
   //
-  static inline unsigned char getBaseOpcodeFor(unsigned TSFlags) {
+  static inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
     return TSFlags >> X86II::OpcodeShift;
   }
   
-  static inline bool hasImm(unsigned TSFlags) {
+  static inline bool hasImm(uint64_t TSFlags) {
     return (TSFlags & X86II::ImmMask) != 0;
   }
   
   /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
   /// of the specified instruction.
-  static inline unsigned getSizeOfImm(unsigned TSFlags) {
+  static inline unsigned getSizeOfImm(uint64_t TSFlags) {
     switch (TSFlags & X86II::ImmMask) {
     default: assert(0 && "Unknown immediate size");
     case X86II::Imm8:
     case X86II::Imm8PCRel:  return 1;
-    case X86II::Imm16:      return 2;
+    case X86II::Imm16:
+    case X86II::Imm16PCRel: return 2;
     case X86II::Imm32:
     case X86II::Imm32PCRel: return 4;
     case X86II::Imm64:      return 8;
@@ -433,23 +484,77 @@ namespace X86II {
   
   /// isImmPCRel - Return true if the immediate of the specified instruction's
   /// TSFlags indicates that it is pc relative.
-  static inline unsigned isImmPCRel(unsigned TSFlags) {
+  static inline unsigned isImmPCRel(uint64_t TSFlags) {
     switch (TSFlags & X86II::ImmMask) {
-      default: assert(0 && "Unknown immediate size");
-      case X86II::Imm8PCRel:
-      case X86II::Imm32PCRel:
-        return true;
-      case X86II::Imm8:
-      case X86II::Imm16:
-      case X86II::Imm32:
-      case X86II::Imm64:
-        return false;
+    default: assert(0 && "Unknown immediate size");
+    case X86II::Imm8PCRel:
+    case X86II::Imm16PCRel:
+    case X86II::Imm32PCRel:
+      return true;
+    case X86II::Imm8:
+    case X86II::Imm16:
+    case X86II::Imm32:
+    case X86II::Imm64:
+      return false;
+    }
+  }
+  
+  /// getMemoryOperandNo - The function returns the MCInst operand # for the
+  /// first field of the memory operand.  If the instruction doesn't have a
+  /// memory operand, this returns -1.
+  ///
+  /// Note that this ignores tied operands.  If there is a tied register which
+  /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
+  /// counted as one operand.
+  ///
+  static inline int getMemoryOperandNo(uint64_t TSFlags) {
+    switch (TSFlags & X86II::FormMask) {
+    case X86II::MRMInitReg:  assert(0 && "FIXME: Remove this form");
+    default: assert(0 && "Unknown FormMask value in getMemoryOperandNo!");
+    case X86II::Pseudo:
+    case X86II::RawFrm:
+    case X86II::AddRegFrm:
+    case X86II::MRMDestReg:
+    case X86II::MRMSrcReg:
+       return -1;
+    case X86II::MRMDestMem:
+      return 0;
+    case X86II::MRMSrcMem: {
+      bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+      unsigned FirstMemOp = 1;
+      if (HasVEX_4V)
+        ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
+      
+      // FIXME: Maybe lea should have its own form?  This is a horrible hack.
+      //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+      //    Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+      return FirstMemOp;
     }
-  }    
+    case X86II::MRM0r: case X86II::MRM1r:
+    case X86II::MRM2r: case X86II::MRM3r:
+    case X86II::MRM4r: case X86II::MRM5r:
+    case X86II::MRM6r: case X86II::MRM7r:
+      return -1;
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m:
+      return 0;
+    case X86II::MRM_C1:
+    case X86II::MRM_C2:
+    case X86II::MRM_C3:
+    case X86II::MRM_C4:
+    case X86II::MRM_C8:
+    case X86II::MRM_C9:
+    case X86II::MRM_E8:
+    case X86II::MRM_F0:
+    case X86II::MRM_F8:
+    case X86II::MRM_F9:
+      return -1;
+    }
+  }
 }
 
-const int X86AddrNumOperands = 5;
-
 inline static bool isScale(const MachineOperand &MO) {
   return MO.isImm() &&
     (MO.getImm() == 1 || MO.getImm() == 2 ||
@@ -555,7 +660,7 @@ public:
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, unsigned SubIdx,
                      const MachineInstr *Orig,
-                     const TargetRegisterInfo *TRI) const;
+                     const TargetRegisterInfo &TRI) const;
 
   /// convertToThreeAddress - This method must be implemented by targets that
   /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
@@ -585,13 +690,12 @@ public:
   virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                 MachineBasicBlock *FBB,
-                            const SmallVectorImpl<MachineOperand> &Cond) const;
-  virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
   virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MI,
                                    unsigned SrcReg, bool isKill, int FrameIndex,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 0d59c42..1efef5a 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -72,6 +72,8 @@ def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
 
 def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
 def SDT_X86SegmentBaseAddress : SDTypeProfile<1, 1, [SDTCisPtrTy<0>]>;
 
 def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
@@ -182,6 +184,9 @@ def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
 def X86MingwAlloca : SDNode<"X86ISD::MINGW_ALLOCA", SDTX86Void,
                             [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+                            
+def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
+                        []>;
 
 //===----------------------------------------------------------------------===//
 // X86 Operand Definitions.
@@ -197,13 +202,9 @@ def X86MemAsmOperand : AsmOperandClass {
   let Name = "Mem";
   let SuperClasses = [];
 }
-def X86NoSegMemAsmOperand : AsmOperandClass {
-  let Name = "NoSegMem";
-  let SuperClasses = [X86MemAsmOperand];
-}
 def X86AbsMemAsmOperand : AsmOperandClass {
   let Name = "AbsMem";
-  let SuperClasses = [X86NoSegMemAsmOperand];
+  let SuperClasses = [X86MemAsmOperand];
 }
 class X86MemOperand<string printMethod> : Operand<iPTR> {
   let PrintMethod = printMethod;
@@ -226,7 +227,7 @@ def f32mem  : X86MemOperand<"printf32mem">;
 def f64mem  : X86MemOperand<"printf64mem">;
 def f80mem  : X86MemOperand<"printf80mem">;
 def f128mem : X86MemOperand<"printf128mem">;
-//def f256mem : X86MemOperand<"printf256mem">;
+def f256mem : X86MemOperand<"printf256mem">;
 
 // A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
 // plain GR64, so that it doesn't potentially require a REX prefix.
@@ -245,15 +246,11 @@ def i32mem_TC : Operand<i32> {
   let ParserMatchClass = X86MemAsmOperand;
 }
 
-def lea32mem : Operand<i32> {
-  let PrintMethod = "printlea32mem";
-  let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm);
-  let ParserMatchClass = X86NoSegMemAsmOperand;
-}
 
 let ParserMatchClass = X86AbsMemAsmOperand,
     PrintMethod = "print_pcrel_imm" in {
 def i32imm_pcrel : Operand<i32>;
+def i16imm_pcrel : Operand<i16>;
 
 def offset8 : Operand<i64>;
 def offset16 : Operand<i64>;
@@ -283,26 +280,31 @@ class ImmSExtAsmOperandClass : AsmOperandClass {
 // 64-bit immediates, but for a 16-bit target value we want to accept both "-1"
 // (which will be a -1ULL), and "0xFF" (-1 in 16-bits).
 
-// [0, 0x7FFFFFFF]                                            | [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
+// [0, 0x7FFFFFFF]                                            |
+//   [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
 def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass {
   let Name = "ImmSExti64i32";
 }
 
-// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] | [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
 def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass {
   let Name = "ImmSExti16i8";
   let SuperClasses = [ImmSExti64i32AsmOperand];
 }
 
-// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] | [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
 def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
   let Name = "ImmSExti32i8";
 }
 
-// [0, 0x0000007F]                                            | [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+// [0, 0x0000007F]                                            |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
 def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
   let Name = "ImmSExti64i8";
-  let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand, ImmSExti64i32AsmOperand];
+  let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand,
+                      ImmSExti64i32AsmOperand];
 }
 
 // A couple of more descriptive operand definitions.
@@ -321,10 +323,10 @@ def i32i8imm  : Operand<i32> {
 
 // Define X86 specific addressing mode.
 def addr      : ComplexPattern<iPTR, 5, "SelectAddr", [], []>;
-def lea32addr : ComplexPattern<i32, 4, "SelectLEAAddr",
+def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
                                [add, sub, mul, X86mul_imm, shl, or, frameindex],
                                []>;
-def tls32addr : ComplexPattern<i32, 4, "SelectTLSADDRAddr",
+def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
 
 //===----------------------------------------------------------------------===//
@@ -704,6 +706,12 @@ let isCall = 1 in
                         "lcall{w}\t{*}$dst", []>, OpSize;
     def FARCALL32m  : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
                         "lcall{l}\t{*}$dst", []>;
+
+    // callw for 16 bit code for the assembler.
+    let isAsmParserOnly = 1 in
+      def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
+                       (outs), (ins i16imm_pcrel:$dst, variable_ops),
+                       "callw\t$dst", []>, OpSize;
   }
 
 // Constructing a stack frame.
@@ -737,18 +745,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
                  "jmp\t$dst  # TAILCALL",
                  []>;
   def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32_TC:$dst, variable_ops), 
-                   "jmp{l}\t{*}$dst  # TAILCALL",
-                 []>;     
+                   "", []>;  // FIXME: Remove encoding when JIT is dead.
   let mayLoad = 1 in
   def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst, variable_ops),
                    "jmp{l}\t{*}$dst  # TAILCALL", []>;
-
-  // FIXME: This is a hack so that MCInst lowering can preserve the TAILCALL
-  // marker on instructions, while still being able to relax.
-  let isCodeGenOnly = 1 in {
-    def TAILJMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
-                         "jmp\t$dst  # TAILCALL", []>;
-  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -815,7 +815,18 @@ def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>,
                Requires<[In32BitMode]>;
 }
 
-let isTwoAddress = 1 in                               // GR32 = bswap GR32
+let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
+    mayLoad=1, neverHasSideEffects=1 in {
+def POPA32   : I<0x61, RawFrm, (outs), (ins), "popa{l}", []>,
+               Requires<[In32BitMode]>;
+}
+let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
+    mayStore=1, neverHasSideEffects=1 in {
+def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pusha{l}", []>,
+               Requires<[In32BitMode]>;
+}
+
+let Uses = [EFLAGS], Constraints = "$src = $dst" in     // GR32 = bswap GR32
   def BSWAP32r : I<0xC8, AddRegFrm,
                    (outs GR32:$dst), (ins GR32:$src),
                    "bswap{l}\t$dst", 
@@ -855,11 +866,11 @@ def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
 
 let neverHasSideEffects = 1 in
 def LEA16r   : I<0x8D, MRMSrcMem,
-                 (outs GR16:$dst), (ins lea32mem:$src),
+                 (outs GR16:$dst), (ins i32mem:$src),
                  "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize;
 let isReMaterializable = 1 in
 def LEA32r   : I<0x8D, MRMSrcMem,
-                 (outs GR32:$dst), (ins lea32mem:$src),
+                 (outs GR32:$dst), (ins i32mem:$src),
                  "lea{l}\t{$src|$dst}, {$dst|$src}",
                  [(set GR32:$dst, lea32addr:$src)]>, Requires<[In32BitMode]>;
 
@@ -1239,7 +1250,7 @@ def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
 //===----------------------------------------------------------------------===//
 //  Two address Instructions.
 //
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 
 // Conditional moves
 let Uses = [EFLAGS] in {
@@ -1640,7 +1651,7 @@ def CMOVNO32rm : I<0x41, MRMSrcMem,       // if !overflow, GR32 = [mem32]
 // i8 register pressure. Note that CMOV_GR8 is conservatively considered to
 // clobber EFLAGS, because if one of the operands is zero, the expansion
 // could involve an xor.
-let usesCustomInserter = 1, isTwoAddress = 0, Defs = [EFLAGS] in {
+let usesCustomInserter = 1, Constraints = "", Defs = [EFLAGS] in {
 def CMOV_GR8 : I<0, Pseudo,
                  (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond),
                  "#CMOV_GR8 PSEUDO!",
@@ -1659,86 +1670,106 @@ def CMOV_GR16 : I<0, Pseudo,
                     [(set GR16:$dst,
                       (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
 def CMOV_RFP32 : I<0, Pseudo,
-                    (outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2, i8imm:$cond),
+                    (outs RFP32:$dst),
+                    (ins RFP32:$src1, RFP32:$src2, i8imm:$cond),
                     "#CMOV_RFP32 PSEUDO!",
-                    [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond,
+                    [(set RFP32:$dst,
+                      (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond,
                                                   EFLAGS))]>;
 def CMOV_RFP64 : I<0, Pseudo,
-                    (outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2, i8imm:$cond),
+                    (outs RFP64:$dst),
+                    (ins RFP64:$src1, RFP64:$src2, i8imm:$cond),
                     "#CMOV_RFP64 PSEUDO!",
-                    [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond,
+                    [(set RFP64:$dst,
+                      (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond,
                                                   EFLAGS))]>;
 def CMOV_RFP80 : I<0, Pseudo,
-                    (outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2, i8imm:$cond),
+                    (outs RFP80:$dst),
+                    (ins RFP80:$src1, RFP80:$src2, i8imm:$cond),
                     "#CMOV_RFP80 PSEUDO!",
-                    [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
+                    [(set RFP80:$dst,
+                      (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
                                                   EFLAGS))]>;
 } // Predicates = [NoCMov]
-} // UsesCustomInserter = 1, isTwoAddress = 0, Defs = [EFLAGS] 
+} // UsesCustomInserter = 1, Constraints = "", Defs = [EFLAGS] 
 } // Uses = [EFLAGS]
 
 
 // unary instructions
 let CodeSize = 2 in {
 let Defs = [EFLAGS] in {
-def NEG8r  : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src), "neg{b}\t$dst",
-               [(set GR8:$dst, (ineg GR8:$src)),
+def NEG8r  : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "neg{b}\t$dst",
+               [(set GR8:$dst, (ineg GR8:$src1)),
                 (implicit EFLAGS)]>;
-def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src), "neg{w}\t$dst",
-               [(set GR16:$dst, (ineg GR16:$src)),
+def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+               "neg{w}\t$dst",
+               [(set GR16:$dst, (ineg GR16:$src1)),
                 (implicit EFLAGS)]>, OpSize;
-def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src), "neg{l}\t$dst",
-               [(set GR32:$dst, (ineg GR32:$src)),
+def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+               "neg{l}\t$dst",
+               [(set GR32:$dst, (ineg GR32:$src1)),
                 (implicit EFLAGS)]>;
-let isTwoAddress = 0 in {
-  def NEG8m  : I<0xF6, MRM3m, (outs), (ins i8mem :$dst), "neg{b}\t$dst",
+                
+let Constraints = "" in {
+  def NEG8m  : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
+                 "neg{b}\t$dst",
                  [(store (ineg (loadi8 addr:$dst)), addr:$dst),
                   (implicit EFLAGS)]>;
-  def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst), "neg{w}\t$dst",
+  def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst),
+                 "neg{w}\t$dst",
                  [(store (ineg (loadi16 addr:$dst)), addr:$dst),
                   (implicit EFLAGS)]>, OpSize;
-  def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst), "neg{l}\t$dst",
+  def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
+                 "neg{l}\t$dst",
                  [(store (ineg (loadi32 addr:$dst)), addr:$dst),
                   (implicit EFLAGS)]>;
-}
+} // Constraints = ""
 } // Defs = [EFLAGS]
 
 // Match xor -1 to not. Favors these over a move imm + xor to save code size.
 let AddedComplexity = 15 in {
-def NOT8r  : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src), "not{b}\t$dst",
-               [(set GR8:$dst, (not GR8:$src))]>;
-def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src), "not{w}\t$dst",
-               [(set GR16:$dst, (not GR16:$src))]>, OpSize;
-def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src), "not{l}\t$dst",
-               [(set GR32:$dst, (not GR32:$src))]>;
+def NOT8r  : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "not{b}\t$dst",
+               [(set GR8:$dst, (not GR8:$src1))]>;
+def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+               "not{w}\t$dst",
+               [(set GR16:$dst, (not GR16:$src1))]>, OpSize;
+def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+               "not{l}\t$dst",
+               [(set GR32:$dst, (not GR32:$src1))]>;
 }
-let isTwoAddress = 0 in {
-  def NOT8m  : I<0xF6, MRM2m, (outs), (ins i8mem :$dst), "not{b}\t$dst",
+let Constraints = "" in {
+  def NOT8m  : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
+                 "not{b}\t$dst",
                  [(store (not (loadi8 addr:$dst)), addr:$dst)]>;
-  def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst), "not{w}\t$dst",
+  def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst),
+                 "not{w}\t$dst",
                  [(store (not (loadi16 addr:$dst)), addr:$dst)]>, OpSize;
-  def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst), "not{l}\t$dst",
+  def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
+                 "not{l}\t$dst",
                  [(store (not (loadi32 addr:$dst)), addr:$dst)]>;
-}
+} // Constraints = ""
 } // CodeSize
 
 // TODO: inc/dec is slow for P4, but fast for Pentium-M.
 let Defs = [EFLAGS] in {
 let CodeSize = 2 in
-def INC8r  : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src), "inc{b}\t$dst",
-               [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src))]>;
+def INC8r  : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "inc{b}\t$dst",
+               [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))]>;
 
 let isConvertibleToThreeAddress = 1, CodeSize = 1 in {  // Can xform into LEA.
-def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), 
+def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), 
                "inc{w}\t$dst",
-               [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src))]>,
+               [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))]>,
              OpSize, Requires<[In32BitMode]>;
-def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), 
+def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), 
                "inc{l}\t$dst",
-               [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src))]>,
+               [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))]>,
              Requires<[In32BitMode]>;
 }
-let isTwoAddress = 0, CodeSize = 2 in {
+let Constraints = "", CodeSize = 2 in {
   def INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
                [(store (add (loadi8 addr:$dst), 1), addr:$dst),
                 (implicit EFLAGS)]>;
@@ -1750,23 +1781,24 @@ let isTwoAddress = 0, CodeSize = 2 in {
                [(store (add (loadi32 addr:$dst), 1), addr:$dst),
                 (implicit EFLAGS)]>,
                Requires<[In32BitMode]>;
-}
+} // Constraints = "", CodeSize = 2
 
 let CodeSize = 2 in
-def DEC8r  : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src), "dec{b}\t$dst",
-               [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src))]>;
+def DEC8r  : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "dec{b}\t$dst",
+               [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))]>;
 let isConvertibleToThreeAddress = 1, CodeSize = 1 in {   // Can xform into LEA.
-def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), 
+def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), 
                "dec{w}\t$dst",
-               [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src))]>,
+               [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))]>,
              OpSize, Requires<[In32BitMode]>;
-def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), 
+def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), 
                "dec{l}\t$dst",
-               [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src))]>,
+               [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))]>,
              Requires<[In32BitMode]>;
-}
+} // CodeSize = 2
 
-let isTwoAddress = 0, CodeSize = 2 in {
+let Constraints = "", CodeSize = 2 in {
   def DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
                [(store (add (loadi8 addr:$dst), -1), addr:$dst),
                 (implicit EFLAGS)]>;
@@ -1778,7 +1810,7 @@ let isTwoAddress = 0, CodeSize = 2 in {
                [(store (add (loadi32 addr:$dst), -1), addr:$dst),
                 (implicit EFLAGS)]>,
                Requires<[In32BitMode]>;
-}
+} // Constraints = "", CodeSize = 2
 } // Defs = [EFLAGS]
 
 // Logical operators...
@@ -1857,7 +1889,7 @@ def AND32ri8 : Ii8<0x83, MRM4r,
                    [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1,
                                                          i32immSExt8:$src2))]>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
   def AND8mr   : I<0x20, MRMDestMem,
                    (outs), (ins i8mem :$dst, GR8 :$src),
                    "and{b}\t{$src, $dst|$dst, $src}",
@@ -1909,7 +1941,7 @@ let isTwoAddress = 0 in {
   def AND32i32 : Ii32<0x25, RawFrm, (outs), (ins i32imm:$src),
                       "and{l}\t{$src, %eax|%eax, $src}", []>;
 
-}
+} // Constraints = ""
 
 
 let isCommutable = 1 in {   // X = OR Y, Z   --> X = OR Z, Y
@@ -1983,7 +2015,7 @@ def OR32ri8  : Ii8<0x83, MRM1r, (outs GR32:$dst),
                    "or{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,
                                                         i32immSExt8:$src2))]>;
-let isTwoAddress = 0 in {
+let Constraints = "" in {
   def OR8mr  : I<0x08, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
                  "or{b}\t{$src, $dst|$dst, $src}",
                  [(store (or (load addr:$dst), GR8:$src), addr:$dst),
@@ -2025,7 +2057,7 @@ let isTwoAddress = 0 in {
                       "or{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
   def OR32i32 : Ii32 <0x0D, RawFrm, (outs), (ins i32imm:$src),
                       "or{l}\t{$src, %eax|%eax, $src}", []>;
-} // isTwoAddress = 0
+} // Constraints = ""
 
 
 let isCommutable = 1 in { // X = XOR Y, Z --> X = XOR Z, Y
@@ -2102,7 +2134,7 @@ def XOR32ri8 : Ii8<0x83, MRM6r,
                    [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1,
                                                          i32immSExt8:$src2))]>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
   def XOR8mr   : I<0x30, MRMDestMem,
                    (outs), (ins i8mem :$dst, GR8 :$src),
                    "xor{b}\t{$src, $dst|$dst, $src}",
@@ -2153,26 +2185,27 @@ let isTwoAddress = 0 in {
                       "xor{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
   def XOR32i32 : Ii32<0x35, RawFrm, (outs), (ins i32imm:$src),
                       "xor{l}\t{$src, %eax|%eax, $src}", []>;
-} // isTwoAddress = 0
+} // Constraints = ""
 } // Defs = [EFLAGS]
 
 // Shift instructions
 let Defs = [EFLAGS] in {
 let Uses = [CL] in {
-def SHL8rCL  : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src),
+def SHL8rCL  : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "shl{b}\t{%cl, $dst|$dst, CL}",
-                 [(set GR8:$dst, (shl GR8:$src, CL))]>;
-def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src),
+                 [(set GR8:$dst, (shl GR8:$src1, CL))]>;
+def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
                  "shl{w}\t{%cl, $dst|$dst, CL}",
-                 [(set GR16:$dst, (shl GR16:$src, CL))]>, OpSize;
-def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src),
+                 [(set GR16:$dst, (shl GR16:$src1, CL))]>, OpSize;
+def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
                  "shl{l}\t{%cl, $dst|$dst, CL}",
-                 [(set GR32:$dst, (shl GR32:$src, CL))]>;
+                 [(set GR32:$dst, (shl GR32:$src1, CL))]>;
 } // Uses = [CL]
 
 def SHL8ri   : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
                    "shl{b}\t{$src2, $dst|$dst, $src2}",
                    [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
+                   
 let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
 def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
                    "shl{w}\t{$src2, $dst|$dst, $src2}",
@@ -2193,7 +2226,7 @@ def SHL32r1  : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
 
 } // isConvertibleToThreeAddress = 1
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
   let Uses = [CL] in {
   def SHL8mCL  : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
                    "shl{b}\t{%cl, $dst|$dst, CL}",
@@ -2227,18 +2260,18 @@ let isTwoAddress = 0 in {
   def SHL32m1  : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
                    "shl{l}\t$dst",
                  [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
-}
+} // Constraints = ""
 
 let Uses = [CL] in {
-def SHR8rCL  : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src),
+def SHR8rCL  : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "shr{b}\t{%cl, $dst|$dst, CL}",
-                 [(set GR8:$dst, (srl GR8:$src, CL))]>;
-def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src),
+                 [(set GR8:$dst, (srl GR8:$src1, CL))]>;
+def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
                  "shr{w}\t{%cl, $dst|$dst, CL}",
-                 [(set GR16:$dst, (srl GR16:$src, CL))]>, OpSize;
-def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src),
+                 [(set GR16:$dst, (srl GR16:$src1, CL))]>, OpSize;
+def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
                  "shr{l}\t{%cl, $dst|$dst, CL}",
-                 [(set GR32:$dst, (srl GR32:$src, CL))]>;
+                 [(set GR32:$dst, (srl GR32:$src1, CL))]>;
 }
 
 def SHR8ri   : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
@@ -2262,7 +2295,7 @@ def SHR32r1  : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
                  "shr{l}\t$dst",
                  [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
   let Uses = [CL] in {
   def SHR8mCL  : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
                    "shr{b}\t{%cl, $dst|$dst, CL}",
@@ -2296,18 +2329,18 @@ let isTwoAddress = 0 in {
   def SHR32m1  : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
                    "shr{l}\t$dst",
                  [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
-}
+} // Constraints = ""
 
 let Uses = [CL] in {
-def SAR8rCL  : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src),
+def SAR8rCL  : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "sar{b}\t{%cl, $dst|$dst, CL}",
-                 [(set GR8:$dst, (sra GR8:$src, CL))]>;
-def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src),
+                 [(set GR8:$dst, (sra GR8:$src1, CL))]>;
+def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
                  "sar{w}\t{%cl, $dst|$dst, CL}",
-                 [(set GR16:$dst, (sra GR16:$src, CL))]>, OpSize;
-def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src),
+                 [(set GR16:$dst, (sra GR16:$src1, CL))]>, OpSize;
+def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
                  "sar{l}\t{%cl, $dst|$dst, CL}",
-                 [(set GR32:$dst, (sra GR32:$src, CL))]>;
+                 [(set GR32:$dst, (sra GR32:$src1, CL))]>;
 }
 
 def SAR8ri   : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
@@ -2332,7 +2365,7 @@ def SAR32r1  : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
                  "sar{l}\t$dst",
                  [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
   let Uses = [CL] in {
   def SAR8mCL  : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
                    "sar{b}\t{%cl, $dst|$dst, CL}",
@@ -2366,65 +2399,65 @@ let isTwoAddress = 0 in {
   def SAR32m1  : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
                    "sar{l}\t$dst",
                  [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
-}
+} // Constraints = ""
 
 // Rotate instructions
 
-def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src),
+def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
                "rcl{b}\t{1, $dst|$dst, 1}", []>;
 let Uses = [CL] in {
-def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src),
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
                 "rcl{b}\t{%cl, $dst|$dst, CL}", []>;
 }
-def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src, i8imm:$cnt),
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
                  "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
   
-def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src),
+def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
                 "rcl{w}\t{1, $dst|$dst, 1}", []>, OpSize;
 let Uses = [CL] in {
-def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src),
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
                  "rcl{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
 }
-def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src, i8imm:$cnt),
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
                   "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
 
-def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src),
+def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
                 "rcl{l}\t{1, $dst|$dst, 1}", []>;
 let Uses = [CL] in {
-def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src),
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
                  "rcl{l}\t{%cl, $dst|$dst, CL}", []>;
 }
-def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src, i8imm:$cnt),
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
                   "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>;
                   
-def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src),
+def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
                "rcr{b}\t{1, $dst|$dst, 1}", []>;
 let Uses = [CL] in {
-def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src),
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
                 "rcr{b}\t{%cl, $dst|$dst, CL}", []>;
 }
-def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src, i8imm:$cnt),
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
                  "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
   
-def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src),
+def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
                 "rcr{w}\t{1, $dst|$dst, 1}", []>, OpSize;
 let Uses = [CL] in {
-def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src),
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
                  "rcr{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
 }
-def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src, i8imm:$cnt),
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
                   "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
 
-def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src),
+def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
                 "rcr{l}\t{1, $dst|$dst, 1}", []>;
 let Uses = [CL] in {
-def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src),
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
                  "rcr{l}\t{%cl, $dst|$dst, CL}", []>;
 }
-def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src, i8imm:$cnt),
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
                   "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
 def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
                "rcl{b}\t{1, $dst|$dst, 1}", []>;
 def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt),
@@ -2464,19 +2497,19 @@ def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
 def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
                  "rcr{l}\t{%cl, $dst|$dst, CL}", []>;
 }
-}
+} // Constraints = ""
 
 // FIXME: provide shorter instructions when imm8 == 1
 let Uses = [CL] in {
-def ROL8rCL  : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src),
+def ROL8rCL  : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "rol{b}\t{%cl, $dst|$dst, CL}",
-                 [(set GR8:$dst, (rotl GR8:$src, CL))]>;
-def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src),
+                 [(set GR8:$dst, (rotl GR8:$src1, CL))]>;
+def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
                  "rol{w}\t{%cl, $dst|$dst, CL}",
-                 [(set GR16:$dst, (rotl GR16:$src, CL))]>, OpSize;
-def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src),
+                 [(set GR16:$dst, (rotl GR16:$src1, CL))]>, OpSize;
+def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
                  "rol{l}\t{%cl, $dst|$dst, CL}",
-                 [(set GR32:$dst, (rotl GR32:$src, CL))]>;
+                 [(set GR32:$dst, (rotl GR32:$src1, CL))]>;
 }
 
 def ROL8ri   : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
@@ -2501,7 +2534,7 @@ def ROL32r1  : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
                  "rol{l}\t$dst",
                  [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
   let Uses = [CL] in {
   def ROL8mCL  : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
                    "rol{b}\t{%cl, $dst|$dst, CL}",
@@ -2535,18 +2568,18 @@ let isTwoAddress = 0 in {
   def ROL32m1  : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
                    "rol{l}\t$dst",
                 [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
-}
+} // Constraints = ""
 
 let Uses = [CL] in {
-def ROR8rCL  : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src),
+def ROR8rCL  : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
                  "ror{b}\t{%cl, $dst|$dst, CL}",
-                 [(set GR8:$dst, (rotr GR8:$src, CL))]>;
-def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src),
+                 [(set GR8:$dst, (rotr GR8:$src1, CL))]>;
+def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
                  "ror{w}\t{%cl, $dst|$dst, CL}",
-                 [(set GR16:$dst, (rotr GR16:$src, CL))]>, OpSize;
-def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src),
+                 [(set GR16:$dst, (rotr GR16:$src1, CL))]>, OpSize;
+def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                  "ror{l}\t{%cl, $dst|$dst, CL}",
-                 [(set GR32:$dst, (rotr GR32:$src, CL))]>;
+                 [(set GR32:$dst, (rotr GR32:$src1, CL))]>;
 }
 
 def ROR8ri   : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
@@ -2571,7 +2604,7 @@ def ROR32r1  : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                  "ror{l}\t$dst",
                  [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
   let Uses = [CL] in {
   def ROR8mCL  : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
                    "ror{b}\t{%cl, $dst|$dst, CL}",
@@ -2605,8 +2638,7 @@ let isTwoAddress = 0 in {
   def ROR32m1  : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
                    "ror{l}\t$dst",
                 [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
-}
-
+} // Constraints = ""
 
 
 // Double shift instructions (generalizations of rotate)
@@ -2662,7 +2694,7 @@ def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
                      TB, OpSize;
 }
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
   let Uses = [CL] in {
   def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                      "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
@@ -2708,7 +2740,7 @@ let isTwoAddress = 0 in {
                       [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
                                         (i8 imm:$src3)), addr:$dst)]>,
                        TB, OpSize;
-}
+} // Constraints = ""
 } // Defs = [EFLAGS]
 
 
@@ -2794,7 +2826,7 @@ def ADD32ri8 : Ii8<0x83, MRM0r, (outs GR32:$dst),
                          (X86add_flag GR32:$src1, i32immSExt8:$src2))]>;
 }
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
   // Memory-Register Addition
   def ADD8mr   : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
                    "add{b}\t{$src2, $dst|$dst, $src2}",
@@ -2838,7 +2870,7 @@ let isTwoAddress = 0 in {
                       "add{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
   def ADD32i32 : Ii32<0x05, RawFrm, (outs), (ins i32imm:$src),
                       "add{l}\t{$src, %eax|%eax, $src}", []>;
-}
+} // Constraints = ""
 
 let Uses = [EFLAGS] in {
 let isCommutable = 1 in {  // X = ADC Y, Z --> X = ADC Z, Y
@@ -2900,7 +2932,7 @@ def ADC32ri8 : Ii8<0x83, MRM2r, (outs GR32:$dst),
                    "adc{l}\t{$src2, $dst|$dst, $src2}",
                  [(set GR32:$dst, (adde GR32:$src1, i32immSExt8:$src2))]>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
   def ADC8mr   : I<0x10, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
                    "adc{b}\t{$src2, $dst|$dst, $src2}",
                    [(store (adde (load addr:$dst), GR8:$src2), addr:$dst)]>;
@@ -2935,7 +2967,7 @@ let isTwoAddress = 0 in {
                       "adc{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
   def ADC32i32 : Ii32<0x15, RawFrm, (outs), (ins i32imm:$src),
                       "adc{l}\t{$src, %eax|%eax, $src}", []>;
-}
+} // Constraints = ""
 } // Uses = [EFLAGS]
 
 // Register-Register Subtraction
@@ -3007,7 +3039,7 @@ def SUB32ri8 : Ii8<0x83, MRM5r, (outs GR32:$dst),
                    [(set GR32:$dst, EFLAGS,
                          (X86sub_flag GR32:$src1, i32immSExt8:$src2))]>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
   // Memory-Register Subtraction
   def SUB8mr   : I<0x28, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src2),
                    "sub{b}\t{$src2, $dst|$dst, $src2}",
@@ -3052,7 +3084,7 @@ let isTwoAddress = 0 in {
                       "sub{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
   def SUB32i32 : Ii32<0x2D, RawFrm, (outs), (ins i32imm:$src),
                       "sub{l}\t{$src, %eax|%eax, $src}", []>;
-}
+} // Constraints = ""
 
 let Uses = [EFLAGS] in {
 def SBB8rr     : I<0x18, MRMDestReg, (outs GR8:$dst),
@@ -3068,7 +3100,7 @@ def SBB32rr    : I<0x19, MRMDestReg, (outs GR32:$dst),
                   "sbb{l}\t{$src2, $dst|$dst, $src2}",
                  [(set GR32:$dst, (sube GR32:$src1, GR32:$src2))]>;
 
-let isTwoAddress = 0 in {
+let Constraints = "" in {
   def SBB8mr   : I<0x18, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), 
                    "sbb{b}\t{$src2, $dst|$dst, $src2}",
                    [(store (sube (load addr:$dst), GR8:$src2), addr:$dst)]>;
@@ -3103,7 +3135,7 @@ let isTwoAddress = 0 in {
                       "sbb{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
   def SBB32i32 : Ii32<0x1D, RawFrm, (outs), (ins i32imm:$src),
                       "sbb{l}\t{$src, %eax|%eax, $src}", []>;
-}
+} // Constraints = ""
 
 let isCodeGenOnly = 1 in {
 def SBB8rr_REV : I<0x1A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
@@ -3811,6 +3843,7 @@ def MOV32r0  : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "",
 // Thread Local Storage Instructions
 //
 
+// ELF TLS Support
 // All calls clobber the non-callee saved registers. ESP is marked as
 // a use to prevent stack-pointer assignments that appear immediately
 // before calls from potentially appearing dead.
@@ -3819,12 +3852,24 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
     Uses = [ESP] in
-def TLS_addr32 : I<0, Pseudo, (outs), (ins lea32mem:$sym),
+def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                   "leal\t$sym, %eax; "
                   "call\t___tls_get_addr@PLT",
                   [(X86tlsaddr tls32addr:$sym)]>,
                   Requires<[In32BitMode]>;
 
+// Darwin TLS Support
+// For i386, the address of the thunk is passed on the stack, on return the 
+// address of the variable is in %eax.  %ecx is trashed during the function 
+// call.  All other registers are preserved.
+let Defs = [EAX, ECX],
+    Uses = [ESP],
+    usesCustomInserter = 1 in
+def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                "# TLSCall_32",
+                [(X86TLSCall addr:$sym)]>,
+                Requires<[In32BitMode]>;
+                
 let AddedComplexity = 5, isCodeGenOnly = 1 in
 def GS_MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                    "movl\t%gs:$src, $dst",
@@ -4783,14 +4828,14 @@ def : Pat<(X86smul_flag GR32:$src1, 2), (ADD32rr GR32:$src1, GR32:$src1)>;
 // Patterns for nodes that do not produce flags, for instructions that do.
 
 // Increment reg.
-def : Pat<(add GR8:$src ,  1), (INC8r  GR8:$src)>;
-def : Pat<(add GR16:$src,  1), (INC16r GR16:$src)>, Requires<[In32BitMode]>;
-def : Pat<(add GR32:$src,  1), (INC32r GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR8:$src1 ,  1), (INC8r  GR8:$src1)>;
+def : Pat<(add GR16:$src1,  1), (INC16r GR16:$src1)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src1,  1), (INC32r GR32:$src1)>, Requires<[In32BitMode]>;
 
 // Decrement reg.
-def : Pat<(add GR8:$src , -1), (DEC8r  GR8:$src)>;
-def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>, Requires<[In32BitMode]>;
-def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR8:$src1 , -1), (DEC8r  GR8:$src1)>;
+def : Pat<(add GR16:$src1, -1), (DEC16r GR16:$src1)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src1, -1), (DEC32r GR32:$src1)>, Requires<[In32BitMode]>;
 
 // or reg/reg.
 def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr  GR8 :$src1, GR8 :$src2)>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 0952fc8..6cf7ac8 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -513,30 +513,20 @@ def : Pat<(store (v4i16 VR64:$src), addr:$dst),
           (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
 def : Pat<(store (v2i32 VR64:$src), addr:$dst),
           (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
-def : Pat<(store (v2f32 VR64:$src), addr:$dst),
-          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
 def : Pat<(store (v1i64 VR64:$src), addr:$dst),
           (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
 
 // Bit convert.
 def : Pat<(v8i8  (bitconvert (v1i64 VR64:$src))), (v8i8  VR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v2i32 VR64:$src))), (v8i8  VR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2f32 VR64:$src))), (v8i8  VR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v4i16 VR64:$src))), (v8i8  VR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v1i64 VR64:$src))), (v4i16 VR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v2i32 VR64:$src))), (v4i16 VR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2f32 VR64:$src))), (v4i16 VR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v8i8  VR64:$src))), (v4i16 VR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v1i64 VR64:$src))), (v2i32 VR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v2f32 VR64:$src))), (v2i32 VR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v4i16 VR64:$src))), (v2i32 VR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v8i8  VR64:$src))), (v2i32 VR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v1i64 VR64:$src))), (v2f32 VR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v2i32 VR64:$src))), (v2f32 VR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v4i16 VR64:$src))), (v2f32 VR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v8i8  VR64:$src))), (v2f32 VR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v2i32 VR64:$src))), (v1i64 VR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v2f32 VR64:$src))), (v1i64 VR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v4i16 VR64:$src))), (v1i64 VR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v8i8  VR64:$src))), (v1i64 VR64:$src)>;
 
@@ -545,8 +535,6 @@ def : Pat<(v1i64 (bitconvert (i64 GR64:$src))),
           (MMX_MOVD64to64rr GR64:$src)>;
 def : Pat<(v2i32 (bitconvert (i64 GR64:$src))),
           (MMX_MOVD64to64rr GR64:$src)>;
-def : Pat<(v2f32 (bitconvert (i64 GR64:$src))),
-          (MMX_MOVD64to64rr GR64:$src)>;
 def : Pat<(v4i16 (bitconvert (i64 GR64:$src))),
           (MMX_MOVD64to64rr GR64:$src)>;
 def : Pat<(v8i8  (bitconvert (i64 GR64:$src))),
@@ -555,8 +543,6 @@ def : Pat<(i64 (bitconvert (v1i64 VR64:$src))),
           (MMX_MOVD64from64rr VR64:$src)>;
 def : Pat<(i64 (bitconvert (v2i32 VR64:$src))),
           (MMX_MOVD64from64rr VR64:$src)>;
-def : Pat<(i64 (bitconvert (v2f32 VR64:$src))),
-          (MMX_MOVD64from64rr VR64:$src)>;
 def : Pat<(i64 (bitconvert (v4i16 VR64:$src))),
           (MMX_MOVD64from64rr VR64:$src)>;
 def : Pat<(i64  (bitconvert (v8i8 VR64:$src))),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 5580ba7..ab0005b 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -15,322 +15,6 @@
 
 
 //===----------------------------------------------------------------------===//
-// SSE specific DAG Nodes.
-//===----------------------------------------------------------------------===//
-
-def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
-                                            SDTCisFP<0>, SDTCisInt<2> ]>;
-def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
-                                       SDTCisFP<1>, SDTCisVT<3, i8>]>;
-
-def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
-def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
-def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
-                        [SDNPCommutative, SDNPAssociative]>;
-def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
-                        [SDNPCommutative, SDNPAssociative]>;
-def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
-                        [SDNPCommutative, SDNPAssociative]>;
-def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
-def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
-def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
-def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
-def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
-def X86pshufb  : SDNode<"X86ISD::PSHUFB",
-                 SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
-                                      SDTCisSameAs<0,2>]>>;
-def X86pextrb  : SDNode<"X86ISD::PEXTRB",
-                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
-def X86pextrw  : SDNode<"X86ISD::PEXTRW",
-                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
-def X86pinsrb  : SDNode<"X86ISD::PINSRB",
-                 SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
-                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
-def X86pinsrw  : SDNode<"X86ISD::PINSRW",
-                 SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
-                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
-def X86insrtps : SDNode<"X86ISD::INSERTPS",
-                 SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
-                                      SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
-def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
-                 SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
-def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
-                        [SDNPHasChain, SDNPMayLoad]>;
-def X86vshl    : SDNode<"X86ISD::VSHL",      SDTIntShiftOp>;
-def X86vshr    : SDNode<"X86ISD::VSRL",      SDTIntShiftOp>;
-def X86cmpps   : SDNode<"X86ISD::CMPPS",     SDTX86VFCMP>;
-def X86cmppd   : SDNode<"X86ISD::CMPPD",     SDTX86VFCMP>;
-def X86pcmpeqb : SDNode<"X86ISD::PCMPEQB", SDTIntBinOp, [SDNPCommutative]>;
-def X86pcmpeqw : SDNode<"X86ISD::PCMPEQW", SDTIntBinOp, [SDNPCommutative]>;
-def X86pcmpeqd : SDNode<"X86ISD::PCMPEQD", SDTIntBinOp, [SDNPCommutative]>;
-def X86pcmpeqq : SDNode<"X86ISD::PCMPEQQ", SDTIntBinOp, [SDNPCommutative]>;
-def X86pcmpgtb : SDNode<"X86ISD::PCMPGTB", SDTIntBinOp>;
-def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>;
-def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>;
-def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>;
-
-def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
-                                          SDTCisVT<1, v4f32>,
-                                          SDTCisVT<2, v4f32>]>;
-def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
-
-//===----------------------------------------------------------------------===//
-// SSE Complex Patterns
-//===----------------------------------------------------------------------===//
-
-// These are 'extloads' from a scalar to the low element of a vector, zeroing
-// the top elements.  These are used for the SSE 'ss' and 'sd' instruction
-// forms.
-def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [],
-                                  [SDNPHasChain, SDNPMayLoad]>;
-def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
-                                  [SDNPHasChain, SDNPMayLoad]>;
-
-def ssmem : Operand<v4f32> {
-  let PrintMethod = "printf32mem";
-  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
-  let ParserMatchClass = X86MemAsmOperand;
-}
-def sdmem : Operand<v2f64> {
-  let PrintMethod = "printf64mem";
-  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
-  let ParserMatchClass = X86MemAsmOperand;
-}
-
-//===----------------------------------------------------------------------===//
-// SSE pattern fragments
-//===----------------------------------------------------------------------===//
-
-def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
-def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
-def loadv4i32    : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
-def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
-
-// Like 'store', but always requires vector alignment.
-def alignedstore : PatFrag<(ops node:$val, node:$ptr),
-                           (store node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getAlignment() >= 16;
-}]>;
-
-// Like 'load', but always requires vector alignment.
-def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return cast<LoadSDNode>(N)->getAlignment() >= 16;
-}]>;
-
-def alignedloadfsf32 : PatFrag<(ops node:$ptr),
-                               (f32 (alignedload node:$ptr))>;
-def alignedloadfsf64 : PatFrag<(ops node:$ptr),
-                               (f64 (alignedload node:$ptr))>;
-def alignedloadv4f32 : PatFrag<(ops node:$ptr),
-                               (v4f32 (alignedload node:$ptr))>;
-def alignedloadv2f64 : PatFrag<(ops node:$ptr),
-                               (v2f64 (alignedload node:$ptr))>;
-def alignedloadv4i32 : PatFrag<(ops node:$ptr),
-                               (v4i32 (alignedload node:$ptr))>;
-def alignedloadv2i64 : PatFrag<(ops node:$ptr),
-                               (v2i64 (alignedload node:$ptr))>;
-
-// Like 'load', but uses special alignment checks suitable for use in
-// memory operands in most SSE instructions, which are required to
-// be naturally aligned on some targets but not on others.  If the subtarget
-// allows unaligned accesses, match any load, though this may require
-// setting a feature bit in the processor (on startup, for example).
-// Opteron 10h and later implement such a feature.
-def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return    Subtarget->hasVectorUAMem()
-         || cast<LoadSDNode>(N)->getAlignment() >= 16;
-}]>;
-
-def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
-def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
-def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
-def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
-def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
-def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
-def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
-
-// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
-// 16-byte boundary.
-// FIXME: 8 byte alignment for mmx reads is not required
-def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return cast<LoadSDNode>(N)->getAlignment() >= 8;
-}]>;
-
-def memopv8i8  : PatFrag<(ops node:$ptr), (v8i8  (memop64 node:$ptr))>;
-def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>;
-def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop64 node:$ptr))>;
-def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>;
-
-// MOVNT Support
-// Like 'store', but requires the non-temporal bit to be set
-def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
-                           (st node:$val, node:$ptr), [{
-  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
-    return ST->isNonTemporal();
-  return false;
-}]>;
-
-def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
-			           (st node:$val, node:$ptr), [{
-  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
-    return ST->isNonTemporal() && !ST->isTruncatingStore() &&
-           ST->getAddressingMode() == ISD::UNINDEXED &&
-           ST->getAlignment() >= 16;
-  return false;
-}]>;
-
-def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
-			           (st node:$val, node:$ptr), [{
-  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
-    return ST->isNonTemporal() &&
-           ST->getAlignment() < 16;
-  return false;
-}]>;
-
-def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
-def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
-def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
-def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
-def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
-def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
-
-def vzmovl_v2i64 : PatFrag<(ops node:$src),
-                           (bitconvert (v2i64 (X86vzmovl
-                             (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
-def vzmovl_v4i32 : PatFrag<(ops node:$src),
-                           (bitconvert (v4i32 (X86vzmovl
-                             (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
-
-def vzload_v2i64 : PatFrag<(ops node:$src),
-                           (bitconvert (v2i64 (X86vzload node:$src)))>;
-
-
-def fp32imm0 : PatLeaf<(f32 fpimm), [{
-  return N->isExactlyValue(+0.0);
-}]>;
-
-// BYTE_imm - Transform bit immediates into byte immediates.
-def BYTE_imm  : SDNodeXForm<imm, [{
-  // Transformation function: imm >> 3
-  return getI32Imm(N->getZExtValue() >> 3);
-}]>;
-
-// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*,
-// SHUFP* etc. imm.
-def SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
-  return getI8Imm(X86::getShuffleSHUFImmediate(N));
-}]>;
-
-// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to
-// PSHUFHW imm.
-def SHUFFLE_get_pshufhw_imm : SDNodeXForm<vector_shuffle, [{
-  return getI8Imm(X86::getShufflePSHUFHWImmediate(N));
-}]>;
-
-// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to
-// PSHUFLW imm.
-def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{
-  return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
-}]>;
-
-// SHUFFLE_get_palign_imm xform function: convert vector_shuffle mask to
-// a PALIGNR imm.
-def SHUFFLE_get_palign_imm : SDNodeXForm<vector_shuffle, [{
-  return getI8Imm(X86::getShufflePALIGNRImmediate(N));
-}]>;
-
-def splat_lo : PatFrag<(ops node:$lhs, node:$rhs),
-                       (vector_shuffle node:$lhs, node:$rhs), [{
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  return SVOp->isSplat() && SVOp->getSplatIndex() == 0;
-}]>;
-
-def movddup : PatFrag<(ops node:$lhs, node:$rhs),
-                      (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isMOVDDUPMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movhlps : PatFrag<(ops node:$lhs, node:$rhs),
-                      (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isMOVHLPSMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movhlps_undef : PatFrag<(ops node:$lhs, node:$rhs),
-                            (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isMOVHLPS_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movlhps : PatFrag<(ops node:$lhs, node:$rhs),
-                      (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isMOVLHPSMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movlp : PatFrag<(ops node:$lhs, node:$rhs),
-                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isMOVLPMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movl : PatFrag<(ops node:$lhs, node:$rhs),
-                   (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isMOVLMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movshdup : PatFrag<(ops node:$lhs, node:$rhs),
-                       (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isMOVSHDUPMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movsldup : PatFrag<(ops node:$lhs, node:$rhs),
-                       (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isMOVSLDUPMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def unpckl : PatFrag<(ops node:$lhs, node:$rhs),
-                     (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def unpckh : PatFrag<(ops node:$lhs, node:$rhs),
-                     (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
-                           (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
-                           (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def pshufd : PatFrag<(ops node:$lhs, node:$rhs),
-                     (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
-}], SHUFFLE_get_shuf_imm>;
-
-def shufp : PatFrag<(ops node:$lhs, node:$rhs),
-                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isSHUFPMask(cast<ShuffleVectorSDNode>(N));
-}], SHUFFLE_get_shuf_imm>;
-
-def pshufhw : PatFrag<(ops node:$lhs, node:$rhs),
-                      (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isPSHUFHWMask(cast<ShuffleVectorSDNode>(N));
-}], SHUFFLE_get_pshufhw_imm>;
-
-def pshuflw : PatFrag<(ops node:$lhs, node:$rhs),
-                      (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N));
-}], SHUFFLE_get_pshuflw_imm>;
-
-def palign : PatFrag<(ops node:$lhs, node:$rhs),
-                     (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isPALIGNRMask(cast<ShuffleVectorSDNode>(N));
-}], SHUFFLE_get_palign_imm>;
-
-//===----------------------------------------------------------------------===//
 // SSE scalar FP Instructions
 //===----------------------------------------------------------------------===//
 
@@ -368,857 +52,642 @@ let Uses = [EFLAGS], usesCustomInserter = 1 in {
 }
 
 //===----------------------------------------------------------------------===//
-// SSE1 Instructions
+// SSE 1 & 2 Instructions Classes
 //===----------------------------------------------------------------------===//
 
-// Move Instructions. Register-to-register movss is not used for FR32
-// register copies because it's a partial register update; FsMOVAPSrr is
-// used instead. Register-to-register movss is not modeled as an INSERT_SUBREG
-// because INSERT_SUBREG requires that the insert be implementable in terms of
-// a copy, and just mentioned, we don't use movss for copies.
-let Constraints = "$src1 = $dst" in
-def MOVSSrr : SSI<0x10, MRMSrcReg,
-                  (outs VR128:$dst), (ins VR128:$src1, FR32:$src2),
-                  "movss\t{$src2, $dst|$dst, $src2}",
-                  [(set (v4f32 VR128:$dst),
-                        (movl VR128:$src1, (scalar_to_vector FR32:$src2)))]>;
+/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
+multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, X86MemOperand x86memop,
+                           bit Is2Addr = 1> {
+  let isCommutable = 1 in {
+    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>;
+  }
+  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))]>;
+}
+
+/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
+multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                             string asm, string SSEVer, string FPSizeStr,
+                             Operand memopr, ComplexPattern mem_cpat,
+                             bit Is2Addr = 1> {
+  def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (!nameconcat<Intrinsic>("int_x86_sse",
+                       !strconcat(SSEVer, !strconcat("_",
+                       !strconcat(OpcodeStr, FPSizeStr))))
+             RC:$src1, RC:$src2))]>;
+  def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
+       !if(Is2Addr,
+           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (!nameconcat<Intrinsic>("int_x86_sse",
+                       !strconcat(SSEVer, !strconcat("_",
+                       !strconcat(OpcodeStr, FPSizeStr))))
+             RC:$src1, mem_cpat:$src2))]>;
+}
+
+/// sse12_fp_packed - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, ValueType vt,
+                           X86MemOperand x86memop, PatFrag mem_frag,
+                           Domain d, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>;
+  let mayLoad = 1 in
+    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], d>;
+}
+
+/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
+                                      string OpcodeStr, X86MemOperand x86memop,
+                                      list<dag> pat_rr, list<dag> pat_rm,
+                                      bit Is2Addr = 1> {
+  let isCommutable = 1 in
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       pat_rr, d>;
+  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       pat_rm, d>;
+}
+
+/// sse12_fp_packed_int - SSE 1 & 2 packed instructions intrinsics class
+multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                           string asm, string SSEVer, string FPSizeStr,
+                           X86MemOperand x86memop, PatFrag mem_frag,
+                           Domain d, bit Is2Addr = 1> {
+  def rr_Int : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+           [(set RC:$dst, (!nameconcat<Intrinsic>("int_x86_sse",
+                           !strconcat(SSEVer, !strconcat("_",
+                           !strconcat(OpcodeStr, FPSizeStr))))
+                 RC:$src1, RC:$src2))], d>;
+  def rm_Int : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1,x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (!nameconcat<Intrinsic>("int_x86_sse",
+                       !strconcat(SSEVer, !strconcat("_",
+                       !strconcat(OpcodeStr, FPSizeStr))))
+             RC:$src1, (mem_frag addr:$src2)))], d>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Instructions
+//===----------------------------------------------------------------------===//
+
+class sse12_move_rr<RegisterClass RC, ValueType vt, string asm> :
+      SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), asm,
+      [(set (vt VR128:$dst), (movl VR128:$src1, (scalar_to_vector RC:$src2)))]>;
+
+// Loading from memory automatically zeroing upper bits.
+class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
+                    PatFrag mem_pat, string OpcodeStr> :
+      SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                        [(set RC:$dst, (mem_pat addr:$src))]>;
+
+// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
+// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
+// is used instead. Register-to-register movss/movsd is not modeled as an
+// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
+// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
+let isAsmParserOnly = 1 in {
+  def VMOVSSrr : sse12_move_rr<FR32, v4f32,
+                  "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V;
+  def VMOVSDrr : sse12_move_rr<FR64, v2f64,
+                  "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V;
+
+  let canFoldAsLoad = 1, isReMaterializable = 1 in {
+    def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX;
+
+    let AddedComplexity = 20 in
+      def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX;
+  }
+}
+
+let Constraints = "$src1 = $dst" in {
+  def MOVSSrr : sse12_move_rr<FR32, v4f32,
+                          "movss\t{$src2, $dst|$dst, $src2}">, XS;
+  def MOVSDrr : sse12_move_rr<FR64, v2f64,
+                          "movsd\t{$src2, $dst|$dst, $src2}">, XD;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  def MOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
 
+  let AddedComplexity = 20 in
+    def MOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
+}
+
+let AddedComplexity = 15 in {
 // Extract the low 32-bit value from one vector and insert it into another.
-let AddedComplexity = 15 in
 def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
           (MOVSSrr (v4f32 VR128:$src1),
                    (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+// Extract the low 64-bit value from one vector and insert it into another.
+def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
+          (MOVSDrr (v2f64 VR128:$src1),
+                   (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+}
 
 // Implicitly promote a 32-bit scalar to a vector.
 def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
 
-// Loading from memory automatically zeroing upper bits.
-let canFoldAsLoad = 1, isReMaterializable = 1 in
-def MOVSSrm : SSI<0x10, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
-                  "movss\t{$src, $dst|$dst, $src}",
-                  [(set FR32:$dst, (loadf32 addr:$src))]>;
-
+let AddedComplexity = 20 in {
 // MOVSSrm zeros the high parts of the register; represent this
 // with SUBREG_TO_REG.
-let AddedComplexity = 20 in {
 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
           (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
           (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
           (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+// MOVSDrm zeros the high parts of the register; represent this
+// with SUBREG_TO_REG.
+def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+def : Pat<(v2f64 (X86vzload addr:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
 }
 
 // Store scalar value to memory.
 def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
                   "movss\t{$src, $dst|$dst, $src}",
                   [(store FR32:$src, addr:$dst)]>;
+def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
+                  "movsd\t{$src, $dst|$dst, $src}",
+                  [(store FR64:$src, addr:$dst)]>;
+
+let isAsmParserOnly = 1 in {
+def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
+                  "movss\t{$src, $dst|$dst, $src}",
+                  [(store FR32:$src, addr:$dst)]>, XS, VEX_4V;
+def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
+                  "movsd\t{$src, $dst|$dst, $src}",
+                  [(store FR64:$src, addr:$dst)]>, XD, VEX_4V;
+}
 
 // Extract and store.
 def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
                  addr:$dst),
           (MOVSSmr addr:$dst,
                    (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+                 addr:$dst),
+          (MOVSDmr addr:$dst,
+                   (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
 
-// Conversion instructions
-def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
-                      "cvttss2si\t{$src, $dst|$dst, $src}",
-                      [(set GR32:$dst, (fp_to_sint FR32:$src))]>;
-def CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
-                      "cvttss2si\t{$src, $dst|$dst, $src}",
-                      [(set GR32:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
-def CVTSI2SSrr  : SSI<0x2A, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
-                      "cvtsi2ss\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (sint_to_fp GR32:$src))]>;
-def CVTSI2SSrm  : SSI<0x2A, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
-                      "cvtsi2ss\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
-
-// Match intrinsics which expect XMM operand(s).
-def CVTSS2SIrr: SSI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
-                    "cvtss2si{l}\t{$src, $dst|$dst, $src}", []>;
-def CVTSS2SIrm: SSI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
-                    "cvtss2si{l}\t{$src, $dst|$dst, $src}", []>;
-
-def Int_CVTSS2SIrr : SSI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
-                         "cvtss2si\t{$src, $dst|$dst, $src}",
-                         [(set GR32:$dst, (int_x86_sse_cvtss2si VR128:$src))]>;
-def Int_CVTSS2SIrm : SSI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
-                         "cvtss2si\t{$src, $dst|$dst, $src}",
-                         [(set GR32:$dst, (int_x86_sse_cvtss2si
-                                           (load addr:$src)))]>;
-
-// Match intrinsics which expect MM and XMM operand(s).
-def Int_CVTPS2PIrr : PSI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
-                         "cvtps2pi\t{$src, $dst|$dst, $src}",
-                         [(set VR64:$dst, (int_x86_sse_cvtps2pi VR128:$src))]>;
-def Int_CVTPS2PIrm : PSI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
-                         "cvtps2pi\t{$src, $dst|$dst, $src}",
-                         [(set VR64:$dst, (int_x86_sse_cvtps2pi
-                                           (load addr:$src)))]>;
-def Int_CVTTPS2PIrr: PSI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
-                         "cvttps2pi\t{$src, $dst|$dst, $src}",
-                         [(set VR64:$dst, (int_x86_sse_cvttps2pi VR128:$src))]>;
-def Int_CVTTPS2PIrm: PSI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
-                         "cvttps2pi\t{$src, $dst|$dst, $src}",
-                         [(set VR64:$dst, (int_x86_sse_cvttps2pi
-                                           (load addr:$src)))]>;
-let Constraints = "$src1 = $dst" in {
-  def Int_CVTPI2PSrr : PSI<0x2A, MRMSrcReg,
-                           (outs VR128:$dst), (ins VR128:$src1, VR64:$src2),
-                        "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
-                                           VR64:$src2))]>;
-  def Int_CVTPI2PSrm : PSI<0x2A, MRMSrcMem,
-                           (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
-                        "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
-                                            (load addr:$src2)))]>;
-}
-
-// Aliases for intrinsics
-def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
-                          "cvttss2si\t{$src, $dst|$dst, $src}",
-                          [(set GR32:$dst,
-                            (int_x86_sse_cvttss2si VR128:$src))]>;
-def Int_CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
-                          "cvttss2si\t{$src, $dst|$dst, $src}",
-                          [(set GR32:$dst,
-                            (int_x86_sse_cvttss2si(load addr:$src)))]>;
-
-let Constraints = "$src1 = $dst" in {
-  def Int_CVTSI2SSrr : SSI<0x2A, MRMSrcReg,
-                           (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
-                           "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
-                           [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
-                                              GR32:$src2))]>;
-  def Int_CVTSI2SSrm : SSI<0x2A, MRMSrcMem,
-                           (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
-                           "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
-                           [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
-                                              (loadi32 addr:$src2)))]>;
-}
-
-// Comparison instructions
-let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
-  def CMPSSrr : SSIi8<0xC2, MRMSrcReg,
-                    (outs FR32:$dst), (ins FR32:$src1, FR32:$src, SSECC:$cc),
-                    "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
-let mayLoad = 1 in
-  def CMPSSrm : SSIi8<0xC2, MRMSrcMem,
-                    (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, SSECC:$cc),
-                    "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
-
-  // Accept explicit immediate argument form instead of comparison code.
-let isAsmParserOnly = 1 in {
-  def CMPSSrr_alt : SSIi8<0xC2, MRMSrcReg,
-                    (outs FR32:$dst), (ins FR32:$src1, FR32:$src, i8imm:$src2),
-                    "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
-let mayLoad = 1 in
-  def CMPSSrm_alt : SSIi8<0xC2, MRMSrcMem,
-                    (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, i8imm:$src2),
-                    "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
-}
-}
-
-let Defs = [EFLAGS] in {
-def UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs), (ins FR32:$src1, FR32:$src2),
-                   "ucomiss\t{$src2, $src1|$src1, $src2}",
-                   [(set EFLAGS, (X86cmp FR32:$src1, FR32:$src2))]>;
-def UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs), (ins FR32:$src1, f32mem:$src2),
-                   "ucomiss\t{$src2, $src1|$src1, $src2}",
-                   [(set EFLAGS, (X86cmp FR32:$src1, (loadf32 addr:$src2)))]>;
-
-def COMISSrr: PSI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
-                  "comiss\t{$src2, $src1|$src1, $src2}", []>;
-def COMISSrm: PSI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
-                  "comiss\t{$src2, $src1|$src1, $src2}", []>;
-
-} // Defs = [EFLAGS]
-
-// Aliases to match intrinsics which expect XMM operand(s).
-let Constraints = "$src1 = $dst" in {
-  def Int_CMPSSrr : SSIi8<0xC2, MRMSrcReg,
-                        (outs VR128:$dst),
-                        (ins VR128:$src1, VR128:$src, SSECC:$cc),
-                        "cmp${cc}ss\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse_cmp_ss
-                                             VR128:$src1,
-                                             VR128:$src, imm:$cc))]>;
-  def Int_CMPSSrm : SSIi8<0xC2, MRMSrcMem,
-                        (outs VR128:$dst),
-                        (ins VR128:$src1, f32mem:$src, SSECC:$cc),
-                        "cmp${cc}ss\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
-                                           (load addr:$src), imm:$cc))]>;
-}
-
-let Defs = [EFLAGS] in {
-def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
-                       "ucomiss\t{$src2, $src1|$src1, $src2}",
-                       [(set EFLAGS, (X86ucomi (v4f32 VR128:$src1),
-                                               VR128:$src2))]>;
-def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs),(ins VR128:$src1, f128mem:$src2),
-                       "ucomiss\t{$src2, $src1|$src1, $src2}",
-                       [(set EFLAGS, (X86ucomi (v4f32 VR128:$src1),
-                                               (load addr:$src2)))]>;
-
-def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
-                      "comiss\t{$src2, $src1|$src1, $src2}",
-                      [(set EFLAGS, (X86comi (v4f32 VR128:$src1),
-                                             VR128:$src2))]>;
-def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
-                      "comiss\t{$src2, $src1|$src1, $src2}",
-                      [(set EFLAGS, (X86comi (v4f32 VR128:$src1),
-                                             (load addr:$src2)))]>;
-} // Defs = [EFLAGS]
-
-// Aliases of packed SSE1 instructions for scalar use. These all have names
-// that start with 'Fs'.
-
-// Alias instructions that map fld0 to pxor for sse.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
-    canFoldAsLoad = 1 in
-  // FIXME: Set encoding to pseudo!
-def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
-                 [(set FR32:$dst, fp32imm0)]>,
-                 Requires<[HasSSE1]>, TB, OpSize;
-
-// Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are
-// disregarded.
+// Move Aligned/Unaligned floating point values
+multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
+                            X86MemOperand x86memop, PatFrag ld_frag,
+                            string asm, Domain d,
+                            bit IsReMaterializable = 1> {
 let neverHasSideEffects = 1 in
-def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
-                     "movaps\t{$src, $dst|$dst, $src}", []>;
-
-// Alias instruction to load FR32 from f128mem using movaps. Upper bits are
-// disregarded.
-let canFoldAsLoad = 1, isReMaterializable = 1 in
-def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
-                     "movaps\t{$src, $dst|$dst, $src}",
-                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
-
-// Alias bitwise logical operations using SSE logical ops on packed FP values.
-let Constraints = "$src1 = $dst" in {
-let isCommutable = 1 in {
-  def FsANDPSrr : PSI<0x54, MRMSrcReg, (outs FR32:$dst),
-                                       (ins FR32:$src1, FR32:$src2),
-                      "andps\t{$src2, $dst|$dst, $src2}",
-                      [(set FR32:$dst, (X86fand FR32:$src1, FR32:$src2))]>;
-  def FsORPSrr  : PSI<0x56, MRMSrcReg, (outs FR32:$dst),
-                                       (ins FR32:$src1, FR32:$src2),
-                      "orps\t{$src2, $dst|$dst, $src2}",
-                      [(set FR32:$dst, (X86for FR32:$src1, FR32:$src2))]>;
-  def FsXORPSrr : PSI<0x57, MRMSrcReg, (outs FR32:$dst),
-                                       (ins FR32:$src1, FR32:$src2),
-                      "xorps\t{$src2, $dst|$dst, $src2}",
-                      [(set FR32:$dst, (X86fxor FR32:$src1, FR32:$src2))]>;
-}
-
-def FsANDPSrm : PSI<0x54, MRMSrcMem, (outs FR32:$dst),
-                                     (ins FR32:$src1, f128mem:$src2),
-                    "andps\t{$src2, $dst|$dst, $src2}",
-                    [(set FR32:$dst, (X86fand FR32:$src1,
-                                      (memopfsf32 addr:$src2)))]>;
-def FsORPSrm  : PSI<0x56, MRMSrcMem, (outs FR32:$dst),
-                                     (ins FR32:$src1, f128mem:$src2),
-                    "orps\t{$src2, $dst|$dst, $src2}",
-                    [(set FR32:$dst, (X86for FR32:$src1,
-                                      (memopfsf32 addr:$src2)))]>;
-def FsXORPSrm : PSI<0x57, MRMSrcMem, (outs FR32:$dst),
-                                     (ins FR32:$src1, f128mem:$src2),
-                    "xorps\t{$src2, $dst|$dst, $src2}",
-                    [(set FR32:$dst, (X86fxor FR32:$src1,
-                                      (memopfsf32 addr:$src2)))]>;
-
-let neverHasSideEffects = 1 in {
-def FsANDNPSrr : PSI<0x55, MRMSrcReg,
-                     (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
-                     "andnps\t{$src2, $dst|$dst, $src2}", []>;
-let mayLoad = 1 in
-def FsANDNPSrm : PSI<0x55, MRMSrcMem,
-                     (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2),
-                     "andnps\t{$src2, $dst|$dst, $src2}", []>;
-}
-}
-
-/// basic_sse1_fp_binop_rm - SSE1 binops come in both scalar and vector forms.
-///
-/// In addition, we also have a special variant of the scalar form here to
-/// represent the associated intrinsic operation.  This form is unlike the
-/// plain scalar form, in that it takes an entire vector (instead of a scalar)
-/// and leaves the top elements unmodified (therefore these cannot be commuted).
-///
-/// These three forms can each be reg+reg or reg+mem, so there are a total of
-/// six "instructions".
-///
-let Constraints = "$src1 = $dst" in {
-multiclass basic_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
-                                  SDNode OpNode, Intrinsic F32Int,
-                                  bit Commutable = 0> {
-  // Scalar operation, reg+reg.
-  def SSrr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
-                 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
-                 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Scalar operation, reg+mem.
-  def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
-                                 (ins FR32:$src1, f32mem:$src2),
-                 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
-                 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
-
-  // Vector operation, reg+reg.
-  def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
-                                 (ins VR128:$src1, VR128:$src2),
-               !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Vector operation, reg+mem.
-  def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
-                                 (ins VR128:$src1, f128mem:$src2),
-                 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
-             [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
-
-  // Intrinsic operation, reg+reg.
-  def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
-                                     (ins VR128:$src1, VR128:$src2),
-                     !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]>;
-
-  // Intrinsic operation, reg+mem.
-  def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
-                                     (ins VR128:$src1, ssmem:$src2),
-                     !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR128:$dst, (F32Int VR128:$src1,
-                                               sse_load_f32:$src2))]>;
+  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>;
+let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
+  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+                   [(set RC:$dst, (ld_frag addr:$src))], d>;
 }
-}
-
-// Arithmetic instructions
-defm ADD : basic_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>;
-defm MUL : basic_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>;
-defm SUB : basic_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>;
-defm DIV : basic_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>;
 
-/// sse1_fp_binop_rm - Other SSE1 binops
-///
-/// This multiclass is like basic_sse1_fp_binop_rm, with the addition of
-/// instructions for a full-vector intrinsic form.  Operations that map
-/// onto C operators don't use this form since they just use the plain
-/// vector form instead of having a separate vector intrinsic form.
-///
-/// This provides a total of eight "instructions".
-///
-let Constraints = "$src1 = $dst" in {
-multiclass sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
-                            SDNode OpNode,
-                            Intrinsic F32Int,
-                            Intrinsic V4F32Int,
-                            bit Commutable = 0> {
-
-  // Scalar operation, reg+reg.
-  def SSrr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
-                 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
-                 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Scalar operation, reg+mem.
-  def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
-                                 (ins FR32:$src1, f32mem:$src2),
-                 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
-                 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
-
-  // Vector operation, reg+reg.
-  def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
-                                 (ins VR128:$src1, VR128:$src2),
-               !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Vector operation, reg+mem.
-  def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
-                                 (ins VR128:$src1, f128mem:$src2),
-                 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
-             [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
-
-  // Intrinsic operation, reg+reg.
-  def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
-                                     (ins VR128:$src1, VR128:$src2),
-                     !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Intrinsic operation, reg+mem.
-  def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
-                                     (ins VR128:$src1, ssmem:$src2),
-                     !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR128:$dst, (F32Int VR128:$src1,
-                                               sse_load_f32:$src2))]>;
-
-  // Vector intrinsic operation, reg+reg.
-  def PSrr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst),
-                                     (ins VR128:$src1, VR128:$src2),
-                     !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR128:$dst, (V4F32Int VR128:$src1, VR128:$src2))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Vector intrinsic operation, reg+mem.
-  def PSrm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst),
-                                     (ins VR128:$src1, f128mem:$src2),
-                     !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
-           [(set VR128:$dst, (V4F32Int VR128:$src1, (memopv4f32 addr:$src2)))]>;
-}
+let isAsmParserOnly = 1 in {
+defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+                              "movaps", SSEPackedSingle>, VEX;
+defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+                              "movapd", SSEPackedDouble>, OpSize, VEX;
+defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+                              "movups", SSEPackedSingle>, VEX;
+defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+                              "movupd", SSEPackedDouble, 0>, OpSize, VEX;
+
+defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
+                              "movaps", SSEPackedSingle>, VEX;
+defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
+                              "movapd", SSEPackedDouble>, OpSize, VEX;
+defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
+                              "movups", SSEPackedSingle>, VEX;
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
+                              "movupd", SSEPackedDouble, 0>, OpSize, VEX;
 }
+defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+                              "movaps", SSEPackedSingle>, TB;
+defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+                              "movapd", SSEPackedDouble>, TB, OpSize;
+defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+                              "movups", SSEPackedSingle>, TB;
+defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+                              "movupd", SSEPackedDouble, 0>, TB, OpSize;
 
-defm MAX : sse1_fp_binop_rm<0x5F, "max", X86fmax,
-                            int_x86_sse_max_ss, int_x86_sse_max_ps>;
-defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin,
-                            int_x86_sse_min_ss, int_x86_sse_min_ps>;
-
-//===----------------------------------------------------------------------===//
-// SSE packed FP Instructions
-
-// Move Instructions
-let neverHasSideEffects = 1 in
-def MOVAPSrr : PSI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                   "movaps\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1 in
-def MOVAPSrm : PSI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+let isAsmParserOnly = 1 in {
+def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (alignedloadv4f32 addr:$src))]>;
-
+                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, VEX;
+def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, VEX;
+def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v4f32 VR128:$src), addr:$dst)]>, VEX;
+def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)]>, VEX;
+def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, VEX;
+def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, VEX;
+def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v8f32 VR256:$src), addr:$dst)]>, VEX;
+def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v4f64 VR256:$src), addr:$dst)]>, VEX;
+}
 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
-
-let neverHasSideEffects = 1 in
-def MOVUPSrr : PSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                   "movups\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1 in
-def MOVUPSrm : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                   "movups\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (loadv4f32 addr:$src))]>;
+def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movups\t{$src, $dst|$dst, $src}",
                    [(store (v4f32 VR128:$src), addr:$dst)]>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)]>;
 
-// Intrinsic forms of MOVUPS load and store
+// Intrinsic forms of MOVUPS/D load and store
+let isAsmParserOnly = 1 in {
+  let canFoldAsLoad = 1, isReMaterializable = 1 in
+  def VMOVUPSrm_Int : VPSI<0x10, MRMSrcMem, (outs VR128:$dst),
+             (ins f128mem:$src),
+             "movups\t{$src, $dst|$dst, $src}",
+             [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>, VEX;
+  def VMOVUPDrm_Int : VPDI<0x10, MRMSrcMem, (outs VR128:$dst),
+             (ins f128mem:$src),
+             "movupd\t{$src, $dst|$dst, $src}",
+             [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>, VEX;
+  def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs),
+             (ins f128mem:$dst, VR128:$src),
+             "movups\t{$src, $dst|$dst, $src}",
+             [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX;
+  def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs),
+             (ins f128mem:$dst, VR128:$src),
+             "movupd\t{$src, $dst|$dst, $src}",
+             [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX;
+}
 let canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "movups\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
+def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "movupd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>;
+
 def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                        "movups\t{$src, $dst|$dst, $src}",
                        [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>;
+def MOVUPDmr_Int : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                       "movupd\t{$src, $dst|$dst, $src}",
+                       [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>;
 
-let Constraints = "$src1 = $dst" in {
-  let AddedComplexity = 20 in {
-    def MOVLPSrm : PSI<0x12, MRMSrcMem,
-                       (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
-                       "movlps\t{$src2, $dst|$dst, $src2}",
-       [(set VR128:$dst,
-         (movlp VR128:$src1,
-                (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
-    def MOVHPSrm : PSI<0x16, MRMSrcMem,
-                       (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
-                       "movhps\t{$src2, $dst|$dst, $src2}",
-       [(set VR128:$dst,
-         (movlhps VR128:$src1,
-                (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
-  } // AddedComplexity
-} // Constraints = "$src1 = $dst"
-
+// Move Low/High packed floating point values
+multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC,
+                                 PatFrag mov_frag, string base_opc,
+                                 string asm_opr> {
+  def PSrm : PI<opc, MRMSrcMem,
+         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+         !strconcat(!strconcat(base_opc,"s"), asm_opr),
+     [(set RC:$dst,
+       (mov_frag RC:$src1,
+              (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
+              SSEPackedSingle>, TB;
+
+  def PDrm : PI<opc, MRMSrcMem,
+         (outs RC:$dst), (ins RC:$src1, f64mem:$src2),
+         !strconcat(!strconcat(base_opc,"d"), asm_opr),
+     [(set RC:$dst, (v2f64 (mov_frag RC:$src1,
+                              (scalar_to_vector (loadf64 addr:$src2)))))],
+              SSEPackedDouble>, TB, OpSize;
+}
 
-def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
-          (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
+let isAsmParserOnly = 1, AddedComplexity = 20 in {
+  defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp",
+                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
+  defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
+                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
+}
+let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
+  defm MOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp",
+                                   "\t{$src2, $dst|$dst, $src2}">;
+  defm MOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
+                                   "\t{$src2, $dst|$dst, $src2}">;
+}
 
+let isAsmParserOnly = 1 in {
+def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)]>, VEX;
+def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)]>, VEX;
+}
 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
                                  (iPTR 0))), addr:$dst)]>;
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)]>;
 
 // v2f64 extract element 1 is always custom lowered to unpack high to low
 // and extract element 0 so the non-store version isn't too horrible.
+let isAsmParserOnly = 1 in {
+def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (unpckh (bc_v2f64 (v4f32 VR128:$src)),
+                                         (undef)), (iPTR 0))), addr:$dst)]>,
+                   VEX;
+def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (v2f64 (unpckh VR128:$src, (undef))),
+                                 (iPTR 0))), addr:$dst)]>,
+                   VEX;
+}
 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract
                                  (unpckh (bc_v2f64 (v4f32 VR128:$src)),
                                          (undef)), (iPTR 0))), addr:$dst)]>;
+def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (v2f64 (unpckh VR128:$src, (undef))),
+                                 (iPTR 0))), addr:$dst)]>;
 
-let Constraints = "$src1 = $dst" in {
-let AddedComplexity = 20 in {
-def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
-                                     (ins VR128:$src1, VR128:$src2),
-                    "movlhps\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst,
-                      (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>;
-
-def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
-                                     (ins VR128:$src1, VR128:$src2),
-                    "movhlps\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst,
-                      (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
-} // AddedComplexity
-} // Constraints = "$src1 = $dst"
+let isAsmParserOnly = 1, AddedComplexity = 20 in {
+  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>,
+                      VEX_4V;
+  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>,
+                      VEX_4V;
+}
+let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
+  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movlhps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>;
+  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movhlps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
+}
 
+def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+          (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
 let AddedComplexity = 20 in {
-def : Pat<(v4f32 (movddup VR128:$src, (undef))),
-          (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
-def : Pat<(v2i64 (movddup VR128:$src, (undef))),
-          (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
+  def : Pat<(v4f32 (movddup VR128:$src, (undef))),
+            (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
+  def : Pat<(v2i64 (movddup VR128:$src, (undef))),
+            (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
 }
 
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Conversion Instructions
+//===----------------------------------------------------------------------===//
 
+multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+                     string asm> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                        [(set DstRC:$dst, (OpNode SrcRC:$src))]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>;
+}
 
-// Arithmetic
-
-/// sse1_fp_unop_rm - SSE1 unops come in both scalar and vector forms.
-///
-/// In addition, we also have a special variant of the scalar form here to
-/// represent the associated intrinsic operation.  This form is unlike the
-/// plain scalar form, in that it takes an entire vector (instead of a
-/// scalar) and leaves the top elements undefined.
-///
-/// And, we have a special variant form for a full-vector intrinsic form.
-///
-/// These four forms can each have a reg or a mem operand, so there are a
-/// total of eight "instructions".
-///
-multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr,
-                           SDNode OpNode,
-                           Intrinsic F32Int,
-                           Intrinsic V4F32Int,
-                           bit Commutable = 0> {
-  // Scalar operation, reg.
-  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
-                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                [(set FR32:$dst, (OpNode FR32:$src))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Scalar operation, mem.
-  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
-                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                [(set FR32:$dst, (OpNode (load addr:$src)))]>, XS,
-            Requires<[HasSSE1, OptForSize]>;
+multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+                         string asm, Domain d> {
+  def rr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                        [(set DstRC:$dst, (OpNode SrcRC:$src))], d>;
+  def rm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))], d>;
+}
 
-  // Vector operation, reg.
-  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-              [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]> {
-    let isCommutable = Commutable;
-  }
+multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+                     string asm> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
+              asm, []>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins DstRC:$src1, x86memop:$src), asm, []>;
+}
 
-  // Vector operation, mem.
-  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>;
+let isAsmParserOnly = 1 in {
+defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+                      "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX;
+defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+                      "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX;
+defm VCVTSI2SS  : sse12_vcvt_avx<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
+                      "cvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}">, XS,
+                      VEX_4V;
+defm VCVTSI2SD  : sse12_vcvt_avx<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
+                      "cvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}">, XD,
+                      VEX_4V;
+}
 
-  // Intrinsic operation, reg.
-  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (F32Int VR128:$src))]> {
-    let isCommutable = Commutable;
-  }
+defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+                      "cvttss2si\t{$src, $dst|$dst, $src}">, XS;
+defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+                      "cvttsd2si\t{$src, $dst|$dst, $src}">, XD;
+defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
+                      "cvtsi2ss\t{$src, $dst|$dst, $src}">, XS;
+defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
+                      "cvtsi2sd\t{$src, $dst|$dst, $src}">, XD;
+
+// Conversion Instructions Intrinsics - Match intrinsics which expect MM
+// and/or XMM operand(s).
+multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
+                         string asm, Domain d> {
+  def rr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                        [(set DstRC:$dst, (Int SrcRC:$src))], d>;
+  def rm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                        [(set DstRC:$dst, (Int (ld_frag addr:$src)))], d>;
+}
 
-  // Intrinsic operation, mem.
-  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
-                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
+multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
+                         string asm> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                        [(set DstRC:$dst, (Int SrcRC:$src))]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                        [(set DstRC:$dst, (Int (ld_frag addr:$src)))]>;
+}
 
-  // Vector intrinsic operation, reg
-  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (V4F32Int VR128:$src))]> {
-    let isCommutable = Commutable;
-  }
+multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
+                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm, Domain d> {
+  def rr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
+              asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], d>;
+  def rm : PI<opc, MRMSrcMem, (outs DstRC:$dst),
+                   (ins DstRC:$src1, x86memop:$src2), asm,
+              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], d>;
+}
 
-  // Vector intrinsic operation, mem
-  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>;
+multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
+                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
+              asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+                   (ins DstRC:$src1, x86memop:$src2), asm,
+              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))]>;
 }
 
-// Square root.
-defm SQRT  : sse1_fp_unop_rm<0x51, "sqrt",  fsqrt,
-                             int_x86_sse_sqrt_ss, int_x86_sse_sqrt_ps>;
+let isAsmParserOnly = 1 in {
+  defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                        f32mem, load, "cvtss2si\t{$src, $dst|$dst, $src}">, XS,
+                        VEX;
+  defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
+                        f128mem, load, "cvtsd2si\t{$src, $dst|$dst, $src}">, XD,
+                        VEX;
+}
+defm Int_CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                      f32mem, load, "cvtss2si\t{$src, $dst|$dst, $src}">, XS;
+defm Int_CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
+                      f128mem, load, "cvtsd2si\t{$src, $dst|$dst, $src}">, XD;
 
-// Reciprocal approximations. Note that these typically require refinement
-// in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_rm<0x52, "rsqrt", X86frsqrt,
-                             int_x86_sse_rsqrt_ss, int_x86_sse_rsqrt_ps>;
-defm RCP   : sse1_fp_unop_rm<0x53, "rcp",   X86frcp,
-                             int_x86_sse_rcp_ss, int_x86_sse_rcp_ps>;
 
-// Logical
 let Constraints = "$src1 = $dst" in {
-  let isCommutable = 1 in {
-    def ANDPSrr : PSI<0x54, MRMSrcReg,
-                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                      "andps\t{$src2, $dst|$dst, $src2}",
-                      [(set VR128:$dst, (v2i64
-                                         (and VR128:$src1, VR128:$src2)))]>;
-    def ORPSrr  : PSI<0x56, MRMSrcReg,
-                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                      "orps\t{$src2, $dst|$dst, $src2}",
-                      [(set VR128:$dst, (v2i64
-                                         (or VR128:$src1, VR128:$src2)))]>;
-    def XORPSrr : PSI<0x57, MRMSrcReg,
-                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                      "xorps\t{$src2, $dst|$dst, $src2}",
-                      [(set VR128:$dst, (v2i64
-                                         (xor VR128:$src1, VR128:$src2)))]>;
-  }
-
-  def ANDPSrm : PSI<0x54, MRMSrcMem,
-                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
-                    "andps\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (and (bc_v2i64 (v4f32 VR128:$src1)),
-                                       (memopv2i64 addr:$src2)))]>;
-  def ORPSrm  : PSI<0x56, MRMSrcMem,
-                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
-                    "orps\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (or (bc_v2i64 (v4f32 VR128:$src1)),
-                                       (memopv2i64 addr:$src2)))]>;
-  def XORPSrm : PSI<0x57, MRMSrcMem,
-                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
-                    "xorps\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (xor (bc_v2i64 (v4f32 VR128:$src1)),
-                                       (memopv2i64 addr:$src2)))]>;
-  def ANDNPSrr : PSI<0x55, MRMSrcReg,
-                     (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                     "andnps\t{$src2, $dst|$dst, $src2}",
-                     [(set VR128:$dst,
-                       (v2i64 (and (xor VR128:$src1,
-                                    (bc_v2i64 (v4i32 immAllOnesV))),
-                               VR128:$src2)))]>;
-  def ANDNPSrm : PSI<0x55, MRMSrcMem,
-                     (outs VR128:$dst), (ins VR128:$src1,f128mem:$src2),
-                     "andnps\t{$src2, $dst|$dst, $src2}",
-                     [(set VR128:$dst,
-                       (v2i64 (and (xor (bc_v2i64 (v4f32 VR128:$src1)),
-                                    (bc_v2i64 (v4i32 immAllOnesV))),
-                               (memopv2i64 addr:$src2))))]>;
+  defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+                        int_x86_sse_cvtsi2ss, i32mem, loadi32,
+                        "cvtsi2ss\t{$src2, $dst|$dst, $src2}">, XS;
+  defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+                        int_x86_sse2_cvtsi2sd, i32mem, loadi32,
+                        "cvtsi2ss\t{$src2, $dst|$dst, $src2}">, XD;
 }
 
+// Instructions below don't have an AVX form.
+defm Int_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
+                      f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
+                      SSEPackedSingle>, TB;
+defm Int_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
+                      f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
+                      SSEPackedDouble>, TB, OpSize;
+defm Int_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
+                       f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
+                       SSEPackedSingle>, TB;
+defm Int_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
+                       f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
+                       SSEPackedDouble>, TB, OpSize;
+defm Int_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
+                         i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
+                         SSEPackedDouble>, TB, OpSize;
 let Constraints = "$src1 = $dst" in {
-  def CMPPSrri : PSIi8<0xC2, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
-                    "cmp${cc}ps\t{$src, $dst|$dst, $src}",
-                    [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
-                                                        VR128:$src, imm:$cc))]>;
-  def CMPPSrmi : PSIi8<0xC2, MRMSrcMem,
-                  (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
-                  "cmp${cc}ps\t{$src, $dst|$dst, $src}",
-                  [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
-                                            (memop addr:$src), imm:$cc))]>;
-
-  // Accept explicit immediate argument form instead of comparison code.
-let isAsmParserOnly = 1 in {
-  def CMPPSrri_alt : PSIi8<0xC2, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src, i8imm:$src2),
-                    "cmpps\t{$src2, $src, $dst|$dst, $src, $src}", []>;
-  def CMPPSrmi_alt : PSIi8<0xC2, MRMSrcMem,
-                  (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, i8imm:$src2),
-                  "cmpps\t{$src2, $src, $dst|$dst, $src, $src}", []>;
+  defm Int_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
+                         int_x86_sse_cvtpi2ps,
+                         i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+                         SSEPackedSingle>, TB;
 }
-}
-def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
-          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
-          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
-
-// Shuffle and unpack instructions
-let Constraints = "$src1 = $dst" in {
-  let isConvertibleToThreeAddress = 1 in // Convert to pshufd
-    def SHUFPSrri : PSIi8<0xC6, MRMSrcReg,
-                          (outs VR128:$dst), (ins VR128:$src1,
-                           VR128:$src2, i8imm:$src3),
-                          "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                          [(set VR128:$dst,
-                            (v4f32 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
-  def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1,
-                         f128mem:$src2, i8imm:$src3),
-                        "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                        [(set VR128:$dst,
-                          (v4f32 (shufp:$src3
-                                  VR128:$src1, (memopv4f32 addr:$src2))))]>;
-
-  let AddedComplexity = 10 in {
-    def UNPCKHPSrr : PSI<0x15, MRMSrcReg,
-                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                         "unpckhps\t{$src2, $dst|$dst, $src2}",
-                         [(set VR128:$dst,
-                           (v4f32 (unpckh VR128:$src1, VR128:$src2)))]>;
-    def UNPCKHPSrm : PSI<0x15, MRMSrcMem,
-                         (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
-                         "unpckhps\t{$src2, $dst|$dst, $src2}",
-                         [(set VR128:$dst,
-                           (v4f32 (unpckh VR128:$src1,
-                                          (memopv4f32 addr:$src2))))]>;
-
-    def UNPCKLPSrr : PSI<0x14, MRMSrcReg,
-                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                         "unpcklps\t{$src2, $dst|$dst, $src2}",
-                         [(set VR128:$dst,
-                           (v4f32 (unpckl VR128:$src1, VR128:$src2)))]>;
-    def UNPCKLPSrm : PSI<0x14, MRMSrcMem,
-                         (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
-                         "unpcklps\t{$src2, $dst|$dst, $src2}",
-                         [(set VR128:$dst,
-                           (unpckl VR128:$src1, (memopv4f32 addr:$src2)))]>;
-  } // AddedComplexity
-} // Constraints = "$src1 = $dst"
-
-// Mask creation
-def MOVMSKPSrr : PSI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
-                     "movmskps\t{$src, $dst|$dst, $src}",
-                     [(set GR32:$dst, (int_x86_sse_movmsk_ps VR128:$src))]>;
-def MOVMSKPDrr : PDI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
-                     "movmskpd\t{$src, $dst|$dst, $src}",
-                     [(set GR32:$dst, (int_x86_sse2_movmsk_pd VR128:$src))]>;
-
-// Prefetch intrinsic.
-def PREFETCHT0   : PSI<0x18, MRM1m, (outs), (ins i8mem:$src),
-    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3))]>;
-def PREFETCHT1   : PSI<0x18, MRM2m, (outs), (ins i8mem:$src),
-    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2))]>;
-def PREFETCHT2   : PSI<0x18, MRM3m, (outs), (ins i8mem:$src),
-    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1))]>;
-def PREFETCHNTA  : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
-    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>;
 
-// Non-temporal stores
-def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
-                    "movntps\t{$src, $dst|$dst, $src}",
-                    [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
+/// SSE 1 Only
 
-let AddedComplexity = 400 in { // Prefer non-temporal versions
-def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                    "movntps\t{$src, $dst|$dst, $src}",
-                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
-
-def MOVNTDQ_64mr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                    "movntdq\t{$src, $dst|$dst, $src}",
-                    [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>;
-
-def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-                 "movnti\t{$src, $dst|$dst, $src}",
-                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
-               TB, Requires<[HasSSE2]>;
-
-def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
-                     "movnti\t{$src, $dst|$dst, $src}",
-                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
-                  TB, Requires<[HasSSE2]>;
+// Aliases for intrinsics
+let isAsmParserOnly = 1, Pattern = []<dag> in {
+defm Int_VCVTTSS2SI : sse12_cvt_sint_3addr<0x2C, VR128, GR32,
+                int_x86_sse_cvttss2si, f32mem, load,
+                "cvttss2si\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS;
+defm Int_VCVTTSD2SI : sse12_cvt_sint_3addr<0x2C, VR128, GR32,
+                int_x86_sse2_cvttsd2si, f128mem, load,
+                "cvttss2si\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD;
 }
-
-// Load, store, and memory fence
-def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
-             TB, Requires<[HasSSE1]>;
-
-// MXCSR register
-def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
-                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>;
-def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
-                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
-
-// Alias instructions that map zero vector to pxor / xorp* for sse.
-// We set canFoldAsLoad because this can be converted to a constant-pool
-// load of an all-zeros value if folding it would be beneficial.
-// FIXME: Change encoding to pseudo!
-let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1 in {
-def V_SET0PS : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
-                 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
-def V_SET0PD : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
-                 [(set VR128:$dst, (v2f64 immAllZerosV))]>;
-let ExeDomain = SSEPackedInt in
-def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
-                 [(set VR128:$dst, (v4i32 immAllZerosV))]>;
+defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+                          f32mem, load, "cvttss2si\t{$src, $dst|$dst, $src}">,
+                          XS;
+defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+                          f128mem, load, "cvttss2si\t{$src, $dst|$dst, $src}">,
+                          XD;
+
+let isAsmParserOnly = 1, Pattern = []<dag> in {
+defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load,
+                          "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, VEX;
+defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, f128mem, load,
+                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
+                            SSEPackedSingle>, TB, VEX;
 }
-
-def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>;
-def : Pat<(v8i16 immAllZerosV), (V_SET0PI)>;
-def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>;
-
-def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
-          (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
-
-//===---------------------------------------------------------------------===//
-// SSE2 Instructions
-//===---------------------------------------------------------------------===//
-
-// Move Instructions. Register-to-register movsd is not used for FR64
-// register copies because it's a partial register update; FsMOVAPDrr is
-// used instead. Register-to-register movsd is not modeled as an INSERT_SUBREG
-// because INSERT_SUBREG requires that the insert be implementable in terms of
-// a copy, and just mentioned, we don't use movsd for copies.
-let Constraints = "$src1 = $dst" in
-def MOVSDrr : SDI<0x10, MRMSrcReg,
-                  (outs VR128:$dst), (ins VR128:$src1, FR64:$src2),
-                  "movsd\t{$src2, $dst|$dst, $src2}",
-                  [(set (v2f64 VR128:$dst),
-                        (movl VR128:$src1, (scalar_to_vector FR64:$src2)))]>;
-
-// Extract the low 64-bit value from one vector and insert it into another.
-let AddedComplexity = 15 in
-def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
-          (MOVSDrr (v2f64 VR128:$src1),
-                   (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
-
-// Implicitly promote a 64-bit scalar to a vector.
-def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
-
-// Loading from memory automatically zeroing upper bits.
-let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 20 in
-def MOVSDrm : SDI<0x10, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
-                  "movsd\t{$src, $dst|$dst, $src}",
-                  [(set FR64:$dst, (loadf64 addr:$src))]>;
-
-// MOVSDrm zeros the high parts of the register; represent this
-// with SUBREG_TO_REG.
-let AddedComplexity = 20 in {
-def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-def : Pat<(v2f64 (X86vzload addr:$src)),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+let Pattern = []<dag> in {
+defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/,
+                          "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS;
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, f128mem, load /*dummy*/,
+                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
+                            SSEPackedSingle>, TB; /* PD SSE3 form is avaiable */
 }
 
-// Store scalar value to memory.
-def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
-                  "movsd\t{$src, $dst|$dst, $src}",
-                  [(store FR64:$src, addr:$dst)]>;
-
-// Extract and store.
-def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
-                 addr:$dst),
-          (MOVSDmr addr:$dst,
-                   (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+/// SSE 2 Only
 
-// Conversion instructions
-def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src),
-                      "cvttsd2si\t{$src, $dst|$dst, $src}",
-                      [(set GR32:$dst, (fp_to_sint FR64:$src))]>;
-def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f64mem:$src),
-                      "cvttsd2si\t{$src, $dst|$dst, $src}",
-                      [(set GR32:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+// Convert scalar double to scalar single
+let isAsmParserOnly = 1 in {
+def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
+                       (ins FR64:$src1, FR64:$src2),
+                      "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                      VEX_4V;
+def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
+                       (ins FR64:$src1, f64mem:$src2),
+                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V;
+}
 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (fround FR64:$src))]>;
@@ -1226,35 +695,28 @@ def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD,
                   Requires<[HasSSE2, OptForSize]>;
-def CVTSI2SDrr  : SDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR32:$src),
-                      "cvtsi2sd\t{$src, $dst|$dst, $src}",
-                      [(set FR64:$dst, (sint_to_fp GR32:$src))]>;
-def CVTSI2SDrm  : SDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i32mem:$src),
-                      "cvtsi2sd\t{$src, $dst|$dst, $src}",
-                      [(set FR64:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
 
-def CVTPD2DQrm  : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
-def CVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
-def CVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
-def CVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
-def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                     "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
-def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                     "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
-def CVTDQ2PSrr : PSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                     "cvtdq2ps\t{$src, $dst|$dst, $src}", []>;
-def CVTDQ2PSrm : PSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                     "cvtdq2ps\t{$src, $dst|$dst, $src}", []>;
-def COMISDrr: PDI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
-                  "comisd\t{$src2, $src1|$src1, $src2}", []>;
-def COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
-                      "comisd\t{$src2, $src1|$src1, $src2}", []>;
-
-// SSE2 instructions with XS prefix
+let isAsmParserOnly = 1 in
+defm Int_VCVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
+                    int_x86_sse2_cvtsd2ss, f64mem, load,
+                    "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
+                    XS, VEX_4V;
+let Constraints = "$src1 = $dst" in
+defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
+             int_x86_sse2_cvtsd2ss, f64mem, load,
+             "cvtsd2ss\t{$src2, $dst|$dst, $src2}">, XS;
+
+// Convert scalar single to scalar double
+let isAsmParserOnly = 1 in { // SSE2 instructions with XS prefix
+def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
+                    (ins FR32:$src1, FR32:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    []>, XS, Requires<[HasAVX]>, VEX_4V;
+def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
+                    (ins FR32:$src1, f32mem:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    []>, XS, VEX_4V, Requires<[HasAVX, OptForSize]>;
+}
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (fextend FR32:$src))]>, XS,
@@ -1264,394 +726,51 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                    [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
                  Requires<[HasSSE2, OptForSize]>;
 
-def : Pat<(extloadf32 addr:$src),
-          (CVTSS2SDrr (MOVSSrm addr:$src))>,
-      Requires<[HasSSE2, OptForSpeed]>;
-
-// Match intrinsics which expect XMM operand(s).
-def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
-                         "cvtsd2si\t{$src, $dst|$dst, $src}",
-                         [(set GR32:$dst, (int_x86_sse2_cvtsd2si VR128:$src))]>;
-def Int_CVTSD2SIrm : SDI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
-                         "cvtsd2si\t{$src, $dst|$dst, $src}",
-                         [(set GR32:$dst, (int_x86_sse2_cvtsd2si
-                                           (load addr:$src)))]>;
-
-// Match intrinsics which expect MM and XMM operand(s).
-def Int_CVTPD2PIrr : PDI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
-                         "cvtpd2pi\t{$src, $dst|$dst, $src}",
-                         [(set VR64:$dst, (int_x86_sse_cvtpd2pi VR128:$src))]>;
-def Int_CVTPD2PIrm : PDI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
-                         "cvtpd2pi\t{$src, $dst|$dst, $src}",
-                         [(set VR64:$dst, (int_x86_sse_cvtpd2pi
-                                           (memop addr:$src)))]>;
-def Int_CVTTPD2PIrr: PDI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
-                         "cvttpd2pi\t{$src, $dst|$dst, $src}",
-                         [(set VR64:$dst, (int_x86_sse_cvttpd2pi VR128:$src))]>;
-def Int_CVTTPD2PIrm: PDI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
-                         "cvttpd2pi\t{$src, $dst|$dst, $src}",
-                         [(set VR64:$dst, (int_x86_sse_cvttpd2pi
-                                           (memop addr:$src)))]>;
-def Int_CVTPI2PDrr : PDI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
-                         "cvtpi2pd\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst, (int_x86_sse_cvtpi2pd VR64:$src))]>;
-def Int_CVTPI2PDrm : PDI<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
-                         "cvtpi2pd\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst, (int_x86_sse_cvtpi2pd
-                                            (load addr:$src)))]>;
-
-// Aliases for intrinsics
-def Int_CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
-                          "cvttsd2si\t{$src, $dst|$dst, $src}",
-                          [(set GR32:$dst,
-                            (int_x86_sse2_cvttsd2si VR128:$src))]>;
-def Int_CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
-                          "cvttsd2si\t{$src, $dst|$dst, $src}",
-                          [(set GR32:$dst, (int_x86_sse2_cvttsd2si
-                                            (load addr:$src)))]>;
-
-// Comparison instructions
-let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
-  def CMPSDrr : SDIi8<0xC2, MRMSrcReg,
-                    (outs FR64:$dst), (ins FR64:$src1, FR64:$src, SSECC:$cc),
-                    "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
-let mayLoad = 1 in
-  def CMPSDrm : SDIi8<0xC2, MRMSrcMem,
-                    (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, SSECC:$cc),
-                    "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
-
-  // Accept explicit immediate argument form instead of comparison code.
 let isAsmParserOnly = 1 in {
-  def CMPSDrr_alt : SDIi8<0xC2, MRMSrcReg,
-                    (outs FR64:$dst), (ins FR64:$src1, FR64:$src, i8imm:$src2),
-                    "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
-let mayLoad = 1 in
-  def CMPSDrm_alt : SDIi8<0xC2, MRMSrcMem,
-                    (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, i8imm:$src2),
-                    "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
-}
-}
-
-let Defs = [EFLAGS] in {
-def UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins FR64:$src1, FR64:$src2),
-                   "ucomisd\t{$src2, $src1|$src1, $src2}",
-                   [(set EFLAGS, (X86cmp FR64:$src1, FR64:$src2))]>;
-def UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs), (ins FR64:$src1, f64mem:$src2),
-                   "ucomisd\t{$src2, $src1|$src1, $src2}",
-                   [(set EFLAGS, (X86cmp FR64:$src1, (loadf64 addr:$src2)))]>;
-} // Defs = [EFLAGS]
-
-// Aliases to match intrinsics which expect XMM operand(s).
-let Constraints = "$src1 = $dst" in {
-  def Int_CMPSDrr : SDIi8<0xC2, MRMSrcReg,
-                        (outs VR128:$dst),
-                        (ins VR128:$src1, VR128:$src, SSECC:$cc),
-                        "cmp${cc}sd\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
-                                           VR128:$src, imm:$cc))]>;
-  def Int_CMPSDrm : SDIi8<0xC2, MRMSrcMem,
-                        (outs VR128:$dst),
-                        (ins VR128:$src1, f64mem:$src, SSECC:$cc),
-                        "cmp${cc}sd\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
-                                           (load addr:$src), imm:$cc))]>;
-}
-
-let Defs = [EFLAGS] in {
-def Int_UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
-                       "ucomisd\t{$src2, $src1|$src1, $src2}",
-                       [(set EFLAGS, (X86ucomi (v2f64 VR128:$src1),
-                                               VR128:$src2))]>;
-def Int_UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs),(ins VR128:$src1, f128mem:$src2),
-                       "ucomisd\t{$src2, $src1|$src1, $src2}",
-                       [(set EFLAGS, (X86ucomi (v2f64 VR128:$src1),
-                                               (load addr:$src2)))]>;
-
-def Int_COMISDrr: PDI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
-                      "comisd\t{$src2, $src1|$src1, $src2}",
-                      [(set EFLAGS, (X86comi (v2f64 VR128:$src1),
-                                             VR128:$src2))]>;
-def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
-                      "comisd\t{$src2, $src1|$src1, $src2}",
-                      [(set EFLAGS, (X86comi (v2f64 VR128:$src1),
-                                             (load addr:$src2)))]>;
-} // Defs = [EFLAGS]
-
-// Aliases of packed SSE2 instructions for scalar use. These all have names
-// that start with 'Fs'.
-
-// Alias instructions that map fld0 to pxor for sse.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
-    canFoldAsLoad = 1 in
-def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
-                 [(set FR64:$dst, fpimm0)]>,
-               Requires<[HasSSE2]>, TB, OpSize;
-
-// Alias instruction to do FR64 reg-to-reg copy using movapd. Upper bits are
-// disregarded.
-let neverHasSideEffects = 1 in
-def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
-                     "movapd\t{$src, $dst|$dst, $src}", []>;
-
-// Alias instruction to load FR64 from f128mem using movapd. Upper bits are
-// disregarded.
-let canFoldAsLoad = 1, isReMaterializable = 1 in
-def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
-                     "movapd\t{$src, $dst|$dst, $src}",
-                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
-
-// Alias bitwise logical operations using SSE logical ops on packed FP values.
-let Constraints = "$src1 = $dst" in {
-let isCommutable = 1 in {
-  def FsANDPDrr : PDI<0x54, MRMSrcReg, (outs FR64:$dst),
-                                       (ins FR64:$src1, FR64:$src2),
-                      "andpd\t{$src2, $dst|$dst, $src2}",
-                      [(set FR64:$dst, (X86fand FR64:$src1, FR64:$src2))]>;
-  def FsORPDrr  : PDI<0x56, MRMSrcReg, (outs FR64:$dst),
-                                       (ins FR64:$src1, FR64:$src2),
-                      "orpd\t{$src2, $dst|$dst, $src2}",
-                      [(set FR64:$dst, (X86for FR64:$src1, FR64:$src2))]>;
-  def FsXORPDrr : PDI<0x57, MRMSrcReg, (outs FR64:$dst),
-                                       (ins FR64:$src1, FR64:$src2),
-                      "xorpd\t{$src2, $dst|$dst, $src2}",
-                      [(set FR64:$dst, (X86fxor FR64:$src1, FR64:$src2))]>;
-}
-
-def FsANDPDrm : PDI<0x54, MRMSrcMem, (outs FR64:$dst),
-                                     (ins FR64:$src1, f128mem:$src2),
-                    "andpd\t{$src2, $dst|$dst, $src2}",
-                    [(set FR64:$dst, (X86fand FR64:$src1,
-                                      (memopfsf64 addr:$src2)))]>;
-def FsORPDrm  : PDI<0x56, MRMSrcMem, (outs FR64:$dst),
-                                     (ins FR64:$src1, f128mem:$src2),
-                    "orpd\t{$src2, $dst|$dst, $src2}",
-                    [(set FR64:$dst, (X86for FR64:$src1,
-                                      (memopfsf64 addr:$src2)))]>;
-def FsXORPDrm : PDI<0x57, MRMSrcMem, (outs FR64:$dst),
-                                     (ins FR64:$src1, f128mem:$src2),
-                    "xorpd\t{$src2, $dst|$dst, $src2}",
-                    [(set FR64:$dst, (X86fxor FR64:$src1,
-                                      (memopfsf64 addr:$src2)))]>;
-
-let neverHasSideEffects = 1 in {
-def FsANDNPDrr : PDI<0x55, MRMSrcReg,
-                     (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
-                     "andnpd\t{$src2, $dst|$dst, $src2}", []>;
-let mayLoad = 1 in
-def FsANDNPDrm : PDI<0x55, MRMSrcMem,
-                     (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2),
-                     "andnpd\t{$src2, $dst|$dst, $src2}", []>;
-}
-}
-
-/// basic_sse2_fp_binop_rm - SSE2 binops come in both scalar and vector forms.
-///
-/// In addition, we also have a special variant of the scalar form here to
-/// represent the associated intrinsic operation.  This form is unlike the
-/// plain scalar form, in that it takes an entire vector (instead of a scalar)
-/// and leaves the top elements unmodified (therefore these cannot be commuted).
-///
-/// These three forms can each be reg+reg or reg+mem, so there are a total of
-/// six "instructions".
-///
-let Constraints = "$src1 = $dst" in {
-multiclass basic_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
-                                  SDNode OpNode, Intrinsic F64Int,
-                                  bit Commutable = 0> {
-  // Scalar operation, reg+reg.
-  def SDrr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
-                 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
-                 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Scalar operation, reg+mem.
-  def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
-                                 (ins FR64:$src1, f64mem:$src2),
-                 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
-                 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
-
-  // Vector operation, reg+reg.
-  def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
-                                 (ins VR128:$src1, VR128:$src2),
-               !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Vector operation, reg+mem.
-  def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
-                                 (ins VR128:$src1, f128mem:$src2),
-                 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
-             [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
-
-  // Intrinsic operation, reg+reg.
-  def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst),
-                                     (ins VR128:$src1, VR128:$src2),
-                     !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]>;
-
-  // Intrinsic operation, reg+mem.
-  def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
-                                     (ins VR128:$src1, sdmem:$src2),
-                     !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR128:$dst, (F64Int VR128:$src1,
-                                               sse_load_f64:$src2))]>;
+def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+                                       VR128:$src2))]>, XS, VEX_4V,
+                    Requires<[HasAVX]>;
+def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+                                       (load addr:$src2)))]>, XS, VEX_4V,
+                    Requires<[HasAVX]>;
 }
+let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+                                       VR128:$src2))]>, XS,
+                    Requires<[HasSSE2]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+                                       (load addr:$src2)))]>, XS,
+                    Requires<[HasSSE2]>;
 }
 
-// Arithmetic instructions
-defm ADD : basic_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>;
-defm MUL : basic_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>;
-defm SUB : basic_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>;
-defm DIV : basic_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>;
-
-/// sse2_fp_binop_rm - Other SSE2 binops
-///
-/// This multiclass is like basic_sse2_fp_binop_rm, with the addition of
-/// instructions for a full-vector intrinsic form.  Operations that map
-/// onto C operators don't use this form since they just use the plain
-/// vector form instead of having a separate vector intrinsic form.
-///
-/// This provides a total of eight "instructions".
-///
-let Constraints = "$src1 = $dst" in {
-multiclass sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
-                            SDNode OpNode,
-                            Intrinsic F64Int,
-                            Intrinsic V2F64Int,
-                            bit Commutable = 0> {
-
-  // Scalar operation, reg+reg.
-  def SDrr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
-                 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
-                 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Scalar operation, reg+mem.
-  def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
-                                 (ins FR64:$src1, f64mem:$src2),
-                 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
-                 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
-
-  // Vector operation, reg+reg.
-  def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
-                                 (ins VR128:$src1, VR128:$src2),
-               !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Vector operation, reg+mem.
-  def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
-                                 (ins VR128:$src1, f128mem:$src2),
-                 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
-             [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
-
-  // Intrinsic operation, reg+reg.
-  def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst),
-                                     (ins VR128:$src1, VR128:$src2),
-                     !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Intrinsic operation, reg+mem.
-  def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
-                                     (ins VR128:$src1, sdmem:$src2),
-                     !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR128:$dst, (F64Int VR128:$src1,
-                                               sse_load_f64:$src2))]>;
-
-  // Vector intrinsic operation, reg+reg.
-  def PDrr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst),
-                                     (ins VR128:$src1, VR128:$src2),
-                     !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR128:$dst, (V2F64Int VR128:$src1, VR128:$src2))]> {
-    let isCommutable = Commutable;
-  }
+def : Pat<(extloadf32 addr:$src),
+          (CVTSS2SDrr (MOVSSrm addr:$src))>,
+      Requires<[HasSSE2, OptForSpeed]>;
 
-  // Vector intrinsic operation, reg+mem.
-  def PDrm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst),
-                                     (ins VR128:$src1, f128mem:$src2),
-                     !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR128:$dst, (V2F64Int VR128:$src1,
-                                                 (memopv2f64 addr:$src2)))]>;
-}
+// Convert doubleword to packed single/double fp
+let isAsmParserOnly = 1 in { // SSE2 instructions without OpSize prefix
+def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
+                     TB, VEX, Requires<[HasAVX]>;
+def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                      "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
+                                        (bitconvert (memopv2i64 addr:$src))))]>,
+                     TB, VEX, Requires<[HasAVX]>;
 }
-
-defm MAX : sse2_fp_binop_rm<0x5F, "max", X86fmax,
-                            int_x86_sse2_max_sd, int_x86_sse2_max_pd>;
-defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin,
-                            int_x86_sse2_min_sd, int_x86_sse2_min_pd>;
-
-//===---------------------------------------------------------------------===//
-// SSE packed FP Instructions
-
-// Move Instructions
-let neverHasSideEffects = 1 in
-def MOVAPDrr : PDI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                   "movapd\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1 in
-def MOVAPDrm : PDI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                   "movapd\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (alignedloadv2f64 addr:$src))]>;
-
-def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                   "movapd\t{$src, $dst|$dst, $src}",
-                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
-
-let neverHasSideEffects = 1 in
-def MOVUPDrr : PDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                   "movupd\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1 in
-def MOVUPDrm : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                   "movupd\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (loadv2f64 addr:$src))]>;
-def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                   "movupd\t{$src, $dst|$dst, $src}",
-                   [(store (v2f64 VR128:$src), addr:$dst)]>;
-
-// Intrinsic forms of MOVUPD load and store
-def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "movupd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>;
-def MOVUPDmr_Int : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                       "movupd\t{$src, $dst|$dst, $src}",
-                       [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>;
-
-let Constraints = "$src1 = $dst" in {
-  let AddedComplexity = 20 in {
-    def MOVLPDrm : PDI<0x12, MRMSrcMem,
-                       (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
-                       "movlpd\t{$src2, $dst|$dst, $src2}",
-                       [(set VR128:$dst,
-                         (v2f64 (movlp VR128:$src1,
-                                 (scalar_to_vector (loadf64 addr:$src2)))))]>;
-    def MOVHPDrm : PDI<0x16, MRMSrcMem,
-                       (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
-                       "movhpd\t{$src2, $dst|$dst, $src2}",
-                       [(set VR128:$dst,
-                         (v2f64 (movlhps VR128:$src1,
-                                 (scalar_to_vector (loadf64 addr:$src2)))))]>;
-  } // AddedComplexity
-} // Constraints = "$src1 = $dst"
-
-def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
-                   "movlpd\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract (v2f64 VR128:$src),
-                                 (iPTR 0))), addr:$dst)]>;
-
-// v2f64 extract element 1 is always custom lowered to unpack high to low
-// and extract element 0 so the non-store version isn't too horrible.
-def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
-                   "movhpd\t{$src, $dst|$dst, $src}",
-                   [(store (f64 (vector_extract
-                                 (v2f64 (unpckh VR128:$src, (undef))),
-                                 (iPTR 0))), addr:$dst)]>;
-
-// SSE2 instructions without OpSize prefix
 def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2ps\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
@@ -1662,7 +781,18 @@ def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                                         (bitconvert (memopv2i64 addr:$src))))]>,
                      TB, Requires<[HasSSE2]>;
 
-// SSE2 instructions with XS prefix
+// FIXME: why the non-intrinsic version is described as SSE3?
+let isAsmParserOnly = 1 in { // SSE2 instructions with XS prefix
+def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
+                     XS, VEX, Requires<[HasAVX]>;
+def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                       "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
+                                        (bitconvert (memopv2i64 addr:$src))))]>,
+                     XS, VEX, Requires<[HasAVX]>;
+}
 def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
@@ -1673,6 +803,29 @@ def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                                         (bitconvert (memopv2i64 addr:$src))))]>,
                      XS, Requires<[HasSSE2]>;
 
+// Convert packed single/double fp to doubleword
+let isAsmParserOnly = 1 in {
+def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+}
+def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
+
+let isAsmParserOnly = 1 in {
+def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "cvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>,
+                        VEX;
+def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst),
+                         (ins f128mem:$src),
+                         "cvtps2dq\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq
+                                            (memop addr:$src)))]>, VEX;
+}
 def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
@@ -1680,12 +833,54 @@ def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvtps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvtps2dq
                                             (memop addr:$src)))]>;
-// SSE2 packed instructions with XS prefix
+
+let isAsmParserOnly = 1 in { // SSE2 packed instructions with XD prefix
+def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+                     XD, VEX, Requires<[HasAVX]>;
+def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
+                                          (memop addr:$src)))]>,
+                     XD, VEX, Requires<[HasAVX]>;
+}
+def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+                     XD, Requires<[HasSSE2]>;
+def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
+                                          (memop addr:$src)))]>,
+                     XD, Requires<[HasSSE2]>;
+
+
+// Convert with truncation packed single/double fp to doubleword
+let isAsmParserOnly = 1 in { // SSE2 packed instructions with XS prefix
+def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+}
 def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>;
 def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>;
 
+
+let isAsmParserOnly = 1 in {
+def Int_VCVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "vcvttps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                              (int_x86_sse2_cvttps2dq VR128:$src))]>,
+                      XS, VEX, Requires<[HasAVX]>;
+def Int_VCVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                        "vcvttps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+                                           (memop addr:$src)))]>,
+                      XS, VEX, Requires<[HasAVX]>;
+}
 def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "cvttps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
@@ -1697,17 +892,18 @@ def Int_CVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                                            (memop addr:$src)))]>,
                       XS, Requires<[HasSSE2]>;
 
-// SSE2 packed instructions with XD prefix
-def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
-                     XD, Requires<[HasSSE2]>;
-def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
-                                          (memop addr:$src)))]>,
-                     XD, Requires<[HasSSE2]>;
-
+let isAsmParserOnly = 1 in {
+def Int_VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst),
+                            (ins VR128:$src),
+                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>,
+                       VEX;
+def Int_VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst),
+                          (ins f128mem:$src),
+                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                          [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                             (memop addr:$src)))]>, VEX;
+}
 def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                           "cvttpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
@@ -1716,12 +912,31 @@ def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
                           [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
                                              (memop addr:$src)))]>;
 
-// SSE2 instructions without OpSize prefix
+// Convert packed single to packed double
+let isAsmParserOnly = 1 in { // SSE2 instructions without OpSize prefix
+def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX,
+                       Requires<[HasAVX]>;
+def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                       "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX,
+                       Requires<[HasAVX]>;
+}
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
 
+let isAsmParserOnly = 1 in {
+def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtps2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
+                     VEX, Requires<[HasAVX]>;
+def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                       "cvtps2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd
+                                          (load addr:$src)))]>,
+                     VEX, Requires<[HasAVX]>;
+}
 def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
@@ -1732,12 +947,29 @@ def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                                           (load addr:$src)))]>,
                      TB, Requires<[HasSSE2]>;
 
+// Convert packed double to packed single
+let isAsmParserOnly = 1 in {
+def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtpd2ps\t{$src, $dst|$dst, $src}", []>, VEX;
+// FIXME: the memory form of this instruction should described using
+// use extra asm syntax
+}
 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
 
 
+let isAsmParserOnly = 1 in {
+def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
+def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst),
+                         (ins f128mem:$src),
+                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
+                                            (memop addr:$src)))]>;
+}
 def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvtpd2ps\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
@@ -1746,269 +978,1039 @@ def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
                                             (memop addr:$src)))]>;
 
-// Match intrinsics which expect XMM operand(s).
-// Aliases for intrinsics
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
+multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
+                            string asm, string asm_alt> {
+  def rr : SIi8<0xC2, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc),
+                    asm, []>;
+  let mayLoad = 1 in
+  def rm : SIi8<0xC2, MRMSrcMem,
+                    (outs RC:$dst), (ins RC:$src1, x86memop:$src, SSECC:$cc),
+                    asm, []>;
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1 in {
+    def rr_alt : SIi8<0xC2, MRMSrcReg,
+                  (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2),
+                  asm_alt, []>;
+    let mayLoad = 1 in
+    def rm_alt : SIi8<0xC2, MRMSrcMem,
+                  (outs RC:$dst), (ins RC:$src1, x86memop:$src, i8imm:$src2),
+                  asm_alt, []>;
+  }
+}
+
+let neverHasSideEffects = 1, isAsmParserOnly = 1 in {
+  defm VCMPSS  : sse12_cmp_scalar<FR32, f32mem,
+                  "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+                  "cmpss\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}">,
+                  XS, VEX_4V;
+  defm VCMPSD  : sse12_cmp_scalar<FR64, f64mem,
+                  "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+                  "cmpsd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}">,
+                  XD, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+  defm CMPSS  : sse12_cmp_scalar<FR32, f32mem,
+                    "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+                    "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}">, XS;
+  defm CMPSD  : sse12_cmp_scalar<FR64, f64mem,
+                    "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+                    "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}">, XD;
+}
+
+multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop,
+                         Intrinsic Int, string asm> {
+  def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src, SSECC:$cc), asm,
+                        [(set VR128:$dst, (Int VR128:$src1,
+                                               VR128:$src, imm:$cc))]>;
+  def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, f32mem:$src, SSECC:$cc), asm,
+                        [(set VR128:$dst, (Int VR128:$src1,
+                                               (load addr:$src), imm:$cc))]>;
+}
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let isAsmParserOnly = 1 in {
+  defm Int_VCMPSS  : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
+                       "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">,
+                       XS, VEX_4V;
+  defm Int_VCMPSD  : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd,
+                       "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">,
+                       XD, VEX_4V;
+}
 let Constraints = "$src1 = $dst" in {
-def Int_CVTSI2SDrr: SDI<0x2A, MRMSrcReg,
-                        (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
-                        "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
-                                           GR32:$src2))]>;
-def Int_CVTSI2SDrm: SDI<0x2A, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
-                        "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
-                                           (loadi32 addr:$src2)))]>;
-def Int_CVTSD2SSrr: SDI<0x5A, MRMSrcReg,
-                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                   "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
-                   [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
-                                      VR128:$src2))]>;
-def Int_CVTSD2SSrm: SDI<0x5A, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
-                   "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
-                   [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
-                                      (load addr:$src2)))]>;
-def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
-                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
-                                       VR128:$src2))]>, XS,
-                    Requires<[HasSSE2]>;
-def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
-                      (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
-                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
-                                       (load addr:$src2)))]>, XS,
-                    Requires<[HasSSE2]>;
+  defm Int_CMPSS  : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
+                       "cmp${cc}ss\t{$src, $dst|$dst, $src}">, XS;
+  defm Int_CMPSD  : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd,
+                       "cmp${cc}sd\t{$src, $dst|$dst, $src}">, XD;
+}
+
+
+// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
+multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
+                            ValueType vt, X86MemOperand x86memop,
+                            PatFrag ld_frag, string OpcodeStr, Domain d> {
+  def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], d>;
+  def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+                     [(set EFLAGS, (OpNode (vt RC:$src1),
+                                           (ld_frag addr:$src2)))], d>;
+}
+
+let Defs = [EFLAGS] in {
+  let isAsmParserOnly = 1 in {
+    defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+                                    "ucomiss", SSEPackedSingle>, VEX;
+    defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+                                    "ucomisd", SSEPackedDouble>, OpSize, VEX;
+    let Pattern = []<dag> in {
+      defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+                                      "comiss", SSEPackedSingle>, VEX;
+      defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+                                      "comisd", SSEPackedDouble>, OpSize, VEX;
+    }
+
+    defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                              load, "ucomiss", SSEPackedSingle>, VEX;
+    defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                              load, "ucomisd", SSEPackedDouble>, OpSize, VEX;
+
+    defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
+                              load, "comiss", SSEPackedSingle>, VEX;
+    defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
+                              load, "comisd", SSEPackedDouble>, OpSize, VEX;
+  }
+  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+                                  "ucomiss", SSEPackedSingle>, TB;
+  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd", SSEPackedDouble>, TB, OpSize;
+
+  let Pattern = []<dag> in {
+    defm COMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+                                    "comiss", SSEPackedSingle>, TB;
+    defm COMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+                                    "comisd", SSEPackedDouble>, TB, OpSize;
+  }
+
+  defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                              load, "ucomiss", SSEPackedSingle>, TB;
+  defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                              load, "ucomisd", SSEPackedDouble>, TB, OpSize;
+
+  defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
+                                  "comiss", SSEPackedSingle>, TB;
+  defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
+                                  "comisd", SSEPackedDouble>, TB, OpSize;
+} // Defs = [EFLAGS]
+
+// sse12_cmp_packed - sse 1 & 2 compared packed instructions
+multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
+                            Intrinsic Int, string asm, string asm_alt,
+                            Domain d> {
+  def rri : PIi8<0xC2, MRMSrcReg,
+             (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc), asm,
+             [(set RC:$dst, (Int RC:$src1, RC:$src, imm:$cc))], d>;
+  def rmi : PIi8<0xC2, MRMSrcMem,
+             (outs RC:$dst), (ins RC:$src1, f128mem:$src, SSECC:$cc), asm,
+             [(set RC:$dst, (Int RC:$src1, (memop addr:$src), imm:$cc))], d>;
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1 in {
+    def rri_alt : PIi8<0xC2, MRMSrcReg,
+               (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2),
+               asm_alt, [], d>;
+    def rmi_alt : PIi8<0xC2, MRMSrcMem,
+               (outs RC:$dst), (ins RC:$src1, f128mem:$src, i8imm:$src2),
+               asm_alt, [], d>;
+  }
+}
+
+let isAsmParserOnly = 1 in {
+  defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
+                 "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
+                 "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
+                 SSEPackedSingle>, VEX_4V;
+  defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
+                 "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
+                 "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
+                 SSEPackedDouble>, OpSize, VEX_4V;
+}
+let Constraints = "$src1 = $dst" in {
+  defm CMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
+                 "cmp${cc}ps\t{$src, $dst|$dst, $src}",
+                 "cmpps\t{$src2, $src, $dst|$dst, $src, $src2}",
+                 SSEPackedSingle>, TB;
+  defm CMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
+                 "cmp${cc}pd\t{$src, $dst|$dst, $src}",
+                 "cmppd\t{$src2, $src, $dst|$dst, $src, $src2}",
+                 SSEPackedDouble>, TB, OpSize;
+}
+
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Shuffle Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_shuffle - sse 1 & 2 shuffle instructions
+multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
+                         ValueType vt, string asm, PatFrag mem_frag,
+                         Domain d, bit IsConvertibleToThreeAddress = 0> {
+  def rmi : PIi8<0xC6, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, f128mem:$src2, i8imm:$src3), asm,
+                   [(set VR128:$dst, (vt (shufp:$src3
+                            VR128:$src1, (mem_frag addr:$src2))))], d>;
+  let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
+    def rri : PIi8<0xC6, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, i8imm:$src3), asm,
+                   [(set VR128:$dst,
+                            (vt (shufp:$src3 VR128:$src1, VR128:$src2)))], d>;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+            memopv4f32, SSEPackedSingle>, VEX_4V;
+  defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
+            memopv2f64, SSEPackedDouble>, OpSize, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>,
+                    TB;
+  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    memopv2f64, SSEPackedDouble>, TB, OpSize;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_unpack_interleave - sse 1 & 2 unpack and interleave
+multiclass sse12_unpack_interleave<bits<8> opc, PatFrag OpNode, ValueType vt,
+                                   PatFrag mem_frag, RegisterClass RC,
+                                   X86MemOperand x86memop, string asm,
+                                   Domain d> {
+    def rr : PI<opc, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1, RC:$src2)))], d>;
+    def rm : PI<opc, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1,
+                                       (mem_frag addr:$src2))))], d>;
+}
+
+let AddedComplexity = 10 in {
+  let isAsmParserOnly = 1 in {
+    defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
+          VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         SSEPackedSingle>, VEX_4V;
+    defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64,
+          VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         SSEPackedDouble>, OpSize, VEX_4V;
+    defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32,
+          VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         SSEPackedSingle>, VEX_4V;
+    defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64,
+          VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         SSEPackedDouble>, OpSize, VEX_4V;
+
+    defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32,
+          VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         SSEPackedSingle>, VEX_4V;
+    defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64,
+          VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         SSEPackedDouble>, OpSize, VEX_4V;
+    defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32,
+          VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         SSEPackedSingle>, VEX_4V;
+    defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64,
+          VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         SSEPackedDouble>, OpSize, VEX_4V;
+  }
+
+  let Constraints = "$src1 = $dst" in {
+    defm UNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
+          VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
+                         SSEPackedSingle>, TB;
+    defm UNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64,
+          VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
+                         SSEPackedDouble>, TB, OpSize;
+    defm UNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32,
+          VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
+                         SSEPackedSingle>, TB;
+    defm UNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64,
+          VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
+                         SSEPackedDouble>, TB, OpSize;
+  } // Constraints = "$src1 = $dst"
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Extract Floating-Point Sign mask
+//===----------------------------------------------------------------------===//
+
+/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
+multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
+                                Domain d> {
+  def rr : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+                     [(set GR32:$dst, (Int RC:$src))], d>;
+}
+
+// Mask creation
+defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
+                                     SSEPackedSingle>, TB;
+defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
+                                     SSEPackedDouble>, TB, OpSize;
+
+let isAsmParserOnly = 1 in {
+  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
+                                        "movmskps", SSEPackedSingle>, VEX;
+  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
+                                        "movmskpd", SSEPackedDouble>, OpSize,
+                                        VEX;
+  // FIXME: merge with multiclass above when the intrinsics come.
+  def VMOVMSKPSYrr : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
+             "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
+  def VMOVMSKPDYrr : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
+             "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
+                                                                        VEX;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Misc aliasing of packed SSE 1 & 2 instructions
+//===----------------------------------------------------------------------===//
+
+// Aliases of packed SSE1 & SSE2 instructions for scalar use. These all have
+// names that start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
+    canFoldAsLoad = 1 in {
+  // FIXME: Set encoding to pseudo!
+def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
+                 [(set FR32:$dst, fp32imm0)]>,
+                 Requires<[HasSSE1]>, TB, OpSize;
+def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
+                 [(set FR64:$dst, fpimm0)]>,
+               Requires<[HasSSE2]>, TB, OpSize;
 }
 
-// Arithmetic
+// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
+// bits are disregarded.
+let neverHasSideEffects = 1 in {
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                     "movaps\t{$src, $dst|$dst, $src}", []>;
+def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                     "movapd\t{$src, $dst|$dst, $src}", []>;
+}
 
-/// sse2_fp_unop_rm - SSE2 unops come in both scalar and vector forms.
+// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
+// bits are disregarded.
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                     "movaps\t{$src, $dst|$dst, $src}",
+                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
+def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                     "movapd\t{$src, $dst|$dst, $src}",
+                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Logical Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops
+///
+multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
+                                       SDNode OpNode> {
+  let isAsmParserOnly = 1 in {
+    defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+                FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, VEX_4V;
+
+    defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+          FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, OpSize, VEX_4V;
+  }
+
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
+                f32, f128mem, memopfsf32, SSEPackedSingle>, TB;
+
+    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
+                f64, f128mem, memopfsf64, SSEPackedDouble>, TB, OpSize;
+  }
+}
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let mayLoad = 0 in {
+  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand>;
+  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for>;
+  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor>;
+}
+
+let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
+  defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef>;
+
+/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
 ///
+multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
+                                 SDNode OpNode, int HasPat = 0,
+                                 list<list<dag>> Pattern = []> {
+  let isAsmParserOnly = 1, Pattern = []<dag> in {
+    defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+         !strconcat(OpcodeStr, "ps"), f128mem,
+         !if(HasPat, Pattern[0], // rr
+                     [(set VR128:$dst, (v2i64 (OpNode VR128:$src1,
+                                                      VR128:$src2)))]),
+         !if(HasPat, Pattern[2], // rm
+                     [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                               (memopv2i64 addr:$src2)))]), 0>,
+                                               VEX_4V;
+
+    defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+         !strconcat(OpcodeStr, "pd"), f128mem,
+         !if(HasPat, Pattern[1], // rr
+                     [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                               (bc_v2i64 (v2f64
+                                               VR128:$src2))))]),
+         !if(HasPat, Pattern[3], // rm
+                     [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                               (memopv2i64 addr:$src2)))]), 0>,
+                                                               OpSize, VEX_4V;
+  }
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+         !strconcat(OpcodeStr, "ps"), f128mem,
+         !if(HasPat, Pattern[0], // rr
+                     [(set VR128:$dst, (v2i64 (OpNode VR128:$src1,
+                                                      VR128:$src2)))]),
+         !if(HasPat, Pattern[2], // rm
+                     [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                               (memopv2i64 addr:$src2)))])>, TB;
+
+    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+         !strconcat(OpcodeStr, "pd"), f128mem,
+         !if(HasPat, Pattern[1], // rr
+                     [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                               (bc_v2i64 (v2f64
+                                               VR128:$src2))))]),
+         !if(HasPat, Pattern[3], // rm
+                     [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                               (memopv2i64 addr:$src2)))])>,
+                                                                    TB, OpSize;
+  }
+}
+
+/// sse12_fp_packed_logical_y - AVX 256-bit SSE 1 & 2 logical ops forms
+///
+let isAsmParserOnly = 1 in {
+multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr> {
+    defm PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
+          !strconcat(OpcodeStr, "ps"), f256mem, [], [], 0>, VEX_4V;
+
+    defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
+          !strconcat(OpcodeStr, "pd"), f256mem, [], [], 0>, OpSize, VEX_4V;
+}
+}
+
+// AVX 256-bit packed logical ops forms
+defm VAND : sse12_fp_packed_logical_y<0x54, "and">;
+defm VOR  : sse12_fp_packed_logical_y<0x56, "or">;
+defm VXOR : sse12_fp_packed_logical_y<0x57, "xor">;
+let isCommutable = 0 in
+  defm VANDN : sse12_fp_packed_logical_y<0x55, "andn">;
+
+defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
+defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
+defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
+let isCommutable = 0 in
+  defm ANDN : sse12_fp_packed_logical<0x55, "andn", undef /* dummy */, 1, [
+    // single r+r
+    [(set VR128:$dst, (v2i64 (and (xor VR128:$src1,
+                                       (bc_v2i64 (v4i32 immAllOnesV))),
+                                   VR128:$src2)))],
+    // double r+r
+    [(set VR128:$dst, (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+                                 (bc_v2i64 (v2f64 VR128:$src2))))],
+    // single r+m
+    [(set VR128:$dst, (v2i64 (and (xor (bc_v2i64 (v4f32 VR128:$src1)),
+                                       (bc_v2i64 (v4i32 immAllOnesV))),
+                                  (memopv2i64 addr:$src2))))],
+    // double r+m
+    [(set VR128:$dst, (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+                           (memopv2i64 addr:$src2)))]]>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
+/// vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem.
+///
+multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  bit Is2Addr = 1> {
+  defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                            OpNode, FR32, f32mem, Is2Addr>, XS;
+  defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                            OpNode, FR64, f64mem, Is2Addr>, XD;
+}
+
+multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                   bit Is2Addr = 1> {
+  let mayLoad = 0 in {
+  defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+              v4f32, f128mem, memopv4f32, SSEPackedSingle, Is2Addr>, TB;
+  defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+              v2f64, f128mem, memopv2f64, SSEPackedDouble, Is2Addr>, TB, OpSize;
+  }
+}
+
+multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr,
+                                    SDNode OpNode> {
+  let mayLoad = 0 in {
+    defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
+                v8f32, f256mem, memopv8f32, SSEPackedSingle, 0>, TB;
+    defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
+                v4f64, f256mem, memopv4f64, SSEPackedDouble, 0>, TB, OpSize;
+  }
+}
+
+multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+                                   bit Is2Addr = 1> {
+  defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+     !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, Is2Addr>, XS;
+  defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+     !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, Is2Addr>, XD;
+}
+
+multiclass basic_sse12_fp_binop_p_int<bits<8> opc, string OpcodeStr,
+                                   bit Is2Addr = 1> {
+  defm PS : sse12_fp_packed_int<opc, OpcodeStr, VR128,
+     !strconcat(OpcodeStr, "ps"), "", "_ps", f128mem, memopv4f32,
+                                              SSEPackedSingle, Is2Addr>, TB;
+
+  defm PD : sse12_fp_packed_int<opc, OpcodeStr, VR128,
+     !strconcat(OpcodeStr, "pd"), "2", "_pd", f128mem, memopv2f64,
+                                      SSEPackedDouble, Is2Addr>, TB, OpSize;
+}
+
+// Binary Arithmetic instructions
+let isAsmParserOnly = 1 in {
+  defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>,
+              basic_sse12_fp_binop_p<0x58, "add", fadd, 0>,
+              basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V;
+  defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>,
+              basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>,
+              basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V;
+
+  let isCommutable = 0 in {
+    defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>,
+                basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>,
+                basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V;
+    defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>,
+                basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>,
+                basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V;
+    defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>,
+                basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>,
+                basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>, VEX_4V;
+    defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>,
+                basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>,
+                basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V;
+  }
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd>,
+             basic_sse12_fp_binop_p<0x58, "add", fadd>,
+             basic_sse12_fp_binop_s_int<0x58, "add">;
+  defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul>,
+             basic_sse12_fp_binop_p<0x59, "mul", fmul>,
+             basic_sse12_fp_binop_s_int<0x59, "mul">;
+
+  let isCommutable = 0 in {
+    defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub>,
+               basic_sse12_fp_binop_p<0x5C, "sub", fsub>,
+               basic_sse12_fp_binop_s_int<0x5C, "sub">;
+    defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv>,
+               basic_sse12_fp_binop_p<0x5E, "div", fdiv>,
+               basic_sse12_fp_binop_s_int<0x5E, "div">;
+    defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax>,
+               basic_sse12_fp_binop_p<0x5F, "max", X86fmax>,
+               basic_sse12_fp_binop_s_int<0x5F, "max">,
+               basic_sse12_fp_binop_p_int<0x5F, "max">;
+    defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin>,
+               basic_sse12_fp_binop_p<0x5D, "min", X86fmin>,
+               basic_sse12_fp_binop_s_int<0x5D, "min">,
+               basic_sse12_fp_binop_p_int<0x5D, "min">;
+  }
+}
+
+/// Unop Arithmetic
 /// In addition, we also have a special variant of the scalar form here to
 /// represent the associated intrinsic operation.  This form is unlike the
 /// plain scalar form, in that it takes an entire vector (instead of a
 /// scalar) and leaves the top elements undefined.
 ///
 /// And, we have a special variant form for a full-vector intrinsic form.
-///
-/// These four forms can each have a reg or a mem operand, so there are a
-/// total of eight "instructions".
-///
-multiclass sse2_fp_unop_rm<bits<8> opc, string OpcodeStr,
-                           SDNode OpNode,
-                           Intrinsic F64Int,
-                           Intrinsic V2F64Int,
-                           bit Commutable = 0> {
-  // Scalar operation, reg.
+
+/// sse1_fp_unop_s - SSE1 unops in scalar form.
+multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
+                          SDNode OpNode, Intrinsic F32Int> {
+  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode FR32:$src))]>;
+  // For scalar unary operations, fold a load into the operation
+  // only in OptForSize mode. It eliminates an instruction, but it also
+  // eliminates a whole-register clobber (the load), so it introduces a
+  // partial register update condition.
+  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode (load addr:$src)))]>, XS,
+            Requires<[HasSSE1, OptForSize]>;
+  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int VR128:$src))]>;
+  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
+                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
+}
+
+/// sse1_fp_unop_s_avx - AVX SSE1 unops in scalar form.
+multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr,
+                              SDNode OpNode, Intrinsic F32Int> {
+  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
+                !strconcat(!strconcat("v", OpcodeStr),
+                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2),
+                !strconcat(!strconcat("v", OpcodeStr),
+                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                []>, XS, Requires<[HasAVX, OptForSize]>;
+  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+                (ins VR128:$src1, VR128:$src2),
+                !strconcat(!strconcat("v", OpcodeStr),
+                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                (ins VR128:$src1, ssmem:$src2),
+                !strconcat(!strconcat("v", OpcodeStr),
+                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+}
+
+/// sse1_fp_unop_p - SSE1 unops in packed form.
+multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>;
+  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>;
+}
+
+/// sse1_fp_unop_p_y - AVX 256-bit SSE1 unops in packed form.
+multiclass sse1_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  def PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+              [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>;
+  def PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))]>;
+}
+
+/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
+multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic V4F32Int> {
+  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int VR128:$src))]>;
+  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>;
+}
+
+
+/// sse2_fp_unop_s - SSE2 unops in scalar form.
+multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
+                          SDNode OpNode, Intrinsic F64Int> {
   def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
                 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
-                [(set FR64:$dst, (OpNode FR64:$src))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Scalar operation, mem.
-  def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
+                [(set FR64:$dst, (OpNode FR64:$src))]>;
+  // See the comments in sse1_fp_unop_s for why this is OptForSize.
+  def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
                 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
-                [(set FR64:$dst, (OpNode (load addr:$src)))]>;
+                [(set FR64:$dst, (OpNode (load addr:$src)))]>, XD,
+            Requires<[HasSSE2, OptForSize]>;
+  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int VR128:$src))]>;
+  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
+                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
+}
+
+/// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form.
+multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr,
+                              SDNode OpNode, Intrinsic F64Int> {
+  def SDr : VSDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+                !strconcat(OpcodeStr,
+                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  def SDm : VSDI<opc, MRMSrcMem, (outs FR64:$dst),
+                (ins FR64:$src1, f64mem:$src2),
+                !strconcat(OpcodeStr,
+                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  def SDr_Int : VSDI<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>;
+  def SDm_Int : VSDI<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, sdmem:$src2),
+           !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>;
+}
 
-  // Vector operation, reg.
+/// sse2_fp_unop_p - SSE2 unops in vector forms.
+multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
+                          SDNode OpNode> {
   def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
               !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Vector operation, mem.
+              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>;
   def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
                 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>;
+}
 
-  // Intrinsic operation, reg.
-  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (F64Int VR128:$src))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Intrinsic operation, mem.
-  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
-                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
+/// sse2_fp_unop_p_y - AVX SSE2 256-bit unops in vector forms.
+multiclass sse2_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  def PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+              [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>;
+  def PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))]>;
+}
 
-  // Vector intrinsic operation, reg
+/// sse2_fp_unop_p_int - SSE2 intrinsic unops in vector forms.
+multiclass sse2_fp_unop_p_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic V2F64Int> {
   def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (V2F64Int VR128:$src))]> {
-    let isCommutable = Commutable;
-  }
-
-  // Vector intrinsic operation, mem
+                    [(set VR128:$dst, (V2F64Int VR128:$src))]>;
   def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                     !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>;
 }
 
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+  // Square root.
+  defm VSQRT  : sse1_fp_unop_s_avx<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss>,
+                sse2_fp_unop_s_avx<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd>,
+                VEX_4V;
+
+  defm VSQRT  : sse1_fp_unop_p<0x51, "vsqrt", fsqrt>,
+                sse2_fp_unop_p<0x51, "vsqrt", fsqrt>,
+                sse1_fp_unop_p_y<0x51, "vsqrt", fsqrt>,
+                sse2_fp_unop_p_y<0x51, "vsqrt", fsqrt>,
+                VEX;
+
+  // Reciprocal approximations. Note that these typically require refinement
+  // in order to obtain suitable precision.
+  defm VRSQRT : sse1_fp_unop_s_avx<0x52, "rsqrt", X86frsqrt,
+                                   int_x86_sse_rsqrt_ss>, VEX_4V;
+  defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt>,
+                sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt>, VEX;
+
+  defm VRCP   : sse1_fp_unop_s_avx<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>,
+                                   VEX_4V;
+  defm VRCP   : sse1_fp_unop_p<0x53, "vrcp", X86frcp>,
+                sse1_fp_unop_p_y<0x53, "vrcp", X86frcp>, VEX;
+}
+
 // Square root.
-defm SQRT  : sse2_fp_unop_rm<0x51, "sqrt",  fsqrt,
-                             int_x86_sse2_sqrt_sd, int_x86_sse2_sqrt_pd>;
+defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss>,
+             sse1_fp_unop_p<0x51, "sqrt",  fsqrt>,
+             sse1_fp_unop_p_int<0x51, "sqrt",  int_x86_sse_sqrt_ps>,
+             sse2_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse2_sqrt_sd>,
+             sse2_fp_unop_p<0x51, "sqrt",  fsqrt>,
+             sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss>,
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt>,
+             sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps>;
+defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>,
+             sse1_fp_unop_p<0x53, "rcp", X86frcp>,
+             sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps>;
 
 // There is no f64 version of the reciprocal approximation instructions.
 
-// Logical
-let Constraints = "$src1 = $dst" in {
-  let isCommutable = 1 in {
-    def ANDPDrr : PDI<0x54, MRMSrcReg,
-                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                      "andpd\t{$src2, $dst|$dst, $src2}",
-                      [(set VR128:$dst,
-                        (and (bc_v2i64 (v2f64 VR128:$src1)),
-                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
-    def ORPDrr  : PDI<0x56, MRMSrcReg,
-                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                      "orpd\t{$src2, $dst|$dst, $src2}",
-                      [(set VR128:$dst,
-                        (or (bc_v2i64 (v2f64 VR128:$src1)),
-                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
-    def XORPDrr : PDI<0x57, MRMSrcReg,
-                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                      "xorpd\t{$src2, $dst|$dst, $src2}",
-                      [(set VR128:$dst,
-                        (xor (bc_v2i64 (v2f64 VR128:$src1)),
-                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
-  }
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Non-temporal stores
+//===----------------------------------------------------------------------===//
 
-  def ANDPDrm : PDI<0x54, MRMSrcMem,
-                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
-                    "andpd\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst,
-                      (and (bc_v2i64 (v2f64 VR128:$src1)),
-                       (memopv2i64 addr:$src2)))]>;
-  def ORPDrm  : PDI<0x56, MRMSrcMem,
-                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
-                    "orpd\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst,
-                      (or (bc_v2i64 (v2f64 VR128:$src1)),
-                       (memopv2i64 addr:$src2)))]>;
-  def XORPDrm : PDI<0x57, MRMSrcMem,
-                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
-                    "xorpd\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst,
-                      (xor (bc_v2i64 (v2f64 VR128:$src1)),
-                       (memopv2i64 addr:$src2)))]>;
-  def ANDNPDrr : PDI<0x55, MRMSrcReg,
-                     (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                     "andnpd\t{$src2, $dst|$dst, $src2}",
-                     [(set VR128:$dst,
-                       (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
-                        (bc_v2i64 (v2f64 VR128:$src2))))]>;
-  def ANDNPDrm : PDI<0x55, MRMSrcMem,
-                     (outs VR128:$dst), (ins VR128:$src1,f128mem:$src2),
-                     "andnpd\t{$src2, $dst|$dst, $src2}",
-                     [(set VR128:$dst,
-                       (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
-                        (memopv2i64 addr:$src2)))]>;
+let isAsmParserOnly = 1 in {
+  def VMOVNTPSmr_Int : VPSI<0x2B, MRMDestMem, (outs),
+                         (ins i128mem:$dst, VR128:$src),
+                         "movntps\t{$src, $dst|$dst, $src}",
+                         [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>, VEX;
+  def VMOVNTPDmr_Int : VPDI<0x2B, MRMDestMem, (outs),
+                         (ins i128mem:$dst, VR128:$src),
+                         "movntpd\t{$src, $dst|$dst, $src}",
+                         [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>, VEX;
+
+  let ExeDomain = SSEPackedInt in
+    def VMOVNTDQmr_Int : VPDI<0xE7, MRMDestMem, (outs),
+                       (ins f128mem:$dst, VR128:$src),
+                       "movntdq\t{$src, $dst|$dst, $src}",
+                       [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>, VEX;
+
+  let AddedComplexity = 400 in { // Prefer non-temporal versions
+    def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
+                         (ins f128mem:$dst, VR128:$src),
+                         "movntps\t{$src, $dst|$dst, $src}",
+                         [(alignednontemporalstore (v4f32 VR128:$src),
+                                                   addr:$dst)]>, VEX;
+    def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
+                         (ins f128mem:$dst, VR128:$src),
+                         "movntpd\t{$src, $dst|$dst, $src}",
+                         [(alignednontemporalstore (v2f64 VR128:$src),
+                                                   addr:$dst)]>, VEX;
+    def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs),
+                          (ins f128mem:$dst, VR128:$src),
+                          "movntdq\t{$src, $dst|$dst, $src}",
+                          [(alignednontemporalstore (v2f64 VR128:$src),
+                                                    addr:$dst)]>, VEX;
+    let ExeDomain = SSEPackedInt in
+    def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
+                        (ins f128mem:$dst, VR128:$src),
+                        "movntdq\t{$src, $dst|$dst, $src}",
+                        [(alignednontemporalstore (v4f32 VR128:$src),
+                                                  addr:$dst)]>, VEX;
+
+    def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
+                         (ins f256mem:$dst, VR256:$src),
+                         "movntps\t{$src, $dst|$dst, $src}",
+                         [(alignednontemporalstore (v8f32 VR256:$src),
+                                                   addr:$dst)]>, VEX;
+    def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
+                         (ins f256mem:$dst, VR256:$src),
+                         "movntpd\t{$src, $dst|$dst, $src}",
+                         [(alignednontemporalstore (v4f64 VR256:$src),
+                                                   addr:$dst)]>, VEX;
+    def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs),
+                          (ins f256mem:$dst, VR256:$src),
+                          "movntdq\t{$src, $dst|$dst, $src}",
+                          [(alignednontemporalstore (v4f64 VR256:$src),
+                                                    addr:$dst)]>, VEX;
+    let ExeDomain = SSEPackedInt in
+    def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+                        (ins f256mem:$dst, VR256:$src),
+                        "movntdq\t{$src, $dst|$dst, $src}",
+                        [(alignednontemporalstore (v8f32 VR256:$src),
+                                                  addr:$dst)]>, VEX;
+  }
 }
 
-let Constraints = "$src1 = $dst" in {
-  def CMPPDrri : PDIi8<0xC2, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
-                    "cmp${cc}pd\t{$src, $dst|$dst, $src}",
-                    [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
-                                                        VR128:$src, imm:$cc))]>;
-  def CMPPDrmi : PDIi8<0xC2, MRMSrcMem,
-                  (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
-                  "cmp${cc}pd\t{$src, $dst|$dst, $src}",
-                  [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
-                                                 (memop addr:$src), imm:$cc))]>;
+def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                    "movntps\t{$src, $dst|$dst, $src}",
+                    [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
+def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                        "movntpd\t{$src, $dst|$dst, $src}",
+                        [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
+
+let ExeDomain = SSEPackedInt in
+def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                        "movntdq\t{$src, $dst|$dst, $src}",
+                        [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntps\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntpd\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
+
+def MOVNTDQ_64mr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>;
+
+let ExeDomain = SSEPackedInt in
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
+
+// There is no AVX form for instructions below this point
+def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                 "movnti\t{$src, $dst|$dst, $src}",
+                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
+               TB, Requires<[HasSSE2]>;
+
+def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                     "movnti\t{$src, $dst|$dst, $src}",
+                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
+                  TB, Requires<[HasSSE2]>;
 
-  // Accept explicit immediate argument form instead of comparison code.
-let isAsmParserOnly = 1 in {
-  def CMPPDrri_alt : PDIi8<0xC2, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src, i8imm:$src2),
-                    "cmppd\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
-  def CMPPDrmi_alt : PDIi8<0xC2, MRMSrcMem,
-                  (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, i8imm:$src2),
-                  "cmppd\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
 }
+def MOVNTImr_Int  :   I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                    "movnti\t{$src, $dst|$dst, $src}",
+                    [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
+                  TB, Requires<[HasSSE2]>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Misc Instructions (No AVX form)
+//===----------------------------------------------------------------------===//
+
+// Prefetch intrinsic.
+def PREFETCHT0   : PSI<0x18, MRM1m, (outs), (ins i8mem:$src),
+    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3))]>;
+def PREFETCHT1   : PSI<0x18, MRM2m, (outs), (ins i8mem:$src),
+    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2))]>;
+def PREFETCHT2   : PSI<0x18, MRM3m, (outs), (ins i8mem:$src),
+    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1))]>;
+def PREFETCHNTA  : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
+    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>;
+
+// Load, store, and memory fence
+def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
+             TB, Requires<[HasSSE1]>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+// FIXME: Change encoding to pseudo!
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1 in {
+def V_SET0PS : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
+                 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+def V_SET0PD : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
+                 [(set VR128:$dst, (v2f64 immAllZerosV))]>;
+let ExeDomain = SSEPackedInt in
+def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
+                 [(set VR128:$dst, (v4i32 immAllZerosV))]>;
 }
-def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
-          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
-          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
 
-// Shuffle and unpack instructions
-let Constraints = "$src1 = $dst" in {
-  def SHUFPDrri : PDIi8<0xC6, MRMSrcReg,
-                 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-                 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                 [(set VR128:$dst,
-                   (v2f64 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
-  def SHUFPDrmi : PDIi8<0xC6, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1,
-                         f128mem:$src2, i8imm:$src3),
-                        "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                        [(set VR128:$dst,
-                          (v2f64 (shufp:$src3
-                                  VR128:$src1, (memopv2f64 addr:$src2))))]>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0PI)>;
+def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>;
 
-  let AddedComplexity = 10 in {
-    def UNPCKHPDrr : PDI<0x15, MRMSrcReg,
-                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                         "unpckhpd\t{$src2, $dst|$dst, $src2}",
-                         [(set VR128:$dst,
-                           (v2f64 (unpckh VR128:$src1, VR128:$src2)))]>;
-    def UNPCKHPDrm : PDI<0x15, MRMSrcMem,
-                         (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
-                         "unpckhpd\t{$src2, $dst|$dst, $src2}",
-                         [(set VR128:$dst,
-                           (v2f64 (unpckh VR128:$src1,
-                                          (memopv2f64 addr:$src2))))]>;
-
-    def UNPCKLPDrr : PDI<0x14, MRMSrcReg,
-                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                         "unpcklpd\t{$src2, $dst|$dst, $src2}",
-                         [(set VR128:$dst,
-                           (v2f64 (unpckl VR128:$src1, VR128:$src2)))]>;
-    def UNPCKLPDrm : PDI<0x14, MRMSrcMem,
-                         (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
-                         "unpcklpd\t{$src2, $dst|$dst, $src2}",
-                         [(set VR128:$dst,
-                           (unpckl VR128:$src1, (memopv2f64 addr:$src2)))]>;
-  } // AddedComplexity
-} // Constraints = "$src1 = $dst"
+def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+          (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Load/Store XCSR register
+//===----------------------------------------------------------------------===//
 
+let isAsmParserOnly = 1 in {
+  def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                    "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX;
+  def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                    "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX;
+}
+
+def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>;
+def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
 
 //===---------------------------------------------------------------------===//
-// SSE integer instructions
-let ExeDomain = SSEPackedInt in {
+// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
+//===---------------------------------------------------------------------===//
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+let isAsmParserOnly = 1 in {
+  let neverHasSideEffects = 1 in
+  def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVDQUrr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
+
+  let canFoldAsLoad = 1, mayLoad = 1 in {
+  def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}",
+                     [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>,
+                     VEX;
+  def VMOVDQUrm :  I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                     "vmovdqu\t{$src, $dst|$dst, $src}",
+                     [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
+                   XS, VEX, Requires<[HasAVX]>;
+  }
+
+  let mayStore = 1 in {
+  def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
+                     (ins i128mem:$dst, VR128:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}",
+                     [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>, VEX;
+  def VMOVDQUmr :  I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                     "vmovdqu\t{$src, $dst|$dst, $src}",
+                     [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
+                   XS, VEX, Requires<[HasAVX]>;
+  }
+}
 
-// Move Instructions
 let neverHasSideEffects = 1 in
 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movdqa\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, mayLoad = 1 in
+
+let canFoldAsLoad = 1, mayLoad = 1 in {
 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}",
                    [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
-let mayStore = 1 in
-def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
-                   "movdqa\t{$src, $dst|$dst, $src}",
-                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
-let canFoldAsLoad = 1, mayLoad = 1 in
 def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
                  XS, Requires<[HasSSE2]>;
-let mayStore = 1 in
+}
+
+let mayStore = 1 in {
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
 def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
                  XS, Requires<[HasSSE2]>;
+}
 
 // Intrinsic forms of MOVDQU load and store
+let isAsmParserOnly = 1 in {
+let canFoldAsLoad = 1 in
+def VMOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "vmovdqu\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
+                     XS, VEX, Requires<[HasAVX]>;
+def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                       "vmovdqu\t{$src, $dst|$dst, $src}",
+                       [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
+                     XS, VEX, Requires<[HasAVX]>;
+}
+
 let canFoldAsLoad = 1 in
 def MOVDQUrm_Int :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "movdqu\t{$src, $dst|$dst, $src}",
@@ -2019,55 +2021,72 @@ def MOVDQUmr_Int :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                        [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
                      XS, Requires<[HasSSE2]>;
 
-let Constraints = "$src1 = $dst" in {
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Arithmetic Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
 
 multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
-                            bit Commutable = 0> {
+                            bit IsCommutable = 0, bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
   def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
-                               (ins VR128:$src1, VR128:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> {
-    let isCommutable = Commutable;
-  }
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
   def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
-                               (ins VR128:$src1, i128mem:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (IntId VR128:$src1,
-                                        (bitconvert (memopv2i64
-                                                     addr:$src2))))]>;
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId VR128:$src1,
+                                (bitconvert (memopv2i64 addr:$src2))))]>;
 }
 
 multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
-                             string OpcodeStr,
-                             Intrinsic IntId, Intrinsic IntId2> {
+                             string OpcodeStr, Intrinsic IntId,
+                             Intrinsic IntId2, bit Is2Addr = 1> {
   def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
-                               (ins VR128:$src1, VR128:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
   def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
-                               (ins VR128:$src1, i128mem:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (IntId VR128:$src1,
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId VR128:$src1,
                                       (bitconvert (memopv2i64 addr:$src2))))]>;
   def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst),
-                                (ins VR128:$src1, i32i8imm:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (IntId2 VR128:$src1, (i32 imm:$src2)))]>;
+       (ins VR128:$src1, i32i8imm:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId2 VR128:$src1, (i32 imm:$src2)))]>;
 }
 
 /// PDI_binop_rm - Simple SSE2 binary operator.
 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                        ValueType OpVT, bit Commutable = 0> {
+                        ValueType OpVT, bit IsCommutable = 0, bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
   def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
-                               (ins VR128:$src1, VR128:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]> {
-    let isCommutable = Commutable;
-  }
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>;
   def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
-                               (ins VR128:$src1, i128mem:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (OpVT (OpNode VR128:$src1,
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (OpVT (OpNode VR128:$src1,
                                      (bitconvert (memopv2i64 addr:$src2)))))]>;
 }
 
@@ -2077,64 +2096,177 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 /// to collapse (bitconvert VT to VT) into its operand.
 ///
 multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              bit Commutable = 0> {
+                              bit IsCommutable = 0, bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
   def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
-               (ins VR128:$src1, VR128:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]> {
-    let isCommutable = Commutable;
-  }
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]>;
   def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
-               (ins VR128:$src1, i128mem:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (OpNode VR128:$src1,
-               (memopv2i64 addr:$src2)))]>;
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (OpNode VR128:$src1, (memopv2i64 addr:$src2)))]>;
 }
 
-} // Constraints = "$src1 = $dst"
 } // ExeDomain = SSEPackedInt
 
 // 128-bit Integer Arithmetic
 
-defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>;
-defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>;
-defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>;
-defm PADDQ : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>;
-
-defm PADDSB  : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>;
-defm PADDSW  : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>;
-defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>;
-defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>;
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+defm VPADDB  : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, 1, 0 /*3addr*/>, VEX_4V;
+defm VPADDW  : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, 1, 0>, VEX_4V;
+defm VPADDD  : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, 1, 0>, VEX_4V;
+defm VPADDQ  : PDI_binop_rm_v2i64<0xD4, "vpaddq", add, 1, 0>, VEX_4V;
+defm VPMULLW : PDI_binop_rm<0xD5, "vpmullw", mul, v8i16, 1, 0>, VEX_4V;
+defm VPSUBB : PDI_binop_rm<0xF8, "vpsubb", sub, v16i8, 0, 0>, VEX_4V;
+defm VPSUBW : PDI_binop_rm<0xF9, "vpsubw", sub, v8i16, 0, 0>, VEX_4V;
+defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, 0, 0>, VEX_4V;
+defm VPSUBQ : PDI_binop_rm_v2i64<0xFB, "vpsubq", sub, 0, 0>, VEX_4V;
+
+// Intrinsic forms
+defm VPSUBSB  : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b, 0, 0>,
+                                 VEX_4V;
+defm VPSUBSW  : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_sse2_psubs_w, 0, 0>,
+                                 VEX_4V;
+defm VPSUBUSB : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_sse2_psubus_b, 0, 0>,
+                                 VEX_4V;
+defm VPSUBUSW : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_sse2_psubus_w, 0, 0>,
+                                 VEX_4V;
+defm VPADDSB  : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_sse2_padds_b, 1, 0>,
+                                 VEX_4V;
+defm VPADDSW  : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_sse2_padds_w, 1, 0>,
+                                 VEX_4V;
+defm VPADDUSB : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_sse2_paddus_b, 1, 0>,
+                                 VEX_4V;
+defm VPADDUSW : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_sse2_paddus_w, 1, 0>,
+                                 VEX_4V;
+defm VPMULHUW : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_sse2_pmulhu_w, 1, 0>,
+                                 VEX_4V;
+defm VPMULHW  : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_sse2_pmulh_w, 1, 0>,
+                                 VEX_4V;
+defm VPMULUDQ : PDI_binop_rm_int<0xF4, "vpmuludq", int_x86_sse2_pmulu_dq, 1, 0>,
+                                 VEX_4V;
+defm VPMADDWD : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_sse2_pmadd_wd, 1, 0>,
+                                 VEX_4V;
+defm VPAVGB   : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_sse2_pavg_b, 1, 0>,
+                                 VEX_4V;
+defm VPAVGW   : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_sse2_pavg_w, 1, 0>,
+                                 VEX_4V;
+defm VPMINUB  : PDI_binop_rm_int<0xDA, "vpminub", int_x86_sse2_pminu_b, 1, 0>,
+                                 VEX_4V;
+defm VPMINSW  : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_sse2_pmins_w, 1, 0>,
+                                 VEX_4V;
+defm VPMAXUB  : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_sse2_pmaxu_b, 1, 0>,
+                                 VEX_4V;
+defm VPMAXSW  : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_sse2_pmaxs_w, 1, 0>,
+                                 VEX_4V;
+defm VPSADBW  : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_sse2_psad_bw, 1, 0>,
+                                 VEX_4V;
+}
 
+let Constraints = "$src1 = $dst" in {
+defm PADDB  : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>;
+defm PADDW  : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>;
+defm PADDD  : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>;
+defm PADDQ  : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>;
+defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>;
 defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>;
 defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>;
 defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>;
 defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>;
 
+// Intrinsic forms
 defm PSUBSB  : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>;
 defm PSUBSW  : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>;
 defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>;
 defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>;
-
-defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>;
-
+defm PADDSB  : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>;
+defm PADDSW  : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>;
+defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>;
+defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>;
 defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>;
-defm PMULHW  : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w , 1>;
+defm PMULHW  : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w, 1>;
 defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>;
-
 defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>;
+defm PAVGB   : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>;
+defm PAVGW   : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>;
+defm PMINUB  : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>;
+defm PMINSW  : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>;
+defm PMAXUB  : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>;
+defm PMAXSW  : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>;
+defm PSADBW  : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>;
+
+} // Constraints = "$src1 = $dst"
 
-defm PAVGB  : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>;
-defm PAVGW  : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>;
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
 
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+defm VPSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "vpsllw",
+                                int_x86_sse2_psll_w, int_x86_sse2_pslli_w, 0>,
+                                VEX_4V;
+defm VPSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "vpslld",
+                                int_x86_sse2_psll_d, int_x86_sse2_pslli_d, 0>,
+                                VEX_4V;
+defm VPSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "vpsllq",
+                                int_x86_sse2_psll_q, int_x86_sse2_pslli_q, 0>,
+                                VEX_4V;
+
+defm VPSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "vpsrlw",
+                                int_x86_sse2_psrl_w, int_x86_sse2_psrli_w, 0>,
+                                VEX_4V;
+defm VPSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "vpsrld",
+                                int_x86_sse2_psrl_d, int_x86_sse2_psrli_d, 0>,
+                                VEX_4V;
+defm VPSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "vpsrlq",
+                                int_x86_sse2_psrl_q, int_x86_sse2_psrli_q, 0>,
+                                VEX_4V;
+
+defm VPSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "vpsraw",
+                                int_x86_sse2_psra_w, int_x86_sse2_psrai_w, 0>,
+                                VEX_4V;
+defm VPSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "vpsrad",
+                                int_x86_sse2_psra_d, int_x86_sse2_psrai_d, 0>,
+                                VEX_4V;
+
+defm VPAND : PDI_binop_rm_v2i64<0xDB, "vpand", and, 1, 0>, VEX_4V;
+defm VPOR  : PDI_binop_rm_v2i64<0xEB, "vpor" , or, 1, 0>, VEX_4V;
+defm VPXOR : PDI_binop_rm_v2i64<0xEF, "vpxor", xor, 1, 0>, VEX_4V;
 
-defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>;
-defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>;
-defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>;
-defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>;
-defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>;
+let ExeDomain = SSEPackedInt in {
+  let neverHasSideEffects = 1 in {
+    // 128-bit logical shifts.
+    def VPSLLDQri : PDIi8<0x73, MRM7r,
+                      (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                      "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                      VEX_4V;
+    def VPSRLDQri : PDIi8<0x73, MRM3r,
+                      (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                      "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                      VEX_4V;
+    // PSRADQri doesn't exist in SSE[1-3].
+  }
+  def VPANDNrr : PDI<0xDF, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+                                              VR128:$src2)))]>, VEX_4V;
 
+  def VPANDNrm : PDI<0xDF, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                    "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+                                              (memopv2i64 addr:$src2))))]>,
+                                              VEX_4V;
+}
+}
 
+let Constraints = "$src1 = $dst" in {
 defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
                                int_x86_sse2_psll_w, int_x86_sse2_pslli_w>;
 defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
@@ -2154,17 +2286,34 @@ defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
 defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
                                int_x86_sse2_psra_d, int_x86_sse2_psrai_d>;
 
-// 128-bit logical shifts.
-let Constraints = "$src1 = $dst", neverHasSideEffects = 1,
-    ExeDomain = SSEPackedInt in {
-  def PSLLDQri : PDIi8<0x73, MRM7r,
-                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
-                       "pslldq\t{$src2, $dst|$dst, $src2}", []>;
-  def PSRLDQri : PDIi8<0x73, MRM3r,
-                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
-                       "psrldq\t{$src2, $dst|$dst, $src2}", []>;
-  // PSRADQri doesn't exist in SSE[1-3].
+defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
+defm POR  : PDI_binop_rm_v2i64<0xEB, "por" , or, 1>;
+defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
+
+let ExeDomain = SSEPackedInt in {
+  let neverHasSideEffects = 1 in {
+    // 128-bit logical shifts.
+    def PSLLDQri : PDIi8<0x73, MRM7r,
+                         (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                         "pslldq\t{$src2, $dst|$dst, $src2}", []>;
+    def PSRLDQri : PDIi8<0x73, MRM3r,
+                         (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                         "psrldq\t{$src2, $dst|$dst, $src2}", []>;
+    // PSRADQri doesn't exist in SSE[1-3].
+  }
+  def PANDNrr : PDI<0xDF, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "pandn\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+                                              VR128:$src2)))]>;
+
+  def PANDNrm : PDI<0xDF, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                    "pandn\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+                                              (memopv2i64 addr:$src2))))]>;
 }
+} // Constraints = "$src1 = $dst"
 
 let Predicates = [HasSSE2] in {
   def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
@@ -2185,32 +2334,33 @@ let Predicates = [HasSSE2] in {
             (v2i64 (PSRLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
 }
 
-// Logical
-defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
-defm POR  : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>;
-defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
-
-let Constraints = "$src1 = $dst", ExeDomain = SSEPackedInt in {
-  def PANDNrr : PDI<0xDF, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                    "pandn\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
-                                              VR128:$src2)))]>;
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Comparison Instructions
+//===---------------------------------------------------------------------===//
 
-  def PANDNrm : PDI<0xDF, MRMSrcMem,
-                    (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                    "pandn\t{$src2, $dst|$dst, $src2}",
-                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
-                                              (memopv2i64 addr:$src2))))]>;
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+  defm VPCMPEQB  : PDI_binop_rm_int<0x74, "vpcmpeqb", int_x86_sse2_pcmpeq_b, 1,
+                                    0>, VEX_4V;
+  defm VPCMPEQW  : PDI_binop_rm_int<0x75, "vpcmpeqw", int_x86_sse2_pcmpeq_w, 1,
+                                    0>, VEX_4V;
+  defm VPCMPEQD  : PDI_binop_rm_int<0x76, "vpcmpeqd", int_x86_sse2_pcmpeq_d, 1,
+                                    0>, VEX_4V;
+  defm VPCMPGTB  : PDI_binop_rm_int<0x64, "vpcmpgtb", int_x86_sse2_pcmpgt_b, 0,
+                                    0>, VEX_4V;
+  defm VPCMPGTW  : PDI_binop_rm_int<0x65, "vpcmpgtw", int_x86_sse2_pcmpgt_w, 0,
+                                    0>, VEX_4V;
+  defm VPCMPGTD  : PDI_binop_rm_int<0x66, "vpcmpgtd", int_x86_sse2_pcmpgt_d, 0,
+                                    0>, VEX_4V;
 }
 
-// SSE2 Integer comparison
-defm PCMPEQB  : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b>;
-defm PCMPEQW  : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w>;
-defm PCMPEQD  : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d>;
-defm PCMPGTB  : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>;
-defm PCMPGTW  : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>;
-defm PCMPGTD  : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
+let Constraints = "$src1 = $dst" in {
+  defm PCMPEQB  : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b, 1>;
+  defm PCMPEQW  : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w, 1>;
+  defm PCMPEQD  : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d, 1>;
+  defm PCMPGTB  : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>;
+  defm PCMPGTW  : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>;
+  defm PCMPGTD  : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
+} // Constraints = "$src1 = $dst"
 
 def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
           (PCMPEQBrr VR128:$src1, VR128:$src2)>;
@@ -2238,94 +2388,147 @@ def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
 def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
           (PCMPGTDrm VR128:$src1, addr:$src2)>;
 
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Pack Instructions
+//===---------------------------------------------------------------------===//
 
-// Pack instructions
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128,
+                                  0, 0>, VEX_4V;
+defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128,
+                                  0, 0>, VEX_4V;
+defm VPACKUSWB : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_sse2_packuswb_128,
+                                  0, 0>, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst" in {
 defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
 defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
 defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
+} // Constraints = "$src1 = $dst"
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Shuffle Instructions
+//===---------------------------------------------------------------------===//
 
 let ExeDomain = SSEPackedInt in {
+multiclass sse2_pshuffle<string OpcodeStr, ValueType vt, PatFrag pshuf_frag,
+                         PatFrag bc_frag> {
+def ri : Ii8<0x70, MRMSrcReg,
+              (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+              !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              [(set VR128:$dst, (vt (pshuf_frag:$src2 VR128:$src1,
+                                                      (undef))))]>;
+def mi : Ii8<0x70, MRMSrcMem,
+              (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+              !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              [(set VR128:$dst, (vt (pshuf_frag:$src2
+                                      (bc_frag (memopv2i64 addr:$src1)),
+                                      (undef))))]>;
+}
+} // ExeDomain = SSEPackedInt
 
-// Shuffle and unpack instructions
-let AddedComplexity = 5 in {
-def PSHUFDri : PDIi8<0x70, MRMSrcReg,
-                     (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
-                     "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     [(set VR128:$dst, (v4i32 (pshufd:$src2
-                                               VR128:$src1, (undef))))]>;
-def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
-                     (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
-                     "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     [(set VR128:$dst, (v4i32 (pshufd:$src2
-                                             (bc_v4i32 (memopv2i64 addr:$src1)),
-                                             (undef))))]>;
-}
-
-// SSE2 with ImmT == Imm8 and XS prefix.
-def PSHUFHWri : Ii8<0x70, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
-                    "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst, (v8i16 (pshufhw:$src2 VR128:$src1,
-                                                            (undef))))]>,
-                XS, Requires<[HasSSE2]>;
-def PSHUFHWmi : Ii8<0x70, MRMSrcMem,
-                    (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
-                    "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst, (v8i16 (pshufhw:$src2
-                                            (bc_v8i16 (memopv2i64 addr:$src1)),
-                                            (undef))))]>,
-                XS, Requires<[HasSSE2]>;
-
-// SSE2 with ImmT == Imm8 and XD prefix.
-def PSHUFLWri : Ii8<0x70, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
-                    "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst, (v8i16 (pshuflw:$src2 VR128:$src1,
-                                                            (undef))))]>,
-                XD, Requires<[HasSSE2]>;
-def PSHUFLWmi : Ii8<0x70, MRMSrcMem,
-                    (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
-                    "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst, (v8i16 (pshuflw:$src2
-                                             (bc_v8i16 (memopv2i64 addr:$src1)),
-                                             (undef))))]>,
-                XD, Requires<[HasSSE2]>;
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+  let AddedComplexity = 5 in
+  defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, OpSize,
+                               VEX;
 
+  // SSE2 with ImmT == Imm8 and XS prefix.
+  defm VPSHUFHW : sse2_pshuffle<"vpshufhw", v8i16, pshufhw, bc_v8i16>, XS,
+                               VEX;
 
-let Constraints = "$src1 = $dst" in {
-  def PUNPCKLBWrr : PDI<0x60, MRMSrcReg,
-                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                        "punpcklbw\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst,
-                          (v16i8 (unpckl VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKLBWrm : PDI<0x60, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                        "punpcklbw\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst,
-                          (unpckl VR128:$src1,
-                                  (bc_v16i8 (memopv2i64 addr:$src2))))]>;
-  def PUNPCKLWDrr : PDI<0x61, MRMSrcReg,
-                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                        "punpcklwd\t{$src2, $dst|$dst, $src2}",
+  // SSE2 with ImmT == Imm8 and XD prefix.
+  defm VPSHUFLW : sse2_pshuffle<"vpshuflw", v8i16, pshuflw, bc_v8i16>, XD,
+                               VEX;
+}
+
+let Predicates = [HasSSE2] in {
+  let AddedComplexity = 5 in
+  defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, pshufd, bc_v4i32>, TB, OpSize;
+
+  // SSE2 with ImmT == Imm8 and XS prefix.
+  defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, pshufhw, bc_v8i16>, XS;
+
+  // SSE2 with ImmT == Imm8 and XD prefix.
+  defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, pshuflw, bc_v8i16>, XD;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Unpack Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
+                       PatFrag unp_frag, PatFrag bc_frag, bit Is2Addr = 1> {
+  def rr : PDI<opc, MRMSrcReg,
+      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set VR128:$dst, (vt (unp_frag VR128:$src1, VR128:$src2)))]>;
+  def rm : PDI<opc, MRMSrcMem,
+      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set VR128:$dst, (unp_frag VR128:$src1,
+                                  (bc_frag (memopv2i64
+                                               addr:$src2))))]>;
+}
+
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, unpckl, bc_v16i8,
+                                 0>, VEX_4V;
+  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, unpckl, bc_v8i16,
+                                 0>, VEX_4V;
+  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, unpckl, bc_v4i32,
+                                 0>, VEX_4V;
+
+  /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
+  /// knew to collapse (bitconvert VT to VT) into its operand.
+  def VPUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                         [(set VR128:$dst,
-                          (v8i16 (unpckl VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKLWDrm : PDI<0x61, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                        "punpcklwd\t{$src2, $dst|$dst, $src2}",
+                          (v2i64 (unpckl VR128:$src1, VR128:$src2)))]>, VEX_4V;
+  def VPUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
+                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                         "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                         [(set VR128:$dst,
-                          (unpckl VR128:$src1,
-                                  (bc_v8i16 (memopv2i64 addr:$src2))))]>;
-  def PUNPCKLDQrr : PDI<0x62, MRMSrcReg,
-                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                        "punpckldq\t{$src2, $dst|$dst, $src2}",
+                          (v2i64 (unpckl VR128:$src1,
+                                         (memopv2i64 addr:$src2))))]>, VEX_4V;
+
+  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, unpckh, bc_v16i8,
+                                 0>, VEX_4V;
+  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, unpckh, bc_v8i16,
+                                 0>, VEX_4V;
+  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, unpckh, bc_v4i32,
+                                 0>, VEX_4V;
+
+  /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
+  /// knew to collapse (bitconvert VT to VT) into its operand.
+  def VPUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                         [(set VR128:$dst,
-                          (v4i32 (unpckl VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKLDQrm : PDI<0x62, MRMSrcMem,
+                          (v2i64 (unpckh VR128:$src1, VR128:$src2)))]>, VEX_4V;
+  def VPUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                        "punpckldq\t{$src2, $dst|$dst, $src2}",
+                        "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                         [(set VR128:$dst,
-                          (unpckl VR128:$src1,
-                                  (bc_v4i32 (memopv2i64 addr:$src2))))]>;
+                          (v2i64 (unpckh VR128:$src1,
+                                         (memopv2i64 addr:$src2))))]>, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, unpckl, bc_v16i8>;
+  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, unpckl, bc_v8i16>;
+  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, unpckl, bc_v4i32>;
+
+  /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
+  /// knew to collapse (bitconvert VT to VT) into its operand.
   def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
                          (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                          "punpcklqdq\t{$src2, $dst|$dst, $src2}",
@@ -2338,39 +2541,12 @@ let Constraints = "$src1 = $dst" in {
                           (v2i64 (unpckl VR128:$src1,
                                          (memopv2i64 addr:$src2))))]>;
 
-  def PUNPCKHBWrr : PDI<0x68, MRMSrcReg,
-                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                        "punpckhbw\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst,
-                          (v16i8 (unpckh VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKHBWrm : PDI<0x68, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                        "punpckhbw\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst,
-                          (unpckh VR128:$src1,
-                                  (bc_v16i8 (memopv2i64 addr:$src2))))]>;
-  def PUNPCKHWDrr : PDI<0x69, MRMSrcReg,
-                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                        "punpckhwd\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst,
-                          (v8i16 (unpckh VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKHWDrm : PDI<0x69, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                        "punpckhwd\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst,
-                          (unpckh VR128:$src1,
-                                  (bc_v8i16 (memopv2i64 addr:$src2))))]>;
-  def PUNPCKHDQrr : PDI<0x6A, MRMSrcReg,
-                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                        "punpckhdq\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst,
-                          (v4i32 (unpckh VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKHDQrm : PDI<0x6A, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                        "punpckhdq\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst,
-                          (unpckh VR128:$src1,
-                                  (bc_v4i32 (memopv2i64 addr:$src2))))]>;
+  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, unpckh, bc_v16i8>;
+  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, unpckh, bc_v8i16>;
+  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, unpckh, bc_v4i32>;
+
+  /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
+  /// knew to collapse (bitconvert VT to VT) into its operand.
   def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
                          (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                          "punpckhqdq\t{$src2, $dst|$dst, $src2}",
@@ -2384,102 +2560,117 @@ let Constraints = "$src1 = $dst" in {
                                          (memopv2i64 addr:$src2))))]>;
 }
 
-// Extract / Insert
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Extract and Insert
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pinsrw<bit Is2Addr = 1> {
+  def rri : Ii8<0xC4, MRMSrcReg,
+       (outs VR128:$dst), (ins VR128:$src1,
+        GR32:$src2, i32i8imm:$src3),
+       !if(Is2Addr,
+           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+       [(set VR128:$dst,
+         (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>;
+  def rmi : Ii8<0xC4, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1,
+                        i16mem:$src2, i32i8imm:$src3),
+       !if(Is2Addr,
+           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+       [(set VR128:$dst,
+         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
+                    imm:$src3))]>;
+}
+
+// Extract
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
+                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                                imm:$src2))]>, OpSize, VEX;
 def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
                     (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
                                                 imm:$src2))]>;
-let Constraints = "$src1 = $dst" in {
-  def PINSRWrri : PDIi8<0xC4, MRMSrcReg,
-                       (outs VR128:$dst), (ins VR128:$src1,
-                        GR32:$src2, i32i8imm:$src3),
-                       "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                       [(set VR128:$dst,
-                         (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>;
-  def PINSRWrmi : PDIi8<0xC4, MRMSrcMem,
-                       (outs VR128:$dst), (ins VR128:$src1,
-                        i16mem:$src2, i32i8imm:$src3),
-                       "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                       [(set VR128:$dst,
-                         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
-                                    imm:$src3))]>;
-}
 
-// Mask creation
-def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
-                     "pmovmskb\t{$src, $dst|$dst, $src}",
-                     [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
+// Insert
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm PINSRW : sse2_pinsrw<0>, OpSize, VEX_4V;
 
-// Conditional store
-let Uses = [EDI] in
-def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
-                     "maskmovdqu\t{$mask, $src|$src, $mask}",
-                     [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
-
-let Uses = [RDI] in
-def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
-                     "maskmovdqu\t{$mask, $src|$src, $mask}",
-                     [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
+let Constraints = "$src1 = $dst" in
+  defm VPINSRW : sse2_pinsrw, TB, OpSize;
 
 } // ExeDomain = SSEPackedInt
 
-// Non-temporal stores
-def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
-                        "movntpd\t{$src, $dst|$dst, $src}",
-                        [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
-let ExeDomain = SSEPackedInt in
-def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                        "movntdq\t{$src, $dst|$dst, $src}",
-                        [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
-def MOVNTImr_Int  :   I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-                    "movnti\t{$src, $dst|$dst, $src}",
-                    [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
-                  TB, Requires<[HasSSE2]>;
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Mask Creation
+//===---------------------------------------------------------------------===//
 
-let AddedComplexity = 400 in { // Prefer non-temporal versions
-def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                    "movntpd\t{$src, $dst|$dst, $src}",
-                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
+let ExeDomain = SSEPackedInt in {
 
-let ExeDomain = SSEPackedInt in
-def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                    "movntdq\t{$src, $dst|$dst, $src}",
-                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
-}
+let isAsmParserOnly = 1 in
+def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>, VEX;
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
 
-// Flush cache
-def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
-               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
-              TB, Requires<[HasSSE2]>;
+} // ExeDomain = SSEPackedInt
 
-// Load, store, and memory fence
-def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
-               "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
-def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
-               "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
+//===---------------------------------------------------------------------===//
+// SSE2 - Conditional Store
+//===---------------------------------------------------------------------===//
 
-// Pause. This "instruction" is encoded as "rep; nop", so even though it
-// was introduced with SSE2, it's backward compatible.
-def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
+let ExeDomain = SSEPackedInt in {
 
-//TODO: custom lower this so as to never even generate the noop
-def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
-           (i8 0)), (NOOP)>;
-def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
-def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
-def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
-           (i8 1)), (MFENCE)>;
+let isAsmParserOnly = 1 in {
+let Uses = [EDI] in
+def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
+           (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, VEX;
+let Uses = [RDI] in
+def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
+           (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, VEX;
+}
 
-// Alias instructions that map zero vector to pxor / xorp* for sse.
-// We set canFoldAsLoad because this can be converted to a constant-pool
-// load of an all-ones value if folding it would be beneficial.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
-  // FIXME: Change encoding to pseudo.
-  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
-                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+let Uses = [EDI] in
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
+let Uses = [RDI] in
+def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Doubleword
+//===---------------------------------------------------------------------===//
 
+// Move Int Doubleword to Packed Double Int
+let isAsmParserOnly = 1 in {
+def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))]>, VEX;
+def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
+                      VEX;
+}
 def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -2489,6 +2680,18 @@ def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>;
 
+
+// Move Int Doubleword to Single Scalar
+let isAsmParserOnly = 1 in {
+def VMOVDI2SSrr  : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX;
+
+def VMOVDI2SSrm  : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
+                      VEX;
+}
 def MOVDI2SSrr  : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert GR32:$src))]>;
@@ -2497,20 +2700,18 @@ def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
 
-// SSE2 instructions with XS prefix
-def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
-                    "movq\t{$src, $dst|$dst, $src}",
-                    [(set VR128:$dst,
-                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
-                  Requires<[HasSSE2]>;
-def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
-                      "movq\t{$src, $dst|$dst, $src}",
-                      [(store (i64 (vector_extract (v2i64 VR128:$src),
-                                    (iPTR 0))), addr:$dst)]>;
-
-def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
-          (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
-
+// Move Packed Doubleword Int to Packed Double Int
+let isAsmParserOnly = 1 in {
+def VMOVPDI2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+                                        (iPTR 0)))]>, VEX;
+def VMOVPDI2DImr  : VPDI<0x7E, MRMDestMem, (outs),
+                       (ins i32mem:$dst, VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (vector_extract (v4i32 VR128:$src),
+                                     (iPTR 0))), addr:$dst)]>, VEX;
+}
 def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
@@ -2520,6 +2721,15 @@ def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                        [(store (i32 (vector_extract (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)]>;
 
+// Move Scalar Single to Double Int
+let isAsmParserOnly = 1 in {
+def VMOVSS2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX;
+def VMOVSS2DImr  : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, VEX;
+}
 def MOVSS2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set GR32:$dst, (bitconvert FR32:$src))]>;
@@ -2527,25 +2737,38 @@ def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
 
-// Store / copy lower 64-bits of a XMM register.
-def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
-                     "movq\t{$src, $dst|$dst, $src}",
-                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
-
 // movd / movq to XMM register zero-extends
+let AddedComplexity = 15, isAsmParserOnly = 1 in {
+def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (v4i32 (X86vzmovl
+                                      (v4i32 (scalar_to_vector GR32:$src)))))]>,
+                                      VEX;
+def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
+                       [(set VR128:$dst, (v2i64 (X86vzmovl
+                                      (v2i64 (scalar_to_vector GR64:$src)))))]>,
+                                      VEX, VEX_W;
+}
 let AddedComplexity = 15 in {
 def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (v4i32 (X86vzmovl
                                       (v4i32 (scalar_to_vector GR32:$src)))))]>;
-// This is X86-64 only.
 def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
                        [(set VR128:$dst, (v2i64 (X86vzmovl
                                       (v2i64 (scalar_to_vector GR64:$src)))))]>;
 }
 
 let AddedComplexity = 20 in {
+let isAsmParserOnly = 1 in
+def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
+                                                   (loadi32 addr:$src))))))]>,
+                                                   VEX;
 def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
@@ -2558,13 +2781,63 @@ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
             (MOVZDI2PDIrm addr:$src)>;
 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
             (MOVZDI2PDIrm addr:$src)>;
+}
 
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Quadword
+//===---------------------------------------------------------------------===//
+
+// Move Quadword Int to Packed Quadword Int
+let isAsmParserOnly = 1 in
+def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                    "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+                    VEX, Requires<[HasAVX]>;
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                    "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+                    Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix
+
+// Move Packed Quadword Int to Quadword Int
+let isAsmParserOnly = 1 in
+def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}",
+                      [(store (i64 (vector_extract (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)]>, VEX;
+def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}",
+                      [(store (i64 (vector_extract (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)]>;
+
+def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+          (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+
+// Store / copy lower 64-bits of a XMM register.
+let isAsmParserOnly = 1 in
+def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                     "movq\t{$src, $dst|$dst, $src}",
+                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX;
+def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                     "movq\t{$src, $dst|$dst, $src}",
+                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
+
+let AddedComplexity = 20, isAsmParserOnly = 1 in
+def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "vmovq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+                                                 (loadi64 addr:$src))))))]>,
+                     XS, VEX, Requires<[HasAVX]>;
+
+let AddedComplexity = 20 in {
 def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
                        (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
-                                                 (loadi64 addr:$src))))))]>, XS,
-                   Requires<[HasSSE2]>;
+                                                 (loadi64 addr:$src))))))]>,
+                     XS, Requires<[HasSSE2]>;
 
 def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
             (MOVZQI2PQIrm addr:$src)>;
@@ -2575,12 +2848,23 @@ def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
 
 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
 // IA32 document. movq xmm1, xmm2 does clear the high bits.
+let isAsmParserOnly = 1, AddedComplexity = 15 in
+def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
+                      XS, VEX, Requires<[HasAVX]>;
 let AddedComplexity = 15 in
 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
                       XS, Requires<[HasSSE2]>;
 
+let AddedComplexity = 20, isAsmParserOnly = 1 in
+def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                        "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl
+                                             (loadv2i64 addr:$src))))]>,
+                      XS, VEX, Requires<[HasAVX]>;
 let AddedComplexity = 20 in {
 def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                         "movq\t{$src, $dst|$dst, $src}",
@@ -2592,49 +2876,136 @@ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
             (MOVZPQILo2PQIrm addr:$src)>;
 }
 
+// Instructions to match in the assembler
+let isAsmParserOnly = 1 in {
+// This instructions is in fact an alias to movd with 64 bit dst
+def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                      "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
+def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
+}
+
 // Instructions for the disassembler
 // xr = XMM register
 // xm = mem64
 
+let isAsmParserOnly = 1 in
+def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
 def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  "movq\t{$src, $dst|$dst, $src}", []>, XS;
 
 //===---------------------------------------------------------------------===//
-// SSE3 Instructions
+// SSE2 - Misc Instructions
 //===---------------------------------------------------------------------===//
 
-// Move Instructions
-def MOVSHDUPrr : S3SI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                      "movshdup\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (v4f32 (movshdup
-                                                VR128:$src, (undef))))]>;
-def MOVSHDUPrm : S3SI<0x16, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                      "movshdup\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (movshdup
-                                         (memopv4f32 addr:$src), (undef)))]>;
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
+              TB, Requires<[HasSSE2]>;
+
+// Load, store, and memory fence
+def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
+               "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
+               "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
+
+// Pause. This "instruction" is encoded as "rep; nop", so even though it
+// was introduced with SSE2, it's backward compatible.
+def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
+
+//TODO: custom lower this so as to never even generate the noop
+def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
+           (i8 0)), (NOOP)>;
+def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
+def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
+def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
+           (i8 1)), (MFENCE)>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
+  // FIXME: Change encoding to pseudo.
+  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
+                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
 
-def MOVSLDUPrr : S3SI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                      "movsldup\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (v4f32 (movsldup
+//===---------------------------------------------------------------------===//
+// SSE3 - Conversion Instructions
+//===---------------------------------------------------------------------===//
+
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+def VCVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+def VCVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+def VCVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+}
+
+def CVTPD2DQrm  : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Move Instructions
+//===---------------------------------------------------------------------===//
+
+// Replicate Single FP
+multiclass sse3_replicate_sfp<bits<8> op, PatFrag rep_frag, string OpcodeStr> {
+def rr : S3SI<op, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set VR128:$dst, (v4f32 (rep_frag
                                                 VR128:$src, (undef))))]>;
-def MOVSLDUPrm : S3SI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                      "movsldup\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (movsldup
+def rm : S3SI<op, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set VR128:$dst, (rep_frag
                                          (memopv4f32 addr:$src), (undef)))]>;
+}
 
-def MOVDDUPrr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                      "movddup\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst,(v2f64 (movddup VR128:$src, (undef))))]>;
-def MOVDDUPrm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                      "movddup\t{$src, $dst|$dst, $src}",
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+defm VMOVSHDUP : sse3_replicate_sfp<0x16, movshdup, "vmovshdup">, VEX;
+defm VMOVSLDUP : sse3_replicate_sfp<0x12, movsldup, "vmovsldup">, VEX;
+}
+defm MOVSHDUP : sse3_replicate_sfp<0x16, movshdup, "movshdup">;
+defm MOVSLDUP : sse3_replicate_sfp<0x12, movsldup, "movsldup">;
+
+// Replicate Double FP
+multiclass sse3_replicate_dfp<string OpcodeStr> {
+def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,(v2f64 (movddup VR128:$src, (undef))))]>;
+def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst,
                       (v2f64 (movddup (scalar_to_vector (loadf64 addr:$src)),
                                       (undef))))]>;
+}
+
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX;
+defm MOVDDUP : sse3_replicate_dfp<"movddup">;
+
+// Move Unaligned Integer
+let isAsmParserOnly = 1 in
+  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                     "vlddqu\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "lddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
 
 def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
                    (undef)),
           (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
 
+// Several Move patterns
 let AddedComplexity = 5 in {
 def : Pat<(movddup (memopv2f64 addr:$src), (undef)),
           (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
@@ -2646,52 +3017,98 @@ def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
           (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
 }
 
-// Arithmetic
-let Constraints = "$src1 = $dst" in {
-  def ADDSUBPSrr : S3DI<0xD0, MRMSrcReg,
-                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                        "addsubps\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
-                                           VR128:$src2))]>;
-  def ADDSUBPSrm : S3DI<0xD0, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
-                        "addsubps\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
-                                           (memop addr:$src2)))]>;
-  def ADDSUBPDrr : S3I<0xD0, MRMSrcReg,
-                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                       "addsubpd\t{$src2, $dst|$dst, $src2}",
-                       [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
-                                          VR128:$src2))]>;
-  def ADDSUBPDrm : S3I<0xD0, MRMSrcMem,
-                       (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
-                       "addsubpd\t{$src2, $dst|$dst, $src2}",
-                       [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
-                                          (memop addr:$src2)))]>;
+// vector_shuffle v1, <undef> <1, 1, 3, 3>
+let AddedComplexity = 15 in
+def : Pat<(v4i32 (movshdup VR128:$src, (undef))),
+          (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+def : Pat<(v4i32 (movshdup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
+          (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <0, 0, 2, 2>
+let AddedComplexity = 15 in
+  def : Pat<(v4i32 (movsldup VR128:$src, (undef))),
+            (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+  def : Pat<(v4i32 (movsldup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
+            (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Arithmetic
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, bit Is2Addr = 1> {
+  def rr : I<0xD0, MRMSrcReg,
+       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (Int VR128:$src1,
+                          VR128:$src2))]>;
+  def rm : I<0xD0, MRMSrcMem,
+       (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (Int VR128:$src1,
+                          (memop addr:$src2)))]>;
+
 }
 
-def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                   "lddqu\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
+let isAsmParserOnly = 1, Predicates = [HasAVX],
+  ExeDomain = SSEPackedDouble in {
+  defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", 0>, XD,
+                              VEX_4V;
+  defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", 0>, OpSize,
+                              VEX_4V;
+}
+let Constraints = "$src1 = $dst", Predicates = [HasSSE3],
+    ExeDomain = SSEPackedDouble in {
+  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps">, XD;
+  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd">, TB, OpSize;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 Instructions
+//===---------------------------------------------------------------------===//
 
 // Horizontal ops
-class S3D_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+class S3D_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId, bit Is2Addr = 1>
   : S3DI<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
          [(set VR128:$dst, (v4f32 (IntId VR128:$src1, VR128:$src2)))]>;
-class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId, bit Is2Addr = 1>
   : S3DI<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+       !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
          [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (memop addr:$src2))))]>;
-class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId, bit Is2Addr = 1>
   : S3I<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
         [(set VR128:$dst, (v2f64 (IntId VR128:$src1, VR128:$src2)))]>;
-class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId, bit Is2Addr = 1>
   : S3I<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
-        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (memopv2f64 addr:$src2))))]>;
 
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+  def VHADDPSrr : S3D_Intrr<0x7C, "vhaddps", int_x86_sse3_hadd_ps, 0>, VEX_4V;
+  def VHADDPSrm : S3D_Intrm<0x7C, "vhaddps", int_x86_sse3_hadd_ps, 0>, VEX_4V;
+  def VHADDPDrr : S3_Intrr <0x7C, "vhaddpd", int_x86_sse3_hadd_pd, 0>, VEX_4V;
+  def VHADDPDrm : S3_Intrm <0x7C, "vhaddpd", int_x86_sse3_hadd_pd, 0>, VEX_4V;
+  def VHSUBPSrr : S3D_Intrr<0x7D, "vhsubps", int_x86_sse3_hsub_ps, 0>, VEX_4V;
+  def VHSUBPSrm : S3D_Intrm<0x7D, "vhsubps", int_x86_sse3_hsub_ps, 0>, VEX_4V;
+  def VHSUBPDrr : S3_Intrr <0x7D, "vhsubpd", int_x86_sse3_hsub_pd, 0>, VEX_4V;
+  def VHSUBPDrm : S3_Intrm <0x7D, "vhsubpd", int_x86_sse3_hsub_pd, 0>, VEX_4V;
+}
+
 let Constraints = "$src1 = $dst" in {
   def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>;
   def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>;
@@ -2703,35 +3120,14 @@ let Constraints = "$src1 = $dst" in {
   def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
 }
 
-// Thread synchronization
-def MONITOR : I<0x01, MRM_C8, (outs), (ins), "monitor",
-                [(int_x86_sse3_monitor EAX, ECX, EDX)]>,TB, Requires<[HasSSE3]>;
-def MWAIT   : I<0x01, MRM_C9, (outs), (ins), "mwait",
-                [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
-
-// vector_shuffle v1, <undef> <1, 1, 3, 3>
-let AddedComplexity = 15 in
-def : Pat<(v4i32 (movshdup VR128:$src, (undef))),
-          (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>;
-let AddedComplexity = 20 in
-def : Pat<(v4i32 (movshdup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
-          (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>;
-
-// vector_shuffle v1, <undef> <0, 0, 2, 2>
-let AddedComplexity = 15 in
-  def : Pat<(v4i32 (movsldup VR128:$src, (undef))),
-            (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>;
-let AddedComplexity = 20 in
-  def : Pat<(v4i32 (movsldup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
-            (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
-
 //===---------------------------------------------------------------------===//
-// SSSE3 Instructions
+// SSSE3 - Packed Absolute Instructions
 //===---------------------------------------------------------------------===//
 
-/// SS3I_unop_rm_int_8 - Simple SSSE3 unary operator whose type is v*i8.
-multiclass SS3I_unop_rm_int_8<bits<8> opc, string OpcodeStr,
-                              Intrinsic IntId64, Intrinsic IntId128> {
+/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
+                            PatFrag mem_frag64, PatFrag mem_frag128,
+                            Intrinsic IntId64, Intrinsic IntId128> {
   def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst, (IntId64 VR64:$src))]>;
@@ -2739,7 +3135,7 @@ multiclass SS3I_unop_rm_int_8<bits<8> opc, string OpcodeStr,
   def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst,
-                     (IntId64 (bitconvert (memopv8i8 addr:$src))))]>;
+                     (IntId64 (bitconvert (mem_frag64 addr:$src))))]>;
 
   def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src),
@@ -2752,240 +3148,203 @@ multiclass SS3I_unop_rm_int_8<bits<8> opc, string OpcodeStr,
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst,
                       (IntId128
-                       (bitconvert (memopv16i8 addr:$src))))]>, OpSize;
+                       (bitconvert (mem_frag128 addr:$src))))]>, OpSize;
 }
 
-/// SS3I_unop_rm_int_16 - Simple SSSE3 unary operator whose type is v*i16.
-multiclass SS3I_unop_rm_int_16<bits<8> opc, string OpcodeStr,
-                               Intrinsic IntId64, Intrinsic IntId128> {
-  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
-                   (ins VR64:$src),
-                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set VR64:$dst, (IntId64 VR64:$src))]>;
-
-  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
-                   (ins i64mem:$src),
-                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set VR64:$dst,
-                     (IntId64
-                      (bitconvert (memopv4i16 addr:$src))))]>;
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb", memopv8i8, memopv16i8,
+                                  int_x86_ssse3_pabs_b,
+                                  int_x86_ssse3_pabs_b_128>, VEX;
+  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw", memopv4i16, memopv8i16,
+                                  int_x86_ssse3_pabs_w,
+                                  int_x86_ssse3_pabs_w_128>, VEX;
+  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd", memopv2i32, memopv4i32,
+                                  int_x86_ssse3_pabs_d,
+                                  int_x86_ssse3_pabs_d_128>, VEX;
+}
 
-  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
-                    (ins VR128:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
-                    OpSize;
+defm PABSB       : SS3I_unop_rm_int<0x1C, "pabsb", memopv8i8, memopv16i8,
+                                    int_x86_ssse3_pabs_b,
+                                    int_x86_ssse3_pabs_b_128>;
+defm PABSW       : SS3I_unop_rm_int<0x1D, "pabsw", memopv4i16, memopv8i16,
+                                    int_x86_ssse3_pabs_w,
+                                    int_x86_ssse3_pabs_w_128>;
+defm PABSD       : SS3I_unop_rm_int<0x1E, "pabsd", memopv2i32, memopv4i32,
+                                    int_x86_ssse3_pabs_d,
+                                    int_x86_ssse3_pabs_d_128>;
 
-  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
-                    (ins i128mem:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst,
-                      (IntId128
-                       (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
-}
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Binary Operator Instructions
+//===---------------------------------------------------------------------===//
 
-/// SS3I_unop_rm_int_32 - Simple SSSE3 unary operator whose type is v*i32.
-multiclass SS3I_unop_rm_int_32<bits<8> opc, string OpcodeStr,
-                               Intrinsic IntId64, Intrinsic IntId128> {
+/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
+                             PatFrag mem_frag64, PatFrag mem_frag128,
+                             Intrinsic IntId64, Intrinsic IntId128,
+                             bit Is2Addr = 1> {
+  let isCommutable = 1 in
   def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
-                   (ins VR64:$src),
-                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set VR64:$dst, (IntId64 VR64:$src))]>;
-
+       (ins VR64:$src1, VR64:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]>;
   def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
-                   (ins i64mem:$src),
-                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set VR64:$dst,
-                     (IntId64
-                      (bitconvert (memopv2i32 addr:$src))))]>;
+       (ins VR64:$src1, i64mem:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR64:$dst,
+         (IntId64 VR64:$src1,
+          (bitconvert (memopv8i8 addr:$src2))))]>;
 
+  let isCommutable = 1 in
   def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
-                    (ins VR128:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
-                    OpSize;
-
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+       OpSize;
   def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
-                    (ins i128mem:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst,
-                      (IntId128
-                       (bitconvert (memopv4i32 addr:$src))))]>, OpSize;
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1,
+          (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
 
-defm PABSB       : SS3I_unop_rm_int_8 <0x1C, "pabsb",
-                                       int_x86_ssse3_pabs_b,
-                                       int_x86_ssse3_pabs_b_128>;
-defm PABSW       : SS3I_unop_rm_int_16<0x1D, "pabsw",
-                                       int_x86_ssse3_pabs_w,
-                                       int_x86_ssse3_pabs_w_128>;
-defm PABSD       : SS3I_unop_rm_int_32<0x1E, "pabsd",
-                                       int_x86_ssse3_pabs_d,
-                                       int_x86_ssse3_pabs_d_128>;
-
-/// SS3I_binop_rm_int_8 - Simple SSSE3 binary operator whose type is v*i8.
-let Constraints = "$src1 = $dst" in {
-  multiclass SS3I_binop_rm_int_8<bits<8> opc, string OpcodeStr,
-                                 Intrinsic IntId64, Intrinsic IntId128,
-                                 bit Commutable = 0> {
-    def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
-                     (ins VR64:$src1, VR64:$src2),
-                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
-      let isCommutable = Commutable;
-    }
-    def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
-                     (ins VR64:$src1, i64mem:$src2),
-                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR64:$dst,
-                       (IntId64 VR64:$src1,
-                        (bitconvert (memopv8i8 addr:$src2))))]>;
-
-    def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
-                      (ins VR128:$src1, VR128:$src2),
-                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                      [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
-                      OpSize {
-      let isCommutable = Commutable;
-    }
-    def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
-                      (ins VR128:$src1, i128mem:$src2),
-                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                      [(set VR128:$dst,
-                        (IntId128 VR128:$src1,
-                         (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
-  }
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let isCommutable = 0 in {
+  defm VPHADDW    : SS3I_binop_rm_int<0x01, "vphaddw", memopv4i16, memopv8i16,
+                                      int_x86_ssse3_phadd_w,
+                                      int_x86_ssse3_phadd_w_128, 0>, VEX_4V;
+  defm VPHADDD    : SS3I_binop_rm_int<0x02, "vphaddd", memopv2i32, memopv4i32,
+                                      int_x86_ssse3_phadd_d,
+                                      int_x86_ssse3_phadd_d_128, 0>, VEX_4V;
+  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw", memopv4i16, memopv8i16,
+                                      int_x86_ssse3_phadd_sw,
+                                      int_x86_ssse3_phadd_sw_128, 0>, VEX_4V;
+  defm VPHSUBW    : SS3I_binop_rm_int<0x05, "vphsubw", memopv4i16, memopv8i16,
+                                      int_x86_ssse3_phsub_w,
+                                      int_x86_ssse3_phsub_w_128, 0>, VEX_4V;
+  defm VPHSUBD    : SS3I_binop_rm_int<0x06, "vphsubd", memopv2i32, memopv4i32,
+                                      int_x86_ssse3_phsub_d,
+                                      int_x86_ssse3_phsub_d_128, 0>, VEX_4V;
+  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw", memopv4i16, memopv8i16,
+                                      int_x86_ssse3_phsub_sw,
+                                      int_x86_ssse3_phsub_sw_128, 0>, VEX_4V;
+  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", memopv8i8, memopv16i8,
+                                      int_x86_ssse3_pmadd_ub_sw,
+                                      int_x86_ssse3_pmadd_ub_sw_128, 0>, VEX_4V;
+  defm VPSHUFB    : SS3I_binop_rm_int<0x00, "vpshufb", memopv8i8, memopv16i8,
+                                      int_x86_ssse3_pshuf_b,
+                                      int_x86_ssse3_pshuf_b_128, 0>, VEX_4V;
+  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb", memopv8i8, memopv16i8,
+                                      int_x86_ssse3_psign_b,
+                                      int_x86_ssse3_psign_b_128, 0>, VEX_4V;
+  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw", memopv4i16, memopv8i16,
+                                      int_x86_ssse3_psign_w,
+                                      int_x86_ssse3_psign_w_128, 0>, VEX_4V;
+  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd", memopv2i32, memopv4i32,
+                                      int_x86_ssse3_psign_d,
+                                      int_x86_ssse3_psign_d_128, 0>, VEX_4V;
 }
-
-/// SS3I_binop_rm_int_16 - Simple SSSE3 binary operator whose type is v*i16.
-let Constraints = "$src1 = $dst" in {
-  multiclass SS3I_binop_rm_int_16<bits<8> opc, string OpcodeStr,
-                                  Intrinsic IntId64, Intrinsic IntId128,
-                                  bit Commutable = 0> {
-    def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
-                     (ins VR64:$src1, VR64:$src2),
-                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
-      let isCommutable = Commutable;
-    }
-    def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
-                     (ins VR64:$src1, i64mem:$src2),
-                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR64:$dst,
-                       (IntId64 VR64:$src1,
-                        (bitconvert (memopv4i16 addr:$src2))))]>;
-
-    def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
-                      (ins VR128:$src1, VR128:$src2),
-                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                      [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
-                      OpSize {
-      let isCommutable = Commutable;
-    }
-    def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
-                      (ins VR128:$src1, i128mem:$src2),
-                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                      [(set VR128:$dst,
-                        (IntId128 VR128:$src1,
-                         (bitconvert (memopv8i16 addr:$src2))))]>, OpSize;
-  }
+defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw", memopv4i16, memopv8i16,
+                                      int_x86_ssse3_pmul_hr_sw,
+                                      int_x86_ssse3_pmul_hr_sw_128, 0>, VEX_4V;
 }
 
-/// SS3I_binop_rm_int_32 - Simple SSSE3 binary operator whose type is v*i32.
-let Constraints = "$src1 = $dst" in {
-  multiclass SS3I_binop_rm_int_32<bits<8> opc, string OpcodeStr,
-                                  Intrinsic IntId64, Intrinsic IntId128,
-                                  bit Commutable = 0> {
-    def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
-                     (ins VR64:$src1, VR64:$src2),
-                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
-      let isCommutable = Commutable;
-    }
-    def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
-                     (ins VR64:$src1, i64mem:$src2),
-                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                     [(set VR64:$dst,
-                       (IntId64 VR64:$src1,
-                        (bitconvert (memopv2i32 addr:$src2))))]>;
-
-    def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
-                      (ins VR128:$src1, VR128:$src2),
-                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                      [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
-                      OpSize {
-      let isCommutable = Commutable;
-    }
-    def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
-                      (ins VR128:$src1, i128mem:$src2),
-                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                      [(set VR128:$dst,
-                        (IntId128 VR128:$src1,
-                         (bitconvert (memopv4i32 addr:$src2))))]>, OpSize;
-  }
+// None of these have i8 immediate fields.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+let isCommutable = 0 in {
+  defm PHADDW    : SS3I_binop_rm_int<0x01, "phaddw", memopv4i16, memopv8i16,
+                                     int_x86_ssse3_phadd_w,
+                                     int_x86_ssse3_phadd_w_128>;
+  defm PHADDD    : SS3I_binop_rm_int<0x02, "phaddd", memopv2i32, memopv4i32,
+                                     int_x86_ssse3_phadd_d,
+                                     int_x86_ssse3_phadd_d_128>;
+  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw", memopv4i16, memopv8i16,
+                                     int_x86_ssse3_phadd_sw,
+                                     int_x86_ssse3_phadd_sw_128>;
+  defm PHSUBW    : SS3I_binop_rm_int<0x05, "phsubw", memopv4i16, memopv8i16,
+                                     int_x86_ssse3_phsub_w,
+                                     int_x86_ssse3_phsub_w_128>;
+  defm PHSUBD    : SS3I_binop_rm_int<0x06, "phsubd", memopv2i32, memopv4i32,
+                                     int_x86_ssse3_phsub_d,
+                                     int_x86_ssse3_phsub_d_128>;
+  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw", memopv4i16, memopv8i16,
+                                     int_x86_ssse3_phsub_sw,
+                                     int_x86_ssse3_phsub_sw_128>;
+  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", memopv8i8, memopv16i8,
+                                     int_x86_ssse3_pmadd_ub_sw,
+                                     int_x86_ssse3_pmadd_ub_sw_128>;
+  defm PSHUFB    : SS3I_binop_rm_int<0x00, "pshufb", memopv8i8, memopv16i8,
+                                     int_x86_ssse3_pshuf_b,
+                                     int_x86_ssse3_pshuf_b_128>;
+  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", memopv8i8, memopv16i8,
+                                     int_x86_ssse3_psign_b,
+                                     int_x86_ssse3_psign_b_128>;
+  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", memopv4i16, memopv8i16,
+                                     int_x86_ssse3_psign_w,
+                                     int_x86_ssse3_psign_w_128>;
+  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", memopv2i32, memopv4i32,
+                                       int_x86_ssse3_psign_d,
+                                       int_x86_ssse3_psign_d_128>;
+}
+defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw", memopv4i16, memopv8i16,
+                                     int_x86_ssse3_pmul_hr_sw,
+                                     int_x86_ssse3_pmul_hr_sw_128>;
 }
 
-let ImmT = NoImm in {  // None of these have i8 immediate fields.
-defm PHADDW      : SS3I_binop_rm_int_16<0x01, "phaddw",
-                                        int_x86_ssse3_phadd_w,
-                                        int_x86_ssse3_phadd_w_128>;
-defm PHADDD      : SS3I_binop_rm_int_32<0x02, "phaddd",
-                                        int_x86_ssse3_phadd_d,
-                                        int_x86_ssse3_phadd_d_128>;
-defm PHADDSW     : SS3I_binop_rm_int_16<0x03, "phaddsw",
-                                        int_x86_ssse3_phadd_sw,
-                                        int_x86_ssse3_phadd_sw_128>;
-defm PHSUBW      : SS3I_binop_rm_int_16<0x05, "phsubw",
-                                        int_x86_ssse3_phsub_w,
-                                        int_x86_ssse3_phsub_w_128>;
-defm PHSUBD      : SS3I_binop_rm_int_32<0x06, "phsubd",
-                                        int_x86_ssse3_phsub_d,
-                                        int_x86_ssse3_phsub_d_128>;
-defm PHSUBSW     : SS3I_binop_rm_int_16<0x07, "phsubsw",
-                                        int_x86_ssse3_phsub_sw,
-                                        int_x86_ssse3_phsub_sw_128>;
-defm PMADDUBSW   : SS3I_binop_rm_int_8 <0x04, "pmaddubsw",
-                                        int_x86_ssse3_pmadd_ub_sw,
-                                        int_x86_ssse3_pmadd_ub_sw_128>;
-defm PMULHRSW    : SS3I_binop_rm_int_16<0x0B, "pmulhrsw",
-                                        int_x86_ssse3_pmul_hr_sw,
-                                        int_x86_ssse3_pmul_hr_sw_128, 1>;
-
-defm PSHUFB      : SS3I_binop_rm_int_8 <0x00, "pshufb",
-                                        int_x86_ssse3_pshuf_b,
-                                        int_x86_ssse3_pshuf_b_128>;
-defm PSIGNB      : SS3I_binop_rm_int_8 <0x08, "psignb",
-                                        int_x86_ssse3_psign_b,
-                                        int_x86_ssse3_psign_b_128>;
-defm PSIGNW      : SS3I_binop_rm_int_16<0x09, "psignw",
-                                        int_x86_ssse3_psign_w,
-                                        int_x86_ssse3_psign_w_128>;
-defm PSIGND      : SS3I_binop_rm_int_32<0x0A, "psignd",
-                                        int_x86_ssse3_psign_d,
-                                        int_x86_ssse3_psign_d_128>;
-}
-
-// palignr patterns.
-let Constraints = "$src1 = $dst" in {
-  def PALIGNR64rr  : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
-                           (ins VR64:$src1, VR64:$src2, i8imm:$src3),
-                           "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                           []>;
-  def PALIGNR64rm  : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
-                           (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
-                           "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                           []>;
-
-  def PALIGNR128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
-                           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-                           "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                           []>, OpSize;
-  def PALIGNR128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
-                           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-                           "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                           []>, OpSize;
+def : Pat<(X86pshufb VR128:$src, VR128:$mask),
+          (PSHUFBrr128 VR128:$src, VR128:$mask)>, Requires<[HasSSSE3]>;
+def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
+          (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>;
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Align Instruction Patterns
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_palign<string asm, bit Is2Addr = 1> {
+  def R64rr  : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+      (ins VR64:$src1, VR64:$src2, i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      []>;
+  def R64rm  : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+      (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      []>;
+
+  def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      []>, OpSize;
+  def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      []>, OpSize;
 }
 
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm VPALIGN : sse3_palign<"vpalignr", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+  defm PALIGN : sse3_palign<"palignr">;
+
 let AddedComplexity = 5 in {
 
 def : Pat<(v1i64 (palign:$src3 VR64:$src1, VR64:$src2)),
@@ -2996,10 +3355,6 @@ def : Pat<(v2i32 (palign:$src3 VR64:$src1, VR64:$src2)),
           (PALIGNR64rr VR64:$src2, VR64:$src1,
                        (SHUFFLE_get_palign_imm VR64:$src3))>,
           Requires<[HasSSSE3]>;
-def : Pat<(v2f32 (palign:$src3 VR64:$src1, VR64:$src2)),
-          (PALIGNR64rr VR64:$src2, VR64:$src1,
-                       (SHUFFLE_get_palign_imm VR64:$src3))>,
-          Requires<[HasSSSE3]>;
 def : Pat<(v4i16 (palign:$src3 VR64:$src1, VR64:$src2)),
           (PALIGNR64rr VR64:$src2, VR64:$src1,
                        (SHUFFLE_get_palign_imm VR64:$src3))>,
@@ -3027,10 +3382,15 @@ def : Pat<(v16i8 (palign:$src3 VR128:$src1, VR128:$src2)),
       Requires<[HasSSSE3]>;
 }
 
-def : Pat<(X86pshufb VR128:$src, VR128:$mask),
-          (PSHUFBrr128 VR128:$src, VR128:$mask)>, Requires<[HasSSSE3]>;
-def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
-          (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>;
+//===---------------------------------------------------------------------===//
+// SSSE3 Misc Instructions
+//===---------------------------------------------------------------------===//
+
+// Thread synchronization
+def MONITOR : I<0x01, MRM_C8, (outs), (ins), "monitor",
+                [(int_x86_sse3_monitor EAX, ECX, EDX)]>,TB, Requires<[HasSSE3]>;
+def MWAIT   : I<0x01, MRM_C9, (outs), (ins), "mwait",
+                [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
 
 //===---------------------------------------------------------------------===//
 // Non-Instruction Patterns
@@ -3311,287 +3671,9 @@ def : Pat<(store (v16i8 VR128:$src), addr:$dst),
           (MOVUPSmr addr:$dst, VR128:$src)>;
 
 //===----------------------------------------------------------------------===//
-// SSE4.1 Instructions
+// SSE4.1 - Packed Move with Sign/Zero Extend
 //===----------------------------------------------------------------------===//
 
-multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd,
-                            string OpcodeStr,
-                            Intrinsic V4F32Int,
-                            Intrinsic V2F64Int> {
-  // Intrinsic operation, reg.
-  // Vector intrinsic operation, reg
-  def PSr_Int : SS4AIi8<opcps, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]>,
-                    OpSize;
-
-  // Vector intrinsic operation, mem
-  def PSm_Int : Ii8<opcps, MRMSrcMem,
-                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set VR128:$dst,
-                          (V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>,
-                    TA, OpSize,
-                Requires<[HasSSE41]>;
-
-  // Vector intrinsic operation, reg
-  def PDr_Int : SS4AIi8<opcpd, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]>,
-                    OpSize;
-
-  // Vector intrinsic operation, mem
-  def PDm_Int : SS4AIi8<opcpd, MRMSrcMem,
-                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set VR128:$dst,
-                          (V2F64Int (memopv2f64 addr:$src1),imm:$src2))]>,
-                    OpSize;
-}
-
-let Constraints = "$src1 = $dst" in {
-multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
-                            string OpcodeStr,
-                            Intrinsic F32Int,
-                            Intrinsic F64Int> {
-  // Intrinsic operation, reg.
-  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
-                    (outs VR128:$dst),
-                                 (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
-                    !strconcat(OpcodeStr,
-                    "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR128:$dst,
-                            (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
-                    OpSize;
-
-  // Intrinsic operation, mem.
-  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
-                    (outs VR128:$dst),
-                                (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
-                    !strconcat(OpcodeStr,
-                    "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR128:$dst,
-                         (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
-                    OpSize;
-
-  // Intrinsic operation, reg.
-  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
-                    (outs VR128:$dst),
-                            (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
-                    !strconcat(OpcodeStr,
-                    "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR128:$dst,
-                            (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
-                    OpSize;
-
-  // Intrinsic operation, mem.
-  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
-                    (outs VR128:$dst),
-                            (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
-                    !strconcat(OpcodeStr,
-                    "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR128:$dst,
-                        (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
-                    OpSize;
-}
-}
-
-// FP round - roundss, roundps, roundsd, roundpd
-defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round",
-                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
-defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
-                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
-
-// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
-multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
-                                 Intrinsic IntId128> {
-  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
-                    (ins VR128:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
-  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
-                     (ins i128mem:$src),
-                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                     [(set VR128:$dst,
-                       (IntId128
-                       (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
-}
-
-defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
-                                         int_x86_sse41_phminposuw>;
-
-/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
-let Constraints = "$src1 = $dst" in {
-  multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
-                                Intrinsic IntId128, bit Commutable = 0> {
-    def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src1, VR128:$src2),
-                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                   [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
-                   OpSize {
-      let isCommutable = Commutable;
-    }
-    def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
-                   (ins VR128:$src1, i128mem:$src2),
-                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                   [(set VR128:$dst,
-                     (IntId128 VR128:$src1,
-                      (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
-  }
-}
-
-defm PCMPEQQ      : SS41I_binop_rm_int<0x29, "pcmpeqq",
-                                       int_x86_sse41_pcmpeqq, 1>;
-defm PACKUSDW     : SS41I_binop_rm_int<0x2B, "packusdw",
-                                       int_x86_sse41_packusdw, 0>;
-defm PMINSB       : SS41I_binop_rm_int<0x38, "pminsb",
-                                       int_x86_sse41_pminsb, 1>;
-defm PMINSD       : SS41I_binop_rm_int<0x39, "pminsd",
-                                       int_x86_sse41_pminsd, 1>;
-defm PMINUD       : SS41I_binop_rm_int<0x3B, "pminud",
-                                       int_x86_sse41_pminud, 1>;
-defm PMINUW       : SS41I_binop_rm_int<0x3A, "pminuw",
-                                       int_x86_sse41_pminuw, 1>;
-defm PMAXSB       : SS41I_binop_rm_int<0x3C, "pmaxsb",
-                                       int_x86_sse41_pmaxsb, 1>;
-defm PMAXSD       : SS41I_binop_rm_int<0x3D, "pmaxsd",
-                                       int_x86_sse41_pmaxsd, 1>;
-defm PMAXUD       : SS41I_binop_rm_int<0x3F, "pmaxud",
-                                       int_x86_sse41_pmaxud, 1>;
-defm PMAXUW       : SS41I_binop_rm_int<0x3E, "pmaxuw",
-                                       int_x86_sse41_pmaxuw, 1>;
-
-defm PMULDQ       : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq, 1>;
-
-def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
-          (PCMPEQQrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
-          (PCMPEQQrm VR128:$src1, addr:$src2)>;
-
-/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
-let Constraints = "$src1 = $dst" in {
-  multiclass SS41I_binop_patint<bits<8> opc, string OpcodeStr, ValueType OpVT,
-                                SDNode OpNode, Intrinsic IntId128,
-                                bit Commutable = 0> {
-    def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src1, VR128:$src2),
-                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                   [(set VR128:$dst, (OpNode (OpVT VR128:$src1),
-                                                   VR128:$src2))]>, OpSize {
-      let isCommutable = Commutable;
-    }
-    def rr_int : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
-                      (ins VR128:$src1, VR128:$src2),
-                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                      [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
-                      OpSize {
-      let isCommutable = Commutable;
-    }
-    def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
-                   (ins VR128:$src1, i128mem:$src2),
-                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                   [(set VR128:$dst,
-                     (OpVT (OpNode VR128:$src1, (memop addr:$src2))))]>, OpSize;
-    def rm_int : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
-                       (ins VR128:$src1, i128mem:$src2),
-                       !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                       [(set VR128:$dst,
-                        (IntId128 VR128:$src1, (memop addr:$src2)))]>,
-                       OpSize;
-  }
-}
-
-/// SS48I_binop_rm - Simple SSE41 binary operator.
-let Constraints = "$src1 = $dst" in {
-multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                        ValueType OpVT, bit Commutable = 0> {
-  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
-                                 (ins VR128:$src1, VR128:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>,
-               OpSize {
-    let isCommutable = Commutable;
-  }
-  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
-                                 (ins VR128:$src1, i128mem:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (OpNode VR128:$src1,
-                                  (bc_v4i32 (memopv2i64 addr:$src2))))]>,
-               OpSize;
-}
-}
-
-defm PMULLD         : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, 1>;
-
-/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
-let Constraints = "$src1 = $dst" in {
-  multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
-                                 Intrinsic IntId128, bit Commutable = 0> {
-    def rri : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-                    (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
-                    !strconcat(OpcodeStr,
-                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR128:$dst,
-                      (IntId128 VR128:$src1, VR128:$src2, imm:$src3))]>,
-                    OpSize {
-      let isCommutable = Commutable;
-    }
-    def rmi : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
-                    (ins VR128:$src1, i128mem:$src2, i32i8imm:$src3),
-                    !strconcat(OpcodeStr,
-                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR128:$dst,
-                      (IntId128 VR128:$src1,
-                       (bitconvert (memopv16i8 addr:$src2)), imm:$src3))]>,
-                    OpSize;
-  }
-}
-
-defm BLENDPS      : SS41I_binop_rmi_int<0x0C, "blendps",
-                                        int_x86_sse41_blendps, 0>;
-defm BLENDPD      : SS41I_binop_rmi_int<0x0D, "blendpd",
-                                        int_x86_sse41_blendpd, 0>;
-defm PBLENDW      : SS41I_binop_rmi_int<0x0E, "pblendw",
-                                        int_x86_sse41_pblendw, 0>;
-defm DPPS         : SS41I_binop_rmi_int<0x40, "dpps",
-                                        int_x86_sse41_dpps, 1>;
-defm DPPD         : SS41I_binop_rmi_int<0x41, "dppd",
-                                        int_x86_sse41_dppd, 1>;
-defm MPSADBW      : SS41I_binop_rmi_int<0x42, "mpsadbw",
-                                        int_x86_sse41_mpsadbw, 0>;
-
-
-/// SS41I_ternary_int - SSE 4.1 ternary operator
-let Uses = [XMM0], Constraints = "$src1 = $dst" in {
-  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
-    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
-                    (ins VR128:$src1, VR128:$src2),
-                    !strconcat(OpcodeStr,
-                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
-                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
-                    OpSize;
-
-    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
-                    (ins VR128:$src1, i128mem:$src2),
-                    !strconcat(OpcodeStr,
-                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
-                    [(set VR128:$dst,
-                      (IntId VR128:$src1,
-                       (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize;
-  }
-}
-
-defm BLENDVPD     : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
-defm BLENDVPS     : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
-defm PBLENDVB     : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
-
-
 multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -3604,6 +3686,21 @@ multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
        OpSize;
 }
 
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>,
+                                     VEX;
+defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>,
+                                     VEX;
+defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", int_x86_sse41_pmovsxdq>,
+                                     VEX;
+defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", int_x86_sse41_pmovzxbw>,
+                                     VEX;
+defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", int_x86_sse41_pmovzxwd>,
+                                     VEX;
+defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>,
+                                     VEX;
+}
+
 defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
 defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
 defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
@@ -3655,6 +3752,17 @@ multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
           OpSize;
 }
 
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>,
+                                     VEX;
+defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>,
+                                     VEX;
+defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd>,
+                                     VEX;
+defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>,
+                                     VEX;
+}
+
 defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
 defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
 defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
@@ -3685,6 +3793,12 @@ multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
                  OpSize;
 }
 
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>,
+                                     VEX;
+defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>,
+                                     VEX;
+}
 defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
 defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
 
@@ -3699,6 +3813,9 @@ def : Pat<(int_x86_sse41_pmovzxbq
                              (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
           (PMOVZXBQrm addr:$src)>, Requires<[HasSSE41]>;
 
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Extract Instructions
+//===----------------------------------------------------------------------===//
 
 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
@@ -3718,6 +3835,9 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
 // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
 }
 
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
+
 defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
 
 
@@ -3733,6 +3853,9 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
 // (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
 }
 
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
+
 defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
 
 
@@ -3752,8 +3875,31 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
                           addr:$dst)]>, OpSize;
 }
 
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
+
 defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
 
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR64:$dst,
+                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W;
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
+                          addr:$dst)]>, OpSize, REX_W;
+}
+
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
+
+defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
 
 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
 /// destination
@@ -3773,6 +3919,8 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
                           addr:$dst)]>, OpSize;
 }
 
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
 defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
 
 // Also match an EXTRACTPS store when the store is done as f32 instead of i32.
@@ -3782,78 +3930,530 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
           (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
          Requires<[HasSSE41]>;
 
-let Constraints = "$src1 = $dst" in {
-  multiclass SS41I_insert8<bits<8> opc, string OpcodeStr> {
-    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
-                   !strconcat(OpcodeStr,
-                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst,
-                     (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
-    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
-                   (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
-                   !strconcat(OpcodeStr,
-                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst,
-                     (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
-                                imm:$src3))]>, OpSize;
-  }
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Insert Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
+                   imm:$src3))]>, OpSize;
 }
 
-defm PINSRB      : SS41I_insert8<0x20, "pinsrb">;
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
+
+multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
+      OpSize;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
+                          imm:$src3)))]>, OpSize;
+}
 
-let Constraints = "$src1 = $dst" in {
-  multiclass SS41I_insert32<bits<8> opc, string OpcodeStr> {
-    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
-                   !strconcat(OpcodeStr,
-                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst,
-                     (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
-                   OpSize;
-    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
-                   (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
-                   !strconcat(OpcodeStr,
-                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst,
-                     (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
-                                       imm:$src3)))]>, OpSize;
-  }
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
+
+multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
+      OpSize;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
+                          imm:$src3)))]>, OpSize;
 }
 
-defm PINSRD      : SS41I_insert32<0x22, "pinsrd">;
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
+let Constraints = "$src1 = $dst" in
+  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
 
 // insertps has a few different modes, there's the first two here below which
 // are optimized inserts that won't zero arbitrary elements in the destination
 // vector. The next one matches the intrinsic and could zero arbitrary elements
 // in the target vector.
-let Constraints = "$src1 = $dst" in {
-  multiclass SS41I_insertf32<bits<8> opc, string OpcodeStr> {
-    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
-                   !strconcat(OpcodeStr,
-                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst,
-                     (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
       OpSize;
-    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
-                   (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3),
-                   !strconcat(OpcodeStr,
-                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst,
-                     (X86insrtps VR128:$src1,
-                                (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
-                                 imm:$src3))]>, OpSize;
-  }
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86insrtps VR128:$src1,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+                    imm:$src3))]>, OpSize;
 }
 
-defm INSERTPS    : SS41I_insertf32<0x21, "insertps">;
+let Constraints = "$src1 = $dst" in
+  defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
 
 def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
           (INSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>;
 
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Round Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd,
+                            string OpcodeStr,
+                            Intrinsic V4F32Int,
+                            Intrinsic V2F64Int> {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def PSr_Int : SS4AIi8<opcps, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]>,
+                    OpSize;
+
+  // Vector intrinsic operation, mem
+  def PSm_Int : Ii8<opcps, MRMSrcMem,
+                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst,
+                          (V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>,
+                    TA, OpSize,
+                Requires<[HasSSE41]>;
+
+  // Vector intrinsic operation, reg
+  def PDr_Int : SS4AIi8<opcpd, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]>,
+                    OpSize;
+
+  // Vector intrinsic operation, mem
+  def PDm_Int : SS4AIi8<opcpd, MRMSrcMem,
+                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst,
+                          (V2F64Int (memopv2f64 addr:$src1),imm:$src2))]>,
+                    OpSize;
+}
+
+multiclass sse41_fp_unop_rm_avx<bits<8> opcps, bits<8> opcpd,
+                                string OpcodeStr> {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def PSr : SS4AIi8<opcps, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, OpSize;
+
+  // Vector intrinsic operation, mem
+  def PSm : Ii8<opcps, MRMSrcMem,
+                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, TA, OpSize, Requires<[HasSSE41]>;
+
+  // Vector intrinsic operation, reg
+  def PDr : SS4AIi8<opcpd, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, OpSize;
+
+  // Vector intrinsic operation, mem
+  def PDm : SS4AIi8<opcpd, MRMSrcMem,
+                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, OpSize;
+}
+
+multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+                            string OpcodeStr,
+                            Intrinsic F32Int,
+                            Intrinsic F64Int, bit Is2Addr = 1> {
+  // Intrinsic operation, reg.
+  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+        OpSize;
+
+  // Intrinsic operation, mem.
+  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
+        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst,
+             (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+        OpSize;
+
+  // Intrinsic operation, reg.
+  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+        OpSize;
+
+  // Intrinsic operation, mem.
+  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
+        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst,
+              (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+        OpSize;
+}
+
+multiclass sse41_fp_binop_rm_avx<bits<8> opcss, bits<8> opcsd,
+                                 string OpcodeStr> {
+  // Intrinsic operation, reg.
+  def SSr : SS4AIi8<opcss, MRMSrcReg,
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, OpSize;
+
+  // Intrinsic operation, mem.
+  def SSm : SS4AIi8<opcss, MRMSrcMem,
+        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, OpSize;
+
+  // Intrinsic operation, reg.
+  def SDr : SS4AIi8<opcsd, MRMSrcReg,
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, OpSize;
+
+  // Intrinsic operation, mem.
+  def SDm : SS4AIi8<opcsd, MRMSrcMem,
+        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, OpSize;
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+  // Intrinsic form
+  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround",
+                                int_x86_sse41_round_ps, int_x86_sse41_round_pd>,
+                                VEX;
+  defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
+                                int_x86_sse41_round_ss, int_x86_sse41_round_sd,
+                                0>, VEX_4V;
+  // Instructions for the assembler
+  defm VROUND  : sse41_fp_unop_rm_avx<0x08, 0x09, "vround">, VEX;
+  defm VROUND  : sse41_fp_binop_rm_avx<0x0A, 0x0B, "vround">, VEX_4V;
+}
+
+defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round",
+                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+let Constraints = "$src1 = $dst" in
+defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Misc Instructions
+//===----------------------------------------------------------------------===//
+
+// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
+multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId128> {
+  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
+  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                     (ins i128mem:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set VR128:$dst,
+                       (IntId128
+                       (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
+}
+
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
+                                         int_x86_sse41_phminposuw>, VEX;
+defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+                                         int_x86_sse41_phminposuw>;
+
+/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic IntId128, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, OpSize;
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1,
+          (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+}
+
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+  let isCommutable = 0 in
+  defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
+                                                         0>, VEX_4V;
+  defm VPCMPEQQ  : SS41I_binop_rm_int<0x29, "vpcmpeqq",  int_x86_sse41_pcmpeqq,
+                                                         0>, VEX_4V;
+  defm VPMINSB   : SS41I_binop_rm_int<0x38, "vpminsb",   int_x86_sse41_pminsb,
+                                                         0>, VEX_4V;
+  defm VPMINSD   : SS41I_binop_rm_int<0x39, "vpminsd",   int_x86_sse41_pminsd,
+                                                         0>, VEX_4V;
+  defm VPMINUD   : SS41I_binop_rm_int<0x3B, "vpminud",   int_x86_sse41_pminud,
+                                                         0>, VEX_4V;
+  defm VPMINUW   : SS41I_binop_rm_int<0x3A, "vpminuw",   int_x86_sse41_pminuw,
+                                                         0>, VEX_4V;
+  defm VPMAXSB   : SS41I_binop_rm_int<0x3C, "vpmaxsb",   int_x86_sse41_pmaxsb,
+                                                         0>, VEX_4V;
+  defm VPMAXSD   : SS41I_binop_rm_int<0x3D, "vpmaxsd",   int_x86_sse41_pmaxsd,
+                                                         0>, VEX_4V;
+  defm VPMAXUD   : SS41I_binop_rm_int<0x3F, "vpmaxud",   int_x86_sse41_pmaxud,
+                                                         0>, VEX_4V;
+  defm VPMAXUW   : SS41I_binop_rm_int<0x3E, "vpmaxuw",   int_x86_sse41_pmaxuw,
+                                                         0>, VEX_4V;
+  defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
+                                                         0>, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst" in {
+  let isCommutable = 0 in
+  defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
+  defm PCMPEQQ  : SS41I_binop_rm_int<0x29, "pcmpeqq",  int_x86_sse41_pcmpeqq>;
+  defm PMINSB   : SS41I_binop_rm_int<0x38, "pminsb",   int_x86_sse41_pminsb>;
+  defm PMINSD   : SS41I_binop_rm_int<0x39, "pminsd",   int_x86_sse41_pminsd>;
+  defm PMINUD   : SS41I_binop_rm_int<0x3B, "pminud",   int_x86_sse41_pminud>;
+  defm PMINUW   : SS41I_binop_rm_int<0x3A, "pminuw",   int_x86_sse41_pminuw>;
+  defm PMAXSB   : SS41I_binop_rm_int<0x3C, "pmaxsb",   int_x86_sse41_pmaxsb>;
+  defm PMAXSD   : SS41I_binop_rm_int<0x3D, "pmaxsd",   int_x86_sse41_pmaxsd>;
+  defm PMAXUD   : SS41I_binop_rm_int<0x3F, "pmaxud",   int_x86_sse41_pmaxud>;
+  defm PMAXUW   : SS41I_binop_rm_int<0x3E, "pmaxuw",   int_x86_sse41_pmaxuw>;
+  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq>;
+}
+
+def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
+          (PCMPEQQrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
+          (PCMPEQQrm VR128:$src1, addr:$src2)>;
+
+/// SS48I_binop_rm - Simple SSE41 binary operator.
+multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>,
+       OpSize;
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (OpNode VR128:$src1,
+                                  (bc_v4i32 (memopv2i64 addr:$src2))))]>,
+       OpSize;
+}
+
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+  defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32>;
+
+/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
+                               Intrinsic IntId128, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rri : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+        (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst,
+          (IntId128 VR128:$src1, VR128:$src2, imm:$src3))]>,
+        OpSize;
+  def rmi : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+        (ins VR128:$src1, i128mem:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst,
+          (IntId128 VR128:$src1,
+           (bitconvert (memopv16i8 addr:$src2)), imm:$src3))]>,
+        OpSize;
+}
+
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+  let isCommutable = 0 in {
+  defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
+                                                        0>, VEX_4V;
+  defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
+                                                        0>, VEX_4V;
+  defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
+                                                        0>, VEX_4V;
+  defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
+                                                        0>, VEX_4V;
+  }
+  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
+                                                        0>, VEX_4V;
+  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
+                                                        0>, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst" in {
+  let isCommutable = 0 in {
+  defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps>;
+  defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd>;
+  defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw>;
+  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw>;
+  }
+  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps>;
+  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd>;
+}
+
+/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+  multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr> {
+    def rr : I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
+                    !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     [], SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+
+    def rm : I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+                    !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     [], SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+  }
+}
+
+defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd">;
+defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps">;
+defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb">;
+
+/// SS41I_ternary_int - SSE 4.1 ternary operator
+let Uses = [XMM0], Constraints = "$src1 = $dst" in {
+  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2),
+                    !strconcat(OpcodeStr,
+                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
+                    OpSize;
+
+    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src2),
+                    !strconcat(OpcodeStr,
+                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+                    [(set VR128:$dst,
+                      (IntId VR128:$src1,
+                       (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize;
+  }
+}
+
+defm BLENDVPD     : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
+defm BLENDVPS     : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
+defm PBLENDVB     : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
+
 // ptest instruction we'll lower to this in X86ISelLowering primarily from
 // the intel intrinsic that corresponds to this.
+let Defs = [EFLAGS], isAsmParserOnly = 1, Predicates = [HasAVX] in {
+def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                    "vptest\t{$src2, $src1|$src1, $src2}",
+                    [(set EFLAGS, (X86ptest VR128:$src1, VR128:$src2))]>,
+              OpSize, VEX;
+def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
+                    "vptest\t{$src2, $src1|$src1, $src2}",
+                    [(set EFLAGS, (X86ptest VR128:$src1, (load addr:$src2)))]>,
+              OpSize, VEX;
+}
+
 let Defs = [EFLAGS] in {
 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                     "ptest \t{$src2, $src1|$src1, $src2}",
@@ -3865,43 +4465,207 @@ def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
               OpSize;
 }
 
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "vmovntdqa\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
+                       OpSize, VEX;
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "movntdqa\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
                        OpSize;
 
-
 //===----------------------------------------------------------------------===//
-// SSE4.2 Instructions
+// SSE4.2 - Compare Instructions
 //===----------------------------------------------------------------------===//
 
 /// SS42I_binop_rm_int - Simple SSE 4.2 binary operator
-let Constraints = "$src1 = $dst" in {
-  multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
-                                Intrinsic IntId128, bit Commutable = 0> {
-    def rr : SS428I<opc, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src1, VR128:$src2),
-                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                   [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
-                   OpSize {
-      let isCommutable = Commutable;
-    }
-    def rm : SS428I<opc, MRMSrcMem, (outs VR128:$dst),
-                   (ins VR128:$src1, i128mem:$src2),
-                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                   [(set VR128:$dst,
-                     (IntId128 VR128:$src1,
-                      (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
-  }
+multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic IntId128, bit Is2Addr = 1> {
+  def rr : SS428I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+       OpSize;
+  def rm : SS428I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1,
+          (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
 
-defm PCMPGTQ      : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
+  defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq,
+                                     0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+  defm PCMPGTQ : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;
 
 def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
           (PCMPGTQrr VR128:$src1, VR128:$src2)>;
 def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
           (PCMPGTQrm VR128:$src1, addr:$src2)>;
 
+//===----------------------------------------------------------------------===//
+// SSE4.2 - String/text Processing Instructions
+//===----------------------------------------------------------------------===//
+
+// Packed Compare Implicit Length Strings, Return Mask
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+  def PCMPISTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
+    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    "#PCMPISTRM128rr PSEUDO!",
+    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
+                                                  imm:$src3))]>, OpSize;
+  def PCMPISTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
+    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    "#PCMPISTRM128rm PSEUDO!",
+    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128
+                          VR128:$src1, (load addr:$src2), imm:$src3))]>, OpSize;
+}
+
+let Defs = [XMM0, EFLAGS], isAsmParserOnly = 1,
+    Predicates = [HasAVX] in {
+  def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
+      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+      "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
+  def VPCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
+      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+      "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
+}
+
+let Defs = [XMM0, EFLAGS] in {
+  def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
+      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+      "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
+  def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
+      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+      "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+  def PCMPESTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
+    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    "#PCMPESTRM128rr PSEUDO!",
+    [(set VR128:$dst,
+          (int_x86_sse42_pcmpestrm128
+           VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>, OpSize;
+
+  def PCMPESTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
+    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    "#PCMPESTRM128rm PSEUDO!",
+    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
+                       VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>,
+    OpSize;
+}
+
+let isAsmParserOnly = 1, Predicates = [HasAVX],
+    Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in {
+  def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
+      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+      "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
+  def VPCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
+      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+      "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
+}
+
+let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in {
+  def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
+      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+      "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
+  def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
+      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+      "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+let Defs = [ECX, EFLAGS] in {
+  multiclass SS42AI_pcmpistri<Intrinsic IntId128, string asm = "pcmpistri"> {
+    def rr : SS42AI<0x63, MRMSrcReg, (outs),
+      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+      !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+      [(set ECX, (IntId128 VR128:$src1, VR128:$src2, imm:$src3)),
+       (implicit EFLAGS)]>, OpSize;
+    def rm : SS42AI<0x63, MRMSrcMem, (outs),
+      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+      !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+      [(set ECX, (IntId128 VR128:$src1, (load addr:$src2), imm:$src3)),
+       (implicit EFLAGS)]>, OpSize;
+  }
+}
+
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+defm VPCMPISTRI  : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128, "vpcmpistri">,
+                                    VEX;
+defm VPCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128, "vpcmpistri">,
+                                    VEX;
+defm VPCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128, "vpcmpistri">,
+                                    VEX;
+defm VPCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128, "vpcmpistri">,
+                                    VEX;
+defm VPCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128, "vpcmpistri">,
+                                    VEX;
+defm VPCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128, "vpcmpistri">,
+                                    VEX;
+}
+
+defm PCMPISTRI  : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128>;
+defm PCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128>;
+defm PCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128>;
+defm PCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128>;
+defm PCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128>;
+defm PCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128>;
+
+// Packed Compare Explicit Length Strings, Return Index
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX] in {
+  multiclass SS42AI_pcmpestri<Intrinsic IntId128, string asm = "pcmpestri"> {
+    def rr : SS42AI<0x61, MRMSrcReg, (outs),
+      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+      !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+      [(set ECX, (IntId128 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5)),
+       (implicit EFLAGS)]>, OpSize;
+    def rm : SS42AI<0x61, MRMSrcMem, (outs),
+      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+      !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+       [(set ECX,
+             (IntId128 VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5)),
+        (implicit EFLAGS)]>, OpSize;
+  }
+}
+
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+defm VPCMPESTRI  : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128, "vpcmpestri">,
+                                    VEX;
+defm VPCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128, "vpcmpestri">,
+                                    VEX;
+defm VPCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128, "vpcmpestri">,
+                                    VEX;
+defm VPCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128, "vpcmpestri">,
+                                    VEX;
+defm VPCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128, "vpcmpestri">,
+                                    VEX;
+defm VPCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128, "vpcmpestri">,
+                                    VEX;
+}
+
+defm PCMPESTRI  : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128>;
+defm PCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128>;
+defm PCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128>;
+defm PCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128>;
+defm PCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128>;
+defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - CRC Instructions
+//===----------------------------------------------------------------------===//
+
+// No CRC instructions have AVX equivalents
+
 // crc intrinsic instruction
 // This set of instructions are only rm, the only difference is the size
 // of r and m.
@@ -3969,133 +4733,52 @@ let Constraints = "$src1 = $dst" in {
                          REX_W;
 }
 
-// String/text processing instructions.
-let Defs = [EFLAGS], usesCustomInserter = 1 in {
-def PCMPISTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
-  (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-  "#PCMPISTRM128rr PSEUDO!",
-  [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
-                                                imm:$src3))]>, OpSize;
-def PCMPISTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
-  (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-  "#PCMPISTRM128rm PSEUDO!",
-  [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, (load addr:$src2),
-                                                imm:$src3))]>, OpSize;
-}
-
-let Defs = [XMM0, EFLAGS] in {
-def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
-  (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-   "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
-def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
-  (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-  "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
-}
+//===----------------------------------------------------------------------===//
+// AES-NI Instructions
+//===----------------------------------------------------------------------===//
 
-let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
-def PCMPESTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
-  (ins VR128:$src1, VR128:$src3, i8imm:$src5),
-  "#PCMPESTRM128rr PSEUDO!",
-  [(set VR128:$dst,
-        (int_x86_sse42_pcmpestrm128
-         VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>, OpSize;
-
-def PCMPESTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
-  (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-  "#PCMPESTRM128rm PSEUDO!",
-  [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
-                     VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>,
-  OpSize;
+multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic IntId128, bit Is2Addr = 1> {
+  def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+       OpSize;
+  def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1,
+          (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
 
-let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in {
-def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
-  (ins VR128:$src1, VR128:$src3, i8imm:$src5),
-  "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
-def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
-  (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-  "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
+// Perform One Round of an AES Encryption/Decryption Flow
+let isAsmParserOnly = 1, Predicates = [HasAVX, HasAES] in {
+  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
+                         int_x86_aesni_aesenc, 0>, VEX_4V;
+  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
+                         int_x86_aesni_aesenclast, 0>, VEX_4V;
+  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
+                         int_x86_aesni_aesdec, 0>, VEX_4V;
+  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
+                         int_x86_aesni_aesdeclast, 0>, VEX_4V;
 }
 
-let Defs = [ECX, EFLAGS] in {
-  multiclass SS42AI_pcmpistri<Intrinsic IntId128> {
-    def rr : SS42AI<0x63, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-      "pcmpistri\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-      [(set ECX, (IntId128 VR128:$src1, VR128:$src2, imm:$src3)),
-       (implicit EFLAGS)]>, OpSize;
-    def rm : SS42AI<0x63, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-      "pcmpistri\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-      [(set ECX, (IntId128 VR128:$src1, (load addr:$src2), imm:$src3)),
-       (implicit EFLAGS)]>, OpSize;
-  }
-}
-
-defm PCMPISTRI  : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128>;
-defm PCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128>;
-defm PCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128>;
-defm PCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128>;
-defm PCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128>;
-defm PCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128>;
-
-let Defs = [ECX, EFLAGS] in {
-let Uses = [EAX, EDX] in {
-  multiclass SS42AI_pcmpestri<Intrinsic IntId128> {
-    def rr : SS42AI<0x61, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
-      "pcmpestri\t{$src5, $src3, $src1|$src1, $src3, $src5}",
-      [(set ECX, (IntId128 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5)),
-       (implicit EFLAGS)]>, OpSize;
-    def rm : SS42AI<0x61, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-       "pcmpestri\t{$src5, $src3, $src1|$src1, $src3, $src5}",
-       [(set ECX,
-             (IntId128 VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5)),
-        (implicit EFLAGS)]>, OpSize;
-  }
-}
-}
-
-defm PCMPESTRI  : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128>;
-defm PCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128>;
-defm PCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128>;
-defm PCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128>;
-defm PCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128>;
-defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>;
-
-//===----------------------------------------------------------------------===//
-// AES-NI Instructions
-//===----------------------------------------------------------------------===//
-
 let Constraints = "$src1 = $dst" in {
-  multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
-                                Intrinsic IntId128, bit Commutable = 0> {
-    def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src1, VR128:$src2),
-                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                   [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
-                   OpSize {
-      let isCommutable = Commutable;
-    }
-    def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
-                   (ins VR128:$src1, i128mem:$src2),
-                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                   [(set VR128:$dst,
-                     (IntId128 VR128:$src1,
-                      (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
-  }
+  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
+                         int_x86_aesni_aesenc>;
+  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
+                         int_x86_aesni_aesenclast>;
+  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
+                         int_x86_aesni_aesdec>;
+  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
+                         int_x86_aesni_aesdeclast>;
 }
 
-defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
-                       int_x86_aesni_aesenc>;
-defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
-                       int_x86_aesni_aesenclast>;
-defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
-                       int_x86_aesni_aesdec>;
-defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
-                       int_x86_aesni_aesdeclast>;
-
 def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
           (AESENCrr VR128:$src1, VR128:$src2)>;
 def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
@@ -4113,13 +4796,27 @@ def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
 def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
           (AESDECLASTrm VR128:$src1, addr:$src2)>;
 
+// Perform the AES InvMixColumn Transformation
+let isAsmParserOnly = 1, Predicates = [HasAVX, HasAES] in {
+  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1),
+      "vaesimc\t{$src1, $dst|$dst, $src1}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aesimc VR128:$src1))]>,
+      OpSize, VEX;
+  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+      (ins i128mem:$src1),
+      "vaesimc\t{$src1, $dst|$dst, $src1}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aesimc (bitconvert (memopv2i64 addr:$src1))))]>,
+      OpSize, VEX;
+}
 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
   (ins VR128:$src1),
   "aesimc\t{$src1, $dst|$dst, $src1}",
   [(set VR128:$dst,
     (int_x86_aesni_aesimc VR128:$src1))]>,
   OpSize;
-
 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
   (ins i128mem:$src1),
   "aesimc\t{$src1, $dst|$dst, $src1}",
@@ -4127,6 +4824,22 @@ def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
     (int_x86_aesni_aesimc (bitconvert (memopv2i64 addr:$src1))))]>,
   OpSize;
 
+// AES Round Key Generation Assist
+let isAsmParserOnly = 1, Predicates = [HasAVX, HasAES] in {
+  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, i8imm:$src2),
+      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+      OpSize, VEX;
+  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+      (ins i128mem:$src1, i8imm:$src2),
+      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)),
+                                        imm:$src2))]>,
+      OpSize, VEX;
+}
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
   (ins VR128:$src1, i8imm:$src2),
   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/X86MCCodeEmitter.cpp
index a9681e6..633ddd4 100644
--- a/lib/Target/X86/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/X86MCCodeEmitter.cpp
@@ -30,7 +30,7 @@ class X86MCCodeEmitter : public MCCodeEmitter {
   MCContext &Ctx;
   bool Is64BitMode;
 public:
-  X86MCCodeEmitter(TargetMachine &tm, MCContext &ctx, bool is64Bit) 
+  X86MCCodeEmitter(TargetMachine &tm, MCContext &ctx, bool is64Bit)
     : TM(tm), TII(*TM.getInstrInfo()), Ctx(ctx) {
     Is64BitMode = is64Bit;
   }
@@ -38,17 +38,18 @@ public:
   ~X86MCCodeEmitter() {}
 
   unsigned getNumFixupKinds() const {
-    return 4;
+    return 5;
   }
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
     const static MCFixupKindInfo Infos[] = {
       { "reloc_pcrel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel },
       { "reloc_pcrel_1byte", 0, 1 * 8, MCFixupKindInfo::FKF_IsPCRel },
+      { "reloc_pcrel_2byte", 0, 2 * 8, MCFixupKindInfo::FKF_IsPCRel },
       { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel },
       { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel }
     };
-    
+
     if (Kind < FirstTargetFixupKind)
       return MCCodeEmitter::getFixupKindInfo(Kind);
 
@@ -56,16 +57,38 @@ public:
            "Invalid kind!");
     return Infos[Kind - FirstTargetFixupKind];
   }
-  
+
   static unsigned GetX86RegNum(const MCOperand &MO) {
     return X86RegisterInfo::getX86RegNum(MO.getReg());
   }
-  
+
+  // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
+  // 0-7 and the difference between the 2 groups is given by the REX prefix.
+  // In the VEX prefix, registers are seen sequencially from 0-15 and encoded
+  // in 1's complement form, example:
+  //
+  //  ModRM field => XMM9 => 1
+  //  VEX.VVVV    => XMM9 => ~9
+  //
+  // See table 4-35 of Intel AVX Programming Reference for details.
+  static unsigned char getVEXRegisterEncoding(const MCInst &MI,
+                                              unsigned OpNum) {
+    unsigned SrcReg = MI.getOperand(OpNum).getReg();
+    unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum));
+    if ((SrcReg >= X86::XMM8 && SrcReg <= X86::XMM15) ||
+        (SrcReg >= X86::YMM8 && SrcReg <= X86::YMM15))
+      SrcRegNum += 8;
+
+    // The registers represented through VEX_VVVV should
+    // be encoded in 1's complement form.
+    return (~SrcRegNum) & 0xf;
+  }
+
   void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
     OS << (char)C;
     ++CurByte;
   }
-  
+
   void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
                     raw_ostream &OS) const {
     // Output the constant in little endian byte order.
@@ -75,38 +98,49 @@ public:
     }
   }
 
-  void EmitImmediate(const MCOperand &Disp, 
+  void EmitImmediate(const MCOperand &Disp,
                      unsigned ImmSize, MCFixupKind FixupKind,
                      unsigned &CurByte, raw_ostream &OS,
                      SmallVectorImpl<MCFixup> &Fixups,
                      int ImmOffset = 0) const;
-  
+
   inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
                                         unsigned RM) {
     assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
     return RM | (RegOpcode << 3) | (Mod << 6);
   }
-  
+
   void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
                         unsigned &CurByte, raw_ostream &OS) const {
     EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
   }
-  
+
   void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
                    unsigned &CurByte, raw_ostream &OS) const {
     // SIB byte is in the same format as the ModRMByte.
     EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
   }
-  
-  
+
+
   void EmitMemModRMByte(const MCInst &MI, unsigned Op,
-                        unsigned RegOpcodeField, 
-                        unsigned TSFlags, unsigned &CurByte, raw_ostream &OS,
+                        unsigned RegOpcodeField,
+                        uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS,
                         SmallVectorImpl<MCFixup> &Fixups) const;
-  
+
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups) const;
-  
+
+  void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+                           const MCInst &MI, const TargetInstrDesc &Desc,
+                           raw_ostream &OS) const;
+
+  void EmitSegmentOverridePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                 int MemOperand, const MCInst &MI,
+                                 raw_ostream &OS) const;
+
+  void EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+                        const MCInst &MI, const TargetInstrDesc &Desc,
+                        raw_ostream &OS) const;
 };
 
 } // end anonymous namespace
@@ -124,24 +158,23 @@ MCCodeEmitter *llvm::createX86_64MCCodeEmitter(const Target &,
   return new X86MCCodeEmitter(TM, Ctx, true);
 }
 
-
-/// isDisp8 - Return true if this signed displacement fits in a 8-bit 
-/// sign-extended field. 
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
 static bool isDisp8(int Value) {
   return Value == (signed char)Value;
 }
 
 /// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
 /// in an instruction with the specified TSFlags.
-static MCFixupKind getImmFixupKind(unsigned TSFlags) {
+static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
   unsigned Size = X86II::getSizeOfImm(TSFlags);
   bool isPCRel = X86II::isImmPCRel(TSFlags);
-  
+
   switch (Size) {
   default: assert(0 && "Unknown immediate size");
   case 1: return isPCRel ? MCFixupKind(X86::reloc_pcrel_1byte) : FK_Data_1;
+  case 2: return isPCRel ? MCFixupKind(X86::reloc_pcrel_2byte) : FK_Data_2;
   case 4: return isPCRel ? MCFixupKind(X86::reloc_pcrel_4byte) : FK_Data_4;
-  case 2: assert(!isPCRel); return FK_Data_2;
   case 8: assert(!isPCRel); return FK_Data_8;
   }
 }
@@ -162,29 +195,30 @@ EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind,
 
   // If we have an immoffset, add it to the expression.
   const MCExpr *Expr = DispOp.getExpr();
-  
+
   // If the fixup is pc-relative, we need to bias the value to be relative to
   // the start of the field, not the end of the field.
   if (FixupKind == MCFixupKind(X86::reloc_pcrel_4byte) ||
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load))
     ImmOffset -= 4;
+  if (FixupKind == MCFixupKind(X86::reloc_pcrel_2byte))
+    ImmOffset -= 2;
   if (FixupKind == MCFixupKind(X86::reloc_pcrel_1byte))
     ImmOffset -= 1;
-  
+
   if (ImmOffset)
     Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(ImmOffset, Ctx),
                                    Ctx);
-  
+
   // Emit a symbolic constant as a fixup and 4 zeros.
   Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind));
   EmitConstant(0, Size, CurByte, OS);
 }
 
-
 void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
                                         unsigned RegOpcodeField,
-                                        unsigned TSFlags, unsigned &CurByte,
+                                        uint64_t TSFlags, unsigned &CurByte,
                                         raw_ostream &OS,
                                         SmallVectorImpl<MCFixup> &Fixups) const{
   const MCOperand &Disp     = MI.getOperand(Op+3);
@@ -192,43 +226,43 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
   const MCOperand &Scale    = MI.getOperand(Op+1);
   const MCOperand &IndexReg = MI.getOperand(Op+2);
   unsigned BaseReg = Base.getReg();
-  
+
   // Handle %rip relative addressing.
   if (BaseReg == X86::RIP) {    // [disp32+RIP] in X86-64 mode
-    assert(IndexReg.getReg() == 0 && Is64BitMode &&
-           "Invalid rip-relative address");
+    assert(Is64BitMode && "Rip-relative addressing requires 64-bit mode");
+    assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
     EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
-    
+
     unsigned FixupKind = X86::reloc_riprel_4byte;
-    
+
     // movq loads are handled with a special relocation form which allows the
     // linker to eliminate some loads for GOT references which end up in the
     // same linkage unit.
     if (MI.getOpcode() == X86::MOV64rm ||
         MI.getOpcode() == X86::MOV64rm_TC)
       FixupKind = X86::reloc_riprel_4byte_movq_load;
-    
+
     // rip-relative addressing is actually relative to the *next* instruction.
     // Since an immediate can follow the mod/rm byte for an instruction, this
     // means that we need to bias the immediate field of the instruction with
     // the size of the immediate field.  If we have this case, add it into the
     // expression to emit.
     int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
-    
+
     EmitImmediate(Disp, 4, MCFixupKind(FixupKind),
                   CurByte, OS, Fixups, -ImmSize);
     return;
   }
-  
+
   unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;
-  
+
   // Determine whether a SIB byte is needed.
-  // If no BaseReg, issue a RIP relative instruction only if the MCE can 
+  // If no BaseReg, issue a RIP relative instruction only if the MCE can
   // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
   // 2-7) and absolute references.
 
   if (// The SIB byte must be used if there is an index register.
-      IndexReg.getReg() == 0 && 
+      IndexReg.getReg() == 0 &&
       // The SIB byte must be used if the base is ESP/RSP/R12, all of which
       // encode to an R/M value of 4, which indicates that a SIB byte is
       // present.
@@ -242,7 +276,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
       EmitImmediate(Disp, 4, FK_Data_4, CurByte, OS, Fixups);
       return;
     }
-    
+
     // If the base is not EBP/ESP and there is no displacement, use simple
     // indirect register encoding, this handles addresses like [EAX].  The
     // encoding for [EBP] with no displacement means [disp32] so we handle it
@@ -251,24 +285,24 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
       EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
       return;
     }
-    
+
     // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
     if (Disp.isImm() && isDisp8(Disp.getImm())) {
       EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
       EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups);
       return;
     }
-    
+
     // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
     EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
     EmitImmediate(Disp, 4, FK_Data_4, CurByte, OS, Fixups);
     return;
   }
-    
+
   // We need a SIB byte, so start by outputting the ModR/M byte first
   assert(IndexReg.getReg() != X86::ESP &&
          IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
-  
+
   bool ForceDisp32 = false;
   bool ForceDisp8  = false;
   if (BaseReg == 0) {
@@ -294,13 +328,13 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
     // Emit the normal disp32 encoding.
     EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
   }
-  
+
   // Calculate what the SS field value should be...
   static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
   unsigned SS = SSTable[Scale.getImm()];
-  
+
   if (BaseReg == 0) {
-    // Handle the SIB byte for the case where there is no base, see Intel 
+    // Handle the SIB byte for the case where there is no base, see Intel
     // Manual 2A, table 2-7. The displacement has already been output.
     unsigned IndexRegNo;
     if (IndexReg.getReg())
@@ -316,7 +350,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
       IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
     EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
   }
-  
+
   // Do we need to output a displacement?
   if (ForceDisp8)
     EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups);
@@ -324,26 +358,216 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
     EmitImmediate(Disp, 4, FK_Data_4, CurByte, OS, Fixups);
 }
 
+/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
+/// called VEX.
+void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                           int MemOperand, const MCInst &MI,
+                                           const TargetInstrDesc &Desc,
+                                           raw_ostream &OS) const {
+  bool HasVEX_4V = false;
+  if (TSFlags & X86II::VEX_4V)
+    HasVEX_4V = true;
+
+  // VEX_R: opcode externsion equivalent to REX.R in
+  // 1's complement (inverted) form
+  //
+  //  1: Same as REX_R=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX_R=1 (64 bit mode only)
+  //
+  unsigned char VEX_R = 0x1;
+
+  // VEX_X: equivalent to REX.X, only used when a
+  // register is used for index in SIB Byte.
+  //
+  //  1: Same as REX.X=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX.X=1 (64-bit mode only)
+  unsigned char VEX_X = 0x1;
+
+  // VEX_B:
+  //
+  //  1: Same as REX_B=0 (ignored in 32-bit mode)
+  //  0: Same as REX_B=1 (64 bit mode only)
+  //
+  unsigned char VEX_B = 0x1;
+
+  // VEX_W: opcode specific (use like REX.W, or used for
+  // opcode extension, or ignored, depending on the opcode byte)
+  unsigned char VEX_W = 0;
+
+  // VEX_5M (VEX m-mmmmm field):
+  //
+  //  0b00000: Reserved for future use
+  //  0b00001: implied 0F leading opcode
+  //  0b00010: implied 0F 38 leading opcode bytes
+  //  0b00011: implied 0F 3A leading opcode bytes
+  //  0b00100-0b11111: Reserved for future use
+  //
+  unsigned char VEX_5M = 0x1;
+
+  // VEX_4V (VEX vvvv field): a register specifier
+  // (in 1's complement form) or 1111 if unused.
+  unsigned char VEX_4V = 0xf;
+
+  // VEX_L (Vector Length):
+  //
+  //  0: scalar or 128-bit vector
+  //  1: 256-bit vector
+  //
+  unsigned char VEX_L = 0;
+
+  // VEX_PP: opcode extension providing equivalent
+  // functionality of a SIMD prefix
+  //
+  //  0b00: None
+  //  0b01: 66
+  //  0b10: F3
+  //  0b11: F2
+  //
+  unsigned char VEX_PP = 0;
+
+  // Encode the operand size opcode prefix as needed.
+  if (TSFlags & X86II::OpSize)
+    VEX_PP = 0x01;
+
+  if (TSFlags & X86II::VEX_W)
+    VEX_W = 1;
+
+  switch (TSFlags & X86II::Op0Mask) {
+  default: assert(0 && "Invalid prefix!");
+  case X86II::T8:  // 0F 38
+    VEX_5M = 0x2;
+    break;
+  case X86II::TA:  // 0F 3A
+    VEX_5M = 0x3;
+    break;
+  case X86II::TF:  // F2 0F 38
+    VEX_PP = 0x3;
+    VEX_5M = 0x2;
+    break;
+  case X86II::XS:  // F3 0F
+    VEX_PP = 0x2;
+    break;
+  case X86II::XD:  // F2 0F
+    VEX_PP = 0x3;
+    break;
+  case X86II::TB:  // Bypass: Not used by VEX
+  case 0:
+    break;  // No prefix!
+  }
+
+  // Set the vector length to 256-bit if YMM0-YMM15 is used
+  for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
+    if (!MI.getOperand(i).isReg())
+      continue;
+    unsigned SrcReg = MI.getOperand(i).getReg();
+    if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15)
+      VEX_L = 1;
+  }
+
+  unsigned NumOps = MI.getNumOperands();
+  unsigned CurOp = 0;
+
+  switch (TSFlags & X86II::FormMask) {
+  case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!");
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m:
+  case X86II::MRMDestMem:
+    NumOps = CurOp = X86::AddrNumOperands;
+  case X86II::MRMSrcMem:
+  case X86II::MRMSrcReg:
+    if (MI.getNumOperands() > CurOp && MI.getOperand(CurOp).isReg() &&
+        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_R = 0x0;
+
+    // CurOp and NumOps are equal when VEX_R represents a register used
+    // to index a memory destination (which is the last operand)
+    CurOp = (CurOp == NumOps) ? 0 : CurOp+1;
+
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      CurOp++;
+    }
+
+    // If the last register should be encoded in the immediate field
+    // do not use any bit from VEX prefix to this register, ignore it
+    if (TSFlags & X86II::VEX_I8IMM)
+      NumOps--;
+
+    for (; CurOp != NumOps; ++CurOp) {
+      const MCOperand &MO = MI.getOperand(CurOp);
+      if (MO.isReg() && X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+        VEX_B = 0x0;
+      if (!VEX_B && MO.isReg() &&
+          ((TSFlags & X86II::FormMask) == X86II::MRMSrcMem) &&
+          X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+        VEX_X = 0x0;
+    }
+    break;
+  default: // MRMDestReg, MRM0r-MRM7r
+    if (MI.getOperand(CurOp).isReg() &&
+        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_B = 0;
+
+    if (HasVEX_4V)
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+
+    CurOp++;
+    for (; CurOp != NumOps; ++CurOp) {
+      const MCOperand &MO = MI.getOperand(CurOp);
+      if (MO.isReg() && !HasVEX_4V &&
+          X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+        VEX_R = 0x0;
+    }
+    break;
+    assert(0 && "Not implemented!");
+  }
+
+  // Emit segment override opcode prefix as needed.
+  EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
+
+  // VEX opcode prefix can have 2 or 3 bytes
+  //
+  //  3 bytes:
+  //    +-----+ +--------------+ +-------------------+
+  //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+  //    +-----+ +--------------+ +-------------------+
+  //  2 bytes:
+  //    +-----+ +-------------------+
+  //    | C5h | | R | vvvv | L | pp |
+  //    +-----+ +-------------------+
+  //
+  unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+  if (VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { // 2 byte VEX prefix
+    EmitByte(0xC5, CurByte, OS);
+    EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
+    return;
+  }
+
+  // 3 byte VEX prefix
+  EmitByte(0xC4, CurByte, OS);
+  EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
+  EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+}
+
 /// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
 /// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
 /// size, and 3) use of X86-64 extended registers.
-static unsigned DetermineREXPrefix(const MCInst &MI, unsigned TSFlags,
+static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
                                    const TargetInstrDesc &Desc) {
-  // Pseudo instructions never have a rex byte.
-  if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
-    return 0;
-  
   unsigned REX = 0;
   if (TSFlags & X86II::REX_W)
-    REX |= 1 << 3;
-  
+    REX |= 1 << 3; // set REX.W
+
   if (MI.getNumOperands() == 0) return REX;
-  
+
   unsigned NumOps = MI.getNumOperands();
   // FIXME: MCInst should explicitize the two-addrness.
   bool isTwoAddr = NumOps > 1 &&
                       Desc.getOperandConstraint(1, TOI::TIED_TO) != -1;
-  
+
   // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
   unsigned i = isTwoAddr ? 1 : 0;
   for (; i != NumOps; ++i) {
@@ -353,34 +577,34 @@ static unsigned DetermineREXPrefix(const MCInst &MI, unsigned TSFlags,
     if (!X86InstrInfo::isX86_64NonExtLowByteReg(Reg)) continue;
     // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
     // that returns non-zero.
-    REX |= 0x40;
+    REX |= 0x40; // REX fixed encoding prefix
     break;
   }
-  
+
   switch (TSFlags & X86II::FormMask) {
   case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!");
   case X86II::MRMSrcReg:
     if (MI.getOperand(0).isReg() &&
         X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
-      REX |= 1 << 2;
+      REX |= 1 << 2; // set REX.R
     i = isTwoAddr ? 2 : 1;
     for (; i != NumOps; ++i) {
       const MCOperand &MO = MI.getOperand(i);
       if (MO.isReg() && X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-        REX |= 1 << 0;
+        REX |= 1 << 0; // set REX.B
     }
     break;
   case X86II::MRMSrcMem: {
     if (MI.getOperand(0).isReg() &&
         X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
-      REX |= 1 << 2;
+      REX |= 1 << 2; // set REX.R
     unsigned Bit = 0;
     i = isTwoAddr ? 2 : 1;
     for (; i != NumOps; ++i) {
       const MCOperand &MO = MI.getOperand(i);
       if (MO.isReg()) {
         if (X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-          REX |= 1 << Bit;
+          REX |= 1 << Bit; // set REX.B (Bit=0) and REX.X (Bit=1)
         Bit++;
       }
     }
@@ -391,17 +615,17 @@ static unsigned DetermineREXPrefix(const MCInst &MI, unsigned TSFlags,
   case X86II::MRM4m: case X86II::MRM5m:
   case X86II::MRM6m: case X86II::MRM7m:
   case X86II::MRMDestMem: {
-    unsigned e = (isTwoAddr ? X86AddrNumOperands+1 : X86AddrNumOperands);
+    unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
     i = isTwoAddr ? 1 : 0;
     if (NumOps > e && MI.getOperand(e).isReg() &&
         X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e).getReg()))
-      REX |= 1 << 2;
+      REX |= 1 << 2; // set REX.R
     unsigned Bit = 0;
     for (; i != e; ++i) {
       const MCOperand &MO = MI.getOperand(i);
       if (MO.isReg()) {
         if (X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-          REX |= 1 << Bit;
+          REX |= 1 << Bit; // REX.B (Bit=0) and REX.X (Bit=1)
         Bit++;
       }
     }
@@ -410,39 +634,40 @@ static unsigned DetermineREXPrefix(const MCInst &MI, unsigned TSFlags,
   default:
     if (MI.getOperand(0).isReg() &&
         X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
-      REX |= 1 << 0;
+      REX |= 1 << 0; // set REX.B
     i = isTwoAddr ? 2 : 1;
     for (unsigned e = NumOps; i != e; ++i) {
       const MCOperand &MO = MI.getOperand(i);
       if (MO.isReg() && X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-        REX |= 1 << 2;
+        REX |= 1 << 2; // set REX.R
     }
     break;
   }
   return REX;
 }
 
-void X86MCCodeEmitter::
-EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
-  unsigned Opcode = MI.getOpcode();
-  const TargetInstrDesc &Desc = TII.get(Opcode);
-  unsigned TSFlags = Desc.TSFlags;
-
-  // Keep track of the current byte being emitted.
-  unsigned CurByte = 0;
-  
-  // FIXME: We should emit the prefixes in exactly the same order as GAS does,
-  // in order to provide diffability.
-
-  // Emit the lock opcode prefix as needed.
-  if (TSFlags & X86II::LOCK)
-    EmitByte(0xF0, CurByte, OS);
-  
-  // Emit segment override opcode prefix as needed.
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+void X86MCCodeEmitter::EmitSegmentOverridePrefix(uint64_t TSFlags,
+                                        unsigned &CurByte, int MemOperand,
+                                        const MCInst &MI,
+                                        raw_ostream &OS) const {
   switch (TSFlags & X86II::SegOvrMask) {
   default: assert(0 && "Invalid segment!");
-  case 0: break;  // No segment override!
+  case 0:
+    // No segment override, check for explicit one on memory operand.
+    if (MemOperand != -1) {   // If the instruction has a memory operand.
+      switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
+      default: assert(0 && "Unknown segment register!");
+      case 0: break;
+      case X86::CS: EmitByte(0x2E, CurByte, OS); break;
+      case X86::SS: EmitByte(0x36, CurByte, OS); break;
+      case X86::DS: EmitByte(0x3E, CurByte, OS); break;
+      case X86::ES: EmitByte(0x26, CurByte, OS); break;
+      case X86::FS: EmitByte(0x64, CurByte, OS); break;
+      case X86::GS: EmitByte(0x65, CurByte, OS); break;
+      }
+    }
+    break;
   case X86II::FS:
     EmitByte(0x64, CurByte, OS);
     break;
@@ -450,19 +675,36 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     EmitByte(0x65, CurByte, OS);
     break;
   }
-  
+}
+
+/// EmitOpcodePrefix - Emit all instruction prefixes prior to the opcode.
+///
+/// MemOperand is the operand # of the start of a memory operand if present.  If
+/// Not present, it is -1.
+void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                        int MemOperand, const MCInst &MI,
+                                        const TargetInstrDesc &Desc,
+                                        raw_ostream &OS) const {
+
+  // Emit the lock opcode prefix as needed.
+  if (TSFlags & X86II::LOCK)
+    EmitByte(0xF0, CurByte, OS);
+
+  // Emit segment override opcode prefix as needed.
+  EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
+
   // Emit the repeat opcode prefix as needed.
   if ((TSFlags & X86II::Op0Mask) == X86II::REP)
     EmitByte(0xF3, CurByte, OS);
-  
+
   // Emit the operand size opcode prefix as needed.
   if (TSFlags & X86II::OpSize)
     EmitByte(0x66, CurByte, OS);
-  
+
   // Emit the address size opcode prefix as needed.
   if (TSFlags & X86II::AdSize)
     EmitByte(0x67, CurByte, OS);
-  
+
   bool Need0FPrefix = false;
   switch (TSFlags & X86II::Op0Mask) {
   default: assert(0 && "Invalid prefix!");
@@ -494,18 +736,18 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::DE: EmitByte(0xDE, CurByte, OS); break;
   case X86II::DF: EmitByte(0xDF, CurByte, OS); break;
   }
-  
+
   // Handle REX prefix.
   // FIXME: Can this come before F2 etc to simplify emission?
   if (Is64BitMode) {
     if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc))
       EmitByte(0x40 | REX, CurByte, OS);
   }
-  
+
   // 0x0F escape code must be emitted just before the opcode.
   if (Need0FPrefix)
     EmitByte(0x0F, CurByte, OS);
-  
+
   // FIXME: Pull this up into previous switch if REX can be moved earlier.
   switch (TSFlags & X86II::Op0Mask) {
   case X86II::TF:    // F2 0F 38
@@ -516,8 +758,21 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     EmitByte(0x3A, CurByte, OS);
     break;
   }
-  
+}
+
+void X86MCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  unsigned Opcode = MI.getOpcode();
+  const TargetInstrDesc &Desc = TII.get(Opcode);
+  uint64_t TSFlags = Desc.TSFlags;
+
+  // Pseudo instructions don't get encoded.
+  if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return;
+
   // If this is a two-address instruction, skip one of the register operands.
+  // FIXME: This should be handled during MCInst lowering.
   unsigned NumOps = Desc.getNumOperands();
   unsigned CurOp = 0;
   if (NumOps > 1 && Desc.getOperandConstraint(1, TOI::TIED_TO) != -1)
@@ -525,56 +780,85 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   else if (NumOps > 2 && Desc.getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
     // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
     --NumOps;
-  
+
+  // Keep track of the current byte being emitted.
+  unsigned CurByte = 0;
+
+  // Is this instruction encoded using the AVX VEX prefix?
+  bool HasVEXPrefix = false;
+
+  // It uses the VEX.VVVV field?
+  bool HasVEX_4V = false;
+
+  if (TSFlags & X86II::VEX)
+    HasVEXPrefix = true;
+  if (TSFlags & X86II::VEX_4V)
+    HasVEX_4V = true;
+
+  // Determine where the memory operand starts, if present.
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+  if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+  if (!HasVEXPrefix)
+    EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+  else
+    EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+
   unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+  unsigned SrcRegNum = 0;
   switch (TSFlags & X86II::FormMask) {
   case X86II::MRMInitReg:
     assert(0 && "FIXME: Remove this form when the JIT moves to MCCodeEmitter!");
   default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n";
     assert(0 && "Unknown FormMask value in X86MCCodeEmitter!");
-  case X86II::Pseudo: return; // Pseudo instructions encode to nothing.
+  case X86II::Pseudo:
+    assert(0 && "Pseudo instruction shouldn't be emitted");
   case X86II::RawFrm:
     EmitByte(BaseOpcode, CurByte, OS);
     break;
-      
+
   case X86II::AddRegFrm:
     EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
     break;
-      
+
   case X86II::MRMDestReg:
     EmitByte(BaseOpcode, CurByte, OS);
     EmitRegModRMByte(MI.getOperand(CurOp),
                      GetX86RegNum(MI.getOperand(CurOp+1)), CurByte, OS);
     CurOp += 2;
     break;
-  
+
   case X86II::MRMDestMem:
     EmitByte(BaseOpcode, CurByte, OS);
     EmitMemModRMByte(MI, CurOp,
-                     GetX86RegNum(MI.getOperand(CurOp + X86AddrNumOperands)),
+                     GetX86RegNum(MI.getOperand(CurOp + X86::AddrNumOperands)),
                      TSFlags, CurByte, OS, Fixups);
-    CurOp += X86AddrNumOperands + 1;
+    CurOp += X86::AddrNumOperands + 1;
     break;
-      
+
   case X86II::MRMSrcReg:
     EmitByte(BaseOpcode, CurByte, OS);
-    EmitRegModRMByte(MI.getOperand(CurOp+1), GetX86RegNum(MI.getOperand(CurOp)),
-                     CurByte, OS);
-    CurOp += 2;
+    SrcRegNum = CurOp + 1;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      SrcRegNum++;
+
+    EmitRegModRMByte(MI.getOperand(SrcRegNum),
+                     GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+    CurOp = SrcRegNum + 1;
     break;
-    
+
   case X86II::MRMSrcMem: {
+    int AddrOperands = X86::AddrNumOperands;
+    unsigned FirstMemOp = CurOp+1;
+    if (HasVEX_4V) {
+      ++AddrOperands;
+      ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
+    }
+
     EmitByte(BaseOpcode, CurByte, OS);
 
-    // FIXME: Maybe lea should have its own form?  This is a horrible hack.
-    int AddrOperands;
-    if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
-        Opcode == X86::LEA16r || Opcode == X86::LEA32r)
-      AddrOperands = X86AddrNumOperands - 1; // No segment register
-    else
-      AddrOperands = X86AddrNumOperands;
-    
-    EmitMemModRMByte(MI, CurOp+1, GetX86RegNum(MI.getOperand(CurOp)),
+    EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
                      TSFlags, CurByte, OS, Fixups);
     CurOp += AddrOperands + 1;
     break;
@@ -584,6 +868,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM2r: case X86II::MRM3r:
   case X86II::MRM4r: case X86II::MRM5r:
   case X86II::MRM6r: case X86II::MRM7r:
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      CurOp++;
     EmitByte(BaseOpcode, CurByte, OS);
     EmitRegModRMByte(MI.getOperand(CurOp++),
                      (TSFlags & X86II::FormMask)-X86II::MRM0r,
@@ -596,7 +882,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     EmitByte(BaseOpcode, CurByte, OS);
     EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m,
                      TSFlags, CurByte, OS, Fixups);
-    CurOp += X86AddrNumOperands;
+    CurOp += X86::AddrNumOperands;
     break;
   case X86II::MRM_C1:
     EmitByte(BaseOpcode, CurByte, OS);
@@ -639,14 +925,27 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     EmitByte(0xF9, CurByte, OS);
     break;
   }
-  
+
   // If there is a remaining operand, it must be a trailing immediate.  Emit it
   // according to the right size for the instruction.
-  if (CurOp != NumOps)
-    EmitImmediate(MI.getOperand(CurOp++),
-                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
-                  CurByte, OS, Fixups);
-  
+  if (CurOp != NumOps) {
+    // The last source register of a 4 operand instruction in AVX is encoded
+    // in bits[7:4] of a immediate byte, and bits[3:0] are ignored.
+    if (TSFlags & X86II::VEX_I8IMM) {
+      const MCOperand &MO = MI.getOperand(CurOp++);
+      bool IsExtReg =
+        X86InstrInfo::isX86_64ExtendedReg(MO.getReg());
+      unsigned RegNum = (IsExtReg ? (1 << 7) : 0);
+      RegNum |= GetX86RegNum(MO) << 4;
+      EmitImmediate(MCOperand::CreateImm(RegNum), 1, FK_Data_1, CurByte, OS,
+                    Fixups);
+    } else
+      EmitImmediate(MI.getOperand(CurOp++),
+                    X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                    CurByte, OS, Fixups);
+  }
+
+
 #ifndef NDEBUG
   // FIXME: Verify.
   if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 98975ea..5f31e00 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -127,21 +127,29 @@ unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) {
   case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
     return RegNo-X86::ST0;
 
-  case X86::XMM0: case X86::XMM8: case X86::MM0:
+  case X86::XMM0: case X86::XMM8:
+  case X86::YMM0: case X86::YMM8: case X86::MM0:
     return 0;
-  case X86::XMM1: case X86::XMM9: case X86::MM1:
+  case X86::XMM1: case X86::XMM9:
+  case X86::YMM1: case X86::YMM9: case X86::MM1:
     return 1;
-  case X86::XMM2: case X86::XMM10: case X86::MM2:
+  case X86::XMM2: case X86::XMM10:
+  case X86::YMM2: case X86::YMM10: case X86::MM2:
     return 2;
-  case X86::XMM3: case X86::XMM11: case X86::MM3:
+  case X86::XMM3: case X86::XMM11:
+  case X86::YMM3: case X86::YMM11: case X86::MM3:
     return 3;
-  case X86::XMM4: case X86::XMM12: case X86::MM4:
+  case X86::XMM4: case X86::XMM12:
+  case X86::YMM4: case X86::YMM12: case X86::MM4:
     return 4;
-  case X86::XMM5: case X86::XMM13: case X86::MM5:
+  case X86::XMM5: case X86::XMM13:
+  case X86::YMM5: case X86::YMM13: case X86::MM5:
     return 5;
-  case X86::XMM6: case X86::XMM14: case X86::MM6:
+  case X86::XMM6: case X86::XMM14:
+  case X86::YMM6: case X86::YMM14: case X86::MM6:
     return 6;
-  case X86::XMM7: case X86::XMM15: case X86::MM7:
+  case X86::XMM7: case X86::XMM15:
+  case X86::YMM7: case X86::YMM15: case X86::MM7:
     return 7;
 
   case X86::ES:
@@ -157,6 +165,34 @@ unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) {
   case X86::GS:
     return 5;
 
+  case X86::CR0:
+    return 0;
+  case X86::CR1:
+    return 1;
+  case X86::CR2:
+    return 2;
+  case X86::CR3:
+    return 3;
+  case X86::CR4:
+    return 4;
+
+  case X86::DR0:
+    return 0;
+  case X86::DR1:
+    return 1;
+  case X86::DR2:
+    return 2;
+  case X86::DR3:
+    return 3;
+  case X86::DR4:
+    return 4;
+  case X86::DR5:
+    return 5;
+  case X86::DR6:
+    return 6;
+  case X86::DR7:
+    return 7;
+
   default:
     assert(isVirtualRegister(RegNo) && "Unknown physical register!");
     llvm_unreachable("Register allocator hasn't allocated reg correctly yet!");
@@ -357,56 +393,6 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   }
 }
 
-const TargetRegisterClass* const*
-X86RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  bool callsEHReturn = false;
-  if (MF)
-    callsEHReturn = MF->getMMI().callsEHReturn();
-
-  static const TargetRegisterClass * const CalleeSavedRegClasses32Bit[] = {
-    &X86::GR32RegClass, &X86::GR32RegClass,
-    &X86::GR32RegClass, &X86::GR32RegClass,  0
-  };
-  static const TargetRegisterClass * const CalleeSavedRegClasses32EHRet[] = {
-    &X86::GR32RegClass, &X86::GR32RegClass,
-    &X86::GR32RegClass, &X86::GR32RegClass,
-    &X86::GR32RegClass, &X86::GR32RegClass,  0
-  };
-  static const TargetRegisterClass * const CalleeSavedRegClasses64Bit[] = {
-    &X86::GR64RegClass, &X86::GR64RegClass,
-    &X86::GR64RegClass, &X86::GR64RegClass,
-    &X86::GR64RegClass, &X86::GR64RegClass, 0
-  };
-  static const TargetRegisterClass * const CalleeSavedRegClasses64EHRet[] = {
-    &X86::GR64RegClass, &X86::GR64RegClass,
-    &X86::GR64RegClass, &X86::GR64RegClass,
-    &X86::GR64RegClass, &X86::GR64RegClass,
-    &X86::GR64RegClass, &X86::GR64RegClass, 0
-  };
-  static const TargetRegisterClass * const CalleeSavedRegClassesWin64[] = {
-    &X86::GR64RegClass,  &X86::GR64RegClass,
-    &X86::GR64RegClass,  &X86::GR64RegClass,
-    &X86::GR64RegClass,  &X86::GR64RegClass,
-    &X86::GR64RegClass,  &X86::GR64RegClass,
-    &X86::VR128RegClass, &X86::VR128RegClass,
-    &X86::VR128RegClass, &X86::VR128RegClass,
-    &X86::VR128RegClass, &X86::VR128RegClass,
-    &X86::VR128RegClass, &X86::VR128RegClass,
-    &X86::VR128RegClass, &X86::VR128RegClass, 0
-  };
-
-  if (Is64Bit) {
-    if (IsWin64)
-      return CalleeSavedRegClassesWin64;
-    else
-      return (callsEHReturn ?
-              CalleeSavedRegClasses64EHRet : CalleeSavedRegClasses64Bit);
-  } else {
-    return (callsEHReturn ?
-            CalleeSavedRegClasses32EHRet : CalleeSavedRegClasses32Bit);
-  }
-}
-
 BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   // Set the stack-pointer register and its aliases as reserved.
@@ -696,8 +682,7 @@ X86RegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     //   }
     //   [EBP]
     MFI->CreateFixedObject(-TailCallReturnAddrDelta,
-                           (-1U*SlotSize)+TailCallReturnAddrDelta,
-                           true, false);
+                           (-1U*SlotSize)+TailCallReturnAddrDelta, true);
   }
 
   if (hasFP(MF)) {
@@ -710,7 +695,7 @@ X86RegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                           -(int)SlotSize +
                                           TFI.getOffsetOfLocalArea() +
                                           TailCallReturnAddrDelta,
-                                          true, false);
+                                          true);
     assert(FrameIdx == MFI->getObjectIndexBegin() &&
            "Slot for EBP register must be last in order to be found!");
     FrameIdx = 0;
@@ -1240,8 +1225,8 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
     if (CSSize) {
       unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
       MachineInstr *MI =
-        addLeaRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr),
-                        FramePtr, false, -CSSize);
+        addRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr),
+                     FramePtr, false, -CSSize);
       MBB.insert(MBBI, MI);
     } else {
       BuildMI(MBB, MBBI, DL,
@@ -1301,9 +1286,11 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
       for (unsigned i = 0; i != 5; ++i)
         MIB.addOperand(MBBI->getOperand(i));
     } else if (RetOpcode == X86::TCRETURNri64) {
-      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64), JumpTarget.getReg());
+      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
     } else {
-      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr), JumpTarget.getReg());
+      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
     }
 
     MachineInstr *NewMI = prior(MBBI);
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index d0b82e2..d852bcd 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -105,12 +105,6 @@ public:
   /// callee-save registers on this target.
   const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
 
-  /// getCalleeSavedRegClasses - Return a null-terminated list of the preferred
-  /// register classes to spill each callee-saved register with.  The order and
-  /// length of this list match the getCalleeSavedRegs() list.
-  const TargetRegisterClass* const*
-  getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
-
   /// getReservedRegs - Returns a bitset indexed by physical register number
   /// indicating if a register is a special register that has particular uses and
   /// should be considered unavailable at all times, e.g. SP, RA. This is used by
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 91cfaa9..9f0382e 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -147,7 +147,7 @@ let Namespace = "X86" in {
   def MM5 : Register<"mm5">, DwarfRegNum<[46, 34, 34]>;
   def MM6 : Register<"mm6">, DwarfRegNum<[47, 35, 35]>;
   def MM7 : Register<"mm7">, DwarfRegNum<[48, 36, 36]>;
-  
+
   // Pseudo Floating Point registers
   def FP0 : Register<"fp0">;
   def FP1 : Register<"fp1">;
@@ -155,7 +155,7 @@ let Namespace = "X86" in {
   def FP3 : Register<"fp3">;
   def FP4 : Register<"fp4">;
   def FP5 : Register<"fp5">;
-  def FP6 : Register<"fp6">; 
+  def FP6 : Register<"fp6">;
 
   // XMM Registers, used by the various SSE instruction set extensions.
   // The sub_ss and sub_sd subregs are the same registers with another regclass.
@@ -357,7 +357,7 @@ def GR16 : RegisterClass<"X86", [i16], 16,
   }];
 }
 
-def GR32 : RegisterClass<"X86", [i32], 32, 
+def GR32 : RegisterClass<"X86", [i32], 32,
                          [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
                           R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
@@ -412,7 +412,7 @@ def GR32 : RegisterClass<"X86", [i32], 32,
 // GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
 // RIP isn't really a register and it can't be used anywhere except in an
 // address, but it doesn't cause trouble.
-def GR64 : RegisterClass<"X86", [i64], 64, 
+def GR64 : RegisterClass<"X86", [i64], 64,
                          [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
                           RBX, R14, R15, R12, R13, RBP, RSP, RIP]> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
@@ -446,7 +446,7 @@ def SEGMENT_REG : RegisterClass<"X86", [i16], 16, [CS, DS, SS, ES, FS, GS]> {
 }
 
 // Debug registers.
-def DEBUG_REG : RegisterClass<"X86", [i32], 32, 
+def DEBUG_REG : RegisterClass<"X86", [i32], 32,
                               [DR0, DR1, DR2, DR3, DR4, DR5, DR6, DR7]> {
 }
 
@@ -780,14 +780,14 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32,
 }
 
 // Generic vector registers: VR64 and VR128.
-def VR64  : RegisterClass<"X86", [v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+def VR64  : RegisterClass<"X86", [v8i8, v4i16, v2i32, v1i64], 64,
                           [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>;
 def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
                           [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
                            XMM8, XMM9, XMM10, XMM11,
                            XMM12, XMM13, XMM14, XMM15]> {
   let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)];
-  
+
   let MethodProtos = [{
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
@@ -803,11 +803,27 @@ def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
     }
   }];
 }
-def VR256 : RegisterClass<"X86", [ v8i32, v4i64, v8f32, v4f64],256,
+
+def VR256 : RegisterClass<"X86", [v8i32, v4i64, v8f32, v4f64], 256,
                           [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
                            YMM8, YMM9, YMM10, YMM11,
                            YMM12, YMM13, YMM14, YMM15]> {
   let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)];
+
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    VR256Class::iterator
+    VR256Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only YMM0 to YMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
 }
 
 // Status flags registers.
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 09a2685..4a10be5 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -53,9 +53,12 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
   if (GV->hasDLLImportLinkage())
     return X86II::MO_DLLIMPORT;
 
-  // Materializable GVs (in JIT lazy compilation mode) do not require an
-  // extra load from stub.
-  bool isDecl = GV->isDeclaration() && !GV->isMaterializable();
+  // Determine whether this is a reference to a definition or a declaration.
+  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+  // load from stub.
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
 
   // X86-64 in PIC mode.
   if (isPICStyleRIPRel()) {
@@ -293,12 +296,11 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS,
   , IsBTMemSlow(false)
   , IsUAMemFast(false)
   , HasVectorUAMem(false)
-  , DarwinVers(0)
   , stackAlignment(8)
   // FIXME: this is a known good value for Yonah. How about others?
   , MaxInlineSizeThreshold(128)
-  , Is64Bit(is64Bit)
-  , TargetType(isELF) { // Default to ELF unless otherwise specified.
+  , TargetTriple(TT)
+  , Is64Bit(is64Bit) {
 
   // default to hard float ABI
   if (FloatABIType == FloatABI::Default)
@@ -328,47 +330,40 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS,
     HasCMov = true;
   }
     
-
   DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
                << ", 3DNowLevel " << X863DNowLevel
                << ", 64bit " << HasX86_64 << "\n");
   assert((!Is64Bit || HasX86_64) &&
          "64-bit code requested on a subtarget that doesn't support it!");
 
-  // Set the boolean corresponding to the current target triple, or the default
-  // if one cannot be determined, to true.
-  if (TT.length() > 5) {
-    size_t Pos;
-    if ((Pos = TT.find("-darwin")) != std::string::npos) {
-      TargetType = isDarwin;
-      
-      // Compute the darwin version number.
-      if (isdigit(TT[Pos+7]))
-        DarwinVers = atoi(&TT[Pos+7]);
-      else
-        DarwinVers = 8;  // Minimum supported darwin is Tiger.
-    } else if (TT.find("linux") != std::string::npos) {
-      // Linux doesn't imply ELF, but we don't currently support anything else.
-      TargetType = isELF;
-    } else if (TT.find("cygwin") != std::string::npos) {
-      TargetType = isCygwin;
-    } else if (TT.find("mingw") != std::string::npos) {
-      TargetType = isMingw;
-    } else if (TT.find("win32") != std::string::npos) {
-      TargetType = isWindows;
-    } else if (TT.find("windows") != std::string::npos) {
-      TargetType = isWindows;
-    } else if (TT.find("-cl") != std::string::npos) {
-      TargetType = isDarwin;
-      DarwinVers = 9;
-    }
-  }
-
   // Stack alignment is 16 bytes on Darwin (both 32 and 64 bit) and for all 64
   // bit targets.
-  if (TargetType == isDarwin || Is64Bit)
+  if (isTargetDarwin() || Is64Bit)
     stackAlignment = 16;
 
   if (StackAlignment)
     stackAlignment = StackAlignment;
 }
+
+/// IsCalleePop - Determines whether the callee is required to pop its
+/// own arguments. Callee pop is necessary to support tail calls.
+bool X86Subtarget::IsCalleePop(bool IsVarArg,
+                               CallingConv::ID CallingConv) const {
+  if (IsVarArg)
+    return false;
+
+  switch (CallingConv) {
+  default:
+    return false;
+  case CallingConv::X86_StdCall:
+    return !is64Bit();
+  case CallingConv::X86_FastCall:
+    return !is64Bit();
+  case CallingConv::X86_ThisCall:
+    return !is64Bit();
+  case CallingConv::Fast:
+    return GuaranteedTailCallOpt;
+  case CallingConv::GHC:
+    return GuaranteedTailCallOpt;
+  }
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 646af91..486dbc4 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -14,7 +14,9 @@
 #ifndef X86SUBTARGET_H
 #define X86SUBTARGET_H
 
+#include "llvm/ADT/Triple.h"
 #include "llvm/Target/TargetSubtarget.h"
+#include "llvm/CallingConv.h"
 #include <string>
 
 namespace llvm {
@@ -88,10 +90,6 @@ protected:
   /// operands. This may require setting a feature bit in the processor.
   bool HasVectorUAMem;
 
-  /// DarwinVers - Nonzero if this is a darwin platform: the numeric
-  /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc.
-  unsigned char DarwinVers; // Is any darwin-x86 platform.
-
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment;
@@ -99,6 +97,9 @@ protected:
   /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
   ///
   unsigned MaxInlineSizeThreshold;
+  
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
 
 private:
   /// Is64Bit - True if the processor supports 64-bit instructions and
@@ -106,9 +107,6 @@ private:
   bool Is64Bit;
 
 public:
-  enum {
-    isELF, isCygwin, isDarwin, isWindows, isMingw
-  } TargetType;
 
   /// This constructor initializes the data members to match that
   /// of the specified triple.
@@ -157,24 +155,31 @@ public:
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
 
-  bool isTargetDarwin() const { return TargetType == isDarwin; }
-  bool isTargetELF() const { return TargetType == isELF; }
+  bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; }
+  
+  // ELF is a reasonably sane default and the only other X86 targets we
+  // support are Darwin and Windows. Just use "not those".
+  bool isTargetELF() const { 
+    return !isTargetDarwin() && !isTargetWindows() && !isTargetCygMing();
+  }
+  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
 
-  bool isTargetWindows() const { return TargetType == isWindows; }
-  bool isTargetMingw() const { return TargetType == isMingw; }
-  bool isTargetCygwin() const { return TargetType == isCygwin; }
+  bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; }
+  bool isTargetMingw() const { 
+    return TargetTriple.getOS() == Triple::MinGW32 ||
+           TargetTriple.getOS() == Triple::MinGW64; }
+  bool isTargetCygwin() const { return TargetTriple.getOS() == Triple::Cygwin; }
   bool isTargetCygMing() const {
-    return TargetType == isMingw || TargetType == isCygwin;
+    return isTargetMingw() || isTargetCygwin();
   }
-
+  
   /// isTargetCOFF - Return true if this is any COFF/Windows target variant.
   bool isTargetCOFF() const {
-    return TargetType == isMingw || TargetType == isCygwin ||
-           TargetType == isWindows;
+    return isTargetMingw() || isTargetCygwin() || isTargetWindows();
   }
 
   bool isTargetWin64() const {
-    return Is64Bit && (TargetType == isMingw || TargetType == isWindows);
+    return Is64Bit && (isTargetMingw() || isTargetWindows());
   }
 
   std::string getDataLayout() const {
@@ -208,7 +213,10 @@ public:
 
   /// getDarwinVers - Return the darwin version number, 8 = Tiger, 9 = Leopard,
   /// 10 = Snow Leopard, etc.
-  unsigned getDarwinVers() const { return DarwinVers; }
+  unsigned getDarwinVers() const {
+    if (isTargetDarwin()) return TargetTriple.getDarwinMajorNumber();
+    return 0;
+  }
 
   /// ClassifyGlobalReference - Classify a global variable reference for the
   /// current subtarget according to how we should reference it in a non-pcrel
@@ -237,6 +245,9 @@ public:
   /// indicating the number of scheduling cycles of backscheduling that
   /// should be attempted.
   unsigned getSpecialAddressLatency() const;
+
+  /// IsCalleePop - Test whether a function should pop its own arguments.
+  bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index f2c5058..df00d3f 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -173,14 +173,18 @@ bool X86TargetMachine::addInstSelector(PassManagerBase &PM,
   // Install an instruction selector.
   PM.add(createX86ISelDag(*this, OptLevel));
 
-  // Install a pass to insert x87 FP_REG_KILL instructions, as needed.
-  PM.add(createX87FPRegKillInserterPass());
+  // For 32-bit, prepend instructions to set the "global base reg" for PIC.
+  if (!Subtarget.is64Bit())
+    PM.add(createGlobalBaseRegPass());
 
   return false;
 }
 
 bool X86TargetMachine::addPreRegAlloc(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel) {
+  // Install a pass to insert x87 FP_REG_KILL instructions, as needed.
+  PM.add(createX87FPRegKillInserterPass());
+
   PM.add(createX86MaxStackAlignmentHeuristicPass());
   return false;  // -print-machineinstr shouldn't print after this.
 }
diff --git a/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp b/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp
index c100c59..6656bdc 100644
--- a/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp
@@ -138,7 +138,6 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     // FALL THROUGH
   case GlobalValue::InternalLinkage:
   case GlobalValue::PrivateLinkage:
-  case GlobalValue::LinkerPrivateLinkage:
     break;
   case GlobalValue::DLLImportLinkage:
     llvm_unreachable("DLLImport linkage is not supported by this target!");
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index b230572..abe7b2f 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -245,7 +245,7 @@ SDValue XCoreTargetLowering::
 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
 {
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), MVT::i32);
   // If it's a debug information descriptor, don't mess with it.
   if (DAG.isVerifiedDebugInfoDesc(Op))
     return GA;
@@ -269,7 +269,7 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
   DebugLoc dl = Op.getDebugLoc();
   // transform to label + getid() * size
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
   if (!GVar) {
     // If GV is an alias then use the aliasee to determine size
@@ -454,12 +454,12 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   
   if (LD->getAlignment() == 2) {
     int SVOffset = LD->getSrcValueOffset();
-    SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, Chain,
+    SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, MVT::i32, dl, Chain,
                                  BasePtr, LD->getSrcValue(), SVOffset, MVT::i16,
                                  LD->isVolatile(), LD->isNonTemporal(), 2);
     SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
                                    DAG.getConstant(2, MVT::i32));
-    SDValue High = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::i32, Chain,
+    SDValue High = DAG.getExtLoad(ISD::EXTLOAD, MVT::i32, dl, Chain,
                                   HighAddr, LD->getSrcValue(), SVOffset + 2,
                                   MVT::i16, LD->isVolatile(),
                                   LD->isNonTemporal(), 2);
@@ -812,6 +812,7 @@ XCoreTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                CallingConv::ID CallConv, bool isVarArg,
                                bool &isTailCall,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
                                DebugLoc dl, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const {
@@ -826,7 +827,7 @@ XCoreTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     case CallingConv::Fast:
     case CallingConv::C:
       return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
-                            Outs, Ins, dl, DAG, InVals);
+                            Outs, OutVals, Ins, dl, DAG, InVals);
   }
 }
 
@@ -839,6 +840,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
                                     CallingConv::ID CallConv, bool isVarArg,
                                     bool isTailCall,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                     DebugLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
@@ -866,7 +868,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = Outs[i].Val;
+    SDValue Arg = OutVals[i];
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
@@ -919,7 +921,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i32);
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
     Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
 
@@ -1072,7 +1074,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
       // Create the frame index object for this incoming parameter...
       int FI = MFI->CreateFixedObject(ObjSize,
                                       LRSaveSize + VA.getLocMemOffset(),
-                                      true, false);
+                                      true);
 
       // Create the SelectionDAG nodes corresponding to a load
       //from this parameter
@@ -1097,7 +1099,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
       // address
       for (unsigned i = array_lengthof(ArgRegs) - 1; i >= FirstVAReg; --i) {
         // Create a stack slot
-        int FI = MFI->CreateFixedObject(4, offset, true, false);
+        int FI = MFI->CreateFixedObject(4, offset, true);
         if (i == FirstVAReg) {
           XFI->setVarArgsFrameIndex(FI);
         }
@@ -1120,7 +1122,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
       // This will point to the next argument passed via stack.
       XFI->setVarArgsFrameIndex(
         MFI->CreateFixedObject(4, LRSaveSize + CCInfo.getNextStackOffset(),
-                               true, false));
+                               true));
     }
   }
   
@@ -1133,19 +1135,19 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
 
 bool XCoreTargetLowering::
 CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
-               const SmallVectorImpl<EVT> &OutTys,
-               const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
-               SelectionDAG &DAG) const {
+               const SmallVectorImpl<ISD::OutputArg> &Outs,
+               LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
-  return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_XCore);
+                 RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_XCore);
 }
 
 SDValue
 XCoreTargetLowering::LowerReturn(SDValue Chain,
                                  CallingConv::ID CallConv, bool isVarArg,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
                                  DebugLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of
@@ -1175,7 +1177,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
     assert(VA.isRegLoc() && "Can only return in registers!");
 
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 
-                             Outs[i].Val, Flag);
+                             OutVals[i], Flag);
 
     // guarantee that all emitted copies are
     // stuck together, avoiding something bad
@@ -1221,23 +1223,22 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   MachineFunction *F = BB->getParent();
   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  BuildMI(BB, dl, TII.get(XCore::BRFT_lru6))
-    .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
   F->insert(It, copy0MBB);
   F->insert(It, sinkMBB);
-  // Update machine-CFG edges by first adding all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
-         E = BB->succ_end(); I != E; ++I)
-    sinkMBB->addSuccessor(*I);
-  // Next, remove all successors of the current block, and add the true
-  // and fallthrough blocks as its successors.
-  while (!BB->succ_empty())
-    BB->removeSuccessor(BB->succ_begin());
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
   // Next, add the true and fallthrough blocks as its successors.
   BB->addSuccessor(copy0MBB);
   BB->addSuccessor(sinkMBB);
   
+  BuildMI(BB, dl, TII.get(XCore::BRFT_lru6))
+    .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+
   //  copy0MBB:
   //   %FalseValue = ...
   //   # fallthrough to sinkMBB
@@ -1250,11 +1251,12 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   //  ...
   BB = sinkMBB;
-  BuildMI(BB, dl, TII.get(XCore::PHI), MI->getOperand(0).getReg())
+  BuildMI(*BB, BB->begin(), dl,
+          TII.get(XCore::PHI), MI->getOperand(0).getReg())
     .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
   
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
   return BB;
 }
 
@@ -1379,7 +1381,6 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
     SDValue Mul0, Mul1, Addend0, Addend1;
     if (N->getValueType(0) == MVT::i32 &&
         isADDADDMUL(SDValue(N, 0), Mul0, Mul1, Addend0, Addend1, true)) {
-      SDValue Zero = DAG.getConstant(0, MVT::i32);
       SDValue Ignored = DAG.getNode(XCoreISD::LMUL, dl,
                                     DAG.getVTList(MVT::i32, MVT::i32), Mul0,
                                     Mul1, Addend0, Addend1);
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index d8d2a3a..febc198 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -120,6 +120,7 @@ namespace llvm {
                            CallingConv::ID CallConv, bool isVarArg,
                            bool isTailCall,
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            DebugLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
@@ -178,6 +179,7 @@ namespace llvm {
                 CallingConv::ID CallConv, bool isVarArg,
                 bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
@@ -186,13 +188,13 @@ namespace llvm {
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     virtual bool
       CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
-                     const SmallVectorImpl<EVT> &OutTys,
-                     const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
-                     SelectionDAG &DAG) const;
+                     const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
+                     LLVMContext &Context) const;
   };
 }
 
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index 5260258..dd90ea9 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -299,9 +299,8 @@ XCoreInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
 unsigned
 XCoreInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
                              MachineBasicBlock *FBB,
-                             const SmallVectorImpl<MachineOperand> &Cond)const{
-  // FIXME there should probably be a DebugLoc argument here
-  DebugLoc dl;
+                             const SmallVectorImpl<MachineOperand> &Cond,
+                             DebugLoc DL)const{
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 2 || Cond.size() == 0) &&
@@ -310,11 +309,11 @@ XCoreInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
   if (FBB == 0) { // One way branch.
     if (Cond.empty()) {
       // Unconditional branch
-      BuildMI(&MBB, dl, get(XCore::BRFU_lu6)).addMBB(TBB);
+      BuildMI(&MBB, DL, get(XCore::BRFU_lu6)).addMBB(TBB);
     } else {
       // Conditional branch.
       unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
-      BuildMI(&MBB, dl, get(Opc)).addReg(Cond[1].getReg())
+      BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg())
                              .addMBB(TBB);
     }
     return 1;
@@ -323,9 +322,9 @@ XCoreInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
   // Two-way Conditional branch.
   assert(Cond.size() == 2 && "Unexpected number of components!");
   unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
-  BuildMI(&MBB, dl, get(Opc)).addReg(Cond[1].getReg())
+  BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg())
                          .addMBB(TBB);
-  BuildMI(&MBB, dl, get(XCore::BRFU_lu6)).addMBB(FBB);
+  BuildMI(&MBB, DL, get(XCore::BRFU_lu6)).addMBB(FBB);
   return 2;
 }
 
@@ -357,37 +356,31 @@ XCoreInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return 2;
 }
 
-bool XCoreInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I,
-                                  unsigned DestReg, unsigned SrcReg,
-                                  const TargetRegisterClass *DestRC,
-                                  const TargetRegisterClass *SrcRC,
-                                  DebugLoc DL) const {
-
-  if (DestRC == SrcRC) {
-    if (DestRC == XCore::GRRegsRegisterClass) {
-      BuildMI(MBB, I, DL, get(XCore::ADD_2rus), DestReg)
-        .addReg(SrcReg)
-        .addImm(0);
-      return true;
-    } else {
-      return false;
-    }
+void XCoreInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I, DebugLoc DL,
+                                 unsigned DestReg, unsigned SrcReg,
+                                 bool KillSrc) const {
+  bool GRDest = XCore::GRRegsRegClass.contains(DestReg);
+  bool GRSrc  = XCore::GRRegsRegClass.contains(SrcReg);
+
+  if (GRDest && GRSrc) {
+    BuildMI(MBB, I, DL, get(XCore::ADD_2rus), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .addImm(0);
+    return;
   }
   
-  if (SrcRC == XCore::RRegsRegisterClass && SrcReg == XCore::SP &&
-    DestRC == XCore::GRRegsRegisterClass) {
-    BuildMI(MBB, I, DL, get(XCore::LDAWSP_ru6), DestReg)
-      .addImm(0);
-    return true;
+  if (GRDest && SrcReg == XCore::SP) {
+    BuildMI(MBB, I, DL, get(XCore::LDAWSP_ru6), DestReg).addImm(0);
+    return;
   }
-  if (DestRC == XCore::RRegsRegisterClass && DestReg == XCore::SP &&
-    SrcRC == XCore::GRRegsRegisterClass) {
+
+  if (DestReg == XCore::SP && GRSrc) {
     BuildMI(MBB, I, DL, get(XCore::SETSP_1r))
-      .addReg(SrcReg);
-    return true;
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
   }
-  return false;
+  llvm_unreachable("Impossible reg-to-reg copy");
 }
 
 void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -438,8 +431,10 @@ bool XCoreInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(it->getReg());
 
-    storeRegToStackSlot(MBB, MI, it->getReg(), true,
-                        it->getFrameIdx(), it->getRegClass(), &RI);
+    unsigned Reg = it->getReg();
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    storeRegToStackSlot(MBB, MI, Reg, true,
+                        it->getFrameIdx(), RC, &RI);
     if (emitFrameMoves) {
       MCSymbol *SaveLabel = MF->getContext().CreateTempSymbol();
       BuildMI(MBB, MI, DL, get(XCore::DBG_LABEL)).addSym(SaveLabel);
@@ -460,10 +455,11 @@ bool XCoreInstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
     --BeforeI;
   for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
                                                     it != CSI.end(); ++it) {
-    
+    unsigned Reg = it->getReg();
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
     loadRegFromStackSlot(MBB, MI, it->getReg(),
                                   it->getFrameIdx(),
-                         it->getRegClass(), &RI);
+                         RC, &RI);
     assert(MI != MBB.begin() &&
            "loadRegFromStackSlot didn't insert any code!");
     // Insert in reverse order.  loadRegFromStackSlot can insert multiple
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index 9035ea9..e5b0171 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -58,17 +58,16 @@ public:
                              bool AllowModify) const;
   
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                             MachineBasicBlock *FBB,
-                             const SmallVectorImpl<MachineOperand> &Cond) const;
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
   
   virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
 
-  virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator I,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
 
   virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MI,
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index dd3cbc1..19b9b1f 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -733,7 +733,7 @@ def NEG : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
 // TODO setd, eet, eef, getts, setpt, outct, inct, chkct, outt, intt, out,
 // in, outshr, inshr, testct, testwct, tinitpc, tinitdp, tinitsp, tinitcp,
 // tsetmr, sext (reg), zext (reg)
-let isTwoAddress = 1 in {
+let Constraints = "$src1 = $dst" in {
 let neverHasSideEffects = 1 in
 def SEXT_rus : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
                  "sext $dst, $src2",
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 0cfb358..2a88342 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -82,18 +82,6 @@ const unsigned* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
   return CalleeSavedRegs;
 }
 
-const TargetRegisterClass* const*
-XCoreRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
-    XCore::GRRegsRegisterClass, XCore::GRRegsRegisterClass,
-    XCore::GRRegsRegisterClass, XCore::GRRegsRegisterClass,
-    XCore::GRRegsRegisterClass, XCore::GRRegsRegisterClass,
-    XCore::GRRegsRegisterClass, XCore::RRegsRegisterClass,
-    0
-  };
-  return CalleeSavedRegClasses;
-}
-
 BitVector XCoreRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(XCore::CP);
@@ -320,7 +308,7 @@ XCoreRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     int FrameIdx;
     if (! isVarArg) {
       // A fixed offset of 0 allows us to save / restore LR using entsp / retsp.
-      FrameIdx = MFI->CreateFixedObject(RC->getSize(), 0, true, false);
+      FrameIdx = MFI->CreateFixedObject(RC->getSize(), 0, true);
     } else {
       FrameIdx = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(),
                                         false);
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 5bdd059..66132ba 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -44,9 +44,6 @@ public:
 
   const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
 
-  const TargetRegisterClass* const* getCalleeSavedRegClasses(
-                                     const MachineFunction *MF = 0) const;
-
   BitVector getReservedRegs(const MachineFunction &MF) const;
   
   bool requiresRegisterScavenging(const MachineFunction &MF) const;
diff --git a/lib/Transforms/Hello/Hello.cpp b/lib/Transforms/Hello/Hello.cpp
index 37d7a00..abfa514 100644
--- a/lib/Transforms/Hello/Hello.cpp
+++ b/lib/Transforms/Hello/Hello.cpp
@@ -28,7 +28,7 @@ namespace {
     Hello() : FunctionPass(&ID) {}
 
     virtual bool runOnFunction(Function &F) {
-      HelloCounter++;
+      ++HelloCounter;
       errs() << "Hello: ";
       errs().write_escaped(F.getName()) << '\n';
       return false;
@@ -46,7 +46,7 @@ namespace {
     Hello2() : FunctionPass(&ID) {}
 
     virtual bool runOnFunction(Function &F) {
-      HelloCounter++;
+      ++HelloCounter;
       errs() << "Hello: ";
       errs().write_escaped(F.getName()) << '\n';
       return false;
diff --git a/lib/Transforms/Hello/Hello.exports b/lib/Transforms/Hello/Hello.exports
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/lib/Transforms/Hello/Hello.exports
diff --git a/lib/Transforms/Hello/Makefile b/lib/Transforms/Hello/Makefile
index c5e75d4..f1e3148 100644
--- a/lib/Transforms/Hello/Makefile
+++ b/lib/Transforms/Hello/Makefile
@@ -12,5 +12,13 @@ LIBRARYNAME = LLVMHello
 LOADABLE_MODULE = 1
 USEDLIBS =
 
+# If we don't need RTTI or EH, there's no reason to export anything
+# from the hello plugin.
+ifneq ($(REQUIRES_RTTI), 1)
+ifneq ($(REQUIRES_EH), 1)
+EXPORTED_SYMBOL_FILE = $(PROJ_SRC_DIR)/Hello.exports
+endif
+endif
+
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 89f213e..28ea079 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -360,19 +360,20 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const {
   IndicesVector Operands;
   for (Value::use_iterator UI = Arg->use_begin(), E = Arg->use_end();
        UI != E; ++UI) {
+    User *U = *UI;
     Operands.clear();
-    if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
       if (LI->isVolatile()) return false;  // Don't hack volatile loads
       Loads.push_back(LI);
       // Direct loads are equivalent to a GEP with a zero index and then a load.
       Operands.push_back(0);
-    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(*UI)) {
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
       if (GEP->use_empty()) {
         // Dead GEP's cause trouble later.  Just remove them if we run into
         // them.
         getAnalysis<AliasAnalysis>().deleteValue(GEP);
         GEP->eraseFromParent();
-        // TODO: This runs the above loop over and over again for dead GEPS
+        // TODO: This runs the above loop over and over again for dead GEPs
         // Couldn't we just do increment the UI iterator earlier and erase the
         // use?
         return isSafeToPromoteArgument(Arg, isByVal);
@@ -452,12 +453,14 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const {
     // Now check every path from the entry block to the load for transparency.
     // To do this, we perform a depth first search on the inverse CFG from the
     // loading block.
-    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+      BasicBlock *P = *PI;
       for (idf_ext_iterator<BasicBlock*, SmallPtrSet<BasicBlock*, 16> >
-             I = idf_ext_begin(*PI, TranspBlocks),
-             E = idf_ext_end(*PI, TranspBlocks); I != E; ++I)
+             I = idf_ext_begin(P, TranspBlocks),
+             E = idf_ext_end(P, TranspBlocks); I != E; ++I)
         if (AA.canBasicBlockModify(**I, Arg, LoadSize))
           return false;
+    }
   }
 
   // If the path from the entry of the function to each load is free of
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 692e47d..475eee8 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -120,9 +120,14 @@ namespace {
 
     typedef SmallVector<RetOrArg, 5> UseVector;
 
+  protected:
+    // DAH uses this to specify a different ID.
+    explicit DAE(void *ID) : ModulePass(ID) {}
+
   public:
     static char ID; // Pass identification, replacement for typeid
     DAE() : ModulePass(&ID) {}
+
     bool runOnModule(Module &M);
 
     virtual bool ShouldHackArguments() const { return false; }
@@ -155,6 +160,8 @@ namespace {
   /// by bugpoint.
   struct DAH : public DAE {
     static char ID;
+    DAH() : DAE(&ID) {}
+
     virtual bool ShouldHackArguments() const { return true; }
   };
 }
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index b429213..735a1c4 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -160,13 +160,12 @@ static bool SafeToDestroyConstant(const Constant *C) {
 static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
                           SmallPtrSet<const PHINode*, 16> &PHIUsers) {
   for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;
-       ++UI)
-    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(*UI)) {
+       ++UI) {
+    const User *U = *UI;
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
       GS.HasNonInstructionUser = true;
-
       if (AnalyzeGlobal(CE, GS, PHIUsers)) return true;
-
-    } else if (const Instruction *I = dyn_cast<Instruction>(*UI)) {
+    } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
       if (!GS.HasMultipleAccessingFunctions) {
         const Function *F = I->getParent()->getParent();
         if (GS.AccessingFunction == 0)
@@ -221,18 +220,21 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
           if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
         GS.HasPHIUser = true;
       } else if (isa<CmpInst>(I)) {
+        // Nothing to analyse.
       } else if (isa<MemTransferInst>(I)) {
-        if (I->getOperand(1) == V)
+        const MemTransferInst *MTI = cast<MemTransferInst>(I);
+        if (MTI->getArgOperand(0) == V)
           GS.StoredType = GlobalStatus::isStored;
-        if (I->getOperand(2) == V)
+        if (MTI->getArgOperand(1) == V)
           GS.isLoaded = true;
       } else if (isa<MemSetInst>(I)) {
-        assert(I->getOperand(1) == V && "Memset only takes one pointer!");
+        assert(cast<MemSetInst>(I)->getArgOperand(0) == V &&
+               "Memset only takes one pointer!");
         GS.StoredType = GlobalStatus::isStored;
       } else {
         return true;  // Any other non-load instruction might take address!
       }
-    } else if (const Constant *C = dyn_cast<Constant>(*UI)) {
+    } else if (const Constant *C = dyn_cast<Constant>(U)) {
       GS.HasNonInstructionUser = true;
       // We might have a dead and dangling constant hanging off of here.
       if (!SafeToDestroyConstant(C))
@@ -242,6 +244,7 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
       // Otherwise must be some other user.
       return true;
     }
+  }
 
   return false;
 }
@@ -1304,7 +1307,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     const Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
     Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
                                         ConstantInt::get(IntPtrTy, TypeSize),
-                                        NElems,
+                                        NElems, 0,
                                         CI->getName() + ".f" + Twine(FieldNo));
     FieldMallocs.push_back(NMI);
     new StoreInst(NMI, NGV, CI);
@@ -1323,8 +1326,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
   //      if (F2) { free(F2); F2 = 0; }
   //    }
   // The malloc can also fail if its argument is too large.
-  Constant *ConstantZero = ConstantInt::get(CI->getOperand(1)->getType(), 0);
-  Value *RunningOr = new ICmpInst(CI, ICmpInst::ICMP_SLT, CI->getOperand(1),
+  Constant *ConstantZero = ConstantInt::get(CI->getArgOperand(0)->getType(), 0);
+  Value *RunningOr = new ICmpInst(CI, ICmpInst::ICMP_SLT, CI->getArgOperand(0),
                                   ConstantZero, "isneg");
   for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) {
     Value *Cond = new ICmpInst(CI, ICmpInst::ICMP_EQ, FieldMallocs[i],
@@ -1511,10 +1514,10 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
 
   // If this is an allocation of a fixed size array of structs, analyze as a
   // variable size array.  malloc [100 x struct],1 -> malloc struct, 100
-  if (NElems == ConstantInt::get(CI->getOperand(1)->getType(), 1))
+  if (NElems == ConstantInt::get(CI->getArgOperand(0)->getType(), 1))
     if (const ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
       AllocTy = AT->getElementType();
-  
+
   const StructType *AllocSTy = dyn_cast<StructType>(AllocTy);
   if (!AllocSTy)
     return false;
@@ -1533,7 +1536,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
       Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
       Instruction *Malloc = CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy,
                                                    AllocSize, NumElements,
-                                                   CI->getName());
+                                                   0, CI->getName());
       Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI);
       CI->replaceAllUsesWith(Cast);
       CI->eraseFromParent();
@@ -1597,13 +1600,15 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
       GVElType->isFloatingPointTy() ||
       GVElType->isPointerTy() || GVElType->isVectorTy())
     return false;
-  
+
   // Walk the use list of the global seeing if all the uses are load or store.
   // If there is anything else, bail out.
-  for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I)
-    if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+  for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I){
+    User *U = *I;
+    if (!isa<LoadInst>(U) && !isa<StoreInst>(U))
       return false;
-  
+  }
+
   DEBUG(dbgs() << "   *** SHRINKING TO BOOL: " << *GV);
   
   // Create the new global, initializing it to false.
@@ -1641,7 +1646,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
         // bool.
         Instruction *StoredVal = cast<Instruction>(SI->getOperand(0));
 
-        // If we're already replaced the input, StoredVal will be a cast or
+        // If we've already replaced the input, StoredVal will be a cast or
         // select instruction.  If not, it will be a load of the original
         // global.
         if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
@@ -2260,8 +2265,7 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
                                          getVal(Values, CI->getOperand(0)),
                                          CI->getType());
     } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
-      InstResult =
-            ConstantExpr::getSelect(getVal(Values, SI->getOperand(0)),
+      InstResult = ConstantExpr::getSelect(getVal(Values, SI->getOperand(0)),
                                            getVal(Values, SI->getOperand(1)),
                                            getVal(Values, SI->getOperand(2)));
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
@@ -2302,7 +2306,8 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
       if (!Callee) return false;  // Cannot resolve.
 
       SmallVector<Constant*, 8> Formals;
-      for (User::op_iterator i = CI->op_begin() + 1, e = CI->op_end();
+      CallSite CS(CI);
+      for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end();
            i != e; ++i)
         Formals.push_back(getVal(Values, *i));
 
diff --git a/lib/Transforms/IPO/IPConstantPropagation.cpp b/lib/Transforms/IPO/IPConstantPropagation.cpp
index df2456f..e4db235 100644
--- a/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ b/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -85,15 +85,16 @@ bool IPCP::PropagateConstantsIntoArguments(Function &F) {
 
   unsigned NumNonconstant = 0;
   for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) {
+    User *U = *UI;
     // Ignore blockaddress uses.
-    if (isa<BlockAddress>(*UI)) continue;
+    if (isa<BlockAddress>(U)) continue;
     
     // Used by a non-instruction, or not the callee of a function, do not
     // transform.
-    if (!isa<CallInst>(*UI) && !isa<InvokeInst>(*UI))
+    if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
       return false;
     
-    CallSite CS = CallSite::get(cast<Instruction>(*UI));
+    CallSite CS = CallSite::get(cast<Instruction>(U));
     if (!CS.isCallee(UI))
       return false;
 
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index b785bb0..027a220 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -468,7 +468,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
       // move a call site to a function in this SCC before the
       // 'FirstCallInSCC' barrier.
       if (SCC.isSingular()) {
-        std::swap(CallSites[CSi], CallSites.back());
+        CallSites[CSi] = CallSites.back();
         CallSites.pop_back();
       } else {
         CallSites.erase(CallSites.begin()+CSi);
diff --git a/lib/Transforms/IPO/LowerSetJmp.cpp b/lib/Transforms/IPO/LowerSetJmp.cpp
index 4d61e83..76cfef8 100644
--- a/lib/Transforms/IPO/LowerSetJmp.cpp
+++ b/lib/Transforms/IPO/LowerSetJmp.cpp
@@ -42,6 +42,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/InstVisitor.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -262,8 +263,8 @@ void LowerSetJmp::TransformLongJmpCall(CallInst* Inst)
   // char*. It returns "void", so it doesn't need to replace any of
   // Inst's uses and doesn't get a name.
   CastInst* CI = 
-    new BitCastInst(Inst->getOperand(1), SBPTy, "LJBuf", Inst);
-  Value *Args[] = { CI, Inst->getOperand(2) };
+    new BitCastInst(Inst->getArgOperand(0), SBPTy, "LJBuf", Inst);
+  Value *Args[] = { CI, Inst->getArgOperand(1) };
   CallInst::Create(ThrowLongJmp, Args, Args + 2, "", Inst);
 
   SwitchValuePair& SVP = SwitchValMap[Inst->getParent()->getParent()];
@@ -378,7 +379,7 @@ void LowerSetJmp::TransformSetJmpCall(CallInst* Inst)
   const Type* SBPTy =
           Type::getInt8PtrTy(Inst->getContext());
   CastInst* BufPtr = 
-    new BitCastInst(Inst->getOperand(1), SBPTy, "SBJmpBuf", Inst);
+    new BitCastInst(Inst->getArgOperand(0), SBPTy, "SBJmpBuf", Inst);
   Value *Args[] = {
     GetSetJmpMap(Func), BufPtr,
     ConstantInt::get(Type::getInt32Ty(Inst->getContext()), SetJmpIDMap[Func]++)
@@ -405,12 +406,14 @@ void LowerSetJmp::TransformSetJmpCall(CallInst* Inst)
     // Loop over all of the uses of instruction.  If any of them are after the
     // call, "spill" the value to the stack.
     for (Value::use_iterator UI = II->use_begin(), E = II->use_end();
-         UI != E; ++UI)
-      if (cast<Instruction>(*UI)->getParent() != ABlock ||
-          InstrsAfterCall.count(cast<Instruction>(*UI))) {
+         UI != E; ++UI) {
+      User *U = *UI;
+      if (cast<Instruction>(U)->getParent() != ABlock ||
+          InstrsAfterCall.count(cast<Instruction>(U))) {
         DemoteRegToStack(*II);
         break;
       }
+    }
   InstrsAfterCall.clear();
 
   // Change the setjmp call into a branch statement. We'll remove the
@@ -473,7 +476,8 @@ void LowerSetJmp::visitCallInst(CallInst& CI)
 
   // Construct the new "invoke" instruction.
   TerminatorInst* Term = OldBB->getTerminator();
-  std::vector<Value*> Params(CI.op_begin() + 1, CI.op_end());
+  CallSite CS(&CI);
+  std::vector<Value*> Params(CS.arg_begin(), CS.arg_end());
   InvokeInst* II =
     InvokeInst::Create(CI.getCalledValue(), NewBB, PrelimBBMap[Func],
                        Params.begin(), Params.end(), CI.getName(), Term);
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 622a9b5..55d5e2a 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -146,7 +146,7 @@ static bool isEquivalentType(const Type *Ty1, const Type *Ty2) {
   switch(Ty1->getTypeID()) {
   default:
     llvm_unreachable("Unknown type!");
-    // Fall through in Release-Asserts mode.
+    // Fall through in Release mode.
   case Type::IntegerTyID:
   case Type::OpaqueTyID:
     // Ty1 == Ty2 would have returned true earlier.
@@ -535,6 +535,7 @@ static LinkageCategory categorize(const Function *F) {
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
   case GlobalValue::ExternalWeakLinkage:
+  case GlobalValue::LinkerPrivateWeakLinkage:
     return ExternalWeak;
 
   case GlobalValue::ExternalLinkage:
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 07525ea..6b9814c 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -66,13 +66,13 @@ Function* PartialInliner::unswitchFunction(Function* F) {
     return 0;
   
   // Clone the function, so that we can hack away on it.
-  DenseMap<const Value*, Value*> ValueMap;
-  Function* duplicateFunction = CloneFunction(F, ValueMap);
+  ValueMap<const Value*, Value*> VMap;
+  Function* duplicateFunction = CloneFunction(F, VMap);
   duplicateFunction->setLinkage(GlobalValue::InternalLinkage);
   F->getParent()->getFunctionList().push_back(duplicateFunction);
-  BasicBlock* newEntryBlock = cast<BasicBlock>(ValueMap[entryBlock]);
-  BasicBlock* newReturnBlock = cast<BasicBlock>(ValueMap[returnBlock]);
-  BasicBlock* newNonReturnBlock = cast<BasicBlock>(ValueMap[nonReturnBlock]);
+  BasicBlock* newEntryBlock = cast<BasicBlock>(VMap[entryBlock]);
+  BasicBlock* newReturnBlock = cast<BasicBlock>(VMap[returnBlock]);
+  BasicBlock* newNonReturnBlock = cast<BasicBlock>(VMap[nonReturnBlock]);
   
   // Go ahead and update all uses to the duplicate, so that we can just
   // use the inliner functionality when we're done hacking.
diff --git a/lib/Transforms/IPO/PartialSpecialization.cpp b/lib/Transforms/IPO/PartialSpecialization.cpp
index 084b94e..58e1448 100644
--- a/lib/Transforms/IPO/PartialSpecialization.cpp
+++ b/lib/Transforms/IPO/PartialSpecialization.cpp
@@ -32,6 +32,10 @@
 using namespace llvm;
 
 STATISTIC(numSpecialized, "Number of specialized functions created");
+STATISTIC(numReplaced, "Number of callers replaced by specialization");
+
+// Maximum number of arguments markable interested
+static const int MaxInterests = 6;
 
 // Call must be used at least occasionally
 static const int CallsMin = 5;
@@ -40,8 +44,9 @@ static const int CallsMin = 5;
 static const double ConstValPercent = .1;
 
 namespace {
+  typedef SmallVector<int, MaxInterests> InterestingArgVector;
   class PartSpec : public ModulePass {
-    void scanForInterest(Function&, SmallVector<int, 6>&);
+    void scanForInterest(Function&, InterestingArgVector&);
     int scanDistribution(Function&, int, std::map<Constant*, int>&);
   public :
     static char ID; // Pass identification, replacement for typeid
@@ -59,13 +64,15 @@ X("partialspecialization", "Partial Specialization");
 // a call to the specialized function.  Returns the specialized function
 static Function* 
 SpecializeFunction(Function* F, 
-                   DenseMap<const Value*, Value*>& replacements) {
+                   ValueMap<const Value*, Value*>& replacements) {
   // arg numbers of deleted arguments
-  DenseSet<unsigned> deleted;
-  for (DenseMap<const Value*, Value*>::iterator 
+  DenseMap<unsigned, const Argument*> deleted;
+  for (ValueMap<const Value*, Value*>::iterator 
          repb = replacements.begin(), repe = replacements.end();
-       repb != repe; ++repb)
-    deleted.insert(cast<Argument>(repb->first)->getArgNo());
+       repb != repe; ++repb) {
+    Argument const *arg = cast<const Argument>(repb->first);
+    deleted[arg->getArgNo()] = arg;
+  }
 
   Function* NF = CloneFunction(F, replacements);
   NF->setLinkage(GlobalValue::InternalLinkage);
@@ -80,9 +87,23 @@ SpecializeFunction(Function* F,
       if (CS.getCalledFunction() == F) {
         
         SmallVector<Value*, 6> args;
-        for (unsigned x = 0; x < CS.arg_size(); ++x)
-          if (!deleted.count(x))
-            args.push_back(CS.getArgument(x));
+        // Assemble the non-specialized arguments for the updated callsite.
+        // In the process, make sure that the specialized arguments are
+        // constant and match the specialization.  If that's not the case,
+        // this callsite needs to call the original or some other
+        // specialization; don't change it here.
+        CallSite::arg_iterator as = CS.arg_begin(), ae = CS.arg_end();
+        for (CallSite::arg_iterator ai = as; ai != ae; ++ai) {
+          DenseMap<unsigned, const Argument*>::iterator delit = deleted.find(
+            std::distance(as, ai));
+          if (delit == deleted.end())
+            args.push_back(cast<Value>(ai));
+          else {
+            Constant *ci = dyn_cast<Constant>(ai);
+            if (!(ci && ci == replacements[delit->second]))
+              goto next_use;
+          }
+        }
         Value* NCall;
         if (CallInst *CI = dyn_cast<CallInst>(i)) {
           NCall = CallInst::Create(NF, args.begin(), args.end(), 
@@ -99,8 +120,11 @@ SpecializeFunction(Function* F,
         }
         CS.getInstruction()->replaceAllUsesWith(NCall);
         CS.getInstruction()->eraseFromParent();
+        ++numReplaced;
       }
     }
+  next_use:
+    ;
   }
   return NF;
 }
@@ -111,7 +135,7 @@ bool PartSpec::runOnModule(Module &M) {
   for (Module::iterator I = M.begin(); I != M.end(); ++I) {
     Function &F = *I;
     if (F.isDeclaration() || F.mayBeOverridden()) continue;
-    SmallVector<int, 6> interestingArgs;
+    InterestingArgVector interestingArgs;
     scanForInterest(F, interestingArgs);
 
     // Find the first interesting Argument that we can specialize on
@@ -126,7 +150,7 @@ bool PartSpec::runOnModule(Module &M) {
                ee = distribution.end(); ii != ee; ++ii)
           if (total > ii->second && ii->first &&
                ii->second > total * ConstValPercent) {
-            DenseMap<const Value*, Value*> m;
+            ValueMap<const Value*, Value*> m;
             Function::arg_iterator arg = F.arg_begin();
             for (int y = 0; y < interestingArgs[x]; ++y)
               ++arg;
@@ -143,7 +167,7 @@ bool PartSpec::runOnModule(Module &M) {
 
 /// scanForInterest - This function decides which arguments would be worth
 /// specializing on.
-void PartSpec::scanForInterest(Function& F, SmallVector<int, 6>& args) {
+void PartSpec::scanForInterest(Function& F, InterestingArgVector& args) {
   for(Function::arg_iterator ii = F.arg_begin(), ee = F.arg_end();
       ii != ee; ++ii) {
     for(Value::use_iterator ui = ii->use_begin(), ue = ii->use_end();
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 6bc8e66..12e8db8 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -73,6 +73,19 @@ namespace {
       AU.setPreservesAll();
     }
   };
+
+  class StripDeadDebugInfo : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripDeadDebugInfo()
+      : ModulePass(&ID) {}
+
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
 }
 
 char StripSymbols::ID = 0;
@@ -99,6 +112,14 @@ ModulePass *llvm::createStripDebugDeclarePass() {
   return new StripDebugDeclare();
 }
 
+char StripDeadDebugInfo::ID = 0;
+static RegisterPass<StripDeadDebugInfo>
+A("strip-dead-debug-info", "Strip debug info for unused symbols");
+
+ModulePass *llvm::createStripDeadDebugInfoPass() {
+  return new StripDeadDebugInfo();
+}
+
 /// OnlyUsedBy - Return true if V is only used by Usr.
 static bool OnlyUsedBy(Value *V, Value *Usr) {
   for(Value::use_iterator I = V->use_begin(), E = V->use_end(); I != E; ++I) {
@@ -223,27 +244,27 @@ static bool StripDebugInfo(Module &M) {
     Changed = true;
   }
 
-  NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.gv");
-  if (NMD) {
-    Changed = true;
-    NMD->eraseFromParent();
-  }
-  
-  NMD = M.getNamedMetadata("llvm.dbg.lv");
-  if (NMD) {
-    Changed = true;
-    NMD->eraseFromParent();
+  for (Module::named_metadata_iterator NMI = M.named_metadata_begin(),
+         NME = M.named_metadata_end(); NMI != NME;) {
+    NamedMDNode *NMD = NMI;
+    ++NMI;
+    if (NMD->getName().startswith("llvm.dbg.")) {
+      NMD->eraseFromParent();
+      Changed = true;
+    }
   }
-  
+
   unsigned MDDbgKind = M.getMDKindID("dbg");
-  for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI) 
+  for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI)
     for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE;
          ++FI)
       for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE;
-           ++BI) 
+           ++BI) {
+        Changed = true; // FIXME: Only set if there was debug metadata.
         BI->setMetadata(MDDbgKind, 0);
+      }
 
-  return true;
+  return Changed;
 }
 
 bool StripSymbols::runOnModule(Module &M) {
@@ -266,8 +287,8 @@ bool StripDebugDeclare::runOnModule(Module &M) {
   if (Declare) {
     while (!Declare->use_empty()) {
       CallInst *CI = cast<CallInst>(Declare->use_back());
-      Value *Arg1 = CI->getOperand(1);
-      Value *Arg2 = CI->getOperand(2);
+      Value *Arg1 = CI->getArgOperand(0);
+      Value *Arg2 = CI->getArgOperand(1);
       assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
       CI->eraseFromParent();
       if (Arg1->use_empty()) {
@@ -295,3 +316,83 @@ bool StripDebugDeclare::runOnModule(Module &M) {
 
   return true;
 }
+
+/// getRealLinkageName - If special LLVM prefix that is used to inform the asm 
+/// printer to not emit usual symbol prefix before the symbol name is used then
+/// return linkage name after skipping this special LLVM prefix.
+static StringRef getRealLinkageName(StringRef LinkageName) {
+  char One = '\1';
+  if (LinkageName.startswith(StringRef(&One, 1)))
+    return LinkageName.substr(1);
+  return LinkageName;
+}
+
+bool StripDeadDebugInfo::runOnModule(Module &M) {
+  bool Changed = false;
+
+  // Debugging infomration is encoded in llvm IR using metadata. This is designed
+  // such a way that debug info for symbols preserved even if symbols are
+  // optimized away by the optimizer. This special pass removes debug info for 
+  // such symbols.
+
+  // llvm.dbg.gv keeps track of debug info for global variables.
+  if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.gv")) {
+    SmallVector<MDNode *, 8> MDs;
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
+      if (DIGlobalVariable(NMD->getOperand(i)).Verify())
+        MDs.push_back(NMD->getOperand(i));
+      else
+        Changed = true;
+    NMD->eraseFromParent();
+    NMD = NULL;
+
+    for (SmallVector<MDNode *, 8>::iterator I = MDs.begin(),
+           E = MDs.end(); I != E; ++I) {
+      if (M.getGlobalVariable(DIGlobalVariable(*I).getGlobal()->getName(), 
+                              true)) {
+        if (!NMD)
+          NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv");
+        NMD->addOperand(*I);
+      }
+      else
+        Changed = true;
+    }
+  }
+
+  // llvm.dbg.sp keeps track of debug info for subprograms.
+  if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.sp")) {
+    SmallVector<MDNode *, 8> MDs;
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
+      if (DISubprogram(NMD->getOperand(i)).Verify())
+        MDs.push_back(NMD->getOperand(i));
+      else
+        Changed = true;
+    NMD->eraseFromParent();
+    NMD = NULL;
+
+    for (SmallVector<MDNode *, 8>::iterator I = MDs.begin(),
+           E = MDs.end(); I != E; ++I) {
+      bool FnIsLive = false;
+      if (Function *F = DISubprogram(*I).getFunction())
+        if (M.getFunction(F->getName()))
+          FnIsLive = true;
+      if (FnIsLive) {
+          if (!NMD)
+            NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp");
+          NMD->addOperand(*I);
+      } else {
+        // Remove llvm.dbg.lv.fnname named mdnode which may have been used
+        // to hold debug info for dead function's local variables.
+        StringRef FName = DISubprogram(*I).getLinkageName();
+        if (FName.empty())
+          FName = DISubprogram(*I).getName();
+        if (NamedMDNode *LVNMD = 
+            M.getNamedMetadata(Twine("llvm.dbg.lv.", 
+                                     getRealLinkageName(FName)))) 
+          LVNMD->eraseFromParent();
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/StructRetPromotion.cpp b/lib/Transforms/IPO/StructRetPromotion.cpp
index 473e83c..a74686f 100644
--- a/lib/Transforms/IPO/StructRetPromotion.cpp
+++ b/lib/Transforms/IPO/StructRetPromotion.cpp
@@ -107,12 +107,12 @@ CallGraphNode *SRETPromotion::PromoteReturn(CallGraphNode *CGN) {
   // Check if it is ok to perform this promotion.
   if (isSafeToUpdateAllCallers(F) == false) {
     DEBUG(dbgs() << "SretPromotion: Not all callers can be updated\n");
-    NumRejectedSRETUses++;
+    ++NumRejectedSRETUses;
     return 0;
   }
 
   DEBUG(dbgs() << "SretPromotion: sret argument will be promoted\n");
-  NumSRET++;
+  ++NumSRET;
   // [1] Replace use of sret parameter 
   AllocaInst *TheAlloca = new AllocaInst(STy, NULL, "mrv", 
                                          F->getEntryBlock().begin());
@@ -171,16 +171,16 @@ bool SRETPromotion::isSafeToUpdateAllCallers(Function *F) {
     // Check FirstArg's users.
     for (Value::use_iterator ArgI = FirstArg->use_begin(), 
            ArgE = FirstArg->use_end(); ArgI != ArgE; ++ArgI) {
-
+      User *U = *ArgI;
       // If FirstArg user is a CallInst that does not correspond to current
       // call site then this function F is not suitable for sret promotion.
-      if (CallInst *CI = dyn_cast<CallInst>(ArgI)) {
+      if (CallInst *CI = dyn_cast<CallInst>(U)) {
         if (CI != Call)
           return false;
       }
       // If FirstArg user is a GEP whose all users are not LoadInst then
       // this function F is not suitable for sret promotion.
-      else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(ArgI)) {
+      else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
         // TODO : Use dom info and insert PHINodes to collect get results
         // from multiple call sites for this GEP.
         if (GEP->getParent() != Call->getParent())
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index c7b04a4..24e0528 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -178,7 +178,8 @@ public:
   Instruction *visitPHINode(PHINode &PN);
   Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
   Instruction *visitAllocaInst(AllocaInst &AI);
-  Instruction *visitFree(Instruction &FI);
+  Instruction *visitMalloc(Instruction &FI);
+  Instruction *visitFree(CallInst &FI);
   Instruction *visitLoadInst(LoadInst &LI);
   Instruction *visitStoreInst(StoreInst &SI);
   Instruction *visitBranchInst(BranchInst &BI);
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 8586054..3f4a857 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1584,6 +1584,19 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     if ((match(A, m_Not(m_Specific(B))) &&
          match(D, m_Not(m_Specific(C)))))
       return BinaryOperator::CreateXor(C, B);
+
+    // ((A|B)&1)|(B&-2) -> (A&1) | B
+    if (match(A, m_Or(m_Value(V1), m_Specific(B))) ||
+        match(A, m_Or(m_Specific(B), m_Value(V1)))) {
+      Instruction *Ret = FoldOrWithConstants(I, Op1, V1, B, C);
+      if (Ret) return Ret;
+    }
+    // (B&-2)|((A|B)&1) -> (A&1) | B
+    if (match(B, m_Or(m_Specific(A), m_Value(V1))) ||
+        match(B, m_Or(m_Value(V1), m_Specific(A)))) {
+      Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D);
+      if (Ret) return Ret;
+    }
   }
   
   // (X >> Z) | (Y >> Z)  -> (X|Y) >> Z  for all shifts.
@@ -1599,19 +1612,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
       }
   }
 
-  // ((A|B)&1)|(B&-2) -> (A&1) | B
-  if (match(Op0, m_And(m_Or(m_Value(A), m_Value(B)), m_Value(C))) ||
-      match(Op0, m_And(m_Value(C), m_Or(m_Value(A), m_Value(B))))) {
-    Instruction *Ret = FoldOrWithConstants(I, Op1, A, B, C);
-    if (Ret) return Ret;
-  }
-  // (B&-2)|((A|B)&1) -> (A&1) | B
-  if (match(Op1, m_And(m_Or(m_Value(A), m_Value(B)), m_Value(C))) ||
-      match(Op1, m_And(m_Value(C), m_Or(m_Value(A), m_Value(B))))) {
-    Instruction *Ret = FoldOrWithConstants(I, Op0, A, B, C);
-    if (Ret) return Ret;
-  }
-
   // (~A | ~B) == (~(A & B)) - De Morgan's Law
   if (Value *Op0NotVal = dyn_castNotVal(Op0))
     if (Value *Op1NotVal = dyn_castNotVal(Op1))
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 38e7b6e..85251a8 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -112,8 +112,8 @@ unsigned InstCombiner::GetOrEnforceKnownAlignment(Value *V,
 }
 
 Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
-  unsigned DstAlign = GetOrEnforceKnownAlignment(MI->getOperand(1));
-  unsigned SrcAlign = GetOrEnforceKnownAlignment(MI->getOperand(2));
+  unsigned DstAlign = GetOrEnforceKnownAlignment(MI->getArgOperand(0));
+  unsigned SrcAlign = GetOrEnforceKnownAlignment(MI->getArgOperand(1));
   unsigned MinAlign = std::min(DstAlign, SrcAlign);
   unsigned CopyAlign = MI->getAlignment();
 
@@ -125,7 +125,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   
   // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
   // load/store.
-  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getOperand(3));
+  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
   if (MemOpLength == 0) return 0;
   
   // Source and destination pointer types are always "i8*" for intrinsic.  See
@@ -140,9 +140,9 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   
   // Use an integer load+store unless we can find something better.
   unsigned SrcAddrSp =
-    cast<PointerType>(MI->getOperand(2)->getType())->getAddressSpace();
+    cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace();
   unsigned DstAddrSp =
-    cast<PointerType>(MI->getOperand(1)->getType())->getAddressSpace();
+    cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace();
 
   const IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
   Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
@@ -154,8 +154,8 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   // an i64 load+store, here because this improves the odds that the source or
   // dest address will be promotable.  See if we can find a better type than the
   // integer datatype.
-  Value *StrippedDest = MI->getOperand(1)->stripPointerCasts();
-  if (StrippedDest != MI->getOperand(1)) {
+  Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts();
+  if (StrippedDest != MI->getArgOperand(0)) {
     const Type *SrcETy = cast<PointerType>(StrippedDest->getType())
                                     ->getElementType();
     if (TD && SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) {
@@ -189,15 +189,15 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   SrcAlign = std::max(SrcAlign, CopyAlign);
   DstAlign = std::max(DstAlign, CopyAlign);
   
-  Value *Src = Builder->CreateBitCast(MI->getOperand(2), NewSrcPtrTy);
-  Value *Dest = Builder->CreateBitCast(MI->getOperand(1), NewDstPtrTy);
+  Value *Src = Builder->CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
+  Value *Dest = Builder->CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
   Instruction *L = new LoadInst(Src, "tmp", MI->isVolatile(), SrcAlign);
   InsertNewInstBefore(L, *MI);
   InsertNewInstBefore(new StoreInst(L, Dest, MI->isVolatile(), DstAlign),
                       *MI);
 
   // Set the size of the copy to 0, it will be deleted on the next iteration.
-  MI->setOperand(3, Constant::getNullValue(MemOpLength->getType()));
+  MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
   return MI;
 }
 
@@ -250,6 +250,8 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
 Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   if (isFreeCall(&CI))
     return visitFree(CI);
+  if (isMalloc(&CI))
+    return visitMalloc(CI);
 
   // If the caller function is nounwind, mark the call as nounwind, even if the
   // callee isn't.
@@ -261,7 +263,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
   if (!II) return visitCallSite(&CI);
-  
+
   // Intrinsics cannot occur in an invoke, so handle them here instead of in
   // visitCallSite.
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {
@@ -287,11 +289,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         if (GVSrc->isConstant()) {
           Module *M = CI.getParent()->getParent()->getParent();
           Intrinsic::ID MemCpyID = Intrinsic::memcpy;
-          const Type *Tys[3] = { CI.getOperand(1)->getType(),
-                                 CI.getOperand(2)->getType(),
-                                 CI.getOperand(3)->getType() };
-          CI.setCalledFunction( 
-                        Intrinsic::getDeclaration(M, MemCpyID, Tys, 3));
+          const Type *Tys[3] = { CI.getArgOperand(0)->getType(),
+                                 CI.getArgOperand(1)->getType(),
+                                 CI.getArgOperand(2)->getType() };
+          CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys, 3));
           Changed = true;
         }
     }
@@ -311,7 +312,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       if (Instruction *I = SimplifyMemSet(MSI))
         return I;
     }
-          
+
     if (Changed) return II;
   }
   
@@ -322,10 +323,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (!TD) break;
     
     const Type *ReturnTy = CI.getType();
-    bool Min = (cast<ConstantInt>(II->getOperand(2))->getZExtValue() == 1);
+    bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
 
     // Get to the real allocated thing and offset as fast as possible.
-    Value *Op1 = II->getOperand(1)->stripPointerCasts();
+    Value *Op1 = II->getArgOperand(0)->stripPointerCasts();
     
     // If we've stripped down to a single global variable that we
     // can know the size of then just return that.
@@ -393,7 +394,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       
       Constant *RetVal = ConstantInt::get(ReturnTy, Size-Offset);
       return ReplaceInstUsesWith(CI, RetVal);
-      
     } 
 
     // Do not return "I don't know" here. Later optimization passes could
@@ -402,45 +402,45 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   }
   case Intrinsic::bswap:
     // bswap(bswap(x)) -> x
-    if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(II->getOperand(1)))
+    if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(II->getArgOperand(0)))
       if (Operand->getIntrinsicID() == Intrinsic::bswap)
-        return ReplaceInstUsesWith(CI, Operand->getOperand(1));
+        return ReplaceInstUsesWith(CI, Operand->getArgOperand(0));
       
     // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
-    if (TruncInst *TI = dyn_cast<TruncInst>(II->getOperand(1))) {
+    if (TruncInst *TI = dyn_cast<TruncInst>(II->getArgOperand(0))) {
       if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(TI->getOperand(0)))
         if (Operand->getIntrinsicID() == Intrinsic::bswap) {
           unsigned C = Operand->getType()->getPrimitiveSizeInBits() -
                        TI->getType()->getPrimitiveSizeInBits();
           Value *CV = ConstantInt::get(Operand->getType(), C);
-          Value *V = Builder->CreateLShr(Operand->getOperand(1), CV);
+          Value *V = Builder->CreateLShr(Operand->getArgOperand(0), CV);
           return new TruncInst(V, TI->getType());
         }
     }
       
     break;
   case Intrinsic::powi:
-    if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getOperand(2))) {
+    if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // powi(x, 0) -> 1.0
       if (Power->isZero())
         return ReplaceInstUsesWith(CI, ConstantFP::get(CI.getType(), 1.0));
       // powi(x, 1) -> x
       if (Power->isOne())
-        return ReplaceInstUsesWith(CI, II->getOperand(1));
+        return ReplaceInstUsesWith(CI, II->getArgOperand(0));
       // powi(x, -1) -> 1/x
       if (Power->isAllOnesValue())
         return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0),
-                                          II->getOperand(1));
+                                          II->getArgOperand(0));
     }
     break;
   case Intrinsic::cttz: {
     // If all bits below the first known one are known zero,
     // this value is constant.
-    const IntegerType *IT = cast<IntegerType>(II->getOperand(1)->getType());
+    const IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType());
     uint32_t BitWidth = IT->getBitWidth();
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(II->getOperand(1), APInt::getAllOnesValue(BitWidth),
+    ComputeMaskedBits(II->getArgOperand(0), APInt::getAllOnesValue(BitWidth),
                       KnownZero, KnownOne);
     unsigned TrailingZeros = KnownOne.countTrailingZeros();
     APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros));
@@ -453,11 +453,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ctlz: {
     // If all bits above the first known one are known zero,
     // this value is constant.
-    const IntegerType *IT = cast<IntegerType>(II->getOperand(1)->getType());
+    const IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType());
     uint32_t BitWidth = IT->getBitWidth();
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(II->getOperand(1), APInt::getAllOnesValue(BitWidth),
+    ComputeMaskedBits(II->getArgOperand(0), APInt::getAllOnesValue(BitWidth),
                       KnownZero, KnownOne);
     unsigned LeadingZeros = KnownOne.countLeadingZeros();
     APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros));
@@ -468,8 +468,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
     break;
   case Intrinsic::uadd_with_overflow: {
-    Value *LHS = II->getOperand(1), *RHS = II->getOperand(2);
-    const IntegerType *IT = cast<IntegerType>(II->getOperand(1)->getType());
+    Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
+    const IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType());
     uint32_t BitWidth = IT->getBitWidth();
     APInt Mask = APInt::getSignBit(BitWidth);
     APInt LHSKnownZero(BitWidth, 0);
@@ -513,19 +513,19 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   // FALL THROUGH uadd into sadd
   case Intrinsic::sadd_with_overflow:
     // Canonicalize constants into the RHS.
-    if (isa<Constant>(II->getOperand(1)) &&
-        !isa<Constant>(II->getOperand(2))) {
-      Value *LHS = II->getOperand(1);
-      II->setOperand(1, II->getOperand(2));
-      II->setOperand(2, LHS);
+    if (isa<Constant>(II->getArgOperand(0)) &&
+        !isa<Constant>(II->getArgOperand(1))) {
+      Value *LHS = II->getArgOperand(0);
+      II->setArgOperand(0, II->getArgOperand(1));
+      II->setArgOperand(1, LHS);
       return II;
     }
 
     // X + undef -> undef
-    if (isa<UndefValue>(II->getOperand(2)))
+    if (isa<UndefValue>(II->getArgOperand(1)))
       return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
       
-    if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getOperand(2))) {
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // X + 0 -> {X, false}
       if (RHS->isZero()) {
         Constant *V[] = {
@@ -533,7 +533,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           ConstantInt::getFalse(II->getContext())
         };
         Constant *Struct = ConstantStruct::get(II->getContext(), V, 2, false);
-        return InsertValueInst::Create(Struct, II->getOperand(1), 0);
+        return InsertValueInst::Create(Struct, II->getArgOperand(0), 0);
       }
     }
     break;
@@ -541,38 +541,38 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ssub_with_overflow:
     // undef - X -> undef
     // X - undef -> undef
-    if (isa<UndefValue>(II->getOperand(1)) ||
-        isa<UndefValue>(II->getOperand(2)))
+    if (isa<UndefValue>(II->getArgOperand(0)) ||
+        isa<UndefValue>(II->getArgOperand(1)))
       return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
       
-    if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getOperand(2))) {
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // X - 0 -> {X, false}
       if (RHS->isZero()) {
         Constant *V[] = {
-          UndefValue::get(II->getOperand(1)->getType()),
+          UndefValue::get(II->getArgOperand(0)->getType()),
           ConstantInt::getFalse(II->getContext())
         };
         Constant *Struct = ConstantStruct::get(II->getContext(), V, 2, false);
-        return InsertValueInst::Create(Struct, II->getOperand(1), 0);
+        return InsertValueInst::Create(Struct, II->getArgOperand(0), 0);
       }
     }
     break;
   case Intrinsic::umul_with_overflow:
   case Intrinsic::smul_with_overflow:
     // Canonicalize constants into the RHS.
-    if (isa<Constant>(II->getOperand(1)) &&
-        !isa<Constant>(II->getOperand(2))) {
-      Value *LHS = II->getOperand(1);
-      II->setOperand(1, II->getOperand(2));
-      II->setOperand(2, LHS);
+    if (isa<Constant>(II->getArgOperand(0)) &&
+        !isa<Constant>(II->getArgOperand(1))) {
+      Value *LHS = II->getArgOperand(0);
+      II->setArgOperand(0, II->getArgOperand(1));
+      II->setArgOperand(1, LHS);
       return II;
     }
 
     // X * undef -> undef
-    if (isa<UndefValue>(II->getOperand(2)))
+    if (isa<UndefValue>(II->getArgOperand(1)))
       return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
       
-    if (ConstantInt *RHSI = dyn_cast<ConstantInt>(II->getOperand(2))) {
+    if (ConstantInt *RHSI = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // X*0 -> {0, false}
       if (RHSI->isZero())
         return ReplaceInstUsesWith(CI, Constant::getNullValue(II->getType()));
@@ -580,11 +580,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       // X * 1 -> {X, false}
       if (RHSI->equalsInt(1)) {
         Constant *V[] = {
-          UndefValue::get(II->getOperand(1)->getType()),
+          UndefValue::get(II->getArgOperand(0)->getType()),
           ConstantInt::getFalse(II->getContext())
         };
         Constant *Struct = ConstantStruct::get(II->getContext(), V, 2, false);
-        return InsertValueInst::Create(Struct, II->getOperand(1), 0);
+        return InsertValueInst::Create(Struct, II->getArgOperand(0), 0);
       }
     }
     break;
@@ -595,8 +595,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_sse2_loadu_dq:
     // Turn PPC lvx     -> load if the pointer is known aligned.
     // Turn X86 loadups -> load if the pointer is known aligned.
-    if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) {
-      Value *Ptr = Builder->CreateBitCast(II->getOperand(1),
+    if (GetOrEnforceKnownAlignment(II->getArgOperand(0), 16) >= 16) {
+      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
                                          PointerType::getUnqual(II->getType()));
       return new LoadInst(Ptr);
     }
@@ -604,22 +604,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ppc_altivec_stvx:
   case Intrinsic::ppc_altivec_stvxl:
     // Turn stvx -> store if the pointer is known aligned.
-    if (GetOrEnforceKnownAlignment(II->getOperand(2), 16) >= 16) {
+    if (GetOrEnforceKnownAlignment(II->getArgOperand(1), 16) >= 16) {
       const Type *OpPtrTy = 
-        PointerType::getUnqual(II->getOperand(1)->getType());
-      Value *Ptr = Builder->CreateBitCast(II->getOperand(2), OpPtrTy);
-      return new StoreInst(II->getOperand(1), Ptr);
+        PointerType::getUnqual(II->getArgOperand(0)->getType());
+      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
+      return new StoreInst(II->getArgOperand(0), Ptr);
     }
     break;
   case Intrinsic::x86_sse_storeu_ps:
   case Intrinsic::x86_sse2_storeu_pd:
   case Intrinsic::x86_sse2_storeu_dq:
     // Turn X86 storeu -> store if the pointer is known aligned.
-    if (GetOrEnforceKnownAlignment(II->getOperand(1), 16) >= 16) {
+    if (GetOrEnforceKnownAlignment(II->getArgOperand(0), 16) >= 16) {
       const Type *OpPtrTy = 
-        PointerType::getUnqual(II->getOperand(2)->getType());
-      Value *Ptr = Builder->CreateBitCast(II->getOperand(1), OpPtrTy);
-      return new StoreInst(II->getOperand(2), Ptr);
+        PointerType::getUnqual(II->getArgOperand(1)->getType());
+      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
+      return new StoreInst(II->getArgOperand(1), Ptr);
     }
     break;
     
@@ -627,12 +627,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // These intrinsics only demands the 0th element of its input vector.  If
     // we can simplify the input based on that, do so now.
     unsigned VWidth =
-      cast<VectorType>(II->getOperand(1)->getType())->getNumElements();
+      cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements();
     APInt DemandedElts(VWidth, 1);
     APInt UndefElts(VWidth, 0);
-    if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), DemandedElts,
+    if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
                                               UndefElts)) {
-      II->setOperand(1, V);
+      II->setArgOperand(0, V);
       return II;
     }
     break;
@@ -640,7 +640,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     
   case Intrinsic::ppc_altivec_vperm:
     // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
-    if (ConstantVector *Mask = dyn_cast<ConstantVector>(II->getOperand(3))) {
+    if (ConstantVector *Mask = dyn_cast<ConstantVector>(II->getArgOperand(2))) {
       assert(Mask->getNumOperands() == 16 && "Bad type for intrinsic!");
       
       // Check that all of the elements are integer constants or undefs.
@@ -655,8 +655,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       
       if (AllEltsOk) {
         // Cast the input vectors to byte vectors.
-        Value *Op0 = Builder->CreateBitCast(II->getOperand(1), Mask->getType());
-        Value *Op1 = Builder->CreateBitCast(II->getOperand(2), Mask->getType());
+        Value *Op0 = Builder->CreateBitCast(II->getArgOperand(0), Mask->getType());
+        Value *Op1 = Builder->CreateBitCast(II->getArgOperand(1), Mask->getType());
         Value *Result = UndefValue::get(Op0->getType());
         
         // Only extract each element once.
@@ -689,7 +689,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::stackrestore: {
     // If the save is right next to the restore, remove the restore.  This can
     // happen when variable allocas are DCE'd.
-    if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getOperand(1))) {
+    if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
       if (SS->getIntrinsicID() == Intrinsic::stacksave) {
         BasicBlock::iterator BI = SS;
         if (&*++BI == II)
@@ -772,13 +772,13 @@ protected:
     NewInstruction = IC->ReplaceInstUsesWith(*CI, With);
   }
   bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp, bool isString) const {
-    if (ConstantInt *SizeCI = dyn_cast<ConstantInt>(CI->getOperand(SizeCIOp))) {
+    if (ConstantInt *SizeCI = dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp - CallInst::ArgOffset))) {
       if (SizeCI->isAllOnesValue())
         return true;
       if (isString)
         return SizeCI->getZExtValue() >=
-               GetStringLength(CI->getOperand(SizeArgOp));
-      if (ConstantInt *Arg = dyn_cast<ConstantInt>(CI->getOperand(SizeArgOp)))
+               GetStringLength(CI->getArgOperand(SizeArgOp - CallInst::ArgOffset));
+      if (ConstantInt *Arg = dyn_cast<ConstantInt>(CI->getArgOperand(SizeArgOp - CallInst::ArgOffset)))
         return SizeCI->getZExtValue() >= Arg->getZExtValue();
     }
     return false;
@@ -846,7 +846,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
                UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
                   CS.getInstruction());
 
-    // If CS dues not return void then replaceAllUsesWith undef.
+    // If CS does not return void then replaceAllUsesWith undef.
     // This allows ValueHandlers and custom metadata to adjust itself.
     if (!CS.getInstruction()->getType()->isVoidTy())
       CS.getInstruction()->
@@ -1140,7 +1140,7 @@ Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
   IntrinsicInst *Tramp =
     cast<IntrinsicInst>(cast<BitCastInst>(Callee)->getOperand(0));
 
-  Function *NestF = cast<Function>(Tramp->getOperand(2)->stripPointerCasts());
+  Function *NestF = cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts());
   const PointerType *NestFPTy = cast<PointerType>(NestF->getType());
   const FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType());
 
@@ -1181,7 +1181,7 @@ Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
         do {
           if (Idx == NestIdx) {
             // Add the chain argument and attributes.
-            Value *NestVal = Tramp->getOperand(3);
+            Value *NestVal = Tramp->getArgOperand(2);
             if (NestVal->getType() != NestTy)
               NestVal = new BitCastInst(NestVal, NestTy, "nest", Caller);
             NewArgs.push_back(NestVal);
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index b0137c4..505a0bf 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -22,19 +22,18 @@ using namespace PatternMatch;
 /// X*Scale+Offset.
 ///
 static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
-                                        int &Offset) {
-  assert(Val->getType()->isIntegerTy(32) && "Unexpected allocation size type!");
+                                        uint64_t &Offset) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
     Offset = CI->getZExtValue();
     Scale  = 0;
-    return ConstantInt::get(Type::getInt32Ty(Val->getContext()), 0);
+    return ConstantInt::get(Val->getType(), 0);
   }
   
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
     if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
       if (I->getOpcode() == Instruction::Shl) {
         // This is a value scaled by '1 << the shift amt'.
-        Scale = 1U << RHS->getZExtValue();
+        Scale = UINT64_C(1) << RHS->getZExtValue();
         Offset = 0;
         return I->getOperand(0);
       }
@@ -100,7 +99,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   // See if we can satisfy the modulus by pulling a scale out of the array
   // size argument.
   unsigned ArraySizeScale;
-  int ArrayOffset;
+  uint64_t ArrayOffset;
   Value *NumElements = // See if the array size is a decomposable linear expr.
     DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
  
@@ -114,13 +113,13 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   if (Scale == 1) {
     Amt = NumElements;
   } else {
-    Amt = ConstantInt::get(Type::getInt32Ty(CI.getContext()), Scale);
+    Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale);
     // Insert before the alloca, not before the cast.
     Amt = AllocaBuilder.CreateMul(Amt, NumElements, "tmp");
   }
   
-  if (int Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
-    Value *Off = ConstantInt::get(Type::getInt32Ty(CI.getContext()),
+  if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
+    Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
                                   Offset, true);
     Amt = AllocaBuilder.CreateAdd(Amt, Off, "tmp");
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 861cf92..6c00586 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1423,7 +1423,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       switch (II->getIntrinsicID()) {
       case Intrinsic::bswap:
         Worklist.Add(II);
-        ICI.setOperand(0, II->getOperand(1));
+        ICI.setOperand(0, II->getArgOperand(0));
         ICI.setOperand(1, ConstantInt::get(II->getContext(), RHSV.byteSwap()));
         return &ICI;
       case Intrinsic::ctlz:
@@ -1431,7 +1431,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         // ctz(A) == bitwidth(a)  ->  A == 0 and likewise for !=
         if (RHSV == RHS->getType()->getBitWidth()) {
           Worklist.Add(II);
-          ICI.setOperand(0, II->getOperand(1));
+          ICI.setOperand(0, II->getArgOperand(0));
           ICI.setOperand(1, ConstantInt::get(RHS->getType(), 0));
           return &ICI;
         }
@@ -1440,13 +1440,13 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         // popcount(A) == 0  ->  A == 0 and likewise for !=
         if (RHS->isZero()) {
           Worklist.Add(II);
-          ICI.setOperand(0, II->getOperand(1));
+          ICI.setOperand(0, II->getArgOperand(0));
           ICI.setOperand(1, RHS);
           return &ICI;
         }
         break;
       default:
-      	break;
+        break;
       }
     }
   }
@@ -1924,35 +1924,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         }
         break;
       }
-      case Instruction::Call:
-        // If we have (malloc != null), and if the malloc has a single use, we
-        // can assume it is successful and remove the malloc.
-        if (isMalloc(LHSI) && LHSI->hasOneUse() &&
-            isa<ConstantPointerNull>(RHSC)) {
-          // Need to explicitly erase malloc call here, instead of adding it to
-          // Worklist, because it won't get DCE'd from the Worklist since
-          // isInstructionTriviallyDead() returns false for function calls.
-          // It is OK to replace LHSI/MallocCall with Undef because the 
-          // instruction that uses it will be erased via Worklist.
-          if (extractMallocCall(LHSI)) {
-            LHSI->replaceAllUsesWith(UndefValue::get(LHSI->getType()));
-            EraseInstFromFunction(*LHSI);
-            return ReplaceInstUsesWith(I,
-                               ConstantInt::get(Type::getInt1Ty(I.getContext()),
-                                                      !I.isTrueWhenEqual()));
-          }
-          if (CallInst* MallocCall = extractMallocCallFromBitCast(LHSI))
-            if (MallocCall->hasOneUse()) {
-              MallocCall->replaceAllUsesWith(
-                                        UndefValue::get(MallocCall->getType()));
-              EraseInstFromFunction(*MallocCall);
-              Worklist.Add(LHSI); // The malloc's bitcast use.
-              return ReplaceInstUsesWith(I,
-                               ConstantInt::get(Type::getInt1Ty(I.getContext()),
-                                                      !I.isTrueWhenEqual()));
-            }
-        }
-        break;
       case Instruction::IntToPtr:
         // icmp pred inttoptr(X), null -> icmp pred X, 0
         if (RHSC->isNullValue() && TD &&
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 0f2a24f..8933a0b 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -13,6 +13,7 @@
 
 #include "InstCombine.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -22,6 +23,18 @@ using namespace llvm;
 STATISTIC(NumDeadStore, "Number of dead stores eliminated");
 
 Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
+  // Ensure that the alloca array size argument has type intptr_t, so that
+  // any casting is exposed early.
+  if (TD) {
+    const Type *IntPtrTy = TD->getIntPtrType(AI.getContext());
+    if (AI.getArraySize()->getType() != IntPtrTy) {
+      Value *V = Builder->CreateIntCast(AI.getArraySize(),
+                                        IntPtrTy, false);
+      AI.setOperand(0, V);
+      return &AI;
+    }
+  }
+
   // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
   if (AI.isArrayAllocation()) {  // Check C != 1
     if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
@@ -352,10 +365,11 @@ DbgDeclareInst *InstCombiner::hasOneUsePlusDeclare(Value *V) {
     return 0;
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
        UI != E; ++UI) {
-    if (DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(UI))
+    User *U = *UI;
+    if (DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(U))
       return DI;
-    if (isa<BitCastInst>(UI) && UI->hasOneUse()) {
-      if (DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(UI->use_begin()))
+    if (isa<BitCastInst>(U) && U->hasOneUse()) {
+      if (DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(U->use_begin()))
         return DI;
       }
   }
@@ -511,17 +525,20 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   // Determine whether Dest has exactly two predecessors and, if so, compute
   // the other predecessor.
   pred_iterator PI = pred_begin(DestBB);
+  BasicBlock *P = *PI;
   BasicBlock *OtherBB = 0;
-  if (*PI != StoreBB)
-    OtherBB = *PI;
-  ++PI;
-  if (PI == pred_end(DestBB))
+
+  if (P != StoreBB)
+    OtherBB = P;
+
+  if (++PI == pred_end(DestBB))
     return false;
   
-  if (*PI != StoreBB) {
+  P = *PI;
+  if (P != StoreBB) {
     if (OtherBB)
       return false;
-    OtherBB = *PI;
+    OtherBB = P;
   }
   if (++PI != pred_end(DestBB))
     return false;
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 65f0393..f7fc62f 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -230,8 +230,9 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
     bool isAddressTaken = false;
     for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
          UI != E; ++UI) {
-      if (isa<LoadInst>(UI)) continue;
-      if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+      User *U = *UI;
+      if (isa<LoadInst>(U)) continue;
+      if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
         // If storing TO the alloca, then the address isn't taken.
         if (SI->getOperand(1) == AI) continue;
       }
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index c958cde..c44fe9d 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -329,6 +329,37 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
       }
     }
 
+  // Transform (X >s -1) ? C1 : C2 --> ((X >>s 31) & (C2 - C1)) + C1
+  // and       (X <s  0) ? C2 : C1 --> ((X >>s 31) & (C2 - C1)) + C1
+  // FIXME: Type and constness constraints could be lifted, but we have to
+  //        watch code size carefully. We should consider xor instead of
+  //        sub/add when we decide to do that.
+  if (const IntegerType *Ty = dyn_cast<IntegerType>(CmpLHS->getType())) {
+    if (TrueVal->getType() == Ty) {
+      if (ConstantInt *Cmp = dyn_cast<ConstantInt>(CmpRHS)) {
+        ConstantInt *C1 = NULL, *C2 = NULL;
+        if (Pred == ICmpInst::ICMP_SGT && Cmp->isAllOnesValue()) {
+          C1 = dyn_cast<ConstantInt>(TrueVal);
+          C2 = dyn_cast<ConstantInt>(FalseVal);
+        } else if (Pred == ICmpInst::ICMP_SLT && Cmp->isNullValue()) {
+          C1 = dyn_cast<ConstantInt>(FalseVal);
+          C2 = dyn_cast<ConstantInt>(TrueVal);
+        }
+        if (C1 && C2) {
+          // This shift results in either -1 or 0.
+          Value *AShr = Builder->CreateAShr(CmpLHS, Ty->getBitWidth()-1);
+
+          // Check if we can express the operation with a single or.
+          if (C2->isAllOnesValue())
+            return ReplaceInstUsesWith(SI, Builder->CreateOr(AShr, C1));
+
+          Value *And = Builder->CreateAnd(AShr, C2->getValue()-C1->getValue());
+          return ReplaceInstUsesWith(SI, Builder->CreateAdd(And, C1));
+        }
+      }
+    }
+  }
+
   if (CmpLHS == TrueVal && CmpRHS == FalseVal) {
     // Transform (X == Y) ? X : Y  -> Y
     if (Pred == ICmpInst::ICMP_EQ)
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 836bda3..e5ce8a6 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -404,7 +404,7 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
           isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == Op1C->getZExtValue()){
         bool isCtPop = II->getIntrinsicID() == Intrinsic::ctpop;
         Constant *RHS = ConstantInt::getSigned(Op0->getType(), isCtPop ? -1:0);
-        Value *Cmp = Builder->CreateICmpEQ(II->getOperand(1), RHS);
+        Value *Cmp = Builder->CreateICmpEQ(II->getArgOperand(0), RHS);
         return new ZExtInst(Cmp, II->getType());
       }
     }
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index cd41844..adf7a76 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -732,10 +732,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
           // the right place.
           Instruction *NewVal;
           if (InputBit > ResultBit)
-            NewVal = BinaryOperator::CreateLShr(I->getOperand(1),
+            NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0),
                     ConstantInt::get(I->getType(), InputBit-ResultBit));
           else
-            NewVal = BinaryOperator::CreateShl(I->getOperand(1),
+            NewVal = BinaryOperator::CreateShl(II->getArgOperand(0),
                     ConstantInt::get(I->getType(), ResultBit-InputBit));
           NewVal->takeName(I);
           return InsertNewInstBefore(NewVal, *I);
@@ -1052,12 +1052,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     case Intrinsic::x86_sse2_mul_sd:
     case Intrinsic::x86_sse2_min_sd:
     case Intrinsic::x86_sse2_max_sd:
-      TmpV = SimplifyDemandedVectorElts(II->getOperand(1), DemandedElts,
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
                                         UndefElts, Depth+1);
-      if (TmpV) { II->setOperand(1, TmpV); MadeChange = true; }
-      TmpV = SimplifyDemandedVectorElts(II->getOperand(2), DemandedElts,
+      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
                                         UndefElts2, Depth+1);
-      if (TmpV) { II->setOperand(2, TmpV); MadeChange = true; }
+      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
 
       // If only the low elt is demanded and this is a scalarizable intrinsic,
       // scalarize it now.
@@ -1069,8 +1069,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
         case Intrinsic::x86_sse2_sub_sd:
         case Intrinsic::x86_sse2_mul_sd:
           // TODO: Lower MIN/MAX/ABS/etc
-          Value *LHS = II->getOperand(1);
-          Value *RHS = II->getOperand(2);
+          Value *LHS = II->getArgOperand(0);
+          Value *RHS = II->getArgOperand(1);
           // Extract the element as scalars.
           LHS = InsertNewInstBefore(ExtractElementInst::Create(LHS, 
             ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II);
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index af9ec5c..af2958f 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -710,8 +710,55 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   return 0;
 }
 
-Instruction *InstCombiner::visitFree(Instruction &FI) {
-  Value *Op = FI.getOperand(1);
+
+
+static bool IsOnlyNullComparedAndFreed(const Value &V) {
+  for (Value::const_use_iterator UI = V.use_begin(), UE = V.use_end();
+       UI != UE; ++UI) {
+    const User *U = *UI;
+    if (isFreeCall(U))
+      continue;
+    if (const ICmpInst *ICI = dyn_cast<ICmpInst>(U))
+      if (ICI->isEquality() && isa<ConstantPointerNull>(ICI->getOperand(1)))
+        continue;
+    return false;
+  }
+  return true;
+}
+
+Instruction *InstCombiner::visitMalloc(Instruction &MI) {
+  // If we have a malloc call which is only used in any amount of comparisons
+  // to null and free calls, delete the calls and replace the comparisons with
+  // true or false as appropriate.
+  if (IsOnlyNullComparedAndFreed(MI)) {
+    for (Value::use_iterator UI = MI.use_begin(), UE = MI.use_end();
+         UI != UE;) {
+      // We can assume that every remaining use is a free call or an icmp eq/ne
+      // to null, so the cast is safe.
+      Instruction *I = cast<Instruction>(*UI);
+
+      // Early increment here, as we're about to get rid of the user.
+      ++UI;
+
+      if (isFreeCall(I)) {
+        EraseInstFromFunction(*cast<CallInst>(I));
+        continue;
+      }
+      // Again, the cast is safe.
+      ICmpInst *C = cast<ICmpInst>(I);
+      ReplaceInstUsesWith(*C, ConstantInt::get(Type::getInt1Ty(C->getContext()),
+                                               C->isFalseWhenEqual()));
+      EraseInstFromFunction(*C);
+    }
+    return EraseInstFromFunction(MI);
+  }
+  return 0;
+}
+
+
+
+Instruction *InstCombiner::visitFree(CallInst &FI) {
+  Value *Op = FI.getArgOperand(0);
 
   // free undef -> unreachable.
   if (isa<UndefValue>(Op)) {
@@ -726,23 +773,6 @@ Instruction *InstCombiner::visitFree(Instruction &FI) {
   if (isa<ConstantPointerNull>(Op))
     return EraseInstFromFunction(FI);
 
-  // If we have a malloc call whose only use is a free call, delete both.
-  if (isMalloc(Op)) {
-    if (CallInst* CI = extractMallocCallFromBitCast(Op)) {
-      if (Op->hasOneUse() && CI->hasOneUse()) {
-        EraseInstFromFunction(FI);
-        EraseInstFromFunction(*CI);
-        return EraseInstFromFunction(*cast<Instruction>(Op));
-      }
-    } else {
-      // Op is a call to malloc
-      if (Op->hasOneUse()) {
-        EraseInstFromFunction(FI);
-        return EraseInstFromFunction(*cast<Instruction>(Op));
-      }
-    }
-  }
-
   return 0;
 }
 
@@ -896,7 +926,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Agg)) {
     // We're extracting from an intrinsic, see if we're the only user, which
     // allows us to simplify multiple result intrinsics to simpler things that
-    // just get one value..
+    // just get one value.
     if (II->hasOneUse()) {
       // Check if we're grabbing the overflow bit or the result of a 'with
       // overflow' intrinsic.  If it's the latter we can remove the intrinsic
@@ -905,7 +935,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       case Intrinsic::uadd_with_overflow:
       case Intrinsic::sadd_with_overflow:
         if (*EV.idx_begin() == 0) {  // Normal result.
-          Value *LHS = II->getOperand(1), *RHS = II->getOperand(2);
+          Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
           II->replaceAllUsesWith(UndefValue::get(II->getType()));
           EraseInstFromFunction(*II);
           return BinaryOperator::CreateAdd(LHS, RHS);
@@ -914,7 +944,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       case Intrinsic::usub_with_overflow:
       case Intrinsic::ssub_with_overflow:
         if (*EV.idx_begin() == 0) {  // Normal result.
-          Value *LHS = II->getOperand(1), *RHS = II->getOperand(2);
+          Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
           II->replaceAllUsesWith(UndefValue::get(II->getType()));
           EraseInstFromFunction(*II);
           return BinaryOperator::CreateSub(LHS, RHS);
@@ -923,7 +953,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       case Intrinsic::umul_with_overflow:
       case Intrinsic::smul_with_overflow:
         if (*EV.idx_begin() == 0) {  // Normal result.
-          Value *LHS = II->getOperand(1), *RHS = II->getOperand(2);
+          Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
           II->replaceAllUsesWith(UndefValue::get(II->getType()));
           EraseInstFromFunction(*II);
           return BinaryOperator::CreateMul(LHS, RHS);
diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
index 5650150..41e3a39 100644
--- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
@@ -143,7 +143,7 @@ bool OptimalEdgeProfiler::runOnModule(Module &M) {
     ProfileInfo::Edge edge = ProfileInfo::getEdge(0,entry);
     if (!std::binary_search(MST.begin(), MST.end(), edge)) {
       printEdgeCounter(edge,entry,i);
-      IncrementCounterInBlock(entry, i, Counters); NumEdgesInserted++;
+      IncrementCounterInBlock(entry, i, Counters); ++NumEdgesInserted;
       Initializer[i++] = (Zero);
     } else{
       Initializer[i++] = (Uncounted);
@@ -166,7 +166,7 @@ bool OptimalEdgeProfiler::runOnModule(Module &M) {
         ProfileInfo::Edge edge = ProfileInfo::getEdge(BB,0);
         if (!std::binary_search(MST.begin(), MST.end(), edge)) {
           printEdgeCounter(edge,BB,i);
-          IncrementCounterInBlock(BB, i, Counters); NumEdgesInserted++;
+          IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted;
           Initializer[i++] = (Zero);
         } else{
           Initializer[i++] = (Uncounted);
@@ -189,11 +189,11 @@ bool OptimalEdgeProfiler::runOnModule(Module &M) {
           if (TI->getNumSuccessors() == 1) {
             // Insert counter at the start of the block
             printEdgeCounter(edge,BB,i);
-            IncrementCounterInBlock(BB, i, Counters); NumEdgesInserted++;
+            IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted;
           } else {
             // Insert counter at the start of the block
             printEdgeCounter(edge,Succ,i);
-            IncrementCounterInBlock(Succ, i, Counters); NumEdgesInserted++;
+            IncrementCounterInBlock(Succ, i, Counters); ++NumEdgesInserted;
           }
           Initializer[i++] = (Zero);
         } else {
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
index 8662a82..1a30e9b 100644
--- a/lib/Transforms/Instrumentation/ProfilingUtils.cpp
+++ b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
@@ -61,8 +61,8 @@ void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
   }
   Args[3] = ConstantInt::get(Type::getInt32Ty(Context), NumElements);
 
-  Instruction *InitCall = CallInst::Create(InitFn, Args.begin(), Args.end(),
-                                           "newargc", InsertPos);
+  CallInst *InitCall = CallInst::Create(InitFn, Args.begin(), Args.end(),
+                                        "newargc", InsertPos);
 
   // If argc or argv are not available in main, just pass null values in.
   Function::arg_iterator AI;
@@ -73,10 +73,10 @@ void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
     if (AI->getType() != ArgVTy) {
       Instruction::CastOps opcode = CastInst::getCastOpcode(AI, false, ArgVTy, 
                                                             false);
-      InitCall->setOperand(2, 
+      InitCall->setArgOperand(1, 
           CastInst::Create(opcode, AI, ArgVTy, "argv.cast", InitCall));
     } else {
-      InitCall->setOperand(2, AI);
+      InitCall->setArgOperand(1, AI);
     }
     /* FALL THROUGH */
 
@@ -93,12 +93,12 @@ void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
       }
       opcode = CastInst::getCastOpcode(AI, true,
                                        Type::getInt32Ty(Context), true);
-      InitCall->setOperand(1, 
+      InitCall->setArgOperand(0, 
           CastInst::Create(opcode, AI, Type::getInt32Ty(Context),
                            "argc.cast", InitCall));
     } else {
       AI->replaceAllUsesWith(InitCall);
-      InitCall->setOperand(1, AI);
+      InitCall->setArgOperand(0, AI);
     }
 
   case 0: break;
diff --git a/lib/Transforms/Scalar/ABCD.cpp b/lib/Transforms/Scalar/ABCD.cpp
index 6135992..dcf14a6 100644
--- a/lib/Transforms/Scalar/ABCD.cpp
+++ b/lib/Transforms/Scalar/ABCD.cpp
@@ -230,7 +230,7 @@ class ABCD : public FunctionPass {
       DenseMapIterator<Value*, MemoizedResultChart> begin = map.begin();
       DenseMapIterator<Value*, MemoizedResultChart> end = map.end();
       for (; begin != end; ++begin) {
-	begin->second.clear();
+        begin->second.clear();
       }
       map.clear();
     }
@@ -396,8 +396,8 @@ class ABCD : public FunctionPass {
   /// this case the method returns true, otherwise false. It also obtains the
   /// Instruction and ConstantInt from the BinaryOperator and returns it.
   bool createBinaryOperatorInfo(BinaryOperator *BO, Instruction **I1,
-				Instruction **I2, ConstantInt **C1,
-				ConstantInt **C2);
+                                Instruction **I2, ConstantInt **C1,
+                                ConstantInt **C2);
 
   /// This method creates a constraint between a Sigma and an Instruction.
   /// These constraints are created as soon as we find a comparator that uses a
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 5a49841..2d19467 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -83,7 +83,7 @@ bool ADCE::runOnFunction(Function& F) {
   
   for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(),
        E = worklist.end(); I != E; ++I) {
-    NumRemoved++;
+    ++NumRemoved;
     (*I)->eraseFromParent();
   }
 
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 93e9bfb..272066c 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -548,7 +548,8 @@ protected:
     CI->eraseFromParent();
   }
   bool isFoldable(unsigned SizeCIOp, unsigned, bool) const {
-    if (ConstantInt *SizeCI = dyn_cast<ConstantInt>(CI->getOperand(SizeCIOp)))
+      if (ConstantInt *SizeCI = dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp
+                                                        - CallInst::ArgOffset)))
       return SizeCI->isAllOnesValue();
     return false;
   }
@@ -559,7 +560,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
   // Lower all uses of llvm.objectsize.*
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
   if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
-    bool Min = (cast<ConstantInt>(II->getOperand(2))->getZExtValue() == 1);
+    bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
     const Type *ReturnTy = CI->getType();
     Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);    
     CI->replaceAllUsesWith(RetVal);
@@ -759,8 +760,7 @@ bool CodeGenPrepare::OptimizeInlineAsmInst(Instruction *I, CallSite CS,
     }
 
     // Compute the constraint code and ConstraintType to use.
-    TLI->ComputeConstraintToUse(OpInfo, SDValue(),
-                             OpInfo.ConstraintType == TargetLowering::C_Memory);
+    TLI->ComputeConstraintToUse(OpInfo, SDValue());
 
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
         OpInfo.isIndirect) {
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 09c01d3..e047e4f 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -56,7 +56,8 @@ namespace {
     }
     
     bool runOnBasicBlock(BasicBlock &BB);
-    bool handleFreeWithNonTrivialDependency(Instruction *F, MemDepResult Dep);
+    bool handleFreeWithNonTrivialDependency(const CallInst *F,
+                                            MemDepResult Dep);
     bool handleEndBlock(BasicBlock &BB);
     bool RemoveUndeadPointers(Value *Ptr, uint64_t killPointerSize,
                               BasicBlock::iterator &BBI,
@@ -73,7 +74,6 @@ namespace {
       AU.addRequired<AliasAnalysis>();
       AU.addRequired<MemoryDependenceAnalysis>();
       AU.addPreserved<DominatorTree>();
-      AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<MemoryDependenceAnalysis>();
     }
 
@@ -123,14 +123,15 @@ static Value *getPointerOperand(Instruction *I) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return SI->getPointerOperand();
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
-    return MI->getOperand(1);
-  
-  switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
+    return MI->getArgOperand(0);
+
+  IntrinsicInst *II = cast<IntrinsicInst>(I);
+  switch (II->getIntrinsicID()) {
   default: assert(false && "Unexpected intrinsic!");
   case Intrinsic::init_trampoline:
-    return I->getOperand(1);
+    return II->getArgOperand(0);
   case Intrinsic::lifetime_end:
-    return I->getOperand(2);
+    return II->getArgOperand(1);
   }
 }
 
@@ -147,12 +148,13 @@ static unsigned getStoreSize(Instruction *I, const TargetData *TD) {
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
     Len = MI->getLength();
   } else {
-    switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
+    IntrinsicInst *II = cast<IntrinsicInst>(I);
+    switch (II->getIntrinsicID()) {
     default: assert(false && "Unexpected intrinsic!");
     case Intrinsic::init_trampoline:
       return -1u;
     case Intrinsic::lifetime_end:
-      Len = I->getOperand(1);
+      Len = II->getArgOperand(0);
       break;
     }
   }
@@ -201,8 +203,8 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
     if (InstDep.isNonLocal()) continue;
   
     // Handle frees whose dependencies are non-trivial.
-    if (isFreeCall(Inst)) {
-      MadeChange |= handleFreeWithNonTrivialDependency(Inst, InstDep);
+    if (const CallInst *F = isFreeCall(Inst)) {
+      MadeChange |= handleFreeWithNonTrivialDependency(F, InstDep);
       continue;
     }
     
@@ -218,7 +220,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           isElidable(DepStore)) {
         // Delete the store and now-dead instructions that feed it.
         DeleteDeadInstruction(DepStore);
-        NumFastStores++;
+        ++NumFastStores;
         MadeChange = true;
 
         // DeleteDeadInstruction can delete the current instruction in loop
@@ -249,7 +251,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
             BBI = BB.begin();
           else if (BBI != BB.begin())  // Revisit this instruction if possible.
             --BBI;
-          NumFastStores++;
+          ++NumFastStores;
           MadeChange = true;
           continue;
         }
@@ -270,7 +272,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           BBI = BB.begin();
         else if (BBI != BB.begin())  // Revisit this instruction if possible.
           --BBI;
-        NumFastStores++;
+        ++NumFastStores;
         MadeChange = true;
         continue;
       }
@@ -287,7 +289,8 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
 
 /// handleFreeWithNonTrivialDependency - Handle frees of entire structures whose
 /// dependency is a store to a field of that structure.
-bool DSE::handleFreeWithNonTrivialDependency(Instruction *F, MemDepResult Dep) {
+bool DSE::handleFreeWithNonTrivialDependency(const CallInst *F,
+                                             MemDepResult Dep) {
   AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
   
   Instruction *Dependency = Dep.getInst();
@@ -297,13 +300,13 @@ bool DSE::handleFreeWithNonTrivialDependency(Instruction *F, MemDepResult Dep) {
   Value *DepPointer = getPointerOperand(Dependency)->getUnderlyingObject();
 
   // Check for aliasing.
-  if (AA.alias(F->getOperand(1), 1, DepPointer, 1) !=
+  if (AA.alias(F->getArgOperand(0), 1, DepPointer, 1) !=
          AliasAnalysis::MustAlias)
     return false;
   
   // DCE instructions only used to calculate that store
   DeleteDeadInstruction(Dependency);
-  NumFastStores++;
+  ++NumFastStores;
   return true;
 }
 
@@ -349,9 +352,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
         if (deadPointers.count(pointerOperand)) {
           // DCE instructions only used to calculate that store.
           Instruction *Dead = BBI;
-          BBI++;
+          ++BBI;
           DeleteDeadInstruction(Dead, &deadPointers);
-          NumFastStores++;
+          ++NumFastStores;
           MadeChange = true;
           continue;
         }
@@ -371,9 +374,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       // However, if this load is unused and not volatile, we can go ahead and
       // remove it, and not have to worry about it making our pointer undead!
       if (L->use_empty() && !L->isVolatile()) {
-        BBI++;
+        ++BBI;
         DeleteDeadInstruction(L, &deadPointers);
-        NumFastOther++;
+        ++NumFastOther;
         MadeChange = true;
         continue;
       }
@@ -391,9 +394,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       
       // Dead alloca's can be DCE'd when we reach them
       if (A->use_empty()) {
-        BBI++;
+        ++BBI;
         DeleteDeadInstruction(A, &deadPointers);
-        NumFastOther++;
+        ++NumFastOther;
         MadeChange = true;
       }
       
@@ -426,9 +429,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
                                                          getPointerSize(*I));
         
         if (A == AliasAnalysis::ModRef)
-          modRef++;
+          ++modRef;
         else
-          other++;
+          ++other;
         
         if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref)
           dead.push_back(*I);
@@ -442,9 +445,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     } else if (isInstructionTriviallyDead(BBI)) {
       // For any non-memory-affecting non-terminators, DCE them as we reach them
       Instruction *Inst = BBI;
-      BBI++;
+      ++BBI;
       DeleteDeadInstruction(Inst, &deadPointers);
-      NumFastOther++;
+      ++NumFastOther;
       MadeChange = true;
       continue;
     }
@@ -497,7 +500,7 @@ bool DSE::RemoveUndeadPointers(Value *killPointer, uint64_t killPointerSize,
       // Remove it!
       ++BBI;
       DeleteDeadInstruction(S, &deadPointers);
-      NumFastStores++;
+      ++NumFastStores;
       MadeChange = true;
 
       continue;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index ca8ab49..88b6776 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/PHITransAddr.h"
@@ -271,7 +272,8 @@ Expression ValueTable::create_expression(CallInst* C) {
   e.function = C->getCalledFunction();
   e.opcode = Expression::CALL;
 
-  for (CallInst::op_iterator I = C->op_begin()+1, E = C->op_end();
+  CallSite CS(C);
+  for (CallInst::op_iterator I = CS.arg_begin(), E = CS.arg_end();
        I != E; ++I)
     e.varargs.push_back(lookup_or_add(*I));
 
@@ -447,14 +449,14 @@ uint32_t ValueTable::lookup_or_add_call(CallInst* C) {
     if (local_dep.isDef()) {
       CallInst* local_cdep = cast<CallInst>(local_dep.getInst());
 
-      if (local_cdep->getNumOperands() != C->getNumOperands()) {
+      if (local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
         valueNumbering[C] = nextValueNumber;
         return nextValueNumber++;
       }
 
-      for (unsigned i = 1; i < C->getNumOperands(); ++i) {
-        uint32_t c_vn = lookup_or_add(C->getOperand(i));
-        uint32_t cd_vn = lookup_or_add(local_cdep->getOperand(i));
+      for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+        uint32_t c_vn = lookup_or_add(C->getArgOperand(i));
+        uint32_t cd_vn = lookup_or_add(local_cdep->getArgOperand(i));
         if (c_vn != cd_vn) {
           valueNumbering[C] = nextValueNumber;
           return nextValueNumber++;
@@ -504,13 +506,13 @@ uint32_t ValueTable::lookup_or_add_call(CallInst* C) {
       return nextValueNumber++;
     }
 
-    if (cdep->getNumOperands() != C->getNumOperands()) {
+    if (cdep->getNumArgOperands() != C->getNumArgOperands()) {
       valueNumbering[C] = nextValueNumber;
       return nextValueNumber++;
     }
-    for (unsigned i = 1; i < C->getNumOperands(); ++i) {
-      uint32_t c_vn = lookup_or_add(C->getOperand(i));
-      uint32_t cd_vn = lookup_or_add(cdep->getOperand(i));
+    for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+      uint32_t c_vn = lookup_or_add(C->getArgOperand(i));
+      uint32_t cd_vn = lookup_or_add(cdep->getArgOperand(i));
       if (c_vn != cd_vn) {
         valueNumbering[C] = nextValueNumber;
         return nextValueNumber++;
@@ -1500,7 +1502,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
       MD->invalidateCachedPointerInfo(V);
     VN.erase(LI);
     toErase.push_back(LI);
-    NumGVNLoad++;
+    ++NumGVNLoad;
     return true;
   }
 
@@ -1723,7 +1725,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
     MD->invalidateCachedPointerInfo(V);
   VN.erase(LI);
   toErase.push_back(LI);
-  NumPRELoad++;
+  ++NumPRELoad;
   return true;
 }
 
@@ -1784,7 +1786,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
         MD->invalidateCachedPointerInfo(AvailVal);
       VN.erase(L);
       toErase.push_back(L);
-      NumGVNLoad++;
+      ++NumGVNLoad;
       return true;
     }
         
@@ -1830,7 +1832,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
       MD->invalidateCachedPointerInfo(StoredVal);
     VN.erase(L);
     toErase.push_back(L);
-    NumGVNLoad++;
+    ++NumGVNLoad;
     return true;
   }
 
@@ -1860,7 +1862,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
       MD->invalidateCachedPointerInfo(DepLI);
     VN.erase(L);
     toErase.push_back(L);
-    NumGVNLoad++;
+    ++NumGVNLoad;
     return true;
   }
 
@@ -1871,7 +1873,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
     L->replaceAllUsesWith(UndefValue::get(L->getType()));
     VN.erase(L);
     toErase.push_back(L);
-    NumGVNLoad++;
+    ++NumGVNLoad;
     return true;
   }
   
@@ -1882,7 +1884,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
       L->replaceAllUsesWith(UndefValue::get(L->getType()));
       VN.erase(L);
       toErase.push_back(L);
-      NumGVNLoad++;
+      ++NumGVNLoad;
       return true;
     }
   }
@@ -2014,7 +2016,7 @@ bool GVN::runOnFunction(Function& F) {
     BasicBlock *BB = FI;
     ++FI;
     bool removedBlock = MergeBlockIntoPredecessor(BB, this);
-    if (removedBlock) NumGVNBlocks++;
+    if (removedBlock) ++NumGVNBlocks;
 
     Changed |= removedBlock;
   }
@@ -2126,27 +2128,28 @@ bool GVN::performPRE(Function &F) {
 
       for (pred_iterator PI = pred_begin(CurrentBlock),
            PE = pred_end(CurrentBlock); PI != PE; ++PI) {
+        BasicBlock *P = *PI;
         // We're not interested in PRE where the block is its
         // own predecessor, or in blocks with predecessors
         // that are not reachable.
-        if (*PI == CurrentBlock) {
+        if (P == CurrentBlock) {
           NumWithout = 2;
           break;
-        } else if (!localAvail.count(*PI))  {
+        } else if (!localAvail.count(P))  {
           NumWithout = 2;
           break;
         }
 
         DenseMap<uint32_t, Value*>::iterator predV =
-                                            localAvail[*PI]->table.find(ValNo);
-        if (predV == localAvail[*PI]->table.end()) {
-          PREPred = *PI;
-          NumWithout++;
+                                            localAvail[P]->table.find(ValNo);
+        if (predV == localAvail[P]->table.end()) {
+          PREPred = P;
+          ++NumWithout;
         } else if (predV->second == CurInst) {
           NumWithout = 2;
         } else {
-          predMap[*PI] = predV->second;
-          NumWith++;
+          predMap[P] = predV->second;
+          ++NumWith;
         }
       }
 
@@ -2201,7 +2204,7 @@ bool GVN::performPRE(Function &F) {
       PREInstr->setName(CurInst->getName() + ".pre");
       predMap[PREPred] = PREInstr;
       VN.add(PREInstr, ValNo);
-      NumGVNPRE++;
+      ++NumGVNPRE;
 
       // Update the availability map to include the new instruction.
       localAvail[PREPred]->table.insert(std::make_pair(ValNo, PREInstr));
@@ -2211,8 +2214,10 @@ bool GVN::performPRE(Function &F) {
                                      CurInst->getName() + ".pre-phi",
                                      CurrentBlock->begin());
       for (pred_iterator PI = pred_begin(CurrentBlock),
-           PE = pred_end(CurrentBlock); PI != PE; ++PI)
-        Phi->addIncoming(predMap[*PI], *PI);
+           PE = pred_end(CurrentBlock); PI != PE; ++PI) {
+        BasicBlock *P = *PI;
+        Phi->addIncoming(predMap[P], P);
+      }
 
       VN.add(Phi, ValNo);
       localAvail[CurrentBlock]->table[ValNo] = Phi;
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 36bea67..b5c9dd8 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -467,6 +467,17 @@ void IndVarSimplify::EliminateIVRemainders() {
 }
 
 bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+  // If LoopSimplify form is not available, stay out of trouble. Some notes:
+  //  - LSR currently only supports LoopSimplify-form loops. Indvars'
+  //    canonicalization can be a pessimization without LSR to "clean up"
+  //    afterwards.
+  //  - We depend on having a preheader; in particular,
+  //    Loop::getCanonicalInductionVariable only supports loops with preheaders,
+  //    and we're in trouble if we can't find the induction variable even when
+  //    we've manually inserted one.
+  if (!L->isLoopSimplifyForm())
+    return false;
+
   IU = &getAnalysis<IVUsers>();
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
@@ -760,8 +771,9 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
     bool UsedInLoop = false;
     for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
          UI != UE; ++UI) {
-      BasicBlock *UseBB = cast<Instruction>(UI)->getParent();
-      if (PHINode *P = dyn_cast<PHINode>(UI)) {
+      User *U = *UI;
+      BasicBlock *UseBB = cast<Instruction>(U)->getParent();
+      if (PHINode *P = dyn_cast<PHINode>(U)) {
         unsigned i =
           PHINode::getIncomingValueNumForOperand(UI.getOperandNo());
         UseBB = P->getIncomingBlock(i);
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index df05b71..edce14c 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -288,14 +289,15 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
       // Perhaps getConstantOnEdge should be smart enough to do this?
       
       for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+        BasicBlock *P = *PI;
         // If the value is known by LazyValueInfo to be a constant in a
         // predecessor, use that information to try to thread this block.
-        Constant *PredCst = LVI->getConstantOnEdge(V, *PI, BB);
+        Constant *PredCst = LVI->getConstantOnEdge(V, P, BB);
         if (PredCst == 0 ||
             (!isa<ConstantInt>(PredCst) && !isa<UndefValue>(PredCst)))
           continue;
         
-        Result.push_back(std::make_pair(dyn_cast<ConstantInt>(PredCst), *PI));
+        Result.push_back(std::make_pair(dyn_cast<ConstantInt>(PredCst), P));
       }
       
       return !Result.empty();
@@ -345,8 +347,19 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
         }
       for (unsigned i = 0, e = RHSVals.size(); i != e; ++i)
         if (RHSVals[i].first == InterestingVal || RHSVals[i].first == 0) {
-          Result.push_back(RHSVals[i]);
-          Result.back().first = InterestingVal;
+          // If we already inferred a value for this block on the LHS, don't
+          // re-add it.
+          bool HasValue = false;
+          for (unsigned r = 0, e = Result.size(); r != e; ++r)
+            if (Result[r].second == RHSVals[i].second) {
+              HasValue = true;
+              break;
+            }
+          
+          if (!HasValue) {
+            Result.push_back(RHSVals[i]);
+            Result.back().first = InterestingVal;
+          }
         }
       return !Result.empty();
     }
@@ -409,20 +422,21 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
         (!isa<Instruction>(Cmp->getOperand(0)) ||
          cast<Instruction>(Cmp->getOperand(0))->getParent() != BB)) {
       Constant *RHSCst = cast<Constant>(Cmp->getOperand(1));
-      
+
       for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+        BasicBlock *P = *PI;
         // If the value is known by LazyValueInfo to be a constant in a
         // predecessor, use that information to try to thread this block.
         LazyValueInfo::Tristate
           Res = LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
-                                        RHSCst, *PI, BB);
+                                        RHSCst, P, BB);
         if (Res == LazyValueInfo::Unknown)
           continue;
 
         Constant *ResC = ConstantInt::get(Cmp->getType(), Res);
-        Result.push_back(std::make_pair(cast<ConstantInt>(ResC), *PI));
+        Result.push_back(std::make_pair(cast<ConstantInt>(ResC), P));
       }
-      
+
       return !Result.empty();
     }
   }
@@ -538,18 +552,22 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
       (CondInst == 0 || CondInst->getParent() != BB)) { // Non-local definition.
     pred_iterator PI = pred_begin(BB), E = pred_end(BB);
     if (isa<BranchInst>(BB->getTerminator())) {
-      for (; PI != E; ++PI)
-        if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+      for (; PI != E; ++PI) {
+        BasicBlock *P = *PI;
+        if (BranchInst *PBI = dyn_cast<BranchInst>(P->getTerminator()))
           if (PBI->isConditional() && PBI->getCondition() == Condition &&
-              ProcessBranchOnDuplicateCond(*PI, BB))
+              ProcessBranchOnDuplicateCond(P, BB))
             return true;
+      }
     } else {
       assert(isa<SwitchInst>(BB->getTerminator()) && "Unknown jump terminator");
-      for (; PI != E; ++PI)
-        if (SwitchInst *PSI = dyn_cast<SwitchInst>((*PI)->getTerminator()))
+      for (; PI != E; ++PI) {
+        BasicBlock *P = *PI;
+        if (SwitchInst *PSI = dyn_cast<SwitchInst>(P->getTerminator()))
           if (PSI->getCondition() == Condition &&
-              ProcessSwitchOnDuplicateCond(*PI, BB))
+              ProcessSwitchOnDuplicateCond(P, BB))
             return true;
+      }
     }
   }
 
@@ -569,19 +587,21 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
       // If we have a comparison, loop over the predecessors to see if there is
       // a condition with a lexically identical value.
       pred_iterator PI = pred_begin(BB), E = pred_end(BB);
-      for (; PI != E; ++PI)
-        if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
-          if (PBI->isConditional() && *PI != BB) {
+      for (; PI != E; ++PI) {
+        BasicBlock *P = *PI;
+        if (BranchInst *PBI = dyn_cast<BranchInst>(P->getTerminator()))
+          if (PBI->isConditional() && P != BB) {
             if (CmpInst *CI = dyn_cast<CmpInst>(PBI->getCondition())) {
               if (CI->getOperand(0) == CondCmp->getOperand(0) &&
                   CI->getOperand(1) == CondCmp->getOperand(1) &&
                   CI->getPredicate() == CondCmp->getPredicate()) {
                 // TODO: Could handle things like (x != 4) --> (x == 17)
-                if (ProcessBranchOnDuplicateCond(*PI, BB))
+                if (ProcessBranchOnDuplicateCond(P, BB))
                   return true;
               }
             }
           }
+      }
     }
   }
 
@@ -869,9 +889,15 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
     // Add all the unavailable predecessors to the PredsToSplit list.
     for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
-         PI != PE; ++PI)
-      if (!AvailablePredSet.count(*PI))
-        PredsToSplit.push_back(*PI);
+         PI != PE; ++PI) {
+      BasicBlock *P = *PI;
+      // If the predecessor is an indirect goto, we can't split the edge.
+      if (isa<IndirectBrInst>(P->getTerminator()))
+        return false;
+      
+      if (!AvailablePredSet.count(P))
+        PredsToSplit.push_back(P);
+    }
     
     // Split them out to their own block.
     UnavailablePred =
@@ -903,11 +929,12 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   // have multiple entries here.
   for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E;
        ++PI) {
+    BasicBlock *P = *PI;
     AvailablePredsTy::iterator I = 
       std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(),
-                       std::make_pair(*PI, (Value*)0));
+                       std::make_pair(P, (Value*)0));
     
-    assert(I != AvailablePreds.end() && I->first == *PI &&
+    assert(I != AvailablePreds.end() && I->first == P &&
            "Didn't find entry for predecessor!");
     
     PN->addIncoming(I->second, I->first);
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 48817ab..e4894e9 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -83,7 +83,7 @@ bool LoopDeletion::IsLoopDead(Loop* L,
       if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator()))
         return false;
       
-    BI++;
+    ++BI;
   }
   
   // Make sure that no instructions in the block have potential side-effects.
@@ -176,7 +176,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
   BasicBlock::iterator BI = exitBlock->begin();
   while (PHINode* P = dyn_cast<PHINode>(BI)) {
     P->replaceUsesOfWith(exitingBlock, preheader);
-    BI++;
+    ++BI;
   }
   
   // Update the dominator tree and remove the instructions and blocks that will
@@ -226,7 +226,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
   LPM.deleteLoopFromQueue(L);
   Changed = true;
   
-  NumDeleted++;
+  ++NumDeleted;
   
   return Changed;
 }
diff --git a/lib/Transforms/Scalar/LoopIndexSplit.cpp b/lib/Transforms/Scalar/LoopIndexSplit.cpp
index 101ff5b..31058e5 100644
--- a/lib/Transforms/Scalar/LoopIndexSplit.cpp
+++ b/lib/Transforms/Scalar/LoopIndexSplit.cpp
@@ -649,7 +649,7 @@ bool LoopIndexSplit::updateLoopIterationSpace() {
       }
     }
   }
-  NumRestrictBounds++;
+  ++NumRestrictBounds;
   return true;
 }
 
@@ -958,11 +958,11 @@ bool LoopIndexSplit::splitLoop() {
       continue;
 
     for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
-	 BI != BE; ++BI) {
+         BI != BE; ++BI) {
       Instruction *Inst = BI;
 
       if (!Inst->isSafeToSpeculativelyExecute() && !isa<PHINode>(Inst)
-	  && !isa<BranchInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst))
+          && !isa<BranchInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst))
         return false;
     }
   }
@@ -1016,13 +1016,13 @@ bool LoopIndexSplit::splitLoop() {
   BSV = getMax(BSV, IVStartValue, Sign, PHTerm);
 
   // [*] Clone Loop
-  DenseMap<const Value *, Value *> ValueMap;
-  Loop *BLoop = CloneLoop(L, LPM, LI, ValueMap, this);
+  ValueMap<const Value *, Value *> VMap;
+  Loop *BLoop = CloneLoop(L, LPM, LI, VMap, this);
   Loop *ALoop = L;
 
   // [*] ALoop's exiting edge enters BLoop's header.
   //    ALoop's original exit block becomes BLoop's exit block.
-  PHINode *B_IndVar = cast<PHINode>(ValueMap[IndVar]);
+  PHINode *B_IndVar = cast<PHINode>(VMap[IndVar]);
   BasicBlock *A_ExitingBlock = ExitCondition->getParent();
   BranchInst *A_ExitInsn =
     dyn_cast<BranchInst>(A_ExitingBlock->getTerminator());
@@ -1047,7 +1047,7 @@ bool LoopIndexSplit::splitLoop() {
   for (BasicBlock::iterator BI = ALoop->getHeader()->begin(), 
          BE = ALoop->getHeader()->end(); BI != BE; ++BI) {
     if (PHINode *PN = dyn_cast<PHINode>(BI)) {
-      PHINode *PNClone = cast<PHINode>(ValueMap[PN]);
+      PHINode *PNClone = cast<PHINode>(VMap[PN]);
       InverseMap[PNClone] = PN;
     } else
       break;
@@ -1085,11 +1085,11 @@ bool LoopIndexSplit::splitLoop() {
   //     block. Remove incoming PHINode values from ALoop's exiting block.
   //     Add new incoming values from BLoop's incoming exiting value.
   //     Update BLoop exit block's dominator info..
-  BasicBlock *B_ExitingBlock = cast<BasicBlock>(ValueMap[A_ExitingBlock]);
+  BasicBlock *B_ExitingBlock = cast<BasicBlock>(VMap[A_ExitingBlock]);
   for (BasicBlock::iterator BI = B_ExitBlock->begin(), BE = B_ExitBlock->end();
        BI != BE; ++BI) {
     if (PHINode *PN = dyn_cast<PHINode>(BI)) {
-      PN->addIncoming(ValueMap[PN->getIncomingValueForBlock(A_ExitingBlock)], 
+      PN->addIncoming(VMap[PN->getIncomingValueForBlock(A_ExitingBlock)], 
                                                             B_ExitingBlock);
       PN->removeIncomingValue(A_ExitingBlock);
     } else
@@ -1131,7 +1131,7 @@ bool LoopIndexSplit::splitLoop() {
   removeBlocks(A_InactiveBranch, L, A_ActiveBranch);
 
   //[*] Eliminate split condition's inactive branch in from BLoop.
-  BasicBlock *B_SplitCondBlock = cast<BasicBlock>(ValueMap[A_SplitCondBlock]);
+  BasicBlock *B_SplitCondBlock = cast<BasicBlock>(VMap[A_SplitCondBlock]);
   BranchInst *B_BR = cast<BranchInst>(B_SplitCondBlock->getTerminator());
   BasicBlock *B_InactiveBranch = NULL;
   BasicBlock *B_ActiveBranch = NULL;
@@ -1146,9 +1146,9 @@ bool LoopIndexSplit::splitLoop() {
 
   //[*] Move exit condition into split condition block to avoid
   //    executing dead loop iteration.
-  ICmpInst *B_ExitCondition = cast<ICmpInst>(ValueMap[ExitCondition]);
-  Instruction *B_IndVarIncrement = cast<Instruction>(ValueMap[IVIncrement]);
-  ICmpInst *B_SplitCondition = cast<ICmpInst>(ValueMap[SplitCondition]);
+  ICmpInst *B_ExitCondition = cast<ICmpInst>(VMap[ExitCondition]);
+  Instruction *B_IndVarIncrement = cast<Instruction>(VMap[IVIncrement]);
+  ICmpInst *B_SplitCondition = cast<ICmpInst>(VMap[SplitCondition]);
 
   moveExitCondition(A_SplitCondBlock, A_ActiveBranch, A_ExitBlock, ExitCondition,
                     cast<ICmpInst>(SplitCondition), IndVar, IVIncrement, 
@@ -1159,7 +1159,7 @@ bool LoopIndexSplit::splitLoop() {
                     B_SplitCondition, B_IndVar, B_IndVarIncrement, 
                     BLoop, EVOpNum);
 
-  NumIndexSplit++;
+  ++NumIndexSplit;
   return true;
 }
 
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 5004483..16c4a15 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -147,7 +147,7 @@ bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) {
         continue;           // PHI nodes don't count.
       if (isa<DbgInfoIntrinsic>(OI))
         continue;  // Debug intrinsics don't count as size.
-      Size++;
+      ++Size;
   }
 
   if (Size > MAX_HEADER_SIZE)
@@ -263,7 +263,7 @@ bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) {
 
   preserveCanonicalLoopForm(LPM);
 
-  NumRotated++;
+  ++NumRotated;
   return true;
 }
 
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 86ea3eb..a250a88 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -392,12 +392,13 @@ static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
 }
 
-/// isMulSExtable - Return true if the given add can be sign-extended
+/// isMulSExtable - Return true if the given mul can be sign-extended
 /// without changing its value.
-static bool isMulSExtable(const SCEVMulExpr *A, ScalarEvolution &SE) {
+static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
   const Type *WideTy =
-    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
-  return isa<SCEVMulExpr>(SE.getSignExtendExpr(A, WideTy));
+    IntegerType::get(SE.getContext(),
+                     SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
+  return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
 }
 
 /// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
@@ -413,20 +414,28 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
   if (LHS == RHS)
     return SE.getConstant(LHS->getType(), 1);
 
-  // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some
-  // folding.
-  if (RHS->isAllOnesValue())
-    return SE.getMulExpr(LHS, RHS);
+  // Handle a few RHS special cases.
+  const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
+  if (RC) {
+    const APInt &RA = RC->getValue()->getValue();
+    // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
+    // some folding.
+    if (RA.isAllOnesValue())
+      return SE.getMulExpr(LHS, RC);
+    // Handle x /s 1 as x.
+    if (RA == 1)
+      return LHS;
+  }
 
   // Check for a division of a constant by a constant.
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
-    const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
     if (!RC)
       return 0;
-    if (C->getValue()->getValue().srem(RC->getValue()->getValue()) != 0)
+    const APInt &LA = C->getValue()->getValue();
+    const APInt &RA = RC->getValue()->getValue();
+    if (LA.srem(RA) != 0)
       return 0;
-    return SE.getConstant(C->getValue()->getValue()
-               .sdiv(RC->getValue()->getValue()));
+    return SE.getConstant(LA.sdiv(RA));
   }
 
   // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
@@ -440,6 +449,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
       if (!Step) return 0;
       return SE.getAddRecExpr(Start, Step, AR->getLoop());
     }
+    return 0;
   }
 
   // Distribute the sdiv over add operands, if the add doesn't overflow.
@@ -455,10 +465,11 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
       }
       return SE.getAddExpr(Ops);
     }
+    return 0;
   }
 
   // Check for a multiply operand that we can pull RHS out of.
-  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS))
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
     if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
       SmallVector<const SCEV *, 4> Ops;
       bool Found = false;
@@ -475,6 +486,8 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
       }
       return Found ? SE.getMulExpr(Ops) : 0;
     }
+    return 0;
+  }
 
   // Otherwise we don't know.
   return 0;
@@ -546,7 +559,7 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
       case Intrinsic::x86_sse2_storeu_pd:
       case Intrinsic::x86_sse2_storeu_dq:
       case Intrinsic::x86_sse2_storel_dq:
-        if (II->getOperand(1) == OperandVal)
+        if (II->getArgOperand(0) == OperandVal)
           isAddress = true;
         break;
     }
@@ -568,7 +581,7 @@ static const Type *getAccessType(const Instruction *Inst) {
     case Intrinsic::x86_sse2_storeu_pd:
     case Intrinsic::x86_sse2_storeu_dq:
     case Intrinsic::x86_sse2_storel_dq:
-      AccessTy = II->getOperand(1)->getType();
+      AccessTy = II->getArgOperand(0)->getType();
       break;
     }
   }
@@ -976,6 +989,8 @@ public:
   void dump() const;
 };
 
+}
+
 /// HasFormula - Test whether this use as a formula which has the same
 /// registers as the given formula.
 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
@@ -1203,6 +1218,32 @@ static bool isAlwaysFoldable(const SCEV *S,
   return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
 }
 
+namespace {
+
+/// UseMapDenseMapInfo - A DenseMapInfo implementation for holding
+/// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind.
+struct UseMapDenseMapInfo {
+  static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() {
+    return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic);
+  }
+
+  static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() {
+    return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic);
+  }
+
+  static unsigned
+  getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) {
+    unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first);
+    Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second));
+    return Result;
+  }
+
+  static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS,
+                      const std::pair<const SCEV *, LSRUse::KindType> &RHS) {
+    return LHS == RHS;
+  }
+};
+
 /// FormulaSorter - This class implements an ordering for formulae which sorts
 /// the by their standalone cost.
 class FormulaSorter {
@@ -1275,7 +1316,9 @@ class LSRInstance {
   }
 
   // Support for sharing of LSRUses between LSRFixups.
-  typedef DenseMap<const SCEV *, size_t> UseMapTy;
+  typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>,
+                   size_t,
+                   UseMapDenseMapInfo> UseMapTy;
   UseMapTy UseMap;
 
   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
@@ -1613,8 +1656,11 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
     NewRHS = Sel->getOperand(1);
   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
     NewRHS = Sel->getOperand(2);
+  else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
+    NewRHS = SU->getValue();
   else
-    llvm_unreachable("Max doesn't match expected pattern!");
+    // Max doesn't match expected pattern.
+    return Cond;
 
   // Determine the new comparison opcode. It may be signed or unsigned,
   // and the original comparison may be either equality or inequality.
@@ -1805,6 +1851,8 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
     NewMaxOffset = NewOffset;
   }
   // Check for a mismatched access type, and fall back conservatively as needed.
+  // TODO: Be less conservative when the type is similar and can use the same
+  // addressing modes.
   if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
     NewAccessTy = Type::getVoidTy(AccessTy->getContext());
 
@@ -1833,7 +1881,7 @@ LSRInstance::getUse(const SCEV *&Expr,
   }
 
   std::pair<UseMapTy::iterator, bool> P =
-    UseMap.insert(std::make_pair(Expr, 0));
+    UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0));
   if (!P.second) {
     // A use already existed with this base.
     size_t LUIdx = P.first->second;
@@ -1919,7 +1967,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
         Strides.insert(AR->getStepRecurrence(SE));
         Worklist.push_back(AR->getStart());
       } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
-        Worklist.insert(Worklist.end(), Add->op_begin(), Add->op_end());
+        Worklist.append(Add->op_begin(), Add->op_end());
       }
     } while (!Worklist.empty());
   }
@@ -2086,7 +2134,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
     const SCEV *S = Worklist.pop_back_val();
 
     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
-      Worklist.insert(Worklist.end(), N->op_begin(), N->op_end());
+      Worklist.append(N->op_begin(), N->op_end());
     else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
       Worklist.push_back(C->getOperand());
     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
@@ -2095,8 +2143,12 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
     } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
       if (!Inserted.insert(U)) continue;
       const Value *V = U->getValue();
-      if (const Instruction *Inst = dyn_cast<Instruction>(V))
+      if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+        // Look for instructions defined outside the loop.
         if (L->contains(Inst)) continue;
+      } else if (isa<UndefValue>(V))
+        // Undef doesn't have a live range, so it doesn't matter.
+        continue;
       for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
            UI != UE; ++UI) {
         const Instruction *UserInst = dyn_cast<Instruction>(*UI);
@@ -2155,20 +2207,23 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
 /// separate registers. If C is non-null, multiply each subexpression by C.
 static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
                             SmallVectorImpl<const SCEV *> &Ops,
+                            SmallVectorImpl<const SCEV *> &UninterestingOps,
+                            const Loop *L,
                             ScalarEvolution &SE) {
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     // Break out add operands.
     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
          I != E; ++I)
-      CollectSubexprs(*I, C, Ops, SE);
+      CollectSubexprs(*I, C, Ops, UninterestingOps, L, SE);
     return;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     // Split a non-zero base out of an addrec.
     if (!AR->getStart()->isZero()) {
       CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
                                        AR->getStepRecurrence(SE),
-                                       AR->getLoop()), C, Ops, SE);
-      CollectSubexprs(AR->getStart(), C, Ops, SE);
+                                       AR->getLoop()),
+                      C, Ops, UninterestingOps, L, SE);
+      CollectSubexprs(AR->getStart(), C, Ops, UninterestingOps, L, SE);
       return;
     }
   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
@@ -2178,13 +2233,17 @@ static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
             dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
         CollectSubexprs(Mul->getOperand(1),
                         C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
-                        Ops, SE);
+                        Ops, UninterestingOps, L, SE);
         return;
       }
   }
 
-  // Otherwise use the value itself.
-  Ops.push_back(C ? SE.getMulExpr(C, S) : S);
+  // Otherwise use the value itself. Loop-variant "unknown" values are
+  // uninteresting; we won't be able to do anything meaningful with them.
+  if (!C && isa<SCEVUnknown>(S) && !S->isLoopInvariant(L))
+    UninterestingOps.push_back(S);
+  else
+    Ops.push_back(C ? SE.getMulExpr(C, S) : S);
 }
 
 /// GenerateReassociations - Split out subexpressions from adds and the bases of
@@ -2198,8 +2257,15 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
     const SCEV *BaseReg = Base.BaseRegs[i];
 
-    SmallVector<const SCEV *, 8> AddOps;
-    CollectSubexprs(BaseReg, 0, AddOps, SE);
+    SmallVector<const SCEV *, 8> AddOps, UninterestingAddOps;
+    CollectSubexprs(BaseReg, 0, AddOps, UninterestingAddOps, L, SE);
+
+    // Add any uninteresting values as one register, as we won't be able to
+    // form any interesting reassociation opportunities with them. They'll
+    // just have to be added inside the loop no matter what we do.
+    if (!UninterestingAddOps.empty())
+      AddOps.push_back(SE.getAddExpr(UninterestingAddOps));
+
     if (AddOps.size() == 1) continue;
 
     for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
@@ -2212,11 +2278,10 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
         continue;
 
       // Collect all operands except *J.
-      SmallVector<const SCEV *, 8> InnerAddOps;
-      for (SmallVectorImpl<const SCEV *>::const_iterator K = AddOps.begin(),
-           KE = AddOps.end(); K != KE; ++K)
-        if (K != J)
-          InnerAddOps.push_back(*K);
+      SmallVector<const SCEV *, 8> InnerAddOps
+        (         ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+      InnerAddOps.append
+        (next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end());
 
       // Don't leave just a constant behind in a register if the constant could
       // be folded into an immediate field.
@@ -2350,13 +2415,12 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
   for (SmallSetVector<int64_t, 8>::const_iterator
        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
     int64_t Factor = *I;
-    Formula F = Base;
 
     // Check that the multiplication doesn't overflow.
-    if (F.AM.BaseOffs == INT64_MIN && Factor == -1)
+    if (Base.AM.BaseOffs == INT64_MIN && Factor == -1)
       continue;
-    F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
-    if (F.AM.BaseOffs / Factor != Base.AM.BaseOffs)
+    int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
+    if (NewBaseOffs / Factor != Base.AM.BaseOffs)
       continue;
 
     // Check that multiplying with the use offset doesn't overflow.
@@ -2367,6 +2431,9 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
     if (Offset / Factor != LU.MinOffset)
       continue;
 
+    Formula F = Base;
+    F.AM.BaseOffs = NewBaseOffs;
+
     // Check that this scale is legal.
     if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
       continue;
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index ae7bf40..0c900ff 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -445,7 +445,7 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) {
   // This is a very ad-hoc heuristic.
   if (Metrics.NumInsts > Threshold ||
       Metrics.NumBlocks * 5 > Threshold ||
-      Metrics.NeverInline) {
+      Metrics.containsIndirectBr || Metrics.isRecursive) {
     DEBUG(dbgs() << "NOT unswitching loop %"
           << currentLoop->getHeader()->getName() << ", cost too high: "
           << currentLoop->getBlocks().size() << "\n");
@@ -457,21 +457,21 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) {
 }
 
 // RemapInstruction - Convert the instruction operands from referencing the
-// current values into those specified by ValueMap.
+// current values into those specified by VMap.
 //
 static inline void RemapInstruction(Instruction *I,
-                                    DenseMap<const Value *, Value*> &ValueMap) {
+                                    ValueMap<const Value *, Value*> &VMap) {
   for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
     Value *Op = I->getOperand(op);
-    DenseMap<const Value *, Value*>::iterator It = ValueMap.find(Op);
-    if (It != ValueMap.end()) Op = It->second;
+    ValueMap<const Value *, Value*>::iterator It = VMap.find(Op);
+    if (It != VMap.end()) Op = It->second;
     I->setOperand(op, Op);
   }
 }
 
 /// CloneLoop - Recursively clone the specified loop and all of its children,
 /// mapping the blocks with the specified map.
-static Loop *CloneLoop(Loop *L, Loop *PL, DenseMap<const Value*, Value*> &VM,
+static Loop *CloneLoop(Loop *L, Loop *PL, ValueMap<const Value*, Value*> &VM,
                        LoopInfo *LI, LPPassManager *LPM) {
   Loop *New = new Loop();
   LPM->insertLoop(New, PL);
@@ -615,11 +615,11 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   // the loop preheader and exit blocks), keeping track of the mapping between
   // the instructions and blocks.
   NewBlocks.reserve(LoopBlocks.size());
-  DenseMap<const Value*, Value*> ValueMap;
+  ValueMap<const Value*, Value*> VMap;
   for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
-    BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], ValueMap, ".us", F);
+    BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F);
     NewBlocks.push_back(NewBB);
-    ValueMap[LoopBlocks[i]] = NewBB;  // Keep the BB mapping.
+    VMap[LoopBlocks[i]] = NewBB;  // Keep the BB mapping.
     LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L);
   }
 
@@ -629,7 +629,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
                                 NewBlocks[0], F->end());
 
   // Now we create the new Loop object for the versioned loop.
-  Loop *NewLoop = CloneLoop(L, L->getParentLoop(), ValueMap, LI, LPM);
+  Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
   Loop *ParentLoop = L->getParentLoop();
   if (ParentLoop) {
     // Make sure to add the cloned preheader and exit blocks to the parent loop
@@ -638,7 +638,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   }
   
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
-    BasicBlock *NewExit = cast<BasicBlock>(ValueMap[ExitBlocks[i]]);
+    BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
     // The new exit block should be in the same loop as the old one.
     if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
       ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase());
@@ -653,8 +653,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     for (BasicBlock::iterator I = ExitSucc->begin(); isa<PHINode>(I); ++I) {
       PN = cast<PHINode>(I);
       Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]);
-      DenseMap<const Value *, Value*>::iterator It = ValueMap.find(V);
-      if (It != ValueMap.end()) V = It->second;
+      ValueMap<const Value *, Value*>::iterator It = VMap.find(V);
+      if (It != VMap.end()) V = It->second;
       PN->addIncoming(V, NewExit);
     }
   }
@@ -663,7 +663,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
     for (BasicBlock::iterator I = NewBlocks[i]->begin(),
            E = NewBlocks[i]->end(); I != E; ++I)
-      RemapInstruction(I, ValueMap);
+      RemapInstruction(I, VMap);
   
   // Rewrite the original preheader to select between versions of the loop.
   BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 3611b8e..0e566c5 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -632,7 +632,7 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
   // Remove the memcpy
   MD.removeInstruction(cpy);
   cpy->eraseFromParent();
-  NumMemCpyInstr++;
+  ++NumMemCpyInstr;
 
   return true;
 }
@@ -710,7 +710,7 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
   if (MD.getDependency(C) == dep) {
     MD.removeInstruction(M);
     M->eraseFromParent();
-    NumMemCpyInstr++;
+    ++NumMemCpyInstr;
     return true;
   }
   
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 5aca9cdc..98452f5 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -407,13 +407,14 @@ static Value *NegateValue(Value *V, Instruction *BI) {
   // Okay, we need to materialize a negated version of V with an instruction.
   // Scan the use lists of V to see if we have one already.
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
-    if (!BinaryOperator::isNeg(*UI)) continue;
+    User *U = *UI;
+    if (!BinaryOperator::isNeg(U)) continue;
 
     // We found one!  Now we have to make sure that the definition dominates
     // this use.  We do this by moving it to the entry block (if it is a
     // non-instruction value) or right after the definition.  These negates will
     // be zapped by reassociate later, so we don't need much finesse here.
-    BinaryOperator *TheNeg = cast<BinaryOperator>(*UI);
+    BinaryOperator *TheNeg = cast<BinaryOperator>(U);
 
     // Verify that the negate is in this function, V might be a constant expr.
     if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 5ca9ce3..dd445f6 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -926,7 +926,7 @@ void SROA::DoScalarReplacement(AllocaInst *AI,
   DeleteDeadInstructions();
   AI->eraseFromParent();
 
-  NumReplaced++;
+  ++NumReplaced;
 }
 
 /// DeleteDeadInstructions - Erase instructions on the DeadInstrs list,
@@ -965,11 +965,11 @@ void SROA::isSafeForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       isSafeGEP(GEPI, AI, GEPOffset, Info);
       if (!Info.isUnsafe)
         isSafeForScalarRepl(GEPI, AI, GEPOffset, Info);
-    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(UI)) {
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
       ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
       if (Length)
         isSafeMemAccess(AI, Offset, Length->getZExtValue(), 0,
-                        UI.getOperandNo() == 1, Info);
+                        UI.getOperandNo() == CallInst::ArgOffset, Info);
       else
         MarkUnsafe(Info);
     } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
@@ -1272,6 +1272,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
   // If there is an other pointer, we want to convert it to the same pointer
   // type as AI has, so we can GEP through it safely.
   if (OtherPtr) {
+    unsigned AddrSpace =
+      cast<PointerType>(OtherPtr->getType())->getAddressSpace();
 
     // Remove bitcasts and all-zero GEPs from OtherPtr.  This is an
     // optimization, but it's also required to detect the corner case where
@@ -1279,20 +1281,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
     // OtherPtr may be a bitcast or GEP that currently being rewritten.  (This
     // function is only called for mem intrinsics that access the whole
     // aggregate, so non-zero GEPs are not an issue here.)
-    while (1) {
-      if (BitCastInst *BC = dyn_cast<BitCastInst>(OtherPtr)) {
-        OtherPtr = BC->getOperand(0);
-        continue;
-      }
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(OtherPtr)) {
-        // All zero GEPs are effectively bitcasts.
-        if (GEP->hasAllZeroIndices()) {
-          OtherPtr = GEP->getOperand(0);
-          continue;
-        }
-      }
-      break;
-    }
+    OtherPtr = OtherPtr->stripPointerCasts();
+    
     // Copying the alloca to itself is a no-op: just delete it.
     if (OtherPtr == AI || OtherPtr == NewElts[0]) {
       // This code will run twice for a no-op memcpy -- once for each operand.
@@ -1304,15 +1294,13 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
       return;
     }
     
-    if (ConstantExpr *BCE = dyn_cast<ConstantExpr>(OtherPtr))
-      if (BCE->getOpcode() == Instruction::BitCast)
-        OtherPtr = BCE->getOperand(0);
-    
     // If the pointer is not the right type, insert a bitcast to the right
     // type.
-    if (OtherPtr->getType() != AI->getType())
-      OtherPtr = new BitCastInst(OtherPtr, AI->getType(), OtherPtr->getName(),
-                                 MI);
+    const Type *NewTy =
+      PointerType::get(AI->getType()->getElementType(), AddrSpace);
+    
+    if (OtherPtr->getType() != NewTy)
+      OtherPtr = new BitCastInst(OtherPtr, NewTy, OtherPtr->getName(), MI);
   }
   
   // Process each element of the aggregate.
@@ -1373,7 +1361,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
       // If the stored element is zero (common case), just store a null
       // constant.
       Constant *StoreVal;
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getOperand(2))) {
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getArgOperand(1))) {
         if (CI->isZero()) {
           StoreVal = Constant::getNullValue(EltTy);  // 0.0, null, 0, <0,0>
         } else {
@@ -1436,7 +1424,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
       Value *Ops[] = {
         SROADest ? EltPtr : OtherElt,  // Dest ptr
         SROADest ? OtherElt : EltPtr,  // Src ptr
-        ConstantInt::get(MI->getOperand(3)->getType(), EltSize), // Size
+        ConstantInt::get(MI->getArgOperand(2)->getType(), EltSize), // Size
         // Align
         ConstantInt::get(Type::getInt32Ty(MI->getContext()), OtherEltAlign),
         MI->getVolatileCst()
@@ -1451,8 +1439,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
     } else {
       assert(isa<MemSetInst>(MI));
       Value *Ops[] = {
-        EltPtr, MI->getOperand(2),  // Dest, Value,
-        ConstantInt::get(MI->getOperand(3)->getType(), EltSize), // Size
+        EltPtr, MI->getArgOperand(1),  // Dest, Value,
+        ConstantInt::get(MI->getArgOperand(2)->getType(), EltSize), // Size
         Zero,  // Align
         ConstantInt::get(Type::getInt1Ty(MI->getContext()), 0) // isVolatile
       };
@@ -1655,7 +1643,12 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
       SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI);
     }
 
-    ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI);
+    // Don't create an 'or x, 0' on the first iteration.
+    if (!isa<Constant>(ResultVal) ||
+        !cast<Constant>(ResultVal)->isNullValue())
+      ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI);
+    else
+      ResultVal = SrcField;
   }
 
   // Handle tail padding by truncating the result
@@ -1794,7 +1787,7 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
     if (isOffset) return false;
 
     // If the memintrinsic isn't using the alloca as the dest, reject it.
-    if (UI.getOperandNo() != 1) return false;
+    if (UI.getOperandNo() != CallInst::ArgOffset) return false;
     
     // If the source of the memcpy/move is not a constant global, reject it.
     if (!PointsToConstantGlobal(MI->getSource()))
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 9744100..49d93a2 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -137,6 +137,9 @@ static bool MarkAliveBlocks(BasicBlock *BB,
       // they should be changed to unreachable by passes that can't modify the
       // CFG.
       if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+        // Don't touch volatile stores.
+        if (SI->isVolatile()) continue;
+
         Value *Ptr = SI->getOperand(1);
         
         if (isa<UndefValue>(Ptr) ||
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 7414be7..b1c6191 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -66,6 +66,11 @@ public:
     this->TD = TD;
     if (CI->getCalledFunction())
       Context = &CI->getCalledFunction()->getContext();
+
+    // We never change the calling convention.
+    if (CI->getCallingConv() != llvm::CallingConv::C)
+      return NULL;
+
     return CallOptimizer(CI->getCalledFunction(), CI, B);
   }
 };
@@ -92,6 +97,20 @@ static bool IsOnlyUsedInZeroEqualityComparison(Value *V) {
   return true;
 }
 
+/// IsOnlyUsedInEqualityComparison - Return true if it is only used in equality
+/// comparisons with With.
+static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality() && IC->getOperand(1) == With)
+        continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // String and Memory LibCall Optimizations
 //===----------------------------------------------------------------------===//
@@ -110,8 +129,8 @@ struct StrCatOpt : public LibCallOptimization {
       return 0;
 
     // Extract some information from the instruction
-    Value *Dst = CI->getOperand(1);
-    Value *Src = CI->getOperand(2);
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
 
     // See if we can get the length of the input string.
     uint64_t Len = GetStringLength(Src);
@@ -162,12 +181,12 @@ struct StrNCatOpt : public StrCatOpt {
       return 0;
 
     // Extract some information from the instruction
-    Value *Dst = CI->getOperand(1);
-    Value *Src = CI->getOperand(2);
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
     uint64_t Len;
 
     // We don't do anything if length is not constant
-    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getOperand(3)))
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
       Len = LengthArg->getZExtValue();
     else
       return 0;
@@ -207,11 +226,11 @@ struct StrChrOpt : public LibCallOptimization {
         FT->getParamType(0) != FT->getReturnType())
       return 0;
 
-    Value *SrcStr = CI->getOperand(1);
+    Value *SrcStr = CI->getArgOperand(0);
 
     // If the second operand is non-constant, see if we can compute the length
     // of the input string and turn this into memchr.
-    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getOperand(2));
+    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
     if (CharC == 0) {
       // These optimizations require TargetData.
       if (!TD) return 0;
@@ -220,7 +239,7 @@ struct StrChrOpt : public LibCallOptimization {
       if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32.
         return 0;
 
-      return EmitMemChr(SrcStr, CI->getOperand(2), // include nul.
+      return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
                         ConstantInt::get(TD->getIntPtrType(*Context), Len),
                         B, TD);
     }
@@ -260,12 +279,12 @@ struct StrCmpOpt : public LibCallOptimization {
     // Verify the "strcmp" function prototype.
     const FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
-	!FT->getReturnType()->isIntegerTy(32) ||
+        !FT->getReturnType()->isIntegerTy(32) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != Type::getInt8PtrTy(*Context))
       return 0;
 
-    Value *Str1P = CI->getOperand(1), *Str2P = CI->getOperand(2);
+    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
     if (Str1P == Str2P)      // strcmp(x,x)  -> 0
       return ConstantInt::get(CI->getType(), 0);
 
@@ -308,19 +327,19 @@ struct StrNCmpOpt : public LibCallOptimization {
     // Verify the "strncmp" function prototype.
     const FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 ||
-	!FT->getReturnType()->isIntegerTy(32) ||
+        !FT->getReturnType()->isIntegerTy(32) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != Type::getInt8PtrTy(*Context) ||
         !FT->getParamType(2)->isIntegerTy())
       return 0;
 
-    Value *Str1P = CI->getOperand(1), *Str2P = CI->getOperand(2);
+    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
     if (Str1P == Str2P)      // strncmp(x,x,n)  -> 0
       return ConstantInt::get(CI->getType(), 0);
 
     // Get the length argument if it is constant.
     uint64_t Length;
-    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getOperand(3)))
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
       Length = LengthArg->getZExtValue();
     else
       return 0;
@@ -328,6 +347,9 @@ struct StrNCmpOpt : public LibCallOptimization {
     if (Length == 0) // strncmp(x,y,0)   -> 0
       return ConstantInt::get(CI->getType(), 0);
 
+    if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
+      return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD);
+
     std::string Str1, Str2;
     bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
     bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
@@ -365,7 +387,7 @@ struct StrCpyOpt : public LibCallOptimization {
         FT->getParamType(0) != Type::getInt8PtrTy(*Context))
       return 0;
 
-    Value *Dst = CI->getOperand(1), *Src = CI->getOperand(2);
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
     if (Dst == Src)      // strcpy(x,x)  -> x
       return Src;
 
@@ -381,7 +403,7 @@ struct StrCpyOpt : public LibCallOptimization {
     if (OptChkCall)
       EmitMemCpyChk(Dst, Src,
                     ConstantInt::get(TD->getIntPtrType(*Context), Len),
-                    CI->getOperand(3), B, TD);
+                    CI->getArgOperand(2), B, TD);
     else
       EmitMemCpy(Dst, Src,
                  ConstantInt::get(TD->getIntPtrType(*Context), Len),
@@ -402,9 +424,9 @@ struct StrNCpyOpt : public LibCallOptimization {
         !FT->getParamType(2)->isIntegerTy())
       return 0;
 
-    Value *Dst = CI->getOperand(1);
-    Value *Src = CI->getOperand(2);
-    Value *LenOp = CI->getOperand(3);
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
+    Value *LenOp = CI->getArgOperand(2);
 
     // See if we can get the length of the input string.
     uint64_t SrcLen = GetStringLength(Src);
@@ -452,7 +474,7 @@ struct StrLenOpt : public LibCallOptimization {
         !FT->getReturnType()->isIntegerTy())
       return 0;
 
-    Value *Src = CI->getOperand(1);
+    Value *Src = CI->getArgOperand(0);
 
     // Constant folding: strlen("xyz") -> 3
     if (uint64_t Len = GetStringLength(Src))
@@ -477,7 +499,7 @@ struct StrToOpt : public LibCallOptimization {
         !FT->getParamType(1)->isPointerTy())
       return 0;
 
-    Value *EndPtr = CI->getOperand(2);
+    Value *EndPtr = CI->getArgOperand(1);
     if (isa<ConstantPointerNull>(EndPtr)) {
       CI->setOnlyReadsMemory();
       CI->addAttribute(1, Attribute::NoCapture);
@@ -500,17 +522,34 @@ struct StrStrOpt : public LibCallOptimization {
       return 0;
 
     // fold strstr(x, x) -> x.
-    if (CI->getOperand(1) == CI->getOperand(2))
-      return B.CreateBitCast(CI->getOperand(1), CI->getType());
+    if (CI->getArgOperand(0) == CI->getArgOperand(1))
+      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+    // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
+    if (TD && IsOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
+      Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD);
+      Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
+                                   StrLen, B, TD);
+      for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end();
+           UI != UE; ) {
+        ICmpInst *Old = cast<ICmpInst>(UI++);
+        Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp,
+                                  ConstantInt::getNullValue(StrNCmp->getType()),
+                                  "cmp");
+        Old->replaceAllUsesWith(Cmp);
+        Old->eraseFromParent();
+      }
+      return CI;
+    }
 
     // See if either input string is a constant string.
     std::string SearchStr, ToFindStr;
-    bool HasStr1 = GetConstantStringInfo(CI->getOperand(1), SearchStr);
-    bool HasStr2 = GetConstantStringInfo(CI->getOperand(2), ToFindStr);
+    bool HasStr1 = GetConstantStringInfo(CI->getArgOperand(0), SearchStr);
+    bool HasStr2 = GetConstantStringInfo(CI->getArgOperand(1), ToFindStr);
 
     // fold strstr(x, "") -> x.
     if (HasStr2 && ToFindStr.empty())
-      return B.CreateBitCast(CI->getOperand(1), CI->getType());
+      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
 
     // If both strings are known, constant fold it.
     if (HasStr1 && HasStr2) {
@@ -520,14 +559,14 @@ struct StrStrOpt : public LibCallOptimization {
         return Constant::getNullValue(CI->getType());
 
       // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
-      Value *Result = CastToCStr(CI->getOperand(1), B);
+      Value *Result = CastToCStr(CI->getArgOperand(0), B);
       Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
       return B.CreateBitCast(Result, CI->getType());
     }
 
     // fold strstr(x, "y") -> strchr(x, 'y').
     if (HasStr2 && ToFindStr.size() == 1)
-      return B.CreateBitCast(EmitStrChr(CI->getOperand(1), ToFindStr[0], B, TD),
+      return B.CreateBitCast(EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD),
                              CI->getType());
     return 0;
   }
@@ -545,13 +584,13 @@ struct MemCmpOpt : public LibCallOptimization {
         !FT->getReturnType()->isIntegerTy(32))
       return 0;
 
-    Value *LHS = CI->getOperand(1), *RHS = CI->getOperand(2);
+    Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
 
     if (LHS == RHS)  // memcmp(s,s,x) -> 0
       return Constant::getNullValue(CI->getType());
 
     // Make sure we have a constant length.
-    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getOperand(3));
+    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
     if (!LenC) return 0;
     uint64_t Len = LenC->getZExtValue();
 
@@ -598,9 +637,9 @@ struct MemCpyOpt : public LibCallOptimization {
       return 0;
 
     // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
-    EmitMemCpy(CI->getOperand(1), CI->getOperand(2),
-               CI->getOperand(3), 1, false, B, TD);
-    return CI->getOperand(1);
+    EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+               CI->getArgOperand(2), 1, false, B, TD);
+    return CI->getArgOperand(0);
   }
 };
 
@@ -620,9 +659,9 @@ struct MemMoveOpt : public LibCallOptimization {
       return 0;
 
     // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
-    EmitMemMove(CI->getOperand(1), CI->getOperand(2),
-                CI->getOperand(3), 1, false, B, TD);
-    return CI->getOperand(1);
+    EmitMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                CI->getArgOperand(2), 1, false, B, TD);
+    return CI->getArgOperand(0);
   }
 };
 
@@ -642,10 +681,10 @@ struct MemSetOpt : public LibCallOptimization {
       return 0;
 
     // memset(p, v, n) -> llvm.memset(p, v, n, 1)
-    Value *Val = B.CreateIntCast(CI->getOperand(2), Type::getInt8Ty(*Context),
+    Value *Val = B.CreateIntCast(CI->getArgOperand(1), Type::getInt8Ty(*Context),
                                  false);
-    EmitMemSet(CI->getOperand(1), Val,  CI->getOperand(3), false, B, TD);
-    return CI->getOperand(1);
+    EmitMemSet(CI->getArgOperand(0), Val,  CI->getArgOperand(2), false, B, TD);
+    return CI->getArgOperand(0);
   }
 };
 
@@ -666,7 +705,7 @@ struct PowOpt : public LibCallOptimization {
         !FT->getParamType(0)->isFloatingPointTy())
       return 0;
 
-    Value *Op1 = CI->getOperand(1), *Op2 = CI->getOperand(2);
+    Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
     if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
       if (Op1C->isExactlyValue(1.0))  // pow(1.0, x) -> 1.0
         return Op1C;
@@ -720,18 +759,18 @@ struct Exp2Opt : public LibCallOptimization {
         !FT->getParamType(0)->isFloatingPointTy())
       return 0;
 
-    Value *Op = CI->getOperand(1);
+    Value *Op = CI->getArgOperand(0);
     // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
     // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
     Value *LdExpArg = 0;
     if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
       if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
         LdExpArg = B.CreateSExt(OpC->getOperand(0),
-				Type::getInt32Ty(*Context), "tmp");
+                                Type::getInt32Ty(*Context), "tmp");
     } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
       if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
         LdExpArg = B.CreateZExt(OpC->getOperand(0),
-				Type::getInt32Ty(*Context), "tmp");
+                                Type::getInt32Ty(*Context), "tmp");
     }
 
     if (LdExpArg) {
@@ -772,7 +811,7 @@ struct UnaryDoubleFPOpt : public LibCallOptimization {
       return 0;
 
     // If this is something like 'floor((double)floatval)', convert to floorf.
-    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getOperand(1));
+    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
     if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
       return 0;
 
@@ -797,11 +836,11 @@ struct FFSOpt : public LibCallOptimization {
     // Just make sure this has 2 arguments of the same FP type, which match the
     // result type.
     if (FT->getNumParams() != 1 ||
-	!FT->getReturnType()->isIntegerTy(32) ||
+        !FT->getReturnType()->isIntegerTy(32) ||
         !FT->getParamType(0)->isIntegerTy())
       return 0;
 
-    Value *Op = CI->getOperand(1);
+    Value *Op = CI->getArgOperand(0);
 
     // Constant fold.
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
@@ -821,7 +860,7 @@ struct FFSOpt : public LibCallOptimization {
 
     Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType), "tmp");
     return B.CreateSelect(Cond, V,
-			  ConstantInt::get(Type::getInt32Ty(*Context), 0));
+                          ConstantInt::get(Type::getInt32Ty(*Context), 0));
   }
 };
 
@@ -837,7 +876,7 @@ struct IsDigitOpt : public LibCallOptimization {
       return 0;
 
     // isdigit(c) -> (c-'0') <u 10
-    Value *Op = CI->getOperand(1);
+    Value *Op = CI->getArgOperand(0);
     Op = B.CreateSub(Op, ConstantInt::get(Type::getInt32Ty(*Context), '0'),
                      "isdigittmp");
     Op = B.CreateICmpULT(Op, ConstantInt::get(Type::getInt32Ty(*Context), 10),
@@ -858,7 +897,7 @@ struct IsAsciiOpt : public LibCallOptimization {
       return 0;
 
     // isascii(c) -> c <u 128
-    Value *Op = CI->getOperand(1);
+    Value *Op = CI->getArgOperand(0);
     Op = B.CreateICmpULT(Op, ConstantInt::get(Type::getInt32Ty(*Context), 128),
                          "isascii");
     return B.CreateZExt(Op, CI->getType());
@@ -877,7 +916,7 @@ struct AbsOpt : public LibCallOptimization {
       return 0;
 
     // abs(x) -> x >s -1 ? x : -x
-    Value *Op = CI->getOperand(1);
+    Value *Op = CI->getArgOperand(0);
     Value *Pos = B.CreateICmpSGT(Op,
                              Constant::getAllOnesValue(Op->getType()),
                                  "ispos");
@@ -899,7 +938,7 @@ struct ToAsciiOpt : public LibCallOptimization {
       return 0;
 
     // isascii(c) -> c & 0x7f
-    return B.CreateAnd(CI->getOperand(1),
+    return B.CreateAnd(CI->getArgOperand(0),
                        ConstantInt::get(CI->getType(),0x7F));
   }
 };
@@ -922,7 +961,7 @@ struct PrintFOpt : public LibCallOptimization {
 
     // Check for a fixed format string.
     std::string FormatStr;
-    if (!GetConstantStringInfo(CI->getOperand(1), FormatStr))
+    if (!GetConstantStringInfo(CI->getArgOperand(0), FormatStr))
       return 0;
 
     // Empty format string -> noop.
@@ -954,20 +993,20 @@ struct PrintFOpt : public LibCallOptimization {
     }
 
     // Optimize specific format strings.
-    // printf("%c", chr) --> putchar(*(i8*)dst)
-    if (FormatStr == "%c" && CI->getNumOperands() > 2 &&
-        CI->getOperand(2)->getType()->isIntegerTy()) {
-      Value *Res = EmitPutChar(CI->getOperand(2), B, TD);
+    // printf("%c", chr) --> putchar(chr)
+    if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
+        CI->getArgOperand(1)->getType()->isIntegerTy()) {
+      Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD);
 
       if (CI->use_empty()) return CI;
       return B.CreateIntCast(Res, CI->getType(), true);
     }
 
     // printf("%s\n", str) --> puts(str)
-    if (FormatStr == "%s\n" && CI->getNumOperands() > 2 &&
-        CI->getOperand(2)->getType()->isPointerTy() &&
+    if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
+        CI->getArgOperand(1)->getType()->isPointerTy() &&
         CI->use_empty()) {
-      EmitPutS(CI->getOperand(2), B, TD);
+      EmitPutS(CI->getArgOperand(1), B, TD);
       return CI;
     }
     return 0;
@@ -988,11 +1027,11 @@ struct SPrintFOpt : public LibCallOptimization {
 
     // Check for a fixed format string.
     std::string FormatStr;
-    if (!GetConstantStringInfo(CI->getOperand(2), FormatStr))
+    if (!GetConstantStringInfo(CI->getArgOperand(1), FormatStr))
       return 0;
 
     // If we just have a format string (nothing else crazy) transform it.
-    if (CI->getNumOperands() == 3) {
+    if (CI->getNumArgOperands() == 2) {
       // Make sure there's no % in the constant array.  We could try to handle
       // %% -> % in the future if we cared.
       for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
@@ -1003,7 +1042,7 @@ struct SPrintFOpt : public LibCallOptimization {
       if (!TD) return 0;
 
       // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
-      EmitMemCpy(CI->getOperand(1), CI->getOperand(2), // Copy the nul byte.
+      EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), // Copy the nul byte.
                  ConstantInt::get(TD->getIntPtrType(*Context),
                  FormatStr.size()+1), 1, false, B, TD);
       return ConstantInt::get(CI->getType(), FormatStr.size());
@@ -1011,16 +1050,17 @@ struct SPrintFOpt : public LibCallOptimization {
 
     // The remaining optimizations require the format string to be "%s" or "%c"
     // and have an extra operand.
-    if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumOperands() <4)
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+        CI->getNumArgOperands() < 3)
       return 0;
 
     // Decode the second character of the format string.
     if (FormatStr[1] == 'c') {
       // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
-      if (!CI->getOperand(3)->getType()->isIntegerTy()) return 0;
-      Value *V = B.CreateTrunc(CI->getOperand(3),
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      Value *V = B.CreateTrunc(CI->getArgOperand(2),
                                Type::getInt8Ty(*Context), "char");
-      Value *Ptr = CastToCStr(CI->getOperand(1), B);
+      Value *Ptr = CastToCStr(CI->getArgOperand(0), B);
       B.CreateStore(V, Ptr);
       Ptr = B.CreateGEP(Ptr, ConstantInt::get(Type::getInt32Ty(*Context), 1),
                         "nul");
@@ -1034,13 +1074,13 @@ struct SPrintFOpt : public LibCallOptimization {
       if (!TD) return 0;
 
       // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
-      if (!CI->getOperand(3)->getType()->isPointerTy()) return 0;
+      if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0;
 
-      Value *Len = EmitStrLen(CI->getOperand(3), B, TD);
+      Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD);
       Value *IncLen = B.CreateAdd(Len,
                                   ConstantInt::get(Len->getType(), 1),
                                   "leninc");
-      EmitMemCpy(CI->getOperand(1), CI->getOperand(3), IncLen, 1, false, B, TD);
+      EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1, false, B, TD);
 
       // The sprintf result is the unincremented number of bytes in the string.
       return B.CreateIntCast(Len, CI->getType(), false);
@@ -1064,8 +1104,8 @@ struct FWriteOpt : public LibCallOptimization {
       return 0;
 
     // Get the element size and count.
-    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getOperand(2));
-    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getOperand(3));
+    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
     if (!SizeC || !CountC) return 0;
     uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue();
 
@@ -1075,8 +1115,8 @@ struct FWriteOpt : public LibCallOptimization {
 
     // If this is writing one byte, turn it into fputc.
     if (Bytes == 1) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
-      Value *Char = B.CreateLoad(CastToCStr(CI->getOperand(1), B), "char");
-      EmitFPutC(Char, CI->getOperand(4), B, TD);
+      Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
+      EmitFPutC(Char, CI->getArgOperand(3), B, TD);
       return ConstantInt::get(CI->getType(), 1);
     }
 
@@ -1100,11 +1140,11 @@ struct FPutsOpt : public LibCallOptimization {
       return 0;
 
     // fputs(s,F) --> fwrite(s,1,strlen(s),F)
-    uint64_t Len = GetStringLength(CI->getOperand(1));
+    uint64_t Len = GetStringLength(CI->getArgOperand(0));
     if (!Len) return 0;
-    EmitFWrite(CI->getOperand(1),
+    EmitFWrite(CI->getArgOperand(0),
                ConstantInt::get(TD->getIntPtrType(*Context), Len-1),
-               CI->getOperand(2), B, TD);
+               CI->getArgOperand(1), B, TD);
     return CI;  // Known to have no uses (see above).
   }
 };
@@ -1123,11 +1163,11 @@ struct FPrintFOpt : public LibCallOptimization {
 
     // All the optimizations depend on the format string.
     std::string FormatStr;
-    if (!GetConstantStringInfo(CI->getOperand(2), FormatStr))
+    if (!GetConstantStringInfo(CI->getArgOperand(1), FormatStr))
       return 0;
 
     // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
-    if (CI->getNumOperands() == 3) {
+    if (CI->getNumArgOperands() == 2) {
       for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
         if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
           return 0; // We found a format specifier.
@@ -1135,31 +1175,32 @@ struct FPrintFOpt : public LibCallOptimization {
       // These optimizations require TargetData.
       if (!TD) return 0;
 
-      EmitFWrite(CI->getOperand(2),
+      EmitFWrite(CI->getArgOperand(1),
                  ConstantInt::get(TD->getIntPtrType(*Context),
                                   FormatStr.size()),
-                 CI->getOperand(1), B, TD);
+                 CI->getArgOperand(0), B, TD);
       return ConstantInt::get(CI->getType(), FormatStr.size());
     }
 
     // The remaining optimizations require the format string to be "%s" or "%c"
     // and have an extra operand.
-    if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumOperands() <4)
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+        CI->getNumArgOperands() < 3)
       return 0;
 
     // Decode the second character of the format string.
     if (FormatStr[1] == 'c') {
-      // fprintf(F, "%c", chr) --> *(i8*)dst = chr
-      if (!CI->getOperand(3)->getType()->isIntegerTy()) return 0;
-      EmitFPutC(CI->getOperand(3), CI->getOperand(1), B, TD);
+      // fprintf(F, "%c", chr) --> fputc(chr, F)
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TD);
       return ConstantInt::get(CI->getType(), 1);
     }
 
     if (FormatStr[1] == 's') {
-      // fprintf(F, "%s", str) -> fputs(str, F)
-      if (!CI->getOperand(3)->getType()->isPointerTy() || !CI->use_empty())
+      // fprintf(F, "%s", str) --> fputs(str, F)
+      if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty())
         return 0;
-      EmitFPutS(CI->getOperand(3), CI->getOperand(1), B, TD);
+      EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD);
       return CI;
     }
     return 0;
diff --git a/lib/Transforms/Scalar/TailDuplication.cpp b/lib/Transforms/Scalar/TailDuplication.cpp
index 2306a77..9208238 100644
--- a/lib/Transforms/Scalar/TailDuplication.cpp
+++ b/lib/Transforms/Scalar/TailDuplication.cpp
@@ -206,12 +206,13 @@ static BasicBlock *FindObviousSharedDomOf(BasicBlock *SrcBlock,
   // there is only one other pred, get it, otherwise we can't handle it.
   PI = pred_begin(DstBlock); PE = pred_end(DstBlock);
   BasicBlock *DstOtherPred = 0;
-  if (*PI == SrcBlock) {
+  BasicBlock *P = *PI;
+  if (P == SrcBlock) {
     if (++PI == PE) return 0;
     DstOtherPred = *PI;
     if (++PI != PE) return 0;
   } else {
-    DstOtherPred = *PI;
+    DstOtherPred = P;
     if (++PI == PE || *PI != SrcBlock || ++PI != PE) return 0;
   }
 
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 5ad5de2..01c8e5d 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -16,9 +16,9 @@
 //     transformation from taking place, though currently the analysis cannot
 //     support moving any really useful instructions (only dead ones).
 //  2. This pass transforms functions that are prevented from being tail
-//     recursive by an associative expression to use an accumulator variable,
-//     thus compiling the typical naive factorial or 'fib' implementation into
-//     efficient code.
+//     recursive by an associative and commutative expression to use an
+//     accumulator variable, thus compiling the typical naive factorial or
+//     'fib' implementation into efficient code.
 //  3. TRE is performed if the function returns void, if the return
 //     returns the result returned by the call, or if the function returns a
 //     run-time constant on all exits from the function.  It is possible, though
@@ -60,6 +60,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/ADT/Statistic.h"
@@ -252,7 +253,7 @@ static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
     // If we are passing this argument into call as the corresponding
     // argument operand, then the argument is dynamically constant.
     // Otherwise, we cannot transform this function safely.
-    if (CI->getOperand(ArgNo+1) == Arg)
+    if (CI->getArgOperand(ArgNo) == Arg)
       return true;
   }
 
@@ -269,16 +270,16 @@ static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
 }
 
 // getCommonReturnValue - Check to see if the function containing the specified
-// return instruction and tail call consistently returns the same
-// runtime-constant value at all exit points.  If so, return the returned value.
+// tail call consistently returns the same runtime-constant value at all exit
+// points except for IgnoreRI.  If so, return the returned value.
 //
-static Value *getCommonReturnValue(ReturnInst *TheRI, CallInst *CI) {
-  Function *F = TheRI->getParent()->getParent();
+static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
+  Function *F = CI->getParent()->getParent();
   Value *ReturnedValue = 0;
 
   for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
     if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
-      if (RI != TheRI) {
+      if (RI != IgnoreRI) {
         Value *RetOp = RI->getOperand(0);
 
         // We can only perform this transformation if the value returned is
@@ -301,9 +302,9 @@ static Value *getCommonReturnValue(ReturnInst *TheRI, CallInst *CI) {
 ///
 Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
                                                       CallInst *CI) {
-  if (!I->isAssociative()) return 0;
+  if (!I->isAssociative() || !I->isCommutative()) return 0;
   assert(I->getNumOperands() == 2 &&
-         "Associative operations should have 2 args!");
+         "Associative/commutative operations should have 2 args!");
 
   // Exactly one operand should be the result of the call instruction...
   if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
@@ -368,11 +369,16 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
       return false;
   }
 
-  // If we are introducing accumulator recursion to eliminate associative
-  // operations after the call instruction, this variable contains the initial
-  // value for the accumulator.  If this value is set, we actually perform
-  // accumulator recursion elimination instead of simple tail recursion
-  // elimination.
+  // If we are introducing accumulator recursion to eliminate operations after
+  // the call instruction that are both associative and commutative, the initial
+  // value for the accumulator is placed in this variable.  If this value is set
+  // then we actually perform accumulator recursion elimination instead of
+  // simple tail recursion elimination.  If the operation is an LLVM instruction
+  // (eg: "add") then it is recorded in AccumulatorRecursionInstr.  If not, then
+  // we are handling the case when the return instruction returns a constant C
+  // which is different to the constant returned by other return instructions
+  // (which is recorded in AccumulatorRecursionEliminationInitVal).  This is a
+  // special case of accumulator recursion, the operation being "return C".
   Value *AccumulatorRecursionEliminationInitVal = 0;
   Instruction *AccumulatorRecursionInstr = 0;
 
@@ -383,9 +389,9 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
   for (BBI = CI, ++BBI; &*BBI != Ret; ++BBI)
     if (!CanMoveAboveCall(BBI, CI)) {
       // If we can't move the instruction above the call, it might be because it
-      // is an associative operation that could be tranformed using accumulator
-      // recursion elimination.  Check to see if this is the case, and if so,
-      // remember the initial accumulator value for later.
+      // is an associative and commutative operation that could be tranformed
+      // using accumulator recursion elimination.  Check to see if this is the
+      // case, and if so, remember the initial accumulator value for later.
       if ((AccumulatorRecursionEliminationInitVal =
                              CanTransformAccumulatorRecursion(BBI, CI))) {
         // Yes, this is accumulator recursion.  Remember which instruction
@@ -403,8 +409,18 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
   if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI &&
       !isa<UndefValue>(Ret->getReturnValue()) &&
       AccumulatorRecursionEliminationInitVal == 0 &&
-      !getCommonReturnValue(Ret, CI))
-    return false;
+      !getCommonReturnValue(0, CI)) {
+    // One case remains that we are able to handle: the current return
+    // instruction returns a constant, and all other return instructions
+    // return a different constant.
+    if (!isDynamicConstant(Ret->getReturnValue(), CI, Ret))
+      return false; // Current return instruction does not return a constant.
+    // Check that all other return instructions return a common constant.  If
+    // so, record it in AccumulatorRecursionEliminationInitVal.
+    AccumulatorRecursionEliminationInitVal = getCommonReturnValue(Ret, CI);
+    if (!AccumulatorRecursionEliminationInitVal)
+      return false;
+  }
 
   // OK! We can transform this tail call.  If this is the first one found,
   // create the new entry block, allowing us to branch back to the old entry.
@@ -453,8 +469,8 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
   // Ok, now that we know we have a pseudo-entry block WITH all of the
   // required PHI nodes, add entries into the PHI node for the actual
   // parameters passed into the tail-recursive call.
-  for (unsigned i = 0, e = CI->getNumOperands()-1; i != e; ++i)
-    ArgumentPHIs[i]->addIncoming(CI->getOperand(i+1), BB);
+  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+    ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
 
   // If we are introducing an accumulator variable to eliminate the recursion,
   // do so now.  Note that we _know_ that no subsequent tail recursion
@@ -464,8 +480,9 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
   if (AccumulatorRecursionEliminationInitVal) {
     Instruction *AccRecInstr = AccumulatorRecursionInstr;
     // Start by inserting a new PHI node for the accumulator.
-    PHINode *AccPN = PHINode::Create(AccRecInstr->getType(), "accumulator.tr",
-                                     OldEntry->begin());
+    PHINode *AccPN =
+      PHINode::Create(AccumulatorRecursionEliminationInitVal->getType(),
+                      "accumulator.tr", OldEntry->begin());
 
     // Loop over all of the predecessors of the tail recursion block.  For the
     // real entry into the function we seed the PHI with the initial value,
@@ -475,20 +492,27 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
     // it will not show up as a predecessor.
     for (pred_iterator PI = pred_begin(OldEntry), PE = pred_end(OldEntry);
          PI != PE; ++PI) {
-      if (*PI == &F->getEntryBlock())
-        AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, *PI);
+      BasicBlock *P = *PI;
+      if (P == &F->getEntryBlock())
+        AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, P);
       else
-        AccPN->addIncoming(AccPN, *PI);
+        AccPN->addIncoming(AccPN, P);
     }
 
-    // Add an incoming argument for the current block, which is computed by our
-    // associative accumulator instruction.
-    AccPN->addIncoming(AccRecInstr, BB);
-
-    // Next, rewrite the accumulator recursion instruction so that it does not
-    // use the result of the call anymore, instead, use the PHI node we just
-    // inserted.
-    AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+    if (AccRecInstr) {
+      // Add an incoming argument for the current block, which is computed by
+      // our associative and commutative accumulator instruction.
+      AccPN->addIncoming(AccRecInstr, BB);
+
+      // Next, rewrite the accumulator recursion instruction so that it does not
+      // use the result of the call anymore, instead, use the PHI node we just
+      // inserted.
+      AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+    } else {
+      // Add an incoming argument for the current block, which is just the
+      // constant returned by the current return instruction.
+      AccPN->addIncoming(Ret->getReturnValue(), BB);
+    }
 
     // Finally, rewrite any return instructions in the program to return the PHI
     // node instead of the "initval" that they do currently.  This loop will
diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp
index ea9d1c1..4d64c85 100644
--- a/lib/Transforms/Utils/AddrModeMatcher.cpp
+++ b/lib/Transforms/Utils/AddrModeMatcher.cpp
@@ -381,29 +381,28 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
                                     const TargetLowering &TLI) {
   std::vector<InlineAsm::ConstraintInfo>
   Constraints = IA->ParseConstraints();
-  
-  unsigned ArgNo = 1;   // ArgNo - The operand of the CallInst.
+
+  unsigned ArgNo = 0;   // The argument of the CallInst.
   for (unsigned i = 0, e = Constraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo OpInfo(Constraints[i]);
-    
+
     // Compute the value type for each operand.
     switch (OpInfo.Type) {
       case InlineAsm::isOutput:
         if (OpInfo.isIndirect)
-          OpInfo.CallOperandVal = CI->getOperand(ArgNo++);
+          OpInfo.CallOperandVal = CI->getArgOperand(ArgNo++);
         break;
       case InlineAsm::isInput:
-        OpInfo.CallOperandVal = CI->getOperand(ArgNo++);
+        OpInfo.CallOperandVal = CI->getArgOperand(ArgNo++);
         break;
       case InlineAsm::isClobber:
         // Nothing to do.
         break;
     }
-    
+
     // Compute the constraint code and ConstraintType to use.
-    TLI.ComputeConstraintToUse(OpInfo, SDValue(),
-                             OpInfo.ConstraintType == TargetLowering::C_Memory);
-    
+    TLI.ComputeConstraintToUse(OpInfo, SDValue());
+
     // If this asm operand is our Value*, and if it isn't an indirect memory
     // operand, we can't fold it!
     if (OpInfo.CallOperandVal == OpVal &&
@@ -411,7 +410,7 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
          !OpInfo.isIndirect))
       return false;
   }
-  
+
   return true;
 }
 
@@ -450,7 +449,7 @@ static bool FindAllMemoryUses(Instruction *I,
     
     if (CallInst *CI = dyn_cast<CallInst>(U)) {
       InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
-      if (IA == 0) return true;
+      if (!IA) return true;
       
       // If this is a memory operand, we're cool, otherwise bail out.
       if (!IsOperandAMemoryOperand(CI, IA, I, TLI))
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 2f1ae00..ec625b4 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -558,121 +558,3 @@ void llvm::FindFunctionBackedges(const Function &F,
   
   
 }
-
-
-
-/// AreEquivalentAddressValues - Test if A and B will obviously have the same
-/// value. This includes recognizing that %t0 and %t1 will have the same
-/// value in code like this:
-///   %t0 = getelementptr \@a, 0, 3
-///   store i32 0, i32* %t0
-///   %t1 = getelementptr \@a, 0, 3
-///   %t2 = load i32* %t1
-///
-static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
-  // Test if the values are trivially equivalent.
-  if (A == B) return true;
-  
-  // Test if the values come from identical arithmetic instructions.
-  // Use isIdenticalToWhenDefined instead of isIdenticalTo because
-  // this function is only used when one address use dominates the
-  // other, which means that they'll always either have the same
-  // value or one of them will have an undefined value.
-  if (isa<BinaryOperator>(A) || isa<CastInst>(A) ||
-      isa<PHINode>(A) || isa<GetElementPtrInst>(A))
-    if (const Instruction *BI = dyn_cast<Instruction>(B))
-      if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
-        return true;
-  
-  // Otherwise they may not be equivalent.
-  return false;
-}
-
-/// FindAvailableLoadedValue - Scan the ScanBB block backwards (starting at the
-/// instruction before ScanFrom) checking to see if we have the value at the
-/// memory address *Ptr locally available within a small number of instructions.
-/// If the value is available, return it.
-///
-/// If not, return the iterator for the last validated instruction that the 
-/// value would be live through.  If we scanned the entire block and didn't find
-/// something that invalidates *Ptr or provides it, ScanFrom would be left at
-/// begin() and this returns null.  ScanFrom could also be left 
-///
-/// MaxInstsToScan specifies the maximum instructions to scan in the block.  If
-/// it is set to 0, it will scan the whole block. You can also optionally
-/// specify an alias analysis implementation, which makes this more precise.
-Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
-                                      BasicBlock::iterator &ScanFrom,
-                                      unsigned MaxInstsToScan,
-                                      AliasAnalysis *AA) {
-  if (MaxInstsToScan == 0) MaxInstsToScan = ~0U;
-
-  // If we're using alias analysis to disambiguate get the size of *Ptr.
-  unsigned AccessSize = 0;
-  if (AA) {
-    const Type *AccessTy = cast<PointerType>(Ptr->getType())->getElementType();
-    AccessSize = AA->getTypeStoreSize(AccessTy);
-  }
-  
-  while (ScanFrom != ScanBB->begin()) {
-    // We must ignore debug info directives when counting (otherwise they
-    // would affect codegen).
-    Instruction *Inst = --ScanFrom;
-    if (isa<DbgInfoIntrinsic>(Inst))
-      continue;
-
-    // Restore ScanFrom to expected value in case next test succeeds
-    ScanFrom++;
-   
-    // Don't scan huge blocks.
-    if (MaxInstsToScan-- == 0) return 0;
-    
-    --ScanFrom;
-    // If this is a load of Ptr, the loaded value is available.
-    if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-      if (AreEquivalentAddressValues(LI->getOperand(0), Ptr))
-        return LI;
-    
-    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-      // If this is a store through Ptr, the value is available!
-      if (AreEquivalentAddressValues(SI->getOperand(1), Ptr))
-        return SI->getOperand(0);
-      
-      // If Ptr is an alloca and this is a store to a different alloca, ignore
-      // the store.  This is a trivial form of alias analysis that is important
-      // for reg2mem'd code.
-      if ((isa<AllocaInst>(Ptr) || isa<GlobalVariable>(Ptr)) &&
-          (isa<AllocaInst>(SI->getOperand(1)) ||
-           isa<GlobalVariable>(SI->getOperand(1))))
-        continue;
-      
-      // If we have alias analysis and it says the store won't modify the loaded
-      // value, ignore the store.
-      if (AA &&
-          (AA->getModRefInfo(SI, Ptr, AccessSize) & AliasAnalysis::Mod) == 0)
-        continue;
-      
-      // Otherwise the store that may or may not alias the pointer, bail out.
-      ++ScanFrom;
-      return 0;
-    }
-    
-    // If this is some other instruction that may clobber Ptr, bail out.
-    if (Inst->mayWriteToMemory()) {
-      // If alias analysis claims that it really won't modify the load,
-      // ignore it.
-      if (AA &&
-          (AA->getModRefInfo(Inst, Ptr, AccessSize) & AliasAnalysis::Mod) == 0)
-        continue;
-      
-      // May modify the pointer, bail out.
-      ++ScanFrom;
-      return 0;
-    }
-  }
-  
-  // Got to the start of the block, we didn't find it, but are done for this
-  // block.
-  return 0;
-}
-
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 8c25ad1..26f53c0 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -106,11 +106,12 @@ bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
   // If AllowIdenticalEdges is true, then we allow this edge to be considered
   // non-critical iff all preds come from TI's block.
   while (I != E) {
-    if (*I != FirstPred)
+    const BasicBlock *P = *I;
+    if (P != FirstPred)
       return true;
     // Note: leave this as is until no one ever compiles with either gcc 4.0.1
     // or Xcode 2. This seems to work around the pred_iterator assert in PR 2207
-    E = pred_end(*I);
+    E = pred_end(P);
     ++I;
   }
   return false;
@@ -277,11 +278,13 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
         OtherPreds.push_back(PN->getIncomingBlock(i));
   } else {
     for (pred_iterator I = pred_begin(DestBB), E = pred_end(DestBB);
-         I != E; ++I)
-      if (*I != NewBB)
-        OtherPreds.push_back(*I);
+         I != E; ++I) {
+      BasicBlock *P = *I;
+      if (P != NewBB)
+          OtherPreds.push_back(P);
+    }
   }
-  
+
   bool NewBBDominatesDestBB = true;
   
   // Should we update DominatorTree information?
@@ -400,11 +403,13 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
           bool HasPredOutsideOfLoop = false;
           BasicBlock *Exit = ExitBlocks[i];
           for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit);
-               I != E; ++I)
-            if (TIL->contains(*I))
-              Preds.push_back(*I);
+               I != E; ++I) {
+            BasicBlock *P = *I;
+            if (TIL->contains(P))
+              Preds.push_back(P);
             else
               HasPredOutsideOfLoop = true;
+          }
           // If there are any preds not in the loop, we'll need to split
           // the edges. The Preds.empty() check is needed because a block
           // may appear multiple times in the list. We can't use
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 767fa3a..7a9d007 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -69,6 +69,31 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
   return CI;
 }
 
+/// EmitStrNCmp - Emit a call to the strncmp function to the builder.
+Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
+                         IRBuilder<> &B, const TargetData *TD) {
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeWithIndex AWI[3];
+  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture);
+  AWI[2] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
+                                   Attribute::NoUnwind);
+
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI, 3),
+                                          B.getInt32Ty(),
+                                          B.getInt8PtrTy(),
+                                          B.getInt8PtrTy(),
+                                          TD->getIntPtrType(Context), NULL);
+  CallInst *CI = B.CreateCall3(StrNCmp, CastToCStr(Ptr1, B),
+                               CastToCStr(Ptr2, B), Len, "strncmp");
+
+  if (const Function *F = dyn_cast<Function>(StrNCmp->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
 /// EmitStrCpy - Emit a call to the strcpy function to the builder, for the
 /// specified pointer arguments.
 Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
@@ -112,10 +137,10 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
 Value *llvm::EmitMemCpy(Value *Dst, Value *Src, Value *Len, unsigned Align,
                         bool isVolatile, IRBuilder<> &B, const TargetData *TD) {
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  const Type *ArgTys[3] = { Dst->getType(), Src->getType(), Len->getType() };
-  Value *MemCpy = Intrinsic::getDeclaration(M, Intrinsic::memcpy, ArgTys, 3);
   Dst = CastToCStr(Dst, B);
   Src = CastToCStr(Src, B);
+  const Type *ArgTys[3] = { Dst->getType(), Src->getType(), Len->getType() };
+  Value *MemCpy = Intrinsic::getDeclaration(M, Intrinsic::memcpy, ArgTys, 3);
   return B.CreateCall5(MemCpy, Dst, Src, Len,
                        ConstantInt::get(B.getInt32Ty(), Align),
                        ConstantInt::get(B.getInt1Ty(), isVolatile));
@@ -395,11 +420,11 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
         FT->getParamType(2) != TD->getIntPtrType(Context) ||
         FT->getParamType(3) != TD->getIntPtrType(Context))
       return false;
-    
-    if (isFoldable(4, 3, false)) {
-      EmitMemCpy(CI->getOperand(1), CI->getOperand(2), CI->getOperand(3),
+
+    if (isFoldable(3 + CallInst::ArgOffset, 2 + CallInst::ArgOffset, false)) {
+      EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2),
                  1, false, B, TD);
-      replaceCall(CI->getOperand(1));
+      replaceCall(CI->getArgOperand(0));
       return true;
     }
     return false;
@@ -418,11 +443,11 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
         FT->getParamType(2) != TD->getIntPtrType(Context) ||
         FT->getParamType(3) != TD->getIntPtrType(Context))
       return false;
-    
-    if (isFoldable(4, 3, false)) {
-      EmitMemMove(CI->getOperand(1), CI->getOperand(2), CI->getOperand(3),
+
+    if (isFoldable(3 + CallInst::ArgOffset, 2 + CallInst::ArgOffset, false)) {
+      EmitMemMove(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2),
                   1, false, B, TD);
-      replaceCall(CI->getOperand(1));
+      replaceCall(CI->getArgOperand(0));
       return true;
     }
     return false;
@@ -436,12 +461,12 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
         FT->getParamType(2) != TD->getIntPtrType(Context) ||
         FT->getParamType(3) != TD->getIntPtrType(Context))
       return false;
-    
-    if (isFoldable(4, 3, false)) {
-      Value *Val = B.CreateIntCast(CI->getOperand(2), B.getInt8Ty(),
+
+    if (isFoldable(3 + CallInst::ArgOffset, 2 + CallInst::ArgOffset, false)) {
+      Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(),
                                    false);
-      EmitMemSet(CI->getOperand(1), Val,  CI->getOperand(3), false, B, TD);
-      replaceCall(CI->getOperand(1));
+      EmitMemSet(CI->getArgOperand(0), Val,  CI->getArgOperand(2), false, B, TD);
+      replaceCall(CI->getArgOperand(0));
       return true;
     }
     return false;
@@ -462,8 +487,8 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
     // st[rp]cpy_chk call which may fail at runtime if the size is too long.
     // TODO: It might be nice to get a maximum length out of the possible
     // string lengths for varying.
-    if (isFoldable(3, 2, true)) {
-      Value *Ret = EmitStrCpy(CI->getOperand(1), CI->getOperand(2), B, TD,
+    if (isFoldable(2 + CallInst::ArgOffset, 1 + CallInst::ArgOffset, true)) {
+      Value *Ret = EmitStrCpy(CI->getArgOperand(0), CI->getArgOperand(1), B, TD,
                               Name.substr(2, 6));
       replaceCall(Ret);
       return true;
@@ -479,10 +504,10 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
         !FT->getParamType(2)->isIntegerTy() ||
         FT->getParamType(3) != TD->getIntPtrType(Context))
       return false;
-    
-    if (isFoldable(4, 3, false)) {
-      Value *Ret = EmitStrNCpy(CI->getOperand(1), CI->getOperand(2),
-                               CI->getOperand(3), B, TD, Name.substr(2, 7));
+
+    if (isFoldable(3 + CallInst::ArgOffset, 2 + CallInst::ArgOffset, false)) {
+      Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                               CI->getArgOperand(2), B, TD, Name.substr(2, 7));
       replaceCall(Ret);
       return true;
     }
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 6d4fe4b..1dcfd57 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -32,7 +32,7 @@ using namespace llvm;
 
 // CloneBasicBlock - See comments in Cloning.h
 BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
-                                  DenseMap<const Value*, Value*> &ValueMap,
+                                  ValueToValueMapTy &VMap,
                                   const Twine &NameSuffix, Function *F,
                                   ClonedCodeInfo *CodeInfo) {
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
@@ -47,7 +47,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
     if (II->hasName())
       NewInst->setName(II->getName()+NameSuffix);
     NewBB->getInstList().push_back(NewInst);
-    ValueMap[II] = NewInst;                // Add instruction map to value.
+    VMap[II] = NewInst;                // Add instruction map to value.
     
     hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
@@ -72,7 +72,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
 // ArgMap values.
 //
 void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
-                             DenseMap<const Value*, Value*> &ValueMap,
+                             ValueToValueMapTy &VMap,
                              SmallVectorImpl<ReturnInst*> &Returns,
                              const char *NameSuffix, ClonedCodeInfo *CodeInfo) {
   assert(NameSuffix && "NameSuffix cannot be null!");
@@ -80,17 +80,17 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
 #ifndef NDEBUG
   for (Function::const_arg_iterator I = OldFunc->arg_begin(), 
        E = OldFunc->arg_end(); I != E; ++I)
-    assert(ValueMap.count(I) && "No mapping from source argument specified!");
+    assert(VMap.count(I) && "No mapping from source argument specified!");
 #endif
 
   // Clone any attributes.
   if (NewFunc->arg_size() == OldFunc->arg_size())
     NewFunc->copyAttributesFrom(OldFunc);
   else {
-    //Some arguments were deleted with the ValueMap. Copy arguments one by one
+    //Some arguments were deleted with the VMap. Copy arguments one by one
     for (Function::const_arg_iterator I = OldFunc->arg_begin(), 
            E = OldFunc->arg_end(); I != E; ++I)
-      if (Argument* Anew = dyn_cast<Argument>(ValueMap[I]))
+      if (Argument* Anew = dyn_cast<Argument>(VMap[I]))
         Anew->addAttr( OldFunc->getAttributes()
                        .getParamAttributes(I->getArgNo() + 1));
     NewFunc->setAttributes(NewFunc->getAttributes()
@@ -111,43 +111,43 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     const BasicBlock &BB = *BI;
 
     // Create a new basic block and copy instructions into it!
-    BasicBlock *CBB = CloneBasicBlock(&BB, ValueMap, NameSuffix, NewFunc,
+    BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc,
                                       CodeInfo);
-    ValueMap[&BB] = CBB;                       // Add basic block mapping.
+    VMap[&BB] = CBB;                       // Add basic block mapping.
 
     if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator()))
       Returns.push_back(RI);
   }
 
   // Loop over all of the instructions in the function, fixing up operand
-  // references as we go.  This uses ValueMap to do all the hard work.
+  // references as we go.  This uses VMap to do all the hard work.
   //
-  for (Function::iterator BB = cast<BasicBlock>(ValueMap[OldFunc->begin()]),
+  for (Function::iterator BB = cast<BasicBlock>(VMap[OldFunc->begin()]),
          BE = NewFunc->end(); BB != BE; ++BB)
     // Loop over all instructions, fixing each one as we find it...
     for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II)
-      RemapInstruction(II, ValueMap);
+      RemapInstruction(II, VMap);
 }
 
 /// CloneFunction - Return a copy of the specified function, but without
 /// embedding the function into another module.  Also, any references specified
-/// in the ValueMap are changed to refer to their mapped value instead of the
-/// original one.  If any of the arguments to the function are in the ValueMap,
-/// the arguments are deleted from the resultant function.  The ValueMap is
+/// in the VMap are changed to refer to their mapped value instead of the
+/// original one.  If any of the arguments to the function are in the VMap,
+/// the arguments are deleted from the resultant function.  The VMap is
 /// updated to include mappings from all of the instructions and basicblocks in
 /// the function from their old to new values.
 ///
 Function *llvm::CloneFunction(const Function *F,
-                              DenseMap<const Value*, Value*> &ValueMap,
+                              ValueToValueMapTy &VMap,
                               ClonedCodeInfo *CodeInfo) {
   std::vector<const Type*> ArgTypes;
 
   // The user might be deleting arguments to the function by specifying them in
-  // the ValueMap.  If so, we need to not add the arguments to the arg ty vector
+  // the VMap.  If so, we need to not add the arguments to the arg ty vector
   //
   for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
        I != E; ++I)
-    if (ValueMap.count(I) == 0)  // Haven't mapped the argument to anything yet?
+    if (VMap.count(I) == 0)  // Haven't mapped the argument to anything yet?
       ArgTypes.push_back(I->getType());
 
   // Create a new function type...
@@ -161,13 +161,13 @@ Function *llvm::CloneFunction(const Function *F,
   Function::arg_iterator DestI = NewF->arg_begin();
   for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
        I != E; ++I)
-    if (ValueMap.count(I) == 0) {   // Is this argument preserved?
+    if (VMap.count(I) == 0) {   // Is this argument preserved?
       DestI->setName(I->getName()); // Copy the name over...
-      ValueMap[I] = DestI++;        // Add mapping to ValueMap
+      VMap[I] = DestI++;        // Add mapping to VMap
     }
 
   SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
-  CloneFunctionInto(NewF, F, ValueMap, Returns, "", CodeInfo);
+  CloneFunctionInto(NewF, F, VMap, Returns, "", CodeInfo);
   return NewF;
 }
 
@@ -179,19 +179,19 @@ namespace {
   struct PruningFunctionCloner {
     Function *NewFunc;
     const Function *OldFunc;
-    DenseMap<const Value*, Value*> &ValueMap;
+    ValueToValueMapTy &VMap;
     SmallVectorImpl<ReturnInst*> &Returns;
     const char *NameSuffix;
     ClonedCodeInfo *CodeInfo;
     const TargetData *TD;
   public:
     PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
-                          DenseMap<const Value*, Value*> &valueMap,
+                          ValueToValueMapTy &valueMap,
                           SmallVectorImpl<ReturnInst*> &returns,
                           const char *nameSuffix, 
                           ClonedCodeInfo *codeInfo,
                           const TargetData *td)
-    : NewFunc(newFunc), OldFunc(oldFunc), ValueMap(valueMap), Returns(returns),
+    : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap), Returns(returns),
       NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) {
     }
 
@@ -202,7 +202,7 @@ namespace {
     
   public:
     /// ConstantFoldMappedInstruction - Constant fold the specified instruction,
-    /// mapping its operands through ValueMap if they are available.
+    /// mapping its operands through VMap if they are available.
     Constant *ConstantFoldMappedInstruction(const Instruction *I);
   };
 }
@@ -211,7 +211,7 @@ namespace {
 /// anything that it can reach.
 void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
                                        std::vector<const BasicBlock*> &ToClone){
-  Value *&BBEntry = ValueMap[BB];
+  Value *&BBEntry = VMap[BB];
 
   // Have we already cloned this block?
   if (BBEntry) return;
@@ -230,7 +230,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
     // If this instruction constant folds, don't bother cloning the instruction,
     // instead, just add the constant to the value map.
     if (Constant *C = ConstantFoldMappedInstruction(II)) {
-      ValueMap[II] = C;
+      VMap[II] = C;
       continue;
     }
 
@@ -238,7 +238,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
     if (II->hasName())
       NewInst->setName(II->getName()+NameSuffix);
     NewBB->getInstList().push_back(NewInst);
-    ValueMap[II] = NewInst;                // Add instruction map to value.
+    VMap[II] = NewInst;                // Add instruction map to value.
     
     hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
@@ -258,12 +258,12 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
       ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
       // Or is a known constant in the caller...
       if (Cond == 0)  
-        Cond = dyn_cast_or_null<ConstantInt>(ValueMap[BI->getCondition()]);
+        Cond = dyn_cast_or_null<ConstantInt>(VMap[BI->getCondition()]);
 
       // Constant fold to uncond branch!
       if (Cond) {
         BasicBlock *Dest = BI->getSuccessor(!Cond->getZExtValue());
-        ValueMap[OldTI] = BranchInst::Create(Dest, NewBB);
+        VMap[OldTI] = BranchInst::Create(Dest, NewBB);
         ToClone.push_back(Dest);
         TerminatorDone = true;
       }
@@ -272,10 +272,10 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
     // If switching on a value known constant in the caller.
     ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
     if (Cond == 0)  // Or known constant after constant prop in the callee...
-      Cond = dyn_cast_or_null<ConstantInt>(ValueMap[SI->getCondition()]);
+      Cond = dyn_cast_or_null<ConstantInt>(VMap[SI->getCondition()]);
     if (Cond) {     // Constant fold to uncond branch!
       BasicBlock *Dest = SI->getSuccessor(SI->findCaseValue(Cond));
-      ValueMap[OldTI] = BranchInst::Create(Dest, NewBB);
+      VMap[OldTI] = BranchInst::Create(Dest, NewBB);
       ToClone.push_back(Dest);
       TerminatorDone = true;
     }
@@ -286,7 +286,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
     if (OldTI->hasName())
       NewInst->setName(OldTI->getName()+NameSuffix);
     NewBB->getInstList().push_back(NewInst);
-    ValueMap[OldTI] = NewInst;             // Add instruction map to value.
+    VMap[OldTI] = NewInst;             // Add instruction map to value.
     
     // Recursively clone any reachable successor blocks.
     const TerminatorInst *TI = BB->getTerminator();
@@ -307,13 +307,13 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
 }
 
 /// ConstantFoldMappedInstruction - Constant fold the specified instruction,
-/// mapping its operands through ValueMap if they are available.
+/// mapping its operands through VMap if they are available.
 Constant *PruningFunctionCloner::
 ConstantFoldMappedInstruction(const Instruction *I) {
   SmallVector<Constant*, 8> Ops;
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
     if (Constant *Op = dyn_cast_or_null<Constant>(MapValue(I->getOperand(i),
-                                                           ValueMap)))
+                                                           VMap)))
       Ops.push_back(Op);
     else
       return 0;  // All operands not constant!
@@ -363,7 +363,7 @@ static MDNode *UpdateInlinedAtInfo(MDNode *InsnMD, MDNode *TheCallMD) {
 /// dead.  Since this doesn't produce an exact copy of the input, it can't be
 /// used for things like CloneFunction or CloneModule.
 void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
-                                     DenseMap<const Value*, Value*> &ValueMap,
+                                     ValueToValueMapTy &VMap,
                                      SmallVectorImpl<ReturnInst*> &Returns,
                                      const char *NameSuffix, 
                                      ClonedCodeInfo *CodeInfo,
@@ -374,10 +374,10 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
 #ifndef NDEBUG
   for (Function::const_arg_iterator II = OldFunc->arg_begin(), 
        E = OldFunc->arg_end(); II != E; ++II)
-    assert(ValueMap.count(II) && "No mapping from source argument specified!");
+    assert(VMap.count(II) && "No mapping from source argument specified!");
 #endif
 
-  PruningFunctionCloner PFC(NewFunc, OldFunc, ValueMap, Returns,
+  PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, Returns,
                             NameSuffix, CodeInfo, TD);
 
   // Clone the entry block, and anything recursively reachable from it.
@@ -397,14 +397,14 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
   SmallVector<const PHINode*, 16> PHIToResolve;
   for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end();
        BI != BE; ++BI) {
-    BasicBlock *NewBB = cast_or_null<BasicBlock>(ValueMap[BI]);
+    BasicBlock *NewBB = cast_or_null<BasicBlock>(VMap[BI]);
     if (NewBB == 0) continue;  // Dead block.
 
     // Add the new block to the new function.
     NewFunc->getBasicBlockList().push_back(NewBB);
     
     // Loop over all of the instructions in the block, fixing up operand
-    // references as we go.  This uses ValueMap to do all the hard work.
+    // references as we go.  This uses VMap to do all the hard work.
     //
     BasicBlock::iterator I = NewBB->begin();
 
@@ -455,7 +455,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
           I->setMetadata(DbgKind, 0);
         }
       }
-      RemapInstruction(I, ValueMap);
+      RemapInstruction(I, VMap);
     }
   }
   
@@ -465,19 +465,19 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
     const PHINode *OPN = PHIToResolve[phino];
     unsigned NumPreds = OPN->getNumIncomingValues();
     const BasicBlock *OldBB = OPN->getParent();
-    BasicBlock *NewBB = cast<BasicBlock>(ValueMap[OldBB]);
+    BasicBlock *NewBB = cast<BasicBlock>(VMap[OldBB]);
 
     // Map operands for blocks that are live and remove operands for blocks
     // that are dead.
     for (; phino != PHIToResolve.size() &&
          PHIToResolve[phino]->getParent() == OldBB; ++phino) {
       OPN = PHIToResolve[phino];
-      PHINode *PN = cast<PHINode>(ValueMap[OPN]);
+      PHINode *PN = cast<PHINode>(VMap[OPN]);
       for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) {
         if (BasicBlock *MappedBlock = 
-            cast_or_null<BasicBlock>(ValueMap[PN->getIncomingBlock(pred)])) {
+            cast_or_null<BasicBlock>(VMap[PN->getIncomingBlock(pred)])) {
           Value *InVal = MapValue(PN->getIncomingValue(pred),
-                                  ValueMap);
+                                  VMap);
           assert(InVal && "Unknown input value?");
           PN->setIncomingValue(pred, InVal);
           PN->setIncomingBlock(pred, MappedBlock);
@@ -531,15 +531,15 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
       while ((PN = dyn_cast<PHINode>(I++))) {
         Value *NV = UndefValue::get(PN->getType());
         PN->replaceAllUsesWith(NV);
-        assert(ValueMap[OldI] == PN && "ValueMap mismatch");
-        ValueMap[OldI] = NV;
+        assert(VMap[OldI] == PN && "VMap mismatch");
+        VMap[OldI] = NV;
         PN->eraseFromParent();
         ++OldI;
       }
     }
     // NOTE: We cannot eliminate single entry phi nodes here, because of
-    // ValueMap.  Single entry phi nodes can have multiple ValueMap entries
-    // pointing at them.  Thus, deleting one would require scanning the ValueMap
+    // VMap.  Single entry phi nodes can have multiple VMap entries
+    // pointing at them.  Thus, deleting one would require scanning the VMap
     // to update any entries in it that would require that.  This would be
     // really slow.
   }
@@ -548,14 +548,14 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
   // and zap unconditional fall-through branches.  This happen all the time when
   // specializing code: code specialization turns conditional branches into
   // uncond branches, and this code folds them.
-  Function::iterator I = cast<BasicBlock>(ValueMap[&OldFunc->getEntryBlock()]);
+  Function::iterator I = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]);
   while (I != NewFunc->end()) {
     BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());
     if (!BI || BI->isConditional()) { ++I; continue; }
     
     // Note that we can't eliminate uncond branches if the destination has
     // single-entry PHI nodes.  Eliminating the single-entry phi nodes would
-    // require scanning the ValueMap to update any entries that point to the phi
+    // require scanning the VMap to update any entries that point to the phi
     // node.
     BasicBlock *Dest = BI->getSuccessor(0);
     if (!Dest->getSinglePredecessor() || isa<PHINode>(Dest->begin())) {
diff --git a/lib/Transforms/Utils/CloneLoop.cpp b/lib/Transforms/Utils/CloneLoop.cpp
index 38928dc..551b630 100644
--- a/lib/Transforms/Utils/CloneLoop.cpp
+++ b/lib/Transforms/Utils/CloneLoop.cpp
@@ -15,7 +15,6 @@
 #include "llvm/BasicBlock.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/ADT/DenseMap.h"
 
 
 using namespace llvm;
@@ -23,13 +22,13 @@ using namespace llvm;
 /// CloneDominatorInfo - Clone basicblock's dominator tree and, if available,
 /// dominance info. It is expected that basic block is already cloned.
 static void CloneDominatorInfo(BasicBlock *BB, 
-                               DenseMap<const Value *, Value *> &ValueMap,
+                               ValueMap<const Value *, Value *> &VMap,
                                DominatorTree *DT,
                                DominanceFrontier *DF) {
 
   assert (DT && "DominatorTree is not available");
-  DenseMap<const Value *, Value*>::iterator BI = ValueMap.find(BB);
-  assert (BI != ValueMap.end() && "BasicBlock clone is missing");
+  ValueMap<const Value *, Value*>::iterator BI = VMap.find(BB);
+  assert (BI != VMap.end() && "BasicBlock clone is missing");
   BasicBlock *NewBB = cast<BasicBlock>(BI->second);
 
   // NewBB already got dominator info.
@@ -43,11 +42,11 @@ static void CloneDominatorInfo(BasicBlock *BB,
 
   // NewBB's dominator is either BB's dominator or BB's dominator's clone.
   BasicBlock *NewBBDom = BBDom;
-  DenseMap<const Value *, Value*>::iterator BBDomI = ValueMap.find(BBDom);
-  if (BBDomI != ValueMap.end()) {
+  ValueMap<const Value *, Value*>::iterator BBDomI = VMap.find(BBDom);
+  if (BBDomI != VMap.end()) {
     NewBBDom = cast<BasicBlock>(BBDomI->second);
     if (!DT->getNode(NewBBDom))
-      CloneDominatorInfo(BBDom, ValueMap, DT, DF);
+      CloneDominatorInfo(BBDom, VMap, DT, DF);
   }
   DT->addNewBlock(NewBB, NewBBDom);
 
@@ -60,8 +59,8 @@ static void CloneDominatorInfo(BasicBlock *BB,
         for (DominanceFrontier::DomSetType::iterator I = S.begin(), E = S.end();
              I != E; ++I) {
           BasicBlock *DB = *I;
-          DenseMap<const Value*, Value*>::iterator IDM = ValueMap.find(DB);
-          if (IDM != ValueMap.end())
+          ValueMap<const Value*, Value*>::iterator IDM = VMap.find(DB);
+          if (IDM != VMap.end())
             NewDFSet.insert(cast<BasicBlock>(IDM->second));
           else
             NewDFSet.insert(DB);
@@ -71,10 +70,10 @@ static void CloneDominatorInfo(BasicBlock *BB,
   }
 }
 
-/// CloneLoop - Clone Loop. Clone dominator info. Populate ValueMap
+/// CloneLoop - Clone Loop. Clone dominator info. Populate VMap
 /// using old blocks to new blocks mapping.
 Loop *llvm::CloneLoop(Loop *OrigL, LPPassManager  *LPM, LoopInfo *LI,
-                      DenseMap<const Value *, Value *> &ValueMap, Pass *P) {
+                      ValueMap<const Value *, Value *> &VMap, Pass *P) {
   
   DominatorTree *DT = NULL;
   DominanceFrontier *DF = NULL;
@@ -104,8 +103,8 @@ Loop *llvm::CloneLoop(Loop *OrigL, LPPassManager  *LPM, LoopInfo *LI,
     for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
          I != E; ++I) {
       BasicBlock *BB = *I;
-      BasicBlock *NewBB = CloneBasicBlock(BB, ValueMap, ".clone");
-      ValueMap[BB] = NewBB;
+      BasicBlock *NewBB = CloneBasicBlock(BB, VMap, ".clone");
+      VMap[BB] = NewBB;
       if (P)
         LPM->cloneBasicBlockSimpleAnalysis(BB, NewBB, L);
       NewLoop->addBasicBlockToLoop(NewBB, LI->getBase());
@@ -117,7 +116,7 @@ Loop *llvm::CloneLoop(Loop *OrigL, LPPassManager  *LPM, LoopInfo *LI,
       for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
            I != E; ++I) {
         BasicBlock *BB = *I;
-        CloneDominatorInfo(BB, ValueMap, DT, DF);
+        CloneDominatorInfo(BB, VMap, DT, DF);
       }
 
     // Process sub loops
@@ -125,7 +124,7 @@ Loop *llvm::CloneLoop(Loop *OrigL, LPPassManager  *LPM, LoopInfo *LI,
       LoopNest.push_back(*I);
   } while (!LoopNest.empty());
 
-  // Remap instructions to reference operands from ValueMap.
+  // Remap instructions to reference operands from VMap.
   for(SmallVector<BasicBlock *, 16>::iterator NBItr = NewBlocks.begin(), 
         NBE = NewBlocks.end();  NBItr != NBE; ++NBItr) {
     BasicBlock *NB = *NBItr;
@@ -135,8 +134,8 @@ Loop *llvm::CloneLoop(Loop *OrigL, LPPassManager  *LPM, LoopInfo *LI,
       for (unsigned index = 0, num_ops = Insn->getNumOperands(); 
            index != num_ops; ++index) {
         Value *Op = Insn->getOperand(index);
-        DenseMap<const Value *, Value *>::iterator OpItr = ValueMap.find(Op);
-        if (OpItr != ValueMap.end())
+        ValueMap<const Value *, Value *>::iterator OpItr = VMap.find(Op);
+        if (OpItr != VMap.end())
           Insn->setOperand(index, OpItr->second);
       }
     }
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index b87c082..fc603d2 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -28,12 +28,12 @@ using namespace llvm;
 Module *llvm::CloneModule(const Module *M) {
   // Create the value map that maps things from the old module over to the new
   // module.
-  DenseMap<const Value*, Value*> ValueMap;
-  return CloneModule(M, ValueMap);
+  ValueToValueMapTy VMap;
+  return CloneModule(M, VMap);
 }
 
 Module *llvm::CloneModule(const Module *M,
-                          DenseMap<const Value*, Value*> &ValueMap) {
+                          ValueToValueMapTy &VMap) {
   // First off, we need to create the new module...
   Module *New = new Module(M->getModuleIdentifier(), M->getContext());
   New->setDataLayout(M->getDataLayout());
@@ -51,7 +51,7 @@ Module *llvm::CloneModule(const Module *M,
     New->addLibrary(*I);
 
   // Loop over all of the global variables, making corresponding globals in the
-  // new module.  Here we add them to the ValueMap and to the new Module.  We
+  // new module.  Here we add them to the VMap and to the new Module.  We
   // don't worry about attributes or initializers, they will come later.
   //
   for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
@@ -62,7 +62,7 @@ Module *llvm::CloneModule(const Module *M,
                                             GlobalValue::ExternalLinkage, 0,
                                             I->getName());
     GV->setAlignment(I->getAlignment());
-    ValueMap[I] = GV;
+    VMap[I] = GV;
   }
 
   // Loop over the functions in the module, making external functions as before
@@ -71,13 +71,13 @@ Module *llvm::CloneModule(const Module *M,
       Function::Create(cast<FunctionType>(I->getType()->getElementType()),
                        GlobalValue::ExternalLinkage, I->getName(), New);
     NF->copyAttributesFrom(I);
-    ValueMap[I] = NF;
+    VMap[I] = NF;
   }
 
   // Loop over the aliases in the module
   for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
        I != E; ++I)
-    ValueMap[I] = new GlobalAlias(I->getType(), GlobalAlias::ExternalLinkage,
+    VMap[I] = new GlobalAlias(I->getType(), GlobalAlias::ExternalLinkage,
                                   I->getName(), NULL, New);
   
   // Now that all of the things that global variable initializer can refer to
@@ -86,10 +86,10 @@ Module *llvm::CloneModule(const Module *M,
   //
   for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
        I != E; ++I) {
-    GlobalVariable *GV = cast<GlobalVariable>(ValueMap[I]);
+    GlobalVariable *GV = cast<GlobalVariable>(VMap[I]);
     if (I->hasInitializer())
       GV->setInitializer(cast<Constant>(MapValue(I->getInitializer(),
-                                                 ValueMap)));
+                                                 VMap)));
     GV->setLinkage(I->getLinkage());
     GV->setThreadLocal(I->isThreadLocal());
     GV->setConstant(I->isConstant());
@@ -98,17 +98,17 @@ Module *llvm::CloneModule(const Module *M,
   // Similarly, copy over function bodies now...
   //
   for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
-    Function *F = cast<Function>(ValueMap[I]);
+    Function *F = cast<Function>(VMap[I]);
     if (!I->isDeclaration()) {
       Function::arg_iterator DestI = F->arg_begin();
       for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end();
            ++J) {
         DestI->setName(J->getName());
-        ValueMap[J] = DestI++;
+        VMap[J] = DestI++;
       }
 
       SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
-      CloneFunctionInto(F, I, ValueMap, Returns);
+      CloneFunctionInto(F, I, VMap, Returns);
     }
 
     F->setLinkage(I->getLinkage());
@@ -117,11 +117,37 @@ Module *llvm::CloneModule(const Module *M,
   // And aliases
   for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
        I != E; ++I) {
-    GlobalAlias *GA = cast<GlobalAlias>(ValueMap[I]);
+    GlobalAlias *GA = cast<GlobalAlias>(VMap[I]);
     GA->setLinkage(I->getLinkage());
     if (const Constant* C = I->getAliasee())
-      GA->setAliasee(cast<Constant>(MapValue(C, ValueMap)));
+      GA->setAliasee(cast<Constant>(MapValue(C, VMap)));
   }
-  
+
+  // And named metadata....
+  for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
+         E = M->named_metadata_end(); I != E; ++I) {
+    const NamedMDNode &NMD = *I;
+    SmallVector<MDNode*, 4> MDs;
+    for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
+      MDs.push_back(cast<MDNode>(MapValue(NMD.getOperand(i), VMap)));
+    NamedMDNode::Create(New->getContext(), NMD.getName(),
+                        MDs.data(), MDs.size(), New);
+  }
+
+  // Update metadata attach with instructions.
+  for (Module::iterator MI = New->begin(), ME = New->end(); MI != ME; ++MI)   
+    for (Function::iterator FI = MI->begin(), FE = MI->end(); 
+         FI != FE; ++FI)
+      for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); 
+           BI != BE; ++BI) {
+        SmallVector<std::pair<unsigned, MDNode *>, 4 > MDs;
+        BI->getAllMetadata(MDs);
+        for (SmallVector<std::pair<unsigned, MDNode *>, 4>::iterator 
+               MDI = MDs.begin(), MDE = MDs.end(); MDI != MDE; ++MDI) {
+          Value *MappedValue = MapValue(MDI->second, VMap);
+          if (MDI->second != MappedValue && MappedValue)
+            BI->setMetadata(MDI->first, cast<MDNode>(MappedValue));
+        }
+      }
   return New;
 }
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
index c908b4a..8e82a02 100644
--- a/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -35,7 +35,7 @@ AllocaInst* llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     I.eraseFromParent();
     return 0;
   }
-  
+
   // Create a stack slot to hold the value.
   AllocaInst *Slot;
   if (AllocaPoint) {
@@ -46,7 +46,7 @@ AllocaInst* llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     Slot = new AllocaInst(I.getType(), 0, I.getName()+".reg2mem",
                           F->getEntryBlock().begin());
   }
-  
+
   // Change all of the users of the instruction to read from the stack slot
   // instead.
   while (!I.use_empty()) {
@@ -67,7 +67,7 @@ AllocaInst* llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
           Value *&V = Loads[PN->getIncomingBlock(i)];
           if (V == 0) {
             // Insert the load into the predecessor block
-            V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads, 
+            V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads,
                              PN->getIncomingBlock(i)->getTerminator());
           }
           PN->setIncomingValue(i, V);
@@ -110,8 +110,8 @@ AllocaInst* llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
 /// The phi node is deleted and it returns the pointer to the alloca inserted.
 AllocaInst* llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
   if (P->use_empty()) {
-    P->eraseFromParent();    
-    return 0;                
+    P->eraseFromParent();
+    return 0;
   }
 
   // Create a stack slot to hold the value.
@@ -124,23 +124,23 @@ AllocaInst* llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
     Slot = new AllocaInst(P->getType(), 0, P->getName()+".reg2mem",
                           F->getEntryBlock().begin());
   }
-  
+
   // Iterate over each operand, insert store in each predecessor.
   for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(P->getIncomingValue(i))) {
-      assert(II->getParent() != P->getIncomingBlock(i) && 
+      assert(II->getParent() != P->getIncomingBlock(i) &&
              "Invoke edge not supported yet"); II=II;
     }
-    new StoreInst(P->getIncomingValue(i), Slot, 
+    new StoreInst(P->getIncomingValue(i), Slot,
                   P->getIncomingBlock(i)->getTerminator());
   }
-  
+
   // Insert load in place of the phi and replace all uses.
   Value *V = new LoadInst(Slot, P->getName()+".reload", P);
   P->replaceAllUsesWith(V);
-  
+
   // Delete phi.
   P->eraseFromParent();
-  
+
   return Slot;
 }
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 91390bc..598e7d2 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -63,7 +63,8 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
     
     // Next, create the new invoke instruction, inserting it at the end
     // of the old basic block.
-    SmallVector<Value*, 8> InvokeArgs(CI->op_begin()+1, CI->op_end());
+    ImmutableCallSite CS(CI);
+    SmallVector<Value*, 8> InvokeArgs(CS.arg_begin(), CS.arg_end());
     InvokeInst *II =
       InvokeInst::Create(CI->getCalledValue(), Split, InvokeDest,
                          InvokeArgs.begin(), InvokeArgs.end(),
@@ -169,7 +170,7 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
 /// some edges of the callgraph may remain.
 static void UpdateCallGraphAfterInlining(CallSite CS,
                                          Function::iterator FirstNewBlock,
-                                       DenseMap<const Value*, Value*> &ValueMap,
+                                         ValueMap<const Value*, Value*> &VMap,
                                          InlineFunctionInfo &IFI) {
   CallGraph &CG = *IFI.CG;
   const Function *Caller = CS.getInstruction()->getParent()->getParent();
@@ -192,9 +193,9 @@ static void UpdateCallGraphAfterInlining(CallSite CS,
   for (; I != E; ++I) {
     const Value *OrigCall = I->first;
 
-    DenseMap<const Value*, Value*>::iterator VMI = ValueMap.find(OrigCall);
+    ValueMap<const Value*, Value*>::iterator VMI = VMap.find(OrigCall);
     // Only copy the edge if the call was inlined!
-    if (VMI == ValueMap.end() || VMI->second == 0)
+    if (VMI == VMap.end() || VMI->second == 0)
       continue;
     
     // If the call was inlined, but then constant folded, there is no edge to
@@ -285,8 +286,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
   ClonedCodeInfo InlinedFunctionInfo;
   Function::iterator FirstNewBlock;
 
-  { // Scope to destroy ValueMap after cloning.
-    DenseMap<const Value*, Value*> ValueMap;
+  { // Scope to destroy VMap after cloning.
+    ValueMap<const Value*, Value*> VMap;
 
     assert(CalledFunc->arg_size() == CS.arg_size() &&
            "No varargs calls can be inlined!");
@@ -351,16 +352,20 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
         // Uses of the argument in the function should use our new alloca
         // instead.
         ActualArg = NewAlloca;
+
+        // Calls that we inline may use the new alloca, so we need to clear
+        // their 'tail' flags.
+        MustClearTailCallFlags = true;
       }
 
-      ValueMap[I] = ActualArg;
+      VMap[I] = ActualArg;
     }
 
     // We want the inliner to prune the code as it copies.  We would LOVE to
     // have no dead or constant instructions leftover after inlining occurs
     // (which can happen, e.g., because an argument was constant), but we'll be
     // happy with whatever the cloner can do.
-    CloneAndPruneFunctionInto(Caller, CalledFunc, ValueMap, Returns, ".i",
+    CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, Returns, ".i",
                               &InlinedFunctionInfo, IFI.TD, TheCall);
 
     // Remember the first block that is newly cloned over.
@@ -368,7 +373,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
 
     // Update the callgraph if requested.
     if (IFI.CG)
-      UpdateCallGraphAfterInlining(CS, FirstNewBlock, ValueMap, IFI);
+      UpdateCallGraphAfterInlining(CS, FirstNewBlock, VMap, IFI);
   }
 
   // If there are any alloca instructions in the block that used to be the entry
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index df6e603..e90c30b 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -190,14 +190,15 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
   
   for (Value::use_iterator UI = Inst->use_begin(), E = Inst->use_end();
        UI != E; ++UI) {
-    BasicBlock *UserBB = cast<Instruction>(*UI)->getParent();
-    if (PHINode *PN = dyn_cast<PHINode>(*UI))
+    User *U = *UI;
+    BasicBlock *UserBB = cast<Instruction>(U)->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(U))
       UserBB = PN->getIncomingBlock(UI);
     
     if (InstBB != UserBB && !inLoop(UserBB))
       UsesToRewrite.push_back(&UI.getUse());
   }
-  
+
   // If there are no uses outside the loop, exit with no change.
   if (UsesToRewrite.empty()) return false;
   
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index d03f7a6..0b48a8f 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -35,111 +35,6 @@
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
-//  Local analysis.
-//
-
-/// getUnderlyingObjectWithOffset - Strip off up to MaxLookup GEPs and
-/// bitcasts to get back to the underlying object being addressed, keeping
-/// track of the offset in bytes from the GEPs relative to the result.
-/// This is closely related to Value::getUnderlyingObject but is located
-/// here to avoid making VMCore depend on TargetData.
-static Value *getUnderlyingObjectWithOffset(Value *V, const TargetData *TD,
-                                            uint64_t &ByteOffset,
-                                            unsigned MaxLookup = 6) {
-  if (!V->getType()->isPointerTy())
-    return V;
-  for (unsigned Count = 0; MaxLookup == 0 || Count < MaxLookup; ++Count) {
-    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
-      if (!GEP->hasAllConstantIndices())
-        return V;
-      SmallVector<Value*, 8> Indices(GEP->op_begin() + 1, GEP->op_end());
-      ByteOffset += TD->getIndexedOffset(GEP->getPointerOperandType(),
-                                         &Indices[0], Indices.size());
-      V = GEP->getPointerOperand();
-    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
-      V = cast<Operator>(V)->getOperand(0);
-    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
-      if (GA->mayBeOverridden())
-        return V;
-      V = GA->getAliasee();
-    } else {
-      return V;
-    }
-    assert(V->getType()->isPointerTy() && "Unexpected operand type!");
-  }
-  return V;
-}
-
-/// isSafeToLoadUnconditionally - Return true if we know that executing a load
-/// from this value cannot trap.  If it is not obviously safe to load from the
-/// specified pointer, we do a quick local scan of the basic block containing
-/// ScanFrom, to determine if the address is already accessed.
-bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
-                                       unsigned Align, const TargetData *TD) {
-  uint64_t ByteOffset = 0;
-  Value *Base = V;
-  if (TD)
-    Base = getUnderlyingObjectWithOffset(V, TD, ByteOffset);
-
-  const Type *BaseType = 0;
-  unsigned BaseAlign = 0;
-  if (const AllocaInst *AI = dyn_cast<AllocaInst>(Base)) {
-    // An alloca is safe to load from as load as it is suitably aligned.
-    BaseType = AI->getAllocatedType();
-    BaseAlign = AI->getAlignment();
-  } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(Base)) {
-    // Global variables are safe to load from but their size cannot be
-    // guaranteed if they are overridden.
-    if (!isa<GlobalAlias>(GV) && !GV->mayBeOverridden()) {
-      BaseType = GV->getType()->getElementType();
-      BaseAlign = GV->getAlignment();
-    }
-  }
-
-  if (BaseType && BaseType->isSized()) {
-    if (TD && BaseAlign == 0)
-      BaseAlign = TD->getPrefTypeAlignment(BaseType);
-
-    if (Align <= BaseAlign) {
-      if (!TD)
-        return true; // Loading directly from an alloca or global is OK.
-
-      // Check if the load is within the bounds of the underlying object.
-      const PointerType *AddrTy = cast<PointerType>(V->getType());
-      uint64_t LoadSize = TD->getTypeStoreSize(AddrTy->getElementType());
-      if (ByteOffset + LoadSize <= TD->getTypeAllocSize(BaseType) &&
-          (Align == 0 || (ByteOffset % Align) == 0))
-        return true;
-    }
-  }
-
-  // Otherwise, be a little bit aggressive by scanning the local block where we
-  // want to check to see if the pointer is already being loaded or stored
-  // from/to.  If so, the previous load or store would have already trapped,
-  // so there is no harm doing an extra load (also, CSE will later eliminate
-  // the load entirely).
-  BasicBlock::iterator BBI = ScanFrom, E = ScanFrom->getParent()->begin();
-
-  while (BBI != E) {
-    --BBI;
-
-    // If we see a free or a call which may write to memory (i.e. which might do
-    // a free) the pointer could be marked invalid.
-    if (isa<CallInst>(BBI) && BBI->mayWriteToMemory() &&
-        !isa<DbgInfoIntrinsic>(BBI))
-      return false;
-
-    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
-      if (LI->getOperand(0) == V) return true;
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
-      if (SI->getOperand(1) == V) return true;
-    }
-  }
-  return false;
-}
-
-
-//===----------------------------------------------------------------------===//
 //  Local constant propagation.
 //
 
@@ -537,9 +432,11 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
   // Use that list to make another list of common predecessors of BB and Succ
   BlockSet CommonPreds;
   for (pred_iterator PI = pred_begin(Succ), PE = pred_end(Succ);
-        PI != PE; ++PI)
-    if (BBPreds.count(*PI))
-      CommonPreds.insert(*PI);
+       PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    if (BBPreds.count(P))
+      CommonPreds.insert(P);
+  }
 
   // Shortcut, if there are no common predecessors, merging is always safe
   if (CommonPreds.empty())
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 1ef3c32..4f4edf3 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -142,9 +142,11 @@ ReprocessLoop:
     if (*BB == L->getHeader()) continue;
 
     SmallPtrSet<BasicBlock *, 4> BadPreds;
-    for (pred_iterator PI = pred_begin(*BB), PE = pred_end(*BB); PI != PE; ++PI)
-      if (!L->contains(*PI))
-        BadPreds.insert(*PI);
+    for (pred_iterator PI = pred_begin(*BB), PE = pred_end(*BB); PI != PE; ++PI){
+      BasicBlock *P = *PI;
+      if (!L->contains(P))
+        BadPreds.insert(P);
+    }
 
     // Delete each unique out-of-loop (and thus dead) predecessor.
     for (SmallPtrSet<BasicBlock *, 4>::iterator I = BadPreds.begin(),
@@ -192,7 +194,7 @@ ReprocessLoop:
   if (!Preheader) {
     Preheader = InsertPreheaderForLoop(L);
     if (Preheader) {
-      NumInserted++;
+      ++NumInserted;
       Changed = true;
     }
   }
@@ -215,7 +217,7 @@ ReprocessLoop:
       // allowed.
       if (!L->contains(*PI)) {
         if (RewriteLoopExitBlock(L, ExitBlock)) {
-          NumInserted++;
+          ++NumInserted;
           Changed = true;
         }
         break;
@@ -244,7 +246,7 @@ ReprocessLoop:
     // loop header.
     LoopLatch = InsertUniqueBackedgeBlock(L, Preheader);
     if (LoopLatch) {
-      NumInserted++;
+      ++NumInserted;
       Changed = true;
     }
   }
@@ -353,16 +355,18 @@ BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) {
   // Compute the set of predecessors of the loop that are not in the loop.
   SmallVector<BasicBlock*, 8> OutsideBlocks;
   for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
-       PI != PE; ++PI)
-    if (!L->contains(*PI)) {         // Coming in from outside the loop?
+       PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    if (!L->contains(P)) {         // Coming in from outside the loop?
       // If the loop is branched to from an indirect branch, we won't
       // be able to fully transform the loop, because it prohibits
       // edge splitting.
-      if (isa<IndirectBrInst>((*PI)->getTerminator())) return 0;
+      if (isa<IndirectBrInst>(P->getTerminator())) return 0;
 
       // Keep track of it.
-      OutsideBlocks.push_back(*PI);
+      OutsideBlocks.push_back(P);
     }
+  }
 
   // Split out the loop pre-header.
   BasicBlock *NewBB =
@@ -385,13 +389,15 @@ BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) {
 /// outside of the loop.
 BasicBlock *LoopSimplify::RewriteLoopExitBlock(Loop *L, BasicBlock *Exit) {
   SmallVector<BasicBlock*, 8> LoopBlocks;
-  for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I)
-    if (L->contains(*I)) {
+  for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) {
+    BasicBlock *P = *I;
+    if (L->contains(P)) {
       // Don't do this if the loop is exited via an indirect branch.
-      if (isa<IndirectBrInst>((*I)->getTerminator())) return 0;
+      if (isa<IndirectBrInst>(P->getTerminator())) return 0;
 
-      LoopBlocks.push_back(*I);
+      LoopBlocks.push_back(P);
     }
+  }
 
   assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?");
   BasicBlock *NewBB = SplitBlockPredecessors(Exit, &LoopBlocks[0], 
@@ -559,10 +565,11 @@ Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM) {
   // Determine which blocks should stay in L and which should be moved out to
   // the Outer loop now.
   std::set<BasicBlock*> BlocksInL;
-  for (pred_iterator PI = pred_begin(Header), E = pred_end(Header); PI!=E; ++PI)
-    if (DT->dominates(Header, *PI))
-      AddBlockAndPredsToSet(*PI, Header, BlocksInL);
-
+  for (pred_iterator PI=pred_begin(Header), E = pred_end(Header); PI!=E; ++PI) {
+    BasicBlock *P = *PI;
+    if (DT->dominates(Header, P))
+      AddBlockAndPredsToSet(P, Header, BlocksInL);
+  }
 
   // Scan all of the loop children of L, moving them to OuterLoop if they are
   // not part of the inner loop.
@@ -610,8 +617,10 @@ LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) {
 
   // Figure out which basic blocks contain back-edges to the loop header.
   std::vector<BasicBlock*> BackedgeBlocks;
-  for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I)
-    if (*I != Preheader) BackedgeBlocks.push_back(*I);
+  for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I){
+    BasicBlock *P = *I;
+    if (P != Preheader) BackedgeBlocks.push_back(P);
+  }
 
   // Create and insert the new backedge block...
   BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(),
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index 84fd1eb..e0e07e7 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -37,13 +37,13 @@ STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
 STATISTIC(NumUnrolled,    "Number of loops unrolled (completely or otherwise)");
 
 /// RemapInstruction - Convert the instruction operands from referencing the
-/// current values into those specified by ValueMap.
+/// current values into those specified by VMap.
 static inline void RemapInstruction(Instruction *I,
-                                    DenseMap<const Value *, Value*> &ValueMap) {
+                                    ValueMap<const Value *, Value*> &VMap) {
   for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
     Value *Op = I->getOperand(op);
-    DenseMap<const Value *, Value*>::iterator It = ValueMap.find(Op);
-    if (It != ValueMap.end())
+    ValueMap<const Value *, Value*>::iterator It = VMap.find(Op);
+    if (It != VMap.end())
       I->setOperand(op, It->second);
   }
 }
@@ -183,7 +183,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM)
 
   // For the first iteration of the loop, we should use the precloned values for
   // PHI nodes.  Insert associations now.
-  typedef DenseMap<const Value*, Value*> ValueToValueMapTy;
+  typedef ValueMap<const Value*, Value*> ValueToValueMapTy;
   ValueToValueMapTy LastValueMap;
   std::vector<PHINode*> OrigPHINode;
   for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
@@ -205,26 +205,26 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM)
     
     for (std::vector<BasicBlock*>::iterator BB = LoopBlocks.begin(),
          E = LoopBlocks.end(); BB != E; ++BB) {
-      ValueToValueMapTy ValueMap;
-      BasicBlock *New = CloneBasicBlock(*BB, ValueMap, "." + Twine(It));
+      ValueToValueMapTy VMap;
+      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
       Header->getParent()->getBasicBlockList().push_back(New);
 
       // Loop over all of the PHI nodes in the block, changing them to use the
       // incoming values from the previous block.
       if (*BB == Header)
         for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
-          PHINode *NewPHI = cast<PHINode>(ValueMap[OrigPHINode[i]]);
+          PHINode *NewPHI = cast<PHINode>(VMap[OrigPHINode[i]]);
           Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
           if (Instruction *InValI = dyn_cast<Instruction>(InVal))
             if (It > 1 && L->contains(InValI))
               InVal = LastValueMap[InValI];
-          ValueMap[OrigPHINode[i]] = InVal;
+          VMap[OrigPHINode[i]] = InVal;
           New->getInstList().erase(NewPHI);
         }
 
       // Update our running map of newest clones
       LastValueMap[*BB] = New;
-      for (ValueToValueMapTy::iterator VI = ValueMap.begin(), VE = ValueMap.end();
+      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
            VI != VE; ++VI)
         LastValueMap[VI->first] = VI->second;
 
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index 0ed8c72..2696e69 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -45,6 +45,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
@@ -62,10 +63,7 @@ static cl::opt<bool> ExpensiveEHSupport("enable-correct-eh-support",
 namespace {
   class LowerInvoke : public FunctionPass {
     // Used for both models.
-    Constant *WriteFn;
     Constant *AbortFn;
-    Value *AbortMessage;
-    unsigned AbortMessageLength;
 
     // Used for expensive EH support.
     const Type *JBLinkTy;
@@ -92,10 +90,8 @@ namespace {
     }
 
   private:
-    void createAbortMessage(Module *M);
-    void writeAbortMessage(Instruction *IB);
     bool insertCheapEHSupport(Function &F);
-    void splitLiveRangesLiveAcrossInvokes(std::vector<InvokeInst*> &Invokes);
+    void splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*>&Invokes);
     void rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo,
                                 AllocaInst *InvokeNum, AllocaInst *StackPtr,
                                 SwitchInst *CatchSwitch);
@@ -123,7 +119,6 @@ FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI,
 bool LowerInvoke::doInitialization(Module &M) {
   const Type *VoidPtrTy =
           Type::getInt8PtrTy(M.getContext());
-  AbortMessage = 0;
   if (useExpensiveEHSupport) {
     // Insert a type for the linked list of jump buffers.
     unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0;
@@ -175,68 +170,14 @@ bool LowerInvoke::doInitialization(Module &M) {
   // We need the 'write' and 'abort' functions for both models.
   AbortFn = M.getOrInsertFunction("abort", Type::getVoidTy(M.getContext()),
                                   (Type *)0);
-#if 0 // "write" is Unix-specific.. code is going away soon anyway.
-  WriteFn = M.getOrInsertFunction("write", Type::VoidTy, Type::Int32Ty,
-                                  VoidPtrTy, Type::Int32Ty, (Type *)0);
-#else
-  WriteFn = 0;
-#endif
   return true;
 }
 
-void LowerInvoke::createAbortMessage(Module *M) {
-  if (useExpensiveEHSupport) {
-    // The abort message for expensive EH support tells the user that the
-    // program 'unwound' without an 'invoke' instruction.
-    Constant *Msg =
-      ConstantArray::get(M->getContext(),
-                         "ERROR: Exception thrown, but not caught!\n");
-    AbortMessageLength = Msg->getNumOperands()-1;  // don't include \0
-
-    GlobalVariable *MsgGV = new GlobalVariable(*M, Msg->getType(), true,
-                                               GlobalValue::InternalLinkage,
-                                               Msg, "abortmsg");
-    std::vector<Constant*> GEPIdx(2,
-                     Constant::getNullValue(Type::getInt32Ty(M->getContext())));
-    AbortMessage = ConstantExpr::getGetElementPtr(MsgGV, &GEPIdx[0], 2);
-  } else {
-    // The abort message for cheap EH support tells the user that EH is not
-    // enabled.
-    Constant *Msg =
-      ConstantArray::get(M->getContext(), 
-                        "Exception handler needed, but not enabled."      
-                        "Recompile program with -enable-correct-eh-support.\n");
-    AbortMessageLength = Msg->getNumOperands()-1;  // don't include \0
-
-    GlobalVariable *MsgGV = new GlobalVariable(*M, Msg->getType(), true,
-                                               GlobalValue::InternalLinkage,
-                                               Msg, "abortmsg");
-    std::vector<Constant*> GEPIdx(2, Constant::getNullValue(
-                                            Type::getInt32Ty(M->getContext())));
-    AbortMessage = ConstantExpr::getGetElementPtr(MsgGV, &GEPIdx[0], 2);
-  }
-}
-
-
-void LowerInvoke::writeAbortMessage(Instruction *IB) {
-#if 0
-  if (AbortMessage == 0)
-    createAbortMessage(IB->getParent()->getParent()->getParent());
-
-  // These are the arguments we WANT...
-  Value* Args[3];
-  Args[0] = ConstantInt::get(Type::Int32Ty, 2);
-  Args[1] = AbortMessage;
-  Args[2] = ConstantInt::get(Type::Int32Ty, AbortMessageLength);
-  (new CallInst(WriteFn, Args, 3, "", IB))->setTailCall();
-#endif
-}
-
 bool LowerInvoke::insertCheapEHSupport(Function &F) {
   bool Changed = false;
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
-      std::vector<Value*> CallArgs(II->op_begin(), II->op_end() - 3);
+      SmallVector<Value*,16> CallArgs(II->op_begin(), II->op_end() - 3);
       // Insert a normal call instruction...
       CallInst *NewCall = CallInst::Create(II->getCalledValue(),
                                            CallArgs.begin(), CallArgs.end(),
@@ -257,9 +198,6 @@ bool LowerInvoke::insertCheapEHSupport(Function &F) {
 
       ++NumInvokes; Changed = true;
     } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
-      // Insert a new call to write(2, AbortMessage, AbortMessageLength);
-      writeAbortMessage(UI);
-
       // Insert a call to abort()
       CallInst::Create(AbortFn, "", UI)->setTailCall();
 
@@ -320,7 +258,7 @@ void LowerInvoke::rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo,
   CatchSwitch->addCase(InvokeNoC, II->getUnwindDest());
 
   // Insert a normal call instruction.
-  std::vector<Value*> CallArgs(II->op_begin(), II->op_end() - 3);
+  SmallVector<Value*,16> CallArgs(II->op_begin(), II->op_end() - 3);
   CallInst *NewCall = CallInst::Create(II->getCalledValue(),
                                        CallArgs.begin(), CallArgs.end(), "",
                                        II);
@@ -349,7 +287,7 @@ static void MarkBlocksLiveIn(BasicBlock *BB, std::set<BasicBlock*> &LiveBBs) {
 // across the unwind edge.  This process also splits all critical edges
 // coming out of invoke's.
 void LowerInvoke::
-splitLiveRangesLiveAcrossInvokes(std::vector<InvokeInst*> &Invokes) {
+splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*> &Invokes) {
   // First step, split all critical edges from invoke instructions.
   for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
     InvokeInst *II = Invokes[i];
@@ -371,16 +309,33 @@ splitLiveRangesLiveAcrossInvokes(std::vector<InvokeInst*> &Invokes) {
     ++AfterAllocaInsertPt;
   for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
        AI != E; ++AI) {
-    // This is always a no-op cast because we're casting AI to AI->getType() so
-    // src and destination types are identical. BitCast is the only possibility.
-    CastInst *NC = new BitCastInst(
-      AI, AI->getType(), AI->getName()+".tmp", AfterAllocaInsertPt);
-    AI->replaceAllUsesWith(NC);
-    // Normally its is forbidden to replace a CastInst's operand because it
-    // could cause the opcode to reflect an illegal conversion. However, we're
-    // replacing it here with the same value it was constructed with to simply
-    // make NC its user.
-    NC->setOperand(0, AI);
+    const Type *Ty = AI->getType();
+    // Aggregate types can't be cast, but are legal argument types, so we have
+    // to handle them differently. We use an extract/insert pair as a
+    // lightweight method to achieve the same goal.
+    if (isa<StructType>(Ty) || isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
+      Instruction *EI = ExtractValueInst::Create(AI, 0, "",AfterAllocaInsertPt);
+      Instruction *NI = InsertValueInst::Create(AI, EI, 0);
+      NI->insertAfter(EI);
+      AI->replaceAllUsesWith(NI);
+      // Set the operand of the instructions back to the AllocaInst.
+      EI->setOperand(0, AI);
+      NI->setOperand(0, AI);
+    } else {
+      // This is always a no-op cast because we're casting AI to AI->getType()
+      // so src and destination types are identical. BitCast is the only
+      // possibility.
+      CastInst *NC = new BitCastInst(
+        AI, AI->getType(), AI->getName()+".tmp", AfterAllocaInsertPt);
+      AI->replaceAllUsesWith(NC);
+      // Set the operand of the cast instruction back to the AllocaInst.
+      // Normally it's forbidden to replace a CastInst's operand because it
+      // could cause the opcode to reflect an illegal conversion. However,
+      // we're replacing it here with the same value it was constructed with.
+      // We do this because the above replaceAllUsesWith() clobbered the
+      // operand, but we want this one to remain.
+      NC->setOperand(0, AI);
+    }
   }
 
   // Finally, scan the code looking for instructions with bad live ranges.
@@ -402,7 +357,7 @@ splitLiveRangesLiveAcrossInvokes(std::vector<InvokeInst*> &Invokes) {
           continue;
 
       // Avoid iterator invalidation by copying users to a temporary vector.
-      std::vector<Instruction*> Users;
+      SmallVector<Instruction*,16> Users;
       for (Value::use_iterator UI = Inst->use_begin(), E = Inst->use_end();
            UI != E; ++UI) {
         Instruction *User = cast<Instruction>(*UI);
@@ -452,9 +407,9 @@ splitLiveRangesLiveAcrossInvokes(std::vector<InvokeInst*> &Invokes) {
 }
 
 bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
-  std::vector<ReturnInst*> Returns;
-  std::vector<UnwindInst*> Unwinds;
-  std::vector<InvokeInst*> Invokes;
+  SmallVector<ReturnInst*,16> Returns;
+  SmallVector<UnwindInst*,16> Unwinds;
+  SmallVector<InvokeInst*,16> Invokes;
 
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
     if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
@@ -502,12 +457,11 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
       new AllocaInst(JBLinkTy, 0, Align,
                      "jblink", F.begin()->begin());
 
-    std::vector<Value*> Idx;
-    Idx.push_back(Constant::getNullValue(Type::getInt32Ty(F.getContext())));
-    Idx.push_back(ConstantInt::get(Type::getInt32Ty(F.getContext()), 1));
-    OldJmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx.begin(), Idx.end(),
+    Value *Idx[] = { Constant::getNullValue(Type::getInt32Ty(F.getContext())),
+                     ConstantInt::get(Type::getInt32Ty(F.getContext()), 1) };
+    OldJmpBufPtr = GetElementPtrInst::Create(JmpBuf, &Idx[0], &Idx[2],
                                              "OldBuf",
-                                              EntryBB->getTerminator());
+                                             EntryBB->getTerminator());
 
     // Copy the JBListHead to the alloca.
     Value *OldBuf = new LoadInst(JBListHead, "oldjmpbufptr", true,
@@ -552,7 +506,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
                                                      "setjmp.cont");
 
     Idx[1] = ConstantInt::get(Type::getInt32Ty(F.getContext()), 0);
-    Value *JmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx.begin(), Idx.end(),
+    Value *JmpBufPtr = GetElementPtrInst::Create(JmpBuf, &Idx[0], &Idx[2],
                                                  "TheJmpBuf",
                                                  EntryBB->getTerminator());
     JmpBufPtr = new BitCastInst(JmpBufPtr,
@@ -605,24 +559,20 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
 
   // Create the block to do the longjmp.
   // Get a pointer to the jmpbuf and longjmp.
-  std::vector<Value*> Idx;
-  Idx.push_back(Constant::getNullValue(Type::getInt32Ty(F.getContext())));
-  Idx.push_back(ConstantInt::get(Type::getInt32Ty(F.getContext()), 0));
-  Idx[0] = GetElementPtrInst::Create(BufPtr, Idx.begin(), Idx.end(), "JmpBuf",
+  Value *Idx[] = { Constant::getNullValue(Type::getInt32Ty(F.getContext())),
+                   ConstantInt::get(Type::getInt32Ty(F.getContext()), 0) };
+  Idx[0] = GetElementPtrInst::Create(BufPtr, &Idx[0], &Idx[2], "JmpBuf",
                                      UnwindBlock);
   Idx[0] = new BitCastInst(Idx[0],
              Type::getInt8PtrTy(F.getContext()),
                            "tmp", UnwindBlock);
   Idx[1] = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
-  CallInst::Create(LongJmpFn, Idx.begin(), Idx.end(), "", UnwindBlock);
+  CallInst::Create(LongJmpFn, &Idx[0], &Idx[2], "", UnwindBlock);
   new UnreachableInst(F.getContext(), UnwindBlock);
 
   // Set up the term block ("throw without a catch").
   new UnreachableInst(F.getContext(), TermBlock);
 
-  // Insert a new call to write(2, AbortMessage, AbortMessageLength);
-  writeAbortMessage(TermBlock->getTerminator());
-
   // Insert a call to abort()
   CallInst::Create(AbortFn, "",
                    TermBlock->getTerminator())->setTailCall();
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 13f0a28..c0de193 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -69,11 +69,12 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
 
   // Only allow direct and non-volatile loads and stores...
   for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end();
-       UI != UE; ++UI)     // Loop over all of the uses of the alloca
-    if (const LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+       UI != UE; ++UI) {   // Loop over all of the uses of the alloca
+    const User *U = *UI;
+    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
       if (LI->isVolatile())
         return false;
-    } else if (const StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+    } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
       if (SI->getOperand(0) == AI)
         return false;   // Don't allow a store OF the AI, only INTO the AI.
       if (SI->isVolatile())
@@ -81,6 +82,7 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
     } else {
       return false;
     }
+  }
 
   return true;
 }
@@ -603,9 +605,8 @@ ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info,
   // To determine liveness, we must iterate through the predecessors of blocks
   // where the def is live.  Blocks are added to the worklist if we need to
   // check their predecessors.  Start with all the using blocks.
-  SmallVector<BasicBlock*, 64> LiveInBlockWorklist;
-  LiveInBlockWorklist.insert(LiveInBlockWorklist.end(), 
-                             Info.UsingBlocks.begin(), Info.UsingBlocks.end());
+  SmallVector<BasicBlock*, 64> LiveInBlockWorklist(Info.UsingBlocks.begin(),
+                                                   Info.UsingBlocks.end());
   
   // If any of the using blocks is also a definition block, check to see if the
   // definition occurs before or after the use.  If it happens before the use,
@@ -897,6 +898,9 @@ void PromoteMem2Reg::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   // Propagate any debug metadata from the store onto the dbg.value.
   if (MDNode *SIMD = SI->getMetadata("dbg"))
     DbgVal->setMetadata("dbg", SIMD);
+  // Otherwise propagate debug metadata from dbg.declare.
+  else if (MDNode *MD = DDI->getMetadata("dbg"))
+      DbgVal->setMetadata("dbg", MD);         
 }
 
 // QueuePhiNode - queues a phi-node to be added to a basic-block for a specific
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 9f2209d..fd3ed3e 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1513,17 +1513,19 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
       // Okay, we're going to insert the PHI node.  Since PBI is not the only
       // predecessor, compute the PHI'd conditional value for all of the preds.
       // Any predecessor where the condition is not computable we keep symbolic.
-      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
-        if ((PBI = dyn_cast<BranchInst>((*PI)->getTerminator())) &&
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+        BasicBlock *P = *PI;
+        if ((PBI = dyn_cast<BranchInst>(P->getTerminator())) &&
             PBI != BI && PBI->isConditional() &&
             PBI->getCondition() == BI->getCondition() &&
             PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
           bool CondIsTrue = PBI->getSuccessor(0) == BB;
           NewPN->addIncoming(ConstantInt::get(Type::getInt1Ty(BB->getContext()), 
-                                              CondIsTrue), *PI);
+                                              CondIsTrue), P);
         } else {
-          NewPN->addIncoming(BI->getCondition(), *PI);
+          NewPN->addIncoming(BI->getCondition(), P);
         }
+      }
       
       BI->setCondition(NewPN);
       return true;
@@ -1697,10 +1699,11 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
       SmallVector<BasicBlock*, 8> UncondBranchPreds;
       SmallVector<BranchInst*, 8> CondBranchPreds;
       for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
-        TerminatorInst *PTI = (*PI)->getTerminator();
+        BasicBlock *P = *PI;
+        TerminatorInst *PTI = P->getTerminator();
         if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) {
           if (BI->isUnconditional())
-            UncondBranchPreds.push_back(*PI);
+            UncondBranchPreds.push_back(P);
           else
             CondBranchPreds.push_back(BI);
         }
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 87ce631..3f6a90c 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -28,7 +28,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM) {
   // DenseMap.  This includes any recursive calls to MapValue.
 
   // Global values and non-function-local metadata do not need to be seeded into
-  // the ValueMap if they are using the identity mapping.
+  // the VM if they are using the identity mapping.
   if (isa<GlobalValue>(V) || isa<InlineAsm>(V) || isa<MDString>(V) ||
       (isa<MDNode>(V) && !cast<MDNode>(V)->isFunctionLocal()))
     return VMSlot = const_cast<Value*>(V);
@@ -45,7 +45,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM) {
   
   if (isa<ConstantInt>(C) || isa<ConstantFP>(C) ||
       isa<ConstantPointerNull>(C) || isa<ConstantAggregateZero>(C) ||
-      isa<UndefValue>(C) || isa<MDString>(C))
+      isa<UndefValue>(C))
     return VMSlot = C;           // Primitive constants map directly
   
   if (ConstantArray *CA = dyn_cast<ConstantArray>(C)) {
@@ -125,11 +125,11 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM) {
 }
 
 /// RemapInstruction - Convert the instruction operands from referencing the
-/// current values into those specified by ValueMap.
+/// current values into those specified by VMap.
 ///
-void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &ValueMap) {
+void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap) {
   for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) {
-    Value *V = MapValue(*op, ValueMap);
+    Value *V = MapValue(*op, VMap);
     assert(V && "Referenced value not in value map!");
     *op = V;
   }
diff --git a/lib/Transforms/Utils/ValueMapper.h b/lib/Transforms/Utils/ValueMapper.h
index d61c24c..f4ff643 100644
--- a/lib/Transforms/Utils/ValueMapper.h
+++ b/lib/Transforms/Utils/ValueMapper.h
@@ -15,12 +15,12 @@
 #ifndef VALUEMAPPER_H
 #define VALUEMAPPER_H
 
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ValueMap.h"
 
 namespace llvm {
   class Value;
   class Instruction;
-  typedef DenseMap<const Value *, Value *> ValueToValueMapTy;
+  typedef ValueMap<const Value *, Value *> ValueToValueMapTy;
 
   Value *MapValue(const Value *V, ValueToValueMapTy &VM);
   void RemapInstruction(Instruction *I, ValueToValueMapTy &VM);
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index e48c026..7a471ef 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -70,8 +70,7 @@ static const Module *getModuleFromVal(const Value *V) {
 
 // PrintEscapedString - Print each character of the specified string, escaping
 // it if it is not printable or if it is an escape char.
-static void PrintEscapedString(const StringRef &Name,
-                               raw_ostream &Out) {
+static void PrintEscapedString(StringRef Name, raw_ostream &Out) {
   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
     unsigned char C = Name[i];
     if (isprint(C) && C != '\\' && C != '"')
@@ -1419,6 +1418,9 @@ static void PrintLinkage(GlobalValue::LinkageTypes LT,
   case GlobalValue::ExternalLinkage: break;
   case GlobalValue::PrivateLinkage:       Out << "private ";        break;
   case GlobalValue::LinkerPrivateLinkage: Out << "linker_private "; break;
+  case GlobalValue::LinkerPrivateWeakLinkage:
+    Out << "linker_private_weak ";
+    break;
   case GlobalValue::InternalLinkage:      Out << "internal ";       break;
   case GlobalValue::LinkOnceAnyLinkage:   Out << "linkonce ";       break;
   case GlobalValue::LinkOnceODRLinkage:   Out << "linkonce_odr ";   break;
@@ -1469,8 +1471,11 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
     writeOperand(GV->getInitializer(), false);
   }
 
-  if (GV->hasSection())
-    Out << ", section \"" << GV->getSection() << '"';
+  if (GV->hasSection()) {
+    Out << ", section \"";
+    PrintEscapedString(GV->getSection(), Out);
+    Out << '"';
+  }
   if (GV->getAlignment())
     Out << ", align " << GV->getAlignment();
 
@@ -1628,8 +1633,11 @@ void AssemblyWriter::printFunction(const Function *F) {
   Attributes FnAttrs = Attrs.getFnAttributes();
   if (FnAttrs != Attribute::None)
     Out << ' ' << Attribute::getAsString(Attrs.getFnAttributes());
-  if (F->hasSection())
-    Out << " section \"" << F->getSection() << '"';
+  if (F->hasSection()) {
+    Out << " section \"";
+    PrintEscapedString(F->getSection(), Out);
+    Out << '"';
+  }
   if (F->getAlignment())
     Out << " align " << F->getAlignment();
   if (F->hasGC())
@@ -1854,6 +1862,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     default: Out << " cc" << CI->getCallingConv(); break;
     }
 
+    Operand = CI->getCalledValue();
     const PointerType    *PTy = cast<PointerType>(Operand->getType());
     const FunctionType   *FTy = cast<FunctionType>(PTy->getElementType());
     const Type         *RetTy = FTy->getReturnType();
@@ -1877,10 +1886,10 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       writeOperand(Operand, true);
     }
     Out << '(';
-    for (unsigned op = 1, Eop = I.getNumOperands(); op < Eop; ++op) {
-      if (op > 1)
+    for (unsigned op = 0, Eop = CI->getNumArgOperands(); op < Eop; ++op) {
+      if (op > 0)
         Out << ", ";
-      writeParamOperand(I.getOperand(op), PAL.getParamAttributes(op));
+      writeParamOperand(CI->getArgOperand(op), PAL.getParamAttributes(op + 1));
     }
     Out << ')';
     if (PAL.getFnAttributes() != Attribute::None)
@@ -1925,10 +1934,10 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       writeOperand(Operand, true);
     }
     Out << '(';
-    for (unsigned op = 0, Eop = I.getNumOperands() - 3; op < Eop; ++op) {
+    for (unsigned op = 0, Eop = II->getNumArgOperands(); op < Eop; ++op) {
       if (op)
         Out << ", ";
-      writeParamOperand(I.getOperand(op), PAL.getParamAttributes(op + 1));
+      writeParamOperand(II->getArgOperand(op), PAL.getParamAttributes(op + 1));
     }
 
     Out << ')';
@@ -2027,7 +2036,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
 }
 
 static void WriteMDNodeComment(const MDNode *Node,
-			       formatted_raw_ostream &Out) {
+                               formatted_raw_ostream &Out) {
   if (Node->getNumOperands() < 1)
     return;
   ConstantInt *CI = dyn_cast_or_null<ConstantInt>(Node->getOperand(0));
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
index 0144210..dc39024 100644
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Module.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/IRBuilder.h"
 #include <cstring>
@@ -314,7 +315,8 @@ bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn) {
 void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   Function *F = CI->getCalledFunction();
   LLVMContext &C = CI->getContext();
-  
+  ImmutableCallSite CS(CI);
+
   assert(F && "CallInst has no function associated with it.");
 
   if (!NewFn) {
@@ -344,11 +346,11 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     if (isLoadH || isLoadL || isMovL || isMovSD || isShufPD ||
         isUnpckhPD || isUnpcklPD || isPunpckhQPD || isPunpcklQPD) {
       std::vector<Constant*> Idxs;
-      Value *Op0 = CI->getOperand(1);
+      Value *Op0 = CI->getArgOperand(0);
       ShuffleVectorInst *SI = NULL;
       if (isLoadH || isLoadL) {
         Value *Op1 = UndefValue::get(Op0->getType());
-        Value *Addr = new BitCastInst(CI->getOperand(2), 
+        Value *Addr = new BitCastInst(CI->getArgOperand(1), 
                                   Type::getDoublePtrTy(C),
                                       "upgraded.", CI);
         Value *Load = new LoadInst(Addr, "upgraded.", false, 8, CI);
@@ -381,7 +383,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         SI = new ShuffleVectorInst(ZeroV, Op0, Mask, "upgraded.", CI);
       } else if (isMovSD ||
                  isUnpckhPD || isUnpcklPD || isPunpckhQPD || isPunpcklQPD) {
-        Value *Op1 = CI->getOperand(2);
+        Value *Op1 = CI->getArgOperand(1);
         if (isMovSD) {
           Idxs.push_back(ConstantInt::get(Type::getInt32Ty(C), 2));
           Idxs.push_back(ConstantInt::get(Type::getInt32Ty(C), 1));
@@ -395,8 +397,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         Value *Mask = ConstantVector::get(Idxs);
         SI = new ShuffleVectorInst(Op0, Op1, Mask, "upgraded.", CI);
       } else if (isShufPD) {
-        Value *Op1 = CI->getOperand(2);
-        unsigned MaskVal = cast<ConstantInt>(CI->getOperand(3))->getZExtValue();
+        Value *Op1 = CI->getArgOperand(1);
+        unsigned MaskVal = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
         Idxs.push_back(ConstantInt::get(Type::getInt32Ty(C), MaskVal & 1));
         Idxs.push_back(ConstantInt::get(Type::getInt32Ty(C),
                                                ((MaskVal >> 1) & 1)+2));
@@ -416,8 +418,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       CI->eraseFromParent();
     } else if (F->getName() == "llvm.x86.sse41.pmulld") {
       // Upgrade this set of intrinsics into vector multiplies.
-      Instruction *Mul = BinaryOperator::CreateMul(CI->getOperand(1),
-                                                   CI->getOperand(2),
+      Instruction *Mul = BinaryOperator::CreateMul(CI->getArgOperand(0),
+                                                   CI->getArgOperand(1),
                                                    CI->getName(),
                                                    CI);
       // Fix up all the uses with our new multiply.
@@ -427,9 +429,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       // Remove upgraded multiply.
       CI->eraseFromParent();
     } else if (F->getName() == "llvm.x86.ssse3.palign.r") {
-      Value *Op1 = CI->getOperand(1);
-      Value *Op2 = CI->getOperand(2);
-      Value *Op3 = CI->getOperand(3);
+      Value *Op1 = CI->getArgOperand(0);
+      Value *Op2 = CI->getArgOperand(1);
+      Value *Op3 = CI->getArgOperand(2);
       unsigned shiftVal = cast<ConstantInt>(Op3)->getZExtValue();
       Value *Rep;
       IRBuilder<> Builder(C);
@@ -483,9 +485,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       CI->eraseFromParent();
       
     } else if (F->getName() == "llvm.x86.ssse3.palign.r.128") {
-      Value *Op1 = CI->getOperand(1);
-      Value *Op2 = CI->getOperand(2);
-      Value *Op3 = CI->getOperand(3);
+      Value *Op1 = CI->getArgOperand(0);
+      Value *Op2 = CI->getArgOperand(1);
+      Value *Op3 = CI->getArgOperand(2);
       unsigned shiftVal = cast<ConstantInt>(Op3)->getZExtValue();
       Value *Rep;
       IRBuilder<> Builder(C);
@@ -556,10 +558,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::x86_mmx_psrl_w: {
     Value *Operands[2];
     
-    Operands[0] = CI->getOperand(1);
+    Operands[0] = CI->getArgOperand(0);
     
     // Cast the second parameter to the correct type.
-    BitCastInst *BC = new BitCastInst(CI->getOperand(2), 
+    BitCastInst *BC = new BitCastInst(CI->getArgOperand(1), 
                                       NewFn->getFunctionType()->getParamType(1),
                                       "upgraded.", CI);
     Operands[1] = BC;
@@ -583,9 +585,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::ctlz:
   case Intrinsic::ctpop:
   case Intrinsic::cttz: {
-    //  Build a small vector of the 1..(N-1) operands, which are the 
-    //  parameters.
-    SmallVector<Value*, 8> Operands(CI->op_begin()+1, CI->op_end());
+    //  Build a small vector of the original arguments.
+    SmallVector<Value*, 8> Operands(CS.arg_begin(), CS.arg_end());
 
     //  Construct a new CallInst
     CallInst *NewCI = CallInst::Create(NewFn, Operands.begin(), Operands.end(),
@@ -620,7 +621,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::eh_selector:
   case Intrinsic::eh_typeid_for: {
     // Only the return type changed.
-    SmallVector<Value*, 8> Operands(CI->op_begin() + 1, CI->op_end());
+    SmallVector<Value*, 8> Operands(CS.arg_begin(), CS.arg_end());
     CallInst *NewCI = CallInst::Create(NewFn, Operands.begin(), Operands.end(),
                                        "upgraded." + CI->getName(), CI);
     NewCI->setTailCall(CI->isTailCall());
@@ -643,8 +644,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::memset: {
     // Add isVolatile
     const llvm::Type *I1Ty = llvm::Type::getInt1Ty(CI->getContext());
-    Value *Operands[5] = { CI->getOperand(1), CI->getOperand(2),
-                           CI->getOperand(3), CI->getOperand(4),
+    Value *Operands[5] = { CI->getArgOperand(0), CI->getArgOperand(1),
+                           CI->getArgOperand(2), CI->getArgOperand(3),
                            llvm::ConstantInt::get(I1Ty, 0) };
     CallInst *NewCI = CallInst::Create(NewFn, Operands, Operands+5,
                                        CI->getName(), CI);
@@ -726,7 +727,8 @@ void llvm::CheckDebugInfoIntrinsics(Module *M) {
   if (Function *Declare = M->getFunction("llvm.dbg.declare")) {
     if (!Declare->use_empty()) {
       DbgDeclareInst *DDI = cast<DbgDeclareInst>(Declare->use_back());
-      if (!isa<MDNode>(DDI->getOperand(1)) ||!isa<MDNode>(DDI->getOperand(2))) {
+      if (!isa<MDNode>(DDI->getArgOperand(0)) ||
+          !isa<MDNode>(DDI->getArgOperand(1))) {
         while (!Declare->use_empty()) {
           CallInst *CI = cast<CallInst>(Declare->use_back());
           CI->eraseFromParent();
diff --git a/lib/VMCore/ConstantFold.cpp b/lib/VMCore/ConstantFold.cpp
index 549977c..3567266 100644
--- a/lib/VMCore/ConstantFold.cpp
+++ b/lib/VMCore/ConstantFold.cpp
@@ -658,7 +658,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
               }
             }
           // Handle an offsetof-like expression.
-          if (Ty->isStructTy() || Ty->isArrayTy() || Ty->isVectorTy()){
+          if (Ty->isStructTy() || Ty->isArrayTy()) {
             if (Constant *C = getFoldedOffsetOf(Ty, CE->getOperand(2),
                                                 DestTy, false))
               return C;
@@ -1817,8 +1817,15 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     return Constant::getAllOnesValue(ResultTy);
 
   // Handle some degenerate cases first
-  if (isa<UndefValue>(C1) || isa<UndefValue>(C2))
+  if (isa<UndefValue>(C1) || isa<UndefValue>(C2)) {
+    // For EQ and NE, we can always pick a value for the undef to make the
+    // predicate pass or fail, so we can return undef.
+    if (ICmpInst::isEquality(ICmpInst::Predicate(pred)))
+      return UndefValue::get(ResultTy);
+    // Otherwise, pick the same value as the non-undef operand, and fold
+    // it to true or false.
     return ConstantInt::get(ResultTy, CmpInst::isTrueWhenEqual(pred));
+  }
 
   // No compile-time operations on this type yet.
   if (C1->getType()->isPPC_FP128Ty())
@@ -2194,7 +2201,7 @@ Constant *llvm::ConstantFoldGetElementPtr(Constant *C,
         }
 
         NewIndices.push_back(Combined);
-        NewIndices.insert(NewIndices.end(), Idxs+1, Idxs+NumIdx);
+        NewIndices.append(Idxs+1, Idxs+NumIdx);
         return (inBounds && cast<GEPOperator>(CE)->isInBounds()) ?
           ConstantExpr::getInBoundsGetElementPtr(CE->getOperand(0),
                                                  &NewIndices[0],
diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp
index bbf1375..ca1a399 100644
--- a/lib/VMCore/Core.cpp
+++ b/lib/VMCore/Core.cpp
@@ -1058,6 +1058,8 @@ LLVMLinkage LLVMGetLinkage(LLVMValueRef Global) {
     return LLVMPrivateLinkage;
   case GlobalValue::LinkerPrivateLinkage:
     return LLVMLinkerPrivateLinkage;
+  case GlobalValue::LinkerPrivateWeakLinkage:
+    return LLVMLinkerPrivateWeakLinkage;
   case GlobalValue::DLLImportLinkage:
     return LLVMDLLImportLinkage;
   case GlobalValue::DLLExportLinkage:
@@ -1108,6 +1110,9 @@ void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
   case LLVMLinkerPrivateLinkage:
     GV->setLinkage(GlobalValue::LinkerPrivateLinkage);
     break;
+  case LLVMLinkerPrivateWeakLinkage:
+    GV->setLinkage(GlobalValue::LinkerPrivateWeakLinkage);
+    break;
   case LLVMDLLImportLinkage:
     GV->setLinkage(GlobalValue::DLLImportLinkage);
     break;
@@ -2205,15 +2210,14 @@ LLVMBool LLVMCreateMemoryBufferWithContentsOfFile(
 
 LLVMBool LLVMCreateMemoryBufferWithSTDIN(LLVMMemoryBufferRef *OutMemBuf,
                                          char **OutMessage) {
-  MemoryBuffer *MB = MemoryBuffer::getSTDIN();
-  if (!MB->getBufferSize()) {
-    delete MB;
-    *OutMessage = strdup("stdin is empty.");
-    return 1;
+  std::string Error;
+  if (MemoryBuffer *MB = MemoryBuffer::getSTDIN(&Error)) {
+    *OutMemBuf = wrap(MB);
+    return 0;
   }
 
-  *OutMemBuf = wrap(MB);
-  return 0;
+  *OutMessage = strdup(Error.c_str());
+  return 1;
 }
 
 void LLVMDisposeMemoryBuffer(LLVMMemoryBufferRef MemBuf) {
diff --git a/lib/VMCore/Instruction.cpp b/lib/VMCore/Instruction.cpp
index a37fe07..9792ada 100644
--- a/lib/VMCore/Instruction.cpp
+++ b/lib/VMCore/Instruction.cpp
@@ -286,9 +286,10 @@ bool Instruction::isUsedOutsideOfBlock(const BasicBlock *BB) const {
   for (const_use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
     // PHI nodes uses values in the corresponding predecessor block.  For other
     // instructions, just check to see whether the parent of the use matches up.
-    const PHINode *PN = dyn_cast<PHINode>(*UI);
+    const User *U = *UI;
+    const PHINode *PN = dyn_cast<PHINode>(U);
     if (PN == 0) {
-      if (cast<Instruction>(*UI)->getParent() != BB)
+      if (cast<Instruction>(U)->getParent() != BB)
         return true;
       continue;
     }
@@ -401,12 +402,20 @@ bool Instruction::isSafeToSpeculativelyExecute() const {
       return false;
     // Note that it is not safe to speculate into a malloc'd region because
     // malloc may return null.
-    if (isa<AllocaInst>(getOperand(0)))
+    // It's also not safe to follow a bitcast, for example:
+    //   bitcast i8* (alloca i8) to i32*
+    // would result in a 4-byte load from a 1-byte alloca.
+    Value *Op0 = getOperand(0);
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op0)) {
+      // TODO: it's safe to do this for any GEP with constant indices that
+      // compute inside the allocated type, but not for any inbounds gep.
+      if (GEP->hasAllZeroIndices())
+        Op0 = GEP->getPointerOperand();
+    }
+    if (isa<AllocaInst>(Op0))
       return true;
     if (GlobalVariable *GV = dyn_cast<GlobalVariable>(getOperand(0)))
       return !GV->hasExternalWeakLinkage();
-    // FIXME: Handle cases involving GEPs.  We have to be careful because
-    // a load of a out-of-bounds GEP has undefined behavior.
     return false;
   }
   case Call:
@@ -421,6 +430,7 @@ bool Instruction::isSafeToSpeculativelyExecute() const {
   case Store:
   case Ret:
   case Br:
+  case IndirectBr:
   case Switch:
   case Unwind:
   case Unreachable:
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index f64b220..c13696f 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -33,7 +33,9 @@ using namespace llvm;
 User::op_iterator CallSite::getCallee() const {
   Instruction *II(getInstruction());
   return isCall()
-    ? cast<CallInst>(II)->op_begin()
+    ? (CallInst::ArgOffset
+       ? cast</*FIXME: CallInst*/User>(II)->op_begin()
+       : cast</*FIXME: CallInst*/User>(II)->op_end() - 1)
     : cast<InvokeInst>(II)->op_end() - 3; // Skip BB, BB, Function
 }
 
@@ -231,8 +233,7 @@ CallInst::~CallInst() {
 
 void CallInst::init(Value *Func, Value* const *Params, unsigned NumParams) {
   assert(NumOperands == NumParams+1 && "NumOperands not set up?");
-  Use *OL = OperandList;
-  OL[0] = Func;
+  Op<ArgOffset -1>() = Func;
 
   const FunctionType *FTy =
     cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
@@ -245,16 +246,15 @@ void CallInst::init(Value *Func, Value* const *Params, unsigned NumParams) {
     assert((i >= FTy->getNumParams() || 
             FTy->getParamType(i) == Params[i]->getType()) &&
            "Calling a function with a bad signature!");
-    OL[i+1] = Params[i];
+    OperandList[i + ArgOffset] = Params[i];
   }
 }
 
 void CallInst::init(Value *Func, Value *Actual1, Value *Actual2) {
   assert(NumOperands == 3 && "NumOperands not set up?");
-  Use *OL = OperandList;
-  OL[0] = Func;
-  OL[1] = Actual1;
-  OL[2] = Actual2;
+  Op<ArgOffset -1>() = Func;
+  Op<ArgOffset + 0>() = Actual1;
+  Op<ArgOffset + 1>() = Actual2;
 
   const FunctionType *FTy =
     cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
@@ -273,9 +273,8 @@ void CallInst::init(Value *Func, Value *Actual1, Value *Actual2) {
 
 void CallInst::init(Value *Func, Value *Actual) {
   assert(NumOperands == 2 && "NumOperands not set up?");
-  Use *OL = OperandList;
-  OL[0] = Func;
-  OL[1] = Actual;
+  Op<ArgOffset -1>() = Func;
+  Op<ArgOffset + 0>() = Actual;
 
   const FunctionType *FTy =
     cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
@@ -291,8 +290,7 @@ void CallInst::init(Value *Func, Value *Actual) {
 
 void CallInst::init(Value *Func) {
   assert(NumOperands == 1 && "NumOperands not set up?");
-  Use *OL = OperandList;
-  OL[0] = Func;
+  Op<ArgOffset -1>() = Func;
 
   const FunctionType *FTy =
     cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
@@ -473,9 +471,10 @@ static Instruction *createMalloc(Instruction *InsertBefore,
 Instruction *CallInst::CreateMalloc(Instruction *InsertBefore,
                                     const Type *IntPtrTy, const Type *AllocTy,
                                     Value *AllocSize, Value *ArraySize,
+                                    Function * MallocF,
                                     const Twine &Name) {
   return createMalloc(InsertBefore, NULL, IntPtrTy, AllocTy, AllocSize,
-                      ArraySize, NULL, Name);
+                      ArraySize, MallocF, Name);
 }
 
 /// CreateMalloc - Generate the IR for a call to malloc:
@@ -527,8 +526,8 @@ static Instruction* createFree(Value* Source, Instruction *InsertBefore,
 }
 
 /// CreateFree - Generate the IR for a call to the builtin free function.
-void CallInst::CreateFree(Value* Source, Instruction *InsertBefore) {
-  createFree(Source, InsertBefore, NULL);
+Instruction * CallInst::CreateFree(Value* Source, Instruction *InsertBefore) {
+  return createFree(Source, InsertBefore, NULL);
 }
 
 /// CreateFree - Generate the IR for a call to the builtin free function.
@@ -828,8 +827,8 @@ static Value *getAISize(LLVMContext &Context, Value *Amt) {
   else {
     assert(!isa<BasicBlock>(Amt) &&
            "Passed basic block into allocation size parameter! Use other ctor");
-    assert(Amt->getType()->isIntegerTy(32) &&
-           "Allocation array size is not a 32-bit integer!");
+    assert(Amt->getType()->isIntegerTy() &&
+           "Allocation array size is not an integer!");
   }
   return Amt;
 }
@@ -1456,7 +1455,7 @@ void InsertValueInst::init(Value *Agg, Value *Val, const unsigned *Idx,
   Op<0>() = Agg;
   Op<1>() = Val;
 
-  Indices.insert(Indices.end(), Idx, Idx + NumIdx);
+  Indices.append(Idx, Idx + NumIdx);
   setName(Name);
 }
 
@@ -1509,7 +1508,7 @@ void ExtractValueInst::init(const unsigned *Idx, unsigned NumIdx,
                             const Twine &Name) {
   assert(NumOperands == 1 && "NumOperands not initialized?");
 
-  Indices.insert(Indices.end(), Idx, Idx + NumIdx);
+  Indices.append(Idx, Idx + NumIdx);
   setName(Name);
 }
 
@@ -1911,9 +1910,12 @@ bool CastInst::isLosslessCast() const {
 /// # bitcast i32* %x to i8*
 /// # bitcast <2 x i32> %x to <4 x i16> 
 /// # ptrtoint i32* %x to i32     ; on 32-bit plaforms only
-/// @brief Determine if a cast is a no-op.
-bool CastInst::isNoopCast(const Type *IntPtrTy) const {
-  switch (getOpcode()) {
+/// @brief Determine if the described cast is a no-op.
+bool CastInst::isNoopCast(Instruction::CastOps Opcode,
+                          const Type *SrcTy,
+                          const Type *DestTy,
+                          const Type *IntPtrTy) {
+  switch (Opcode) {
     default:
       assert(!"Invalid CastOp");
     case Instruction::Trunc:
@@ -1930,13 +1932,18 @@ bool CastInst::isNoopCast(const Type *IntPtrTy) const {
       return true;  // BitCast never modifies bits.
     case Instruction::PtrToInt:
       return IntPtrTy->getScalarSizeInBits() ==
-             getType()->getScalarSizeInBits();
+             DestTy->getScalarSizeInBits();
     case Instruction::IntToPtr:
       return IntPtrTy->getScalarSizeInBits() ==
-             getOperand(0)->getType()->getScalarSizeInBits();
+             SrcTy->getScalarSizeInBits();
   }
 }
 
+/// @brief Determine if a cast is a no-op.
+bool CastInst::isNoopCast(const Type *IntPtrTy) const {
+  return isNoopCast(getOpcode(), getOperand(0)->getType(), getType(), IntPtrTy);
+}
+
 /// This function determines if a pair of casts can be eliminated and what 
 /// opcode should be used in the elimination. This assumes that there are two 
 /// instructions like this:
@@ -1999,6 +2006,14 @@ unsigned CastInst::isEliminableCastPair(
     { 99,99,99,99,99,99,99,99,99,13,99,12 }, // IntToPtr    |
     {  5, 5, 5, 6, 6, 5, 5, 6, 6,11, 5, 1 }, // BitCast    -+
   };
+  
+  // If either of the casts are a bitcast from scalar to vector, disallow the
+  // merging.
+  if ((firstOp == Instruction::BitCast &&
+       isa<VectorType>(SrcTy) != isa<VectorType>(MidTy)) ||
+      (secondOp == Instruction::BitCast &&
+       isa<VectorType>(MidTy) != isa<VectorType>(DstTy)))
+    return 0; // Disallowed
 
   int ElimCase = CastResults[firstOp-Instruction::CastOpsBegin]
                             [secondOp-Instruction::CastOpsBegin];
diff --git a/lib/VMCore/IntrinsicInst.cpp b/lib/VMCore/IntrinsicInst.cpp
index c37d5b0..ac8ec20 100644
--- a/lib/VMCore/IntrinsicInst.cpp
+++ b/lib/VMCore/IntrinsicInst.cpp
@@ -54,7 +54,7 @@ Value *DbgInfoIntrinsic::StripCast(Value *C) {
 ///
 
 Value *DbgDeclareInst::getAddress() const {
-  if (MDNode* MD = cast_or_null<MDNode>(getOperand(1)))
+  if (MDNode* MD = cast_or_null<MDNode>(getArgOperand(0)))
     return MD->getOperand(0);
   else
     return NULL;
@@ -65,9 +65,9 @@ Value *DbgDeclareInst::getAddress() const {
 ///
 
 const Value *DbgValueInst::getValue() const {
-  return cast<MDNode>(getOperand(1))->getOperand(0);
+  return cast<MDNode>(getArgOperand(0))->getOperand(0);
 }
 
 Value *DbgValueInst::getValue() {
-  return cast<MDNode>(getOperand(1))->getOperand(0);
+  return cast<MDNode>(getArgOperand(0))->getOperand(0);
 }
diff --git a/lib/VMCore/Metadata.cpp b/lib/VMCore/Metadata.cpp
index b894ea3..1d3a058 100644
--- a/lib/VMCore/Metadata.cpp
+++ b/lib/VMCore/Metadata.cpp
@@ -133,6 +133,7 @@ static const Function *getFunctionForValue(Value *V) {
 static const Function *assertLocalFunction(const MDNode *N) {
   if (!N->isFunctionLocal()) return 0;
 
+  // FIXME: This does not handle cyclic function local metadata.
   const Function *F = 0, *NewF = 0;
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     if (Value *V = N->getOperand(i)) {
diff --git a/lib/VMCore/Module.cpp b/lib/VMCore/Module.cpp
index 94840f0..38a51df 100644
--- a/lib/VMCore/Module.cpp
+++ b/lib/VMCore/Module.cpp
@@ -17,6 +17,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/GVMaterializer.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/LeakDetector.h"
@@ -311,9 +312,11 @@ GlobalAlias *Module::getNamedAlias(StringRef Name) const {
 
 /// getNamedMetadata - Return the first NamedMDNode in the module with the
 /// specified name. This method returns null if a NamedMDNode with the 
-//// specified name is not found.
-NamedMDNode *Module::getNamedMetadata(StringRef Name) const {
-  return NamedMDSymTab->lookup(Name);
+/// specified name is not found.
+NamedMDNode *Module::getNamedMetadata(const Twine &Name) const {
+  SmallString<256> NameData;
+  StringRef NameRef = Name.toStringRef(NameData);
+  return NamedMDSymTab->lookup(NameRef);
 }
 
 /// getOrInsertNamedMetadata - Return the first named MDNode in the module 
diff --git a/lib/VMCore/Pass.cpp b/lib/VMCore/Pass.cpp
index a60877d..efd98af 100644
--- a/lib/VMCore/Pass.cpp
+++ b/lib/VMCore/Pass.cpp
@@ -35,6 +35,15 @@ using namespace llvm;
 // Pass Implementation
 //
 
+Pass::Pass(PassKind K, intptr_t pid) : Resolver(0), PassID(pid), Kind(K) {
+  assert(pid && "pid cannot be 0");
+}
+
+Pass::Pass(PassKind K, const void *pid)
+  : Resolver(0), PassID((intptr_t)pid), Kind(K) {
+  assert(pid && "pid cannot be 0");
+}
+
 // Force out-of-line virtual method.
 Pass::~Pass() { 
   delete Resolver; 
@@ -92,6 +101,23 @@ void Pass::verifyAnalysis() const {
   // By default, don't do anything.
 }
 
+void *Pass::getAdjustedAnalysisPointer(const PassInfo *) {
+  return this;
+}
+
+ImmutablePass *Pass::getAsImmutablePass() {
+  return 0;
+}
+
+PMDataManager *Pass::getAsPMDataManager() {
+  return 0;
+}
+
+void Pass::setResolver(AnalysisResolver *AR) {
+  assert(!Resolver && "Resolver is already set");
+  Resolver = AR;
+}
+
 // print - Print out the internal state of the pass.  This is called by Analyze
 // to print out the contents of an analysis.  Otherwise it is not necessary to
 // implement this method.
@@ -364,6 +390,14 @@ void PassInfo::unregisterPass() {
   getPassRegistrar()->UnregisterPass(*this);
 }
 
+Pass *PassInfo::createPass() const {
+  assert((!isAnalysisGroup() || NormalCtor) &&
+         "No default implementation found for analysis group!");
+  assert(NormalCtor &&
+         "Cannot call createPass on PassInfo without default ctor!");
+  return NormalCtor();
+}
+
 //===----------------------------------------------------------------------===//
 //                  Analysis Group Implementation Code
 //===----------------------------------------------------------------------===//
@@ -467,4 +501,15 @@ void AnalysisUsage::setPreservesCFG() {
   GetCFGOnlyPasses(Preserved).enumeratePasses();
 }
 
+AnalysisUsage &AnalysisUsage::addRequiredID(AnalysisID ID) {
+  assert(ID && "Pass class not registered!");
+  Required.push_back(ID);
+  return *this;
+}
 
+AnalysisUsage &AnalysisUsage::addRequiredTransitiveID(AnalysisID ID) {
+  assert(ID && "Pass class not registered!");
+  Required.push_back(ID);
+  RequiredTransitive.push_back(ID);
+  return *this;
+}
diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp
index a56938c..296b0d1 100644
--- a/lib/VMCore/PassManager.cpp
+++ b/lib/VMCore/PassManager.cpp
@@ -1147,6 +1147,11 @@ void PMDataManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
   llvm_unreachable("Unable to schedule pass");
 }
 
+Pass *PMDataManager::getOnTheFlyPass(Pass *P, const PassInfo *PI, Function &F) {
+  assert(0 && "Unable to find on the fly pass");
+  return NULL;
+}
+
 // Destructor
 PMDataManager::~PMDataManager() {
   for (SmallVector<Pass *, 8>::iterator I = PassVector.begin(),
diff --git a/lib/VMCore/Value.cpp b/lib/VMCore/Value.cpp
index 645dd5a..585edf0 100644
--- a/lib/VMCore/Value.cpp
+++ b/lib/VMCore/Value.cpp
@@ -322,7 +322,13 @@ void Value::replaceAllUsesWith(Value *New) {
 Value *Value::stripPointerCasts() {
   if (!getType()->isPointerTy())
     return this;
+
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+
   Value *V = this;
+  Visited.insert(V);
   do {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
       if (!GEP->hasAllZeroIndices())
@@ -338,7 +344,9 @@ Value *Value::stripPointerCasts() {
       return V;
     }
     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
-  } while (1);
+  } while (Visited.insert(V));
+
+  return V;
 }
 
 Value *Value::getUnderlyingObject(unsigned MaxLookup) {
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index 75988cc..f97699d 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -85,7 +85,8 @@ namespace {  // Anonymous namespace for class
 
       for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
         if (I->empty() || !I->back().isTerminator()) {
-          dbgs() << "Basic Block does not have terminator!\n";
+          dbgs() << "Basic Block in function '" << F.getName() 
+                 << "' does not have terminator!\n";
           WriteAsOperand(dbgs(), I, true);
           dbgs() << "\n";
           Broken = true;
@@ -1356,7 +1357,7 @@ void Verifier::visitLoadInst(LoadInst &LI) {
 
 void Verifier::visitStoreInst(StoreInst &SI) {
   const PointerType *PTy = dyn_cast<PointerType>(SI.getOperand(1)->getType());
-  Assert1(PTy, "Load operand must be a pointer.", &SI);
+  Assert1(PTy, "Store operand must be a pointer.", &SI);
   const Type *ElTy = PTy->getElementType();
   Assert2(ElTy == SI.getOperand(0)->getType(),
           "Stored value type does not match pointer operand type!",
@@ -1371,8 +1372,8 @@ void Verifier::visitAllocaInst(AllocaInst &AI) {
           &AI);
   Assert1(PTy->getElementType()->isSized(), "Cannot allocate unsized type",
           &AI);
-  Assert1(AI.getArraySize()->getType()->isIntegerTy(32),
-          "Alloca array size must be i32", &AI);
+  Assert1(AI.getArraySize()->getType()->isIntegerTy(),
+          "Alloca array size must have integer type", &AI);
   visitInstruction(AI);
 }
 
@@ -1453,7 +1454,7 @@ void Verifier::visitInstruction(Instruction &I) {
     if (Function *F = dyn_cast<Function>(I.getOperand(i))) {
       // Check to make sure that the "address of" an intrinsic function is never
       // taken.
-      Assert1(!F->isIntrinsic() || (i == 0 && isa<CallInst>(I)),
+      Assert1(!F->isIntrinsic() || (i + 1 == e && isa<CallInst>(I)),
               "Cannot take the address of an intrinsic!", &I);
       Assert1(F->getParent() == Mod, "Referencing function in another module!",
               &I);
@@ -1536,7 +1537,8 @@ void Verifier::visitInstruction(Instruction &I) {
                 "Instruction does not dominate all uses!", Op, &I);
       }
     } else if (isa<InlineAsm>(I.getOperand(i))) {
-      Assert1((i == 0 && isa<CallInst>(I)) || (i + 3 == e && isa<InvokeInst>(I)),
+      Assert1((i + 1 == e && isa<CallInst>(I)) ||
+              (i + 3 == e && isa<InvokeInst>(I)),
               "Cannot take the address of an inline asm!", &I);
     }
   }
@@ -1628,24 +1630,24 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
 
   // If the intrinsic takes MDNode arguments, verify that they are either global
   // or are local to *this* function.
-  for (unsigned i = 1, e = CI.getNumOperands(); i != e; ++i)
-    if (MDNode *MD = dyn_cast<MDNode>(CI.getOperand(i)))
+  for (unsigned i = 0, e = CI.getNumArgOperands(); i != e; ++i)
+    if (MDNode *MD = dyn_cast<MDNode>(CI.getArgOperand(i)))
       visitMDNode(*MD, CI.getParent()->getParent());
 
   switch (ID) {
   default:
     break;
   case Intrinsic::dbg_declare: {  // llvm.dbg.declare
-    Assert1(CI.getOperand(1) && isa<MDNode>(CI.getOperand(1)),
+    Assert1(CI.getArgOperand(0) && isa<MDNode>(CI.getArgOperand(0)),
                 "invalid llvm.dbg.declare intrinsic call 1", &CI);
-    MDNode *MD = cast<MDNode>(CI.getOperand(1));
+    MDNode *MD = cast<MDNode>(CI.getArgOperand(0));
     Assert1(MD->getNumOperands() == 1,
                 "invalid llvm.dbg.declare intrinsic call 2", &CI);
   } break;
   case Intrinsic::memcpy:
   case Intrinsic::memmove:
   case Intrinsic::memset:
-    Assert1(isa<ConstantInt>(CI.getOperand(4)),
+    Assert1(isa<ConstantInt>(CI.getArgOperand(3)),
             "alignment argument of memory intrinsics must be a constant int",
             &CI);
     break;
@@ -1654,10 +1656,10 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   case Intrinsic::gcread:
     if (ID == Intrinsic::gcroot) {
       AllocaInst *AI =
-        dyn_cast<AllocaInst>(CI.getOperand(1)->stripPointerCasts());
+        dyn_cast<AllocaInst>(CI.getArgOperand(0)->stripPointerCasts());
       Assert1(AI && AI->getType()->getElementType()->isPointerTy(),
               "llvm.gcroot parameter #1 must be a pointer alloca.", &CI);
-      Assert1(isa<Constant>(CI.getOperand(2)),
+      Assert1(isa<Constant>(CI.getArgOperand(1)),
               "llvm.gcroot parameter #2 must be a constant.", &CI);
     }
 
@@ -1665,32 +1667,32 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
             "Enclosing function does not use GC.", &CI);
     break;
   case Intrinsic::init_trampoline:
-    Assert1(isa<Function>(CI.getOperand(2)->stripPointerCasts()),
+    Assert1(isa<Function>(CI.getArgOperand(1)->stripPointerCasts()),
             "llvm.init_trampoline parameter #2 must resolve to a function.",
             &CI);
     break;
   case Intrinsic::prefetch:
-    Assert1(isa<ConstantInt>(CI.getOperand(2)) &&
-            isa<ConstantInt>(CI.getOperand(3)) &&
-            cast<ConstantInt>(CI.getOperand(2))->getZExtValue() < 2 &&
-            cast<ConstantInt>(CI.getOperand(3))->getZExtValue() < 4,
+    Assert1(isa<ConstantInt>(CI.getArgOperand(1)) &&
+            isa<ConstantInt>(CI.getArgOperand(2)) &&
+            cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue() < 2 &&
+            cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue() < 4,
             "invalid arguments to llvm.prefetch",
             &CI);
     break;
   case Intrinsic::stackprotector:
-    Assert1(isa<AllocaInst>(CI.getOperand(2)->stripPointerCasts()),
+    Assert1(isa<AllocaInst>(CI.getArgOperand(1)->stripPointerCasts()),
             "llvm.stackprotector parameter #2 must resolve to an alloca.",
             &CI);
     break;
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end:
   case Intrinsic::invariant_start:
-    Assert1(isa<ConstantInt>(CI.getOperand(1)),
+    Assert1(isa<ConstantInt>(CI.getArgOperand(0)),
             "size argument of memory use markers must be a constant integer",
             &CI);
     break;
   case Intrinsic::invariant_end:
-    Assert1(isa<ConstantInt>(CI.getOperand(2)),
+    Assert1(isa<ConstantInt>(CI.getArgOperand(1)),
             "llvm.invariant.end parameter #2 must be a constant integer", &CI);
     break;
   }